Re: [PATCH 2/2] btrfs-progs: device delete to accept devid

2015-08-17 Thread Goffredo Baroncelli
On 2015-08-14 12:36, Anand Jain wrote:
> This patch introduces new option  for the command
> 
>   btrfs device delete [..]  
> 
> In a user reported issue on a 3-disk-RAID1, one disk failed with its
> SB unreadable. Now with this patch user will have a choice to delete
> the device using devid.
> 
> The other method we could do, is to match the input device_path
> to the available device_paths with in the kernel. But that won't
> work in all the cases, like what if user provided mapper path
> when the path within the kernel is a non-mapper path.
> 
> This patch depends on the below kernel patch for the new feature to work,
> however it will fail-back to the old interface for the kernel without the
> patch
> 
>   Btrfs: device delete by devid
> 
> Signed-off-by: Anand Jain 
> ---
>  Documentation/btrfs-device.asciidoc |  2 +-
>  cmds-device.c   | 45 
> -
>  ioctl.h |  8 +++
>  3 files changed, 44 insertions(+), 11 deletions(-)
> 
> diff --git a/Documentation/btrfs-device.asciidoc 
> b/Documentation/btrfs-device.asciidoc
> index 2827598..61ede6e 100644
> --- a/Documentation/btrfs-device.asciidoc
> +++ b/Documentation/btrfs-device.asciidoc
> @@ -74,7 +74,7 @@ do not perform discard by default
>  -f|--force
>  force overwrite of existing filesystem on the given disk(s)
>  
> -*remove*  [...] ::
> +*remove* | [|...] ::
>  Remove device(s) from a filesystem identified by .
>  
>  *delete*  [...] ::
also here is missing  (below you added)

> diff --git a/cmds-device.c b/cmds-device.c
> index 0e60500..eb4358d 100644
> --- a/cmds-device.c
> +++ b/cmds-device.c
> @@ -164,16 +164,34 @@ static int _cmd_rm_dev(int argc, char **argv, const 
> char * const *usagestr)
>   struct  btrfs_ioctl_vol_args arg;
>   int res;
>  
> - if (!is_block_device(argv[i])) {
> + struct  btrfs_ioctl_vol_args_v3 argv3 = {0};
> + int its_num = false;
> +
> + if (is_numerical(argv[i])) {
> + argv3.devid = arg_strtou64(argv[i]);
> + its_num = true;
> + } else if (is_block_device(argv[i])) {
> + strncpy_null(argv3.name, argv[i]);
> + } else {
>   fprintf(stderr,
> - "ERROR: %s is not a block device\n", argv[i]);
> + "ERROR: %s is not a block device or devid\n", 
> argv[i]);
>   ret++;
>   continue;
>   }
> - memset(&arg, 0, sizeof(arg));
> - strncpy_null(arg.name, argv[i]);
> - res = ioctl(fdmnt, BTRFS_IOC_RM_DEV, &arg);
> + res = ioctl(fdmnt, BTRFS_IOC_RM_DEV_V2, &argv3);
>   e = errno;
> + if (res && e == ENOTTY) {
> + if (its_num) {
> + fprintf(stderr,
> + "Error: Kernel does not support delete by 
> devid\n");
> + ret = 1;
> + continue;
> + }
> + memset(&arg, 0, sizeof(arg));
> + strncpy_null(arg.name, argv[i]);
> + res = ioctl(fdmnt, BTRFS_IOC_RM_DEV, &arg);
> + e = errno;
> + }
>   if (res) {
>   const char *msg;
>  
> @@ -181,9 +199,16 @@ static int _cmd_rm_dev(int argc, char **argv, const char 
> * const *usagestr)
>   msg = btrfs_err_str(res);
>   else
>   msg = strerror(e);
> - fprintf(stderr,
> - "ERROR: error removing the device '%s' - %s\n",
> - argv[i], msg);
> +
> + if (its_num)
> + fprintf(stderr,
> + "ERROR: error removing the devid '%llu' 
> - %s\n",
> + argv3.devid, msg);
> + else
> + fprintf(stderr,
> + "ERROR: error removing the device '%s' 
> - %s\n",
> + argv[i], msg);
> +
>   ret++;
>   }
>   }
> @@ -193,7 +218,7 @@ static int _cmd_rm_dev(int argc, char **argv, const char 
> * const *usagestr)
>  }
>  
>  static const char * const cmd_rm_dev_usage[] = {
> - "btrfs device remove  [...] ",
> + "btrfs device remove | [|...] ",
>   "Remove a device from a filesystem",
>   NULL
>  };
> @@ -204,7 +229,7 @@ static int cmd_rm_dev(int argc, char **argv)
>  }
>  
>  static const char * const cmd_del_dev_usage[] = {
> - "btrfs device delete  [...] ",
> + "btrfs device delete | [|...] ",
>   "Remove a device from a filesystem",
>   NULL
>  };
> diff --git a/ioctl.

Re: [PATCH v2] fstests: btrfs: Add reserved space leak check for rewrite dirty page

2015-08-17 Thread Filipe David Manana
On Mon, Aug 17, 2015 at 7:07 AM, Qu Wenruo  wrote:
> Btrfs qgroup reserve codes lacks check for rewriting dirty page, causing
> every write, even rewriting a uncommitted dirty page, to reserve space.
>
> But only written data will free the reserved space, resulting reserved
> space leaking.
>
> The bug exists almost from the beginning of btrfs qgroup codes, but
> nobody found it.
>
> For example:
>
> 1)Write [0, 12K) into file A
>   reserve 12K space
>
> File A:
> 0   4K  8K  12K
> ||
> reserved: 12K
>
> 2)Write [0,4K) into file A
> 0   4K  8K  12K
> ||
> reserved: 16K <<< Should be 12K
>
> 3) Commit transaction
> Dirty pages [0,12) written to disk.
> Free 12K reserved space.
> reserved: 4K <<< Should be 0
>
> This testcase will test such problem.
> Kernel fix will need some huge change, so won't be soon.
>
> Signed-off-by: Qu Wenruo 

Thanks for doing this Qu.
Just a few comments inlined below.

> ---
> changelog:
> v2:
>   Reduce write size inside loop to ensure even commit happens, we can still
>   continue write.
> ---
>  tests/btrfs/089 | 85 
> +
>  tests/btrfs/089.out | 13 
>  tests/btrfs/group   |  1 +
>  3 files changed, 99 insertions(+)
>  create mode 100755 tests/btrfs/089
>  create mode 100644 tests/btrfs/089.out
>
> diff --git a/tests/btrfs/089 b/tests/btrfs/089
> new file mode 100755
> index 000..d521910
> --- /dev/null
> +++ b/tests/btrfs/089
> @@ -0,0 +1,85 @@
> +#! /bin/bash
> +# FS QA Test 089
> +#
> +# Check for btrfs reserved space leaking caused by overlap dirty range.

No mention of qgroups here. Re-phrasing it as "Check for qgroup
reserved space leaks caused by re-writing dirty ranges." would make it
more clear imho.

> +# This problem exists almost from qgroup function.

Confusing phrase. Function = implementation? Almost = always? Perhaps
we don't need this phrase at all, or just say it's a bug that's been
present in many implementations of btrfs' qgroup feature and for a
long time.

> +#
> +#---
> +# Copyright (c) 2015 Fujitsu. All Rights Reserved.
> +#
> +# This program is free software; you can redistribute it and/or
> +# modify it under the terms of the GNU General Public License as
> +# published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope that it would be useful,
> +# but WITHOUT ANY WARRANTY; without even the implied warranty of
> +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +# GNU General Public License for more details.
> +#
> +# You should have received a copy of the GNU General Public License
> +# along with this program; if not, write the Free Software Foundation,
> +# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
> +#---
> +#
> +
> +seq=`basename $0`
> +seqres=$RESULT_DIR/$seq
> +echo "QA output created by $seq"
> +
> +here=`pwd`
> +tmp=/tmp/$$
> +status=1   # failure is the default!
> +trap "_cleanup; exit \$status" 0 1 2 3 15
> +
> +_cleanup()
> +{
> +   cd /
> +   rm -f $tmp.*
> +}
> +
> +# get standard environment, filters and checks
> +. ./common/rc
> +. ./common/filter
> +
> +# real QA test starts here
> +
> +# Modify as appropriate.
> +_supported_fs btrfs
> +_supported_os Linux
> +_require_scratch
> +_need_to_be_root
> +
> +# Use big blocksize to ensure there is still enough space left for metadata
> +# space reserve.
> +BLOCKSIZE=$(( 2 * 1024 * 1024 )) # 2M block size
> +FILESIZE=$(( 128 * 1024 * 1024 )) # 128M file size
> +
> +_scratch_mkfs >> $seqres.full 2>&1
> +_scratch_mount
> +_require_fs_space $SCRATCH_MNT $(($FILESIZE * 2 / 1024))
> +
> +_run_btrfs_util_prog quota enable $SCRATCH_MNT
> +_run_btrfs_util_prog qgroup limit $FILESIZE 5 $SCRATCH_MNT
> +
> +# loop 5 times without sync to ensure reserved space leak will happen
> +for i in `seq 1 5`; do
> +   # Use 1/4 of the file size, to ensure even commit is trigger by
> +   # dirty page threshold or commit interval, we should still be
> +   # able to continue write
> +   $XFS_IO_PROG -f -c "pwrite -b $BLOCKSIZE 0 $(($FILESIZE / 4))" \
> +   $SCRATCH_MNT/foo | _filter_xfs_io
> +done
> +
> +# remove the file and sync, to ensure all space freed

This comment feels like it should go to above the call to "rm", and
here a small explanation of why we do "sync" before calling "rm"
wouldn't hurt (not everyone is too familiar with the qgroups
implementation and knows that it frees space reservation at
transaction commit time).

> +sync
> +# error shouldn't happen, as BLOCKSIZE is large enough for metdata cow
> +rm $SCRATCH_MNT/foo || _fail "reserved space leak detected"

Why do we need || _fail ...? If rm fails it prints an error message to
stderr which makes the test fail.

> +sync
> +
> +# We should be able to write $FILESIZE - $B

Re: [PATCH v2] fstests: btrfs: Add reserved space leak check for rewrite dirty page

2015-08-17 Thread Qu Wenruo



Filipe David Manana wrote on 2015/08/17 10:18 +0100:

On Mon, Aug 17, 2015 at 7:07 AM, Qu Wenruo  wrote:

Btrfs qgroup reserve codes lacks check for rewriting dirty page, causing
every write, even rewriting a uncommitted dirty page, to reserve space.

But only written data will free the reserved space, resulting reserved
space leaking.

The bug exists almost from the beginning of btrfs qgroup codes, but
nobody found it.

For example:

1)Write [0, 12K) into file A
   reserve 12K space

File A:
0   4K  8K  12K
||
reserved: 12K

2)Write [0,4K) into file A
0   4K  8K  12K
||
reserved: 16K <<< Should be 12K

3) Commit transaction
Dirty pages [0,12) written to disk.
Free 12K reserved space.
reserved: 4K <<< Should be 0

This testcase will test such problem.
Kernel fix will need some huge change, so won't be soon.

Signed-off-by: Qu Wenruo 


Thanks for doing this Qu.
Just a few comments inlined below.


Thanks for reviewing Filipe.




---
changelog:
v2:
   Reduce write size inside loop to ensure even commit happens, we can still
   continue write.
---
  tests/btrfs/089 | 85 +
  tests/btrfs/089.out | 13 
  tests/btrfs/group   |  1 +
  3 files changed, 99 insertions(+)
  create mode 100755 tests/btrfs/089
  create mode 100644 tests/btrfs/089.out

diff --git a/tests/btrfs/089 b/tests/btrfs/089
new file mode 100755
index 000..d521910
--- /dev/null
+++ b/tests/btrfs/089
@@ -0,0 +1,85 @@
+#! /bin/bash
+# FS QA Test 089
+#
+# Check for btrfs reserved space leaking caused by overlap dirty range.


No mention of qgroups here. Re-phrasing it as "Check for qgroup
reserved space leaks caused by re-writing dirty ranges." would make it
more clear imho.


+# This problem exists almost from qgroup function.


Confusing phrase. Function = implementation? Almost = always? Perhaps
we don't need this phrase at all, or just say it's a bug that's been
present in many implementations of btrfs' qgroup feature and for a
long time.
My bad, I was originally meaning "The problem exists since initial 
qgroup codes" or something like that.

Anyway, I'll use your expression.



+#
+#---
+# Copyright (c) 2015 Fujitsu. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_need_to_be_root
+
+# Use big blocksize to ensure there is still enough space left for metadata
+# space reserve.
+BLOCKSIZE=$(( 2 * 1024 * 1024 )) # 2M block size
+FILESIZE=$(( 128 * 1024 * 1024 )) # 128M file size
+
+_scratch_mkfs >> $seqres.full 2>&1
+_scratch_mount
+_require_fs_space $SCRATCH_MNT $(($FILESIZE * 2 / 1024))
+
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+_run_btrfs_util_prog qgroup limit $FILESIZE 5 $SCRATCH_MNT
+
+# loop 5 times without sync to ensure reserved space leak will happen
+for i in `seq 1 5`; do
+   # Use 1/4 of the file size, to ensure even commit is trigger by
+   # dirty page threshold or commit interval, we should still be
+   # able to continue write
+   $XFS_IO_PROG -f -c "pwrite -b $BLOCKSIZE 0 $(($FILESIZE / 4))" \
+   $SCRATCH_MNT/foo | _filter_xfs_io
+done
+
+# remove the file and sync, to ensure all space freed


This comment feels like it should go to above the call to "rm", and
here a small explanation of why we do "sync" before calling "rm"
wouldn't hurt (not everyone is too familiar with the qgroups
implementation and knows that it frees space reservation at
transaction commit time).


Make sense.




+sync
+# error shouldn't happen, as BLOCKSIZE is large enough for metdata cow
+rm $SCRATCH_MNT/foo || _fail "reserved space leak detected"


Why do we need || _fail ...? If rm fails it prints an error message to
stderr which makes the test fail.

Just to end the test as soon as possible

[PATCH 1/2] btrfs: Remove useless condition in start_log_trans()

2015-08-17 Thread Zhao Lei
Dan Carpenter  reported a smatch warning
for start_log_trans():
 fs/btrfs/tree-log.c:178 start_log_trans()
 warn: we tested 'root->log_root' before and it was 'false'

 fs/btrfs/tree-log.c
 147  if (root->log_root) {
 We test "root->log_root" here.
 ...

Reason:
 Condition of:
 fs/btrfs/tree-log.c:178: if (!root->log_root) {
 is not necessary after commit: 7237f1833

 It caused a smatch warning, and no functionally error.

Fix:
 Deleting above condition will make smatch shut up,
 but a better way is to do cleanup for start_log_trans()
 to remove duplicated code and make code more readable.

Reported-by: Dan Carpenter 
Signed-off-by: Zhao Lei 
---
 fs/btrfs/tree-log.c | 43 +--
 1 file changed, 17 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c45431..66a8a58 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -140,55 +140,46 @@ static int start_log_trans(struct btrfs_trans_handle 
*trans,
   struct btrfs_root *root,
   struct btrfs_log_ctx *ctx)
 {
-   int index;
-   int ret;
+   int ret = 0;
 
mutex_lock(&root->log_mutex);
+
if (root->log_root) {
if (btrfs_need_log_full_commit(root->fs_info, trans)) {
ret = -EAGAIN;
goto out;
}
+
if (!root->log_start_pid) {
-   root->log_start_pid = current->pid;
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+   root->log_start_pid = current->pid;
} else if (root->log_start_pid != current->pid) {
set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
}
+   } else {
+   mutex_lock(&root->fs_info->tree_log_mutex);
+   if (!root->fs_info->log_root_tree)
+   ret = btrfs_init_log_root_tree(trans, root->fs_info);
+   mutex_unlock(&root->fs_info->tree_log_mutex);
+   if (ret)
+   goto out;
 
-   atomic_inc(&root->log_batch);
-   atomic_inc(&root->log_writers);
-   if (ctx) {
-   index = root->log_transid % 2;
-   list_add_tail(&ctx->list, &root->log_ctxs[index]);
-   ctx->log_transid = root->log_transid;
-   }
-   mutex_unlock(&root->log_mutex);
-   return 0;
-   }
-
-   ret = 0;
-   mutex_lock(&root->fs_info->tree_log_mutex);
-   if (!root->fs_info->log_root_tree)
-   ret = btrfs_init_log_root_tree(trans, root->fs_info);
-   mutex_unlock(&root->fs_info->tree_log_mutex);
-   if (ret)
-   goto out;
-
-   if (!root->log_root) {
ret = btrfs_add_log_tree(trans, root);
if (ret)
goto out;
+
+   clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
+   root->log_start_pid = current->pid;
}
-   clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
-   root->log_start_pid = current->pid;
+
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
if (ctx) {
-   index = root->log_transid % 2;
+   int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
}
+
 out:
mutex_unlock(&root->log_mutex);
return ret;
-- 
1.8.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] btrfs: Remove unused arguments in tree-log.c

2015-08-17 Thread Zhao Lei
Following arguments are not used in tree-log.c:
 insert_one_name(): path, type
 wait_log_commit(): trans
 wait_for_writer(): trans

This patch remove them.

Signed-off-by: Zhao Lei 
---
 fs/btrfs/tree-log.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 66a8a58..5816a5d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1540,9 +1540,8 @@ static noinline int link_to_fixup_dir(struct 
btrfs_trans_handle *trans,
  */
 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
-   struct btrfs_path *path,
u64 dirid, u64 index,
-   char *name, int name_len, u8 type,
+   char *name, int name_len,
struct btrfs_key *location)
 {
struct inode *inode;
@@ -1710,8 +1709,8 @@ insert:
goto out;
}
btrfs_release_path(path);
-   ret = insert_one_name(trans, root, path, key->objectid, key->offset,
- name, name_len, log_type, &log_key);
+   ret = insert_one_name(trans, root, key->objectid, key->offset,
+ name, name_len, &log_key);
if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
update_size = false;
@@ -2526,8 +2525,7 @@ static int update_log_root(struct btrfs_trans_handle 
*trans,
return ret;
 }
 
-static void wait_log_commit(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root, int transid)
+static void wait_log_commit(struct btrfs_root *root, int transid)
 {
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -2552,8 +2550,7 @@ static void wait_log_commit(struct btrfs_trans_handle 
*trans,
 atomic_read(&root->log_commit[index]));
 }
 
-static void wait_for_writer(struct btrfs_trans_handle *trans,
-   struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_root *root)
 {
DEFINE_WAIT(wait);
 
@@ -2633,7 +2630,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
index1 = log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
-   wait_log_commit(trans, root, log_transid);
+   wait_log_commit(root, log_transid);
mutex_unlock(&root->log_mutex);
return ctx->log_ret;
}
@@ -2642,7 +2639,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-   wait_log_commit(trans, root, log_transid - 1);
+   wait_log_commit(root, log_transid - 1);
 
while (1) {
int batch = atomic_read(&root->log_batch);
@@ -2653,7 +2650,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
}
-   wait_for_writer(trans, root);
+   wait_for_writer(root);
if (batch == atomic_read(&root->log_batch))
break;
}
@@ -2750,7 +2747,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages,
mark);
btrfs_wait_logged_extents(trans, log, log_transid);
-   wait_log_commit(trans, log_root_tree,
+   wait_log_commit(log_root_tree,
root_log_ctx.log_transid);
mutex_unlock(&log_root_tree->log_mutex);
if (!ret)
@@ -2761,11 +2758,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
atomic_set(&log_root_tree->log_commit[index2], 1);
 
if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-   wait_log_commit(trans, log_root_tree,
+   wait_log_commit(log_root_tree,
root_log_ctx.log_transid - 1);
}
 
-   wait_for_writer(trans, log_root_tree);
+   wait_for_writer(log_root_tree);
 
/*
 * now that we've moved on to the tree of log tree roots,
-- 
1.8.5.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] btrfs-progs: device delete to accept devid

2015-08-17 Thread Anand Jain





-*remove*  [...] ::
+*remove* | [|...] ::
  Remove device(s) from a filesystem identified by .

  *delete*  [...] ::

also here is missing  (below you added)


 Thank you. Got this fixed.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: The performance is not as expected when used several disks on raid0.

2015-08-17 Thread Austin S Hemmelgarn

On 2015-08-15 02:30, Duncan wrote:

Austin S Hemmelgarn posted on Fri, 14 Aug 2015 15:58:30 -0400 as
excerpted:


FWIW, running BTRFS on top of MDRAID actually works very well,
especially for BTRFS raid1 on top of MD-RAID0 (I get an almost 50%
performance increase for this usage over BTRFS raid10, although most of
this is probably due to how btrfs dispatches I/O's to disks in
multi-disk stetups).


Of course that's effectively a raid01, which is normally supposed to most
often be a mistakenly reversed raid10 implementation, mistakenly, due to
the IO cost of the rebuild should a device fail, since the whole raid0 of
the one raid1 side would have to be rereplicated to the other, vs only
having to rereplicate one device to the other locally, in a raid10
arrangement.

However, in this case it's a very smart arrangement, actually, the only
md-raid-under-btrfs-raid arrangement that makes real sense (well, other
than raid00, raid0 at both levels, perhaps), in particular because the
btrfs raid1 on top still gives you the full benefit of btrfs file
integrity features as well as the usual raid1 redundancy, tho in this
case it's only at the one raid0 against the other as the pair of btrfs
raid1 copies.  And the mdraid0 is much better optimized than btrfs raid0,
so there's that bonus, while at the same time the btrfs raid1 redundancy
nicely balances the usual "Russian Roulette" quality of raid0.

Very nice configuration! =:^)

Thanks for mentioning it, as I guess I was effectively ruling it out as
an option before even really considering it due to the usual raid10's
better than raid01 thing, and thus was entirely blind to the
possibility.  Which was bad, because as I alluded to, mdraid's lack of
file integrity features and thus lack of any way to have btrfs scrub
properly filter down to the mdraid level when there's mdraid level
redundancy, kind of makes a mess of things, otherwise.  But btrfs raid1
on mdraid0 effectively balances and eliminates the negatives at each
level with the strengths of the other level, and is really a quite
awesome solution, that until now I was entirely blinded to! =:^)

I've also found that BTRFS raid5/6 on top of MD RAID0 mitigates (to a 
certain extent that is) the performance penalty of doing raid5/6 if you 
aren't on ridiculously fast storage, probably not something that should 
be used in production yet, but it's how I've got the near-line backups 
setup on my home server system.  It may also be worth pointing out that 
BTRFS raid6 lets you use 4 disks minimum, as opposed to most other raid6 
implementations that (unnecessarily, as a 4 disk RAID6 is not a 
degenerate form) require 5.




smime.p7s
Description: S/MIME Cryptographic Signature


Re: So, wipe it out and start over or keep debugging?

2015-08-17 Thread Austin S Hemmelgarn

On 2015-08-15 17:46, Timothy Normand Miller wrote:

To those of you who have been helping out with my 4-drive RAID1
situation, is there anything further we should do to investigate this,
in case we can uncover any more bugs, or should I just wipe everything
out and restore from backup?

If you need the system back online, then my suggestion would be to use 
btrfs-image to get metadata images of the disks (there's an option to 
clear out private data if need be), and then restore from backup.  That 
way, we still have the problematic images to work with and examine.




smime.p7s
Description: S/MIME Cryptographic Signature


Re: Btrfs is amazing! (a lack-of-bug report)

2015-08-17 Thread Austin S Hemmelgarn

On 2015-08-16 23:35, Tyler Bletsch wrote:

I just wanted to drop you guys a line to say that I am stunned with how
excellent btrfs is. I did some testing, and the things that it did were
amazing. I took a 4-disk RAID 5 and walked it all the way down to a
one-disk volume and back again, mixed in devices of different sizes in
different modes, balanced it in every direction, trashed data on drives
without the OS knowing, and did every other form of torture I could
think of, all while looping file integrity tests, and it was perfect.

The ease of use and simplicity were great.  I was dreading having to
administer ZFS in order to get snapshots and other features, but now I
don't have to. With the exception of enterprisey features like SSD
intent logs and stuff, it is hands down far better than ZFS.

Thanks for the great work!

It's nice to hear success stories for once.  I would suggest being 
careful of using BTRFS raid5 or raid6 in production, there are probably 
still bugs that haven't yet been discovered.  Secondarily, if you can 
deal with slightly more setup and maintenance overhead, BTRFS works 
_very_ well on top of LVM (it makes online data migration much easier, 
and provides easy ways to do layered RAID setups).





smime.p7s
Description: S/MIME Cryptographic Signature


Re: kernel BUG at fs/btrfs/extent-tree.c:8113! (4.1.3 kernel)

2015-08-17 Thread Marc MERLIN
On Mon, Aug 17, 2015 at 10:01:16AM +0800, Qu Wenruo wrote:
> Hi Marc,
 
Hi Qu, thanks for your answer and looking at this.

> Did btrfs-debug-tree also has the crash?
> 
> If not, would you please attach the output if it doesn't contain
> classified data.
 
Sure thing:
btrfs-debug-tree /dev/mapper/crypt_sdd1 > /tmp/tree.out
parent transid verify failed on 2968115101696 wanted 34855 found 39533
parent transid verify failed on 2968115101696 wanted 34855 found 39533
parent transid verify failed on 2968115101696 wanted 34855 found 39533
parent transid verify failed on 2968115101696 wanted 34855 found 39533
Ignoring transid failure
parent transid verify failed on 2968115134464 wanted 34855 found 39533
parent transid verify failed on 2968115134464 wanted 34855 found 39533
parent transid verify failed on 2968115134464 wanted 34855 found 39533
parent transid verify failed on 2968115134464 wanted 34855 found 39533
Ignoring transid failure
parent transid verify failed on 2968115150848 wanted 34855 found 39533
parent transid verify failed on 2968115150848 wanted 34855 found 39533
parent transid verify failed on 2968115150848 wanted 34855 found 39533
parent transid verify failed on 2968115150848 wanted 34855 found 39533
Ignoring transid failure
parent transid verify failed on 2968115691520 wanted 34855 found 39533
parent transid verify failed on 2968115691520 wanted 34855 found 39533
parent transid verify failed on 2968115691520 wanted 34855 found 39533
parent transid verify failed on 2968115691520 wanted 34855 found 39533
Ignoring transid failure
parent transid verify failed on 1291597152256 wanted 35830 found 39530
parent transid verify failed on 1291597152256 wanted 35830 found 39530
parent transid verify failed on 1291597152256 wanted 35830 found 39530
parent transid verify failed on 1291597152256 wanted 35830 found 39530
Ignoring transid failure
parent transid verify failed on 2968116592640 wanted 34855 found 39533
parent transid verify failed on 2968116592640 wanted 34855 found 39533
parent transid verify failed on 2968116592640 wanted 34855 found 39533
parent transid verify failed on 2968116592640 wanted 34855 found 39533
Ignoring transid failure
parent transid verify failed on 2968116609024 wanted 34855 found 39533
parent transid verify failed on 2968116609024 wanted 34855 found 39533
parent transid verify failed on 2968116609024 wanted 34855 found 39533
parent transid verify failed on 2968116609024 wanted 34855 found 39533
Ignoring transid failure
print-tree.c:1094: btrfs_print_tree: Assertion failed.
btrfs-debug-tree[0x805ce93]
btrfs-debug-tree(btrfs_print_tree+0x26d)[0x805eb51]
btrfs-debug-tree(btrfs_print_tree+0x279)[0x805eb5d]
btrfs-debug-tree(main+0x8b5)[0x804dfb7]
/lib/i386-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0xb757c4d3]
btrfs-debug-tree[0x804e221]

Do you want the actual output?
(it's 1.1GB uncompressed)

Marc


> Thanks,
> Qu
> 
> Marc MERLIN wrote on 2015/08/12 10:19 -0700:
> >On Wed, Aug 12, 2015 at 12:18:45PM -0400, Josef Bacik wrote:
> >>Going to need more info to figure this one out
> >
> >Thanks for the patch, here's the output:
> >enabling repair mode
> >Checking filesystem on /dev/mapper/crypt_sdd1
> >UUID: 024ba4d0-dacb-438d-9f1b-eeb34083fe49
> >checking extents
> >wtf, parent 575708413952 <<
> >cmds-check.c:4488: add_data_backref: Assertion `back->bytes != max_size` 
> >failed.
> >/tmp/btrfs[0x8066a83]
> >/tmp/btrfs[0x8066ab4]
> >/tmp/btrfs[0x80679d8]
> >/tmp/btrfs[0x806b4f2]
> >/tmp/btrfs[0x806b9ea]
> >/tmp/btrfs[0x806c5f9]
> >/tmp/btrfs(cmd_check+0x1088)[0x806ee26]
> >/tmp/btrfs(main+0x153)[0x80557d6]
> >/lib/i386-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0xb75a54d3]
> >/tmp/btrfs[0x80557fc]
> >
> >Marc
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/ | PGP 1024R/763BE901
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: Introduce warning() and error() for common use

2015-08-17 Thread Zhao Lei
Current code use fprintf(stderr, "...") to output warnning and
error information.

The error message have different style, as:
 # grep fprintf *.c
 fprintf(stderr, "Open ctree failed\n");
 fprintf(stderr, "%s: open ctree failed\n", __func__);
 fprintf(stderr, "ERROR: cannot open ctree\n");
 ...

And sometimes, we forgot add tailed '\n', or use printf instead:
 printf("warning, device %llu is missing\n",

This patch introduce warning() and error() as common function,
to make:
1: Each warning and error information have same format
2: Easy to search/change all error message
3: Easy to modify function's internal for debug or other requirement,
   as:
   print function/linenumber in error()
   dumpstack in error()
   add some trace for some style of message
   add support for -v, -vv, ...
   ...something is joke, just for example...

Converting all source is a big work, this patch convert cmds-scrub.c
I'll convert others these days, and new code can use these function
directly.

Signed-off-by: Zhao Lei 
---
 cmds-scrub.c | 171 +--
 utils.c  |  20 +++
 utils.h  |  12 +
 3 files changed, 117 insertions(+), 86 deletions(-)

diff --git a/cmds-scrub.c b/cmds-scrub.c
index 5a85dc4..c971b2e 100644
--- a/cmds-scrub.c
+++ b/cmds-scrub.c
@@ -127,11 +127,6 @@ static void print_scrub_full(struct btrfs_scrub_progress 
*sp)
printf("\tlast_physical: %lld\n", sp->last_physical);
 }
 
-#define ERR(test, ...) do {\
-   if (test)   \
-   fprintf(stderr, __VA_ARGS__);   \
-} while (0)
-
 #define PRINT_SCRUB_ERROR(test, desc) do { \
if (test)   \
printf(" %s=%llu", desc, test); \
@@ -462,7 +457,7 @@ static int scrub_kvread(int *i, int len, int avail, const 
char *buf,
 
 #define _SCRUB_INVALID do {\
if (report_errors)  \
-   fprintf(stderr, "WARNING: invalid data in line %d pos " \
+   warning("invalid data in line %d pos "  \
"%d state %d (near \"%.*s\") at %s:%d\n",   \
lineno, i, state, 20 > avail ? avail : 20,  \
l + i,  __FILE__, __LINE__);\
@@ -839,8 +834,7 @@ static void *scrub_one_dev(void *ctx)
  IOPRIO_PRIO_VALUE(sp->ioprio_class,
sp->ioprio_classdata));
if (ret)
-   fprintf(stderr,
-   "WARNING: setting ioprio failed: %s (ignored).\n",
+   warning("setting ioprio failed: %s (ignored).\n",
strerror(errno));
 
ret = ioctl(sp->fd, BTRFS_IOC_SCRUB, &sp->scrub_args);
@@ -1186,9 +1180,9 @@ static int scrub_start(int argc, char **argv, int resume)
do_print = 0;
 
if (mkdir_p(datafile)) {
-   ERR(!do_quiet, "WARNING: cannot create scrub data "
-  "file, mkdir %s failed: %s. Status recording "
-  "disabled\n", datafile, strerror(errno));
+   warningon(!do_quiet,
+ "cannot create scrub data file, mkdir %s failed: %s. 
Status recording disabled\n",
+ datafile, strerror(errno));
do_record = 0;
}
free(datafile);
@@ -1199,24 +1193,27 @@ static int scrub_start(int argc, char **argv, int 
resume)
 
if (fdmnt < 0) {
if (errno == EINVAL)
-   ERR(!do_quiet,
-   "ERROR: '%s' is not a mounted btrfs device\n",
-   path);
+   erroron(!do_quiet,
+   "'%s' is not a mounted btrfs device\n",
+   path);
else
-   ERR(!do_quiet, "ERROR: can't access '%s': %s\n",
-   path, strerror(errno));
+   erroron(!do_quiet,
+   "can't access '%s': %s\n",
+   path, strerror(errno));
return 1;
}
 
ret = get_fs_info(path, &fi_args, &di_args);
if (ret) {
-   ERR(!do_quiet, "ERROR: getting dev info for scrub failed: "
-   "%s\n", strerror(-ret));
+   erroron(!do_quiet,
+   "getting dev info for scrub failed: %s\n",
+   strerror(-ret));
err = 1;
goto out;
}
if (!fi_args.num_devices) {
-   ERR(!do_quiet, "ERROR: no devices found\n");
+   erroron(!do_quiet,
+   "no devices found\n");
err = 1;
goto out;
}
@@ -1224,13 +1221,15 @@ static int scrub_start(int argc, char **argv, in

Re: So, wipe it out and start over or keep debugging?

2015-08-17 Thread Timothy Normand Miller
I'm not sure if I'm doing this wrong.  Here's what I'm seeing:

# btrfs-image -c9 -t4 -w /mnt/btrfs ~/btrfs_dump.z
Superblock bytenr is larger than device size
Open ctree failed
create failed (No such file or directory)


On Mon, Aug 17, 2015 at 7:43 AM, Austin S Hemmelgarn
 wrote:
> On 2015-08-15 17:46, Timothy Normand Miller wrote:
>>
>> To those of you who have been helping out with my 4-drive RAID1
>> situation, is there anything further we should do to investigate this,
>> in case we can uncover any more bugs, or should I just wipe everything
>> out and restore from backup?
>>
> If you need the system back online, then my suggestion would be to use
> btrfs-image to get metadata images of the disks (there's an option to clear
> out private data if need be), and then restore from backup.  That way, we
> still have the problematic images to work with and examine.
>



-- 
Timothy Normand Miller, PhD
Assistant Professor of Computer Science, Binghamton University
http://www.cs.binghamton.edu/~millerti/
Open Graphics Project
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs is amazing! (a lack-of-bug report)

2015-08-17 Thread Tyler Bletsch
Thanks. I will be trying raid5 in production, but "production" in this 
case just means my home file server, with btrfs snapshot+sync for all 
data and appropriate offsite non-btrfs backups for critical data. If it 
hoses up, I'll post a bug report.


Going to try to avoid LVM, since half the appeal of btrfs for me is 
getting away from the multiple duct-taped layers of indirection that I 
you get currently with ext4/MD/LVM setups.


- Tyler

On 8/17/2015 7:48 AM, Austin S Hemmelgarn wrote:

On 2015-08-16 23:35, Tyler Bletsch wrote:

I just wanted to drop you guys a line to say that I am stunned with how
excellent btrfs is. I did some testing, and the things that it did were
amazing. I took a 4-disk RAID 5 and walked it all the way down to a
one-disk volume and back again, mixed in devices of different sizes in
different modes, balanced it in every direction, trashed data on drives
without the OS knowing, and did every other form of torture I could
think of, all while looping file integrity tests, and it was perfect.

The ease of use and simplicity were great.  I was dreading having to
administer ZFS in order to get snapshots and other features, but now I
don't have to. With the exception of enterprisey features like SSD
intent logs and stuff, it is hands down far better than ZFS.

Thanks for the great work!

It's nice to hear success stories for once.  I would suggest being 
careful of using BTRFS raid5 or raid6 in production, there are 
probably still bugs that haven't yet been discovered. Secondarily, if 
you can deal with slightly more setup and maintenance overhead, BTRFS 
works _very_ well on top of LVM (it makes online data migration much 
easier, and provides easy ways to do layered RAID setups).





--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: The performance is not as expected when used several disks on raid0.

2015-08-17 Thread Eduardo Bach
Hi Calvin.

thanks a lot for the quick answer and sorry for my delayed to reply.
We got some security issues at some machines. I will answer almost al
the replies below.

Yes raid0 is huge risk. This setup is just for performance demos and
other very specific occasions.
I understand the the need of the development be focused on stability now.

Based on previous testing with a smaller number of disk I'm suspecting
that the 32 disks are not all being used. With 12 discs I got more
speed with btrfs thanmdadm+xfs. With, btrfs, 12 disks and large files
we got the entire theoretical speed, 12 x 200MB/s per disk. My hope
was to get some light from you guys to debug the problem so the btrfs
use the 32 discs (assuming this is the problem). Perhaps the debug
this problem may be of interest to devs?

Thanks again.
Eduardo.

2015-08-14 13:35 GMT-03:00 Calvin Walton :
> On Fri, 2015-08-14 at 12:30 -0400, Calvin Walton wrote:
>> On Fri, 2015-08-14 at 12:16 -0300, Eduardo Bach wrote:
>> > Hi all,
>> >
>> > This is my first email to this list, so please excuse any gaffe.
>> >
>> > I am in the evaluation early stages of a new storage, an SGI MIS,
>> > currently with two HBAs LSI and 32 disks.
>> > The hba controllers are LSI 9207-8i and the disks are Seagate 6TB,
>> > model ST6000NM0004-1FT17Z.
>> >
>> > To evaluate the performance I am using IOzone over a raid0 using
>> > all
>> > the 32 disks, with the parameters: iozone -i0 -i1 -t5 -s 20G  -P0.
>> >
>> > With btrfs the result approaches 3.5GB/s. When using mdadm+xfs the
>> > result reaches 6gb/s, which is the expected value when compared
>> > with
>> > parallel dd made on discs.
>> > When used btrfs with only half of the disc the result is about
>> > 3GB/s.
>>
>> There's two things in particular to pay attention with on btrfs  with
>> this sort of setup right now:
>
> Umm, Ok, I made a mistake. You can ignore paragraph #1 - I got some
> details about the btrfs raid1 and raid0 modes mixed up!
> Btrfs RAID0 is n-way striping across all available drives which have
> room for allocations.
>
>>1. btrfs's "raid0" is not an n-way stripe; it's a 2-way stripe
>> only. (n
>>   -way stripe is a long requested feature, but there is no
>> timeline on
>>   its completion) A single-threaded disk write will only ever be
>>   writing to two disks at the same time. The total throughput you
>> get
>>   for multithreaded writes is up to which blocks the allocator
>> happens
>>   to pick; it will probably often happen that multiple threads
>> will
>>   both be using the same chunk, sharing IO from only 2 disks.
>>2. Btrfs development is currently primarily focused on
>> functionality
>>   over performance. There's several places where placeholder or
>>   untuned algorithms are used (e.g. the multi-mirror io read
>>   scheduling just does pid % number_of_mirrors to pick a mirror).
>>
>> This kind of a performance difference on large performance-oriented
>> RAID systems between btrfs's built-in raid and mdadm is interesting
>> to
>> see, but for the moment I'd say it's mostly expected.
>>
>> One of the developers here might have some more precise information
>> on
>> exactly why you're seeing such a performance difference.
>>
>> As an aside, you have 192TB in RAID0? That's certainly pretty
>> impressive, but as soon as one disk dies, you're going to lose a
>> *lot*
>> of data.
>>
>
> --
> Calvin Walton 
>
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: The performance is not as expected when used several disks on raid0.

2015-08-17 Thread Eduardo Bach
The bether xfs performance we got was using 32 disks and 128KB mdadm chunk size.
Could the be the problem we are seen? if each disk get 4KB, 64KB will
be optimal for just 16 disks when usint raid0 with btrfs?

2015-08-14 15:31 GMT-03:00 Chris Murphy :
> On Fri, Aug 14, 2015 at 9:16 AM, Eduardo Bach  wrote:
>
>> With btrfs the result approaches 3.5GB/s. When using mdadm+xfs the
>> result reaches 6gb/s, which is the expected value when compared with
>> parallel dd made on discs.
>
> mdadm with what chunk (strip) size? The default for mdadm is 512KiB.
> On Btrfs it's fixed at 64KiB. While testing with 64KiB chunk with XFS
> on md RAID might improve its performance relative to Btrfs, at least
> it's a more apples to apples comparison.
>
> --
> Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: The performance is not as expected when used several disks on raid0.

2015-08-17 Thread Calvin Walton
On Mon, 2015-08-17 at 16:44 -0300, Eduardo Bach wrote:
> Based on previous testing with a smaller number of disk I'm
> suspecting
> that the 32 disks are not all being used. With 12 discs I got more
> speed with btrfs thanmdadm+xfs. With, btrfs, 12 disks and large files
> we got the entire theoretical speed, 12 x 200MB/s per disk. My hope
> was to get some light from you guys to debug the problem so the btrfs
> use the 32 discs (assuming this is the problem). Perhaps the debug
> this problem may be of interest to devs?

>From the sounds of this, you must be hitting some bottleneck in the
btrfs code. One thing I'm actually curious about: How is the CPU usage
during these tests?

Btrfs can more work on the CPU than mdadm+xfs - in particular, data che
cksums are enabled by default. If you have compression enabled, that
would obviously be a major hit as well. Make sure you don't have
compression enabled (it's off by default, or you can use the mount
option "compress=no"). You could try with the "nodatasum" option to see
if checksums make a difference.

It could be possible that you're saturating the CPU, and that's why
you're not seeing any additional gains over 3.5GB/s. Taking a look at
top output while the test is running might be informative.

On the other hand, if the CPU isn't saturated and the disk io isn't
saturated, then it's probably a scaling issue in btrfs, possibly
something like lock contention.

-- 
Calvin Walton 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Major qgroup regression in 4.2?

2015-08-17 Thread Mark Fasheh
Hi Qu,

Firstly thanks for the response. I have a few new questions and comments
below,

On Mon, Aug 17, 2015 at 09:33:54AM +0800, Qu Wenruo wrote:
> Thanks for pointing out the problem.
> But it's already known and we didn't have a good idea to solve it yet.
> 
> BTW, the old framework won't handle it well either.
> Liu Bo's testcase (btrfs/017) can easily trigger it.
> So it's not a regression.

I don't understand your meaning here. btrfs/017 doesn't have anything to do
with subvolume deletion. The regression I am talking about is that deleting
a subvolume in btrfs on 4.2 breaks qgroup numbers, whereas it did not in
4.1. Btw, I do acknowledge that 017 *was* broken before and fixed by your
patches (thanks) but that's of course been at the expense of reintroducing
the subvol regression :(


> Let me explain the problem a little and then introduce the fixing plan.
> 
> [Qgroup for subvolume delete]
> Here are two subvolumes, sharing the whole tree.
> The whole trees are consistent with 8 tree blocks.
> A B are the tree root of subv 257 and 258 respectively.
> C~H are all shared by the two subvolumes.
> And I is one data extent which is recorded in tree block H.
> 
> 
> subv 257(A)subv258(B)
>| \   /|
>C  D
>   / \/ \
>   E FG H
>:
>:
>I
> 
> Let's assume the tree block are all in 4K size(just for easy
> calculation), and I is in 16K size.
> 
> Now qgroup info should be:
> 257: excl: 4K, rfer: 44K (6 shared tree block + 1 data + 1 tree root)
> 258: excl: 4K, rfer: 44K
> 
> If we are going to delete subvolume 258, then expected qgroup should be:
> 257: excl 44K, rfer 44K

Ok, this is basically explaining how we expect the numbers from a subvolume
delete to work out. Everything there looks normal to me.


> Which means we need to recalculate all the reference relation of
> tree block C~H and data extent I.
>
> [[The real problem]]
> Now the real problem is, we need to mark *ALL* metadata and data
> extent of a subvolume dirty to make subvolume deletion work.

I don't understand why we have to mark all nodes of a subvol. The previous
implementation was able to get this correct without recalculating every
block in the tree. It only passed shared nodes and items through to the
qgroup accounting code. The accounting code in turn had very simple logic
for working it out - if the extent is now exclusively owned, we adjust
qgroup->excl for the (now only remaing) owner.

If you go back to qgroup_subtree_accounting() it, you would see
*that it never* changes ->rfer on a qgroup. That is because the rfer counts
on any related subvolumes don't change during subvol delete. Your example above 
in
fact illustrates this.

So to be clear,

- Why do we have to visit all tree nodes with the qgroup code given that
we only cared about shared ones before"


> Forgot to say, that's at transaction commit time, if we really do that,
> commit time consumption is not acceptable if the subvolume is huge.

This too is confusing to me. Even if we assume that qgroups will get every
node now instead of just the shared ones:

- Drop subvol already visits every node (though it doesn't pass them all to
the qgroup code). We were doing this before just fine to my knowledge - what
changed? The commit-time qgroup accounting?

- Also, btrfs_drop_snapshot() throttles transaction commits (via
btrfs_end_transaction_throttle()), so anything done at commit time looks
like it would be broken up into smaller chunks of work for us.

Am I mistaken in how I understand these functions?


Also I must ask... How did you discover this performance issue? Are we
talking about something theoretical here or was your implementation slow on
subvolume delete?


> [[Possible fix plan]]
> As you can see the biggest problem is the number of child
> metadata/data extents can be quite large, it's not possible to mark
> them all and account at commit transaction time.

... right so handling qgroups at commit time is performance sensitive in
that too many qgroup updates will cause the commit code to do a lot of work.
I think that actually answers one of my questions above, though my followup 
would be:

Would I be correct if I said this is a problem for any workload that creates
a large number of qgroup updates in a short period of time?


> But the good news is, we can keep the snapshot and account them in
> several commits.
> The new extent-orient accounting is quite accurate as it happens at
> commit time.

Btw, one thing I should say is that in general I really like your rewrite
grats on that - the code is far easier to read through and I like very much
that we've taken some of the open-coded qgroup code out of our extent
inc/dec code.


> So one possible fix is, if we want to delete a subvolume, we put the
> subvolume into an orphan status, and let qgroup account to ignore
> that qgroup from then on, and mark some extent dirty in 

You guys do an amazing job - I am blown away!

2015-08-17 Thread George Mitchell
Two years ago I installed btrfs across 8 hard drives on my desktop 
system with the entire system ending up on btrfs RAID 1.  I did all of 
this with btrfs-progs-0.20.  Since that time I have been dreading 
updating my system because of fear that the old btrfs volumes would 
become unstable in the process.  I was finally driven to update by 
otherwise unresolvable security issues.  I went from 32bit OS to 64bit 
OS with new btrfs-progs 3.19.1.  I simply cannot believe that everything 
just worked without a hitch.  In terms of btrfs the upgrade from what 
was then "experimental" to what is now "stable" was totally 
transparent.  In the past two years I have had zero problems with btrfs 
involving multiple TB of data.  I really appreciate all that you guys 
have done to make btrfs such an amazing file system.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: The performance is not as expected when used several disks on raid0.

2015-08-17 Thread Duncan
Austin S Hemmelgarn posted on Mon, 17 Aug 2015 07:38:13 -0400 as
excerpted:

> I've also found that BTRFS raid5/6 on top of MD RAID0 mitigates (to a
> certain extent that is) the performance penalty of doing raid5/6 if you
> aren't on ridiculously fast storage, probably not something that should
> be used in production yet, but it's how I've got the near-line backups
> setup on my home server system.

As should be clear from my previous posts on the subject, I'm 
conservative enough not to be comfortable with the btrfs raid56 
implementation yet.  My recommendation has been, and remains, unless 
you're deliberately testing it in ordered to help find/report/workout 
bugs, give it a year after the nominally full implementation (3.19, so 
until 4.4), before expecting it to be reasonably as stable as the rest of 
btrfs (which itself isn't fully stable yet).

But the almost-released 4.2 does seem to be past the initial nominally 
btrfs raid56 full-code bugs, and I'd call an intermediate level backup, 
with working copies in front and itself backed up in back, a reasonable 
first working (as opposed to testing) deployment.

And yes, btrfs raid5/6 over mdraid0  would have the same general 
complementary nature as btrfs raid1/10 over mdraid0.

> It may also be worth pointing out that
> BTRFS raid6 lets you use 4 disks minimum, as opposed to most other raid6
> implementations that (unnecessarily, as a 4 disk RAID6 is not a
> degenerate form) require 5.

4-device raid6, btrfs and mdraid both allow that, good point.  But of 
course mdraid6 doesn't have the data integrity, only rebuild-parity.

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


kernel BUG as fs/btrfs/extent-tree/c:1833 while executing device remove under 4.2.0-rc5

2015-08-17 Thread derek
A couple of days ago while running 4.2.0-RC5 I had a suspected fault on one of 
the disks of my 6-disk RAID-1 btrfs filesystem so removed the offending drive 
and tried a "btrfs device remove...". 

While trying it, the system repeated hung with a kernel BUG after varying 
amounts of time ranging from a couple of minutes to over an hour. I eventually 
booted the Debian release kernel 4.1.0-1 and completed the device remove using 
that without any error.

The kernel only local modification to the kernel was the change the export of 
one sysmbol in workqueue.c to allow installation of the latest NVIDIA driver 
(hence the Tainted and -dirty on the version below).

Aug 15 10:42:44 capella kernel: [ cut here ]
Aug 15 10:42:44 capella kernel: kernel BUG at fs/btrfs/extent-tree.c:1833!
Aug 15 10:42:44 capella kernel: invalid opcode:  [#1] SMP 
Aug 15 10:42:44 capella kernel: Modules linked in: nfsd nfs_acl rpcsec_gss_krb5 
auth_rpcgss oid_registry nfsv4 dns_resolver nfs lockd grace sunrpc fscache 
qt1010 af9013 dvb_usb_af9015 dvb_usb_v2 dvb_core rc_core sp5100_tco kvm_amd kvm 
pcspkr snd_hda_codec_hdmi evdev amd64_edac_mod edac_mce_amd edac_core 
nvidia(PO) i2c_piix4 snd_hda_intel snd_hda_codec snd_hda_core sg snd_hwdep 
snd_pcm tpm_infineon tpm_tis tpm snd_timer snd soundcore drm acpi_cpufreq 
processor thermal_sys button shpchp md_mod k10temp jc42 i2c_core loop 
parport_pc ppdev lp parport autofs4 ext4 crc16 mbcache jbd2 crc32c_generic 
btrfs xor raid6_pq dm_mod hid_generic usbhid hid sd_mod usb_storage ohci_pci 
tg3 ptp pps_core libphy ahci libahci libata ehci_pci ohci_hcd ehci_hcd scsi_mod 
usbcore usb_common
Aug 15 10:42:44 capella kernel: CPU: 1 PID: 95 Comm: kworker/u8:7 Tainted: P
   O4.2.0-rc5-derek-00033-g6c84461-dirty #5
Aug 15 10:42:44 capella kernel: Hardware name: HP ProLiant MicroServer, BIOS 
O41 10/01/2013
Aug 15 10:42:44 capella kernel: Workqueue: btrfs-extent-refs 
btrfs_extent_refs_helper [btrfs]
Aug 15 10:42:44 capella kernel: task: 880213b8cd00 ti: 880213b9 
task.ti: 880213b9
Aug 15 10:42:44 capella kernel: RIP: 0010:[]  
[] insert_inline_extent_backref+0xe3/0xf0 [btrfs]
Aug 15 10:42:44 capella kernel: RSP: 0018:880213b93af8  EFLAGS: 00010293
Aug 15 10:42:44 capella kernel: RAX:  RBX:  
RCX: 0001
Aug 15 10:42:44 capella kernel: RDX: 8800 RSI: 0001 
RDI: 
Aug 15 10:42:44 capella kernel: RBP: 8800daef2800 R08: 4000 
R09: 880213b93a08
Aug 15 10:42:44 capella kernel: R10:  R11: 0003 
R12: 8801d797dad0
Aug 15 10:42:44 capella kernel: R13: 4b18 R14:  
R15: 1c7de0f8c000
Aug 15 10:42:44 capella kernel: FS:  7fab947cb8c0() 
GS:88021fc8() knlGS:
Aug 15 10:42:44 capella kernel: CS:  0010 DS:  ES:  CR0: 
8005003b
Aug 15 10:42:44 capella kernel: CR2: 010928d8 CR3: aa465000 
CR4: 06e0
Aug 15 10:42:44 capella kernel: Stack:
Aug 15 10:42:44 capella kernel:  1c7de0f8c000 4b18 
 
Aug 15 10:42:44 capella kernel:  0001 0282 
8801d797dad0 a020f9c0
Aug 15 10:42:44 capella kernel:  880213b93bb4 34bd 
8801d797dae0 8800db81b000
Aug 15 10:42:44 capella kernel: Call Trace:
Aug 15 10:42:44 capella kernel:  [] ? 
__btrfs_free_extent.isra.68+0x320/0xd50 [btrfs]
Aug 15 10:42:44 capella kernel:  [] ? 
__btrfs_inc_extent_ref.isra.52+0xa7/0x280 [btrfs]
Aug 15 10:42:44 capella kernel:  [] ? find_ref_head+0x52/0x70 
[btrfs]
Aug 15 10:42:44 capella kernel:  [] ? 
__btrfs_run_delayed_refs+0xc41/0x1070 [btrfs]
Aug 15 10:42:44 capella kernel:  [] ? sched_clock+0x5/0x10
Aug 15 10:42:44 capella kernel:  [] ? 
__sb_start_write+0x42/0xe0
Aug 15 10:42:44 capella kernel:  [] ? 
btrfs_run_delayed_refs.part.73+0x71/0x270 [btrfs]
Aug 15 10:42:44 capella kernel:  [] ? update_curr+0xb0/0xf0
Aug 15 10:42:44 capella kernel:  [] ? 
delayed_ref_async_start+0x78/0x90 [btrfs]
Aug 15 10:42:44 capella kernel:  [] ? 
btrfs_scrubparity_helper+0xc0/0x280 [btrfs]
Aug 15 10:42:44 capella kernel:  [] ? 
process_one_work+0x1a1/0x430
Aug 15 10:42:44 capella kernel:  [] ? worker_thread+0x47/0x4a0
Aug 15 10:42:44 capella kernel:  [] ? 
process_one_work+0x430/0x430
Aug 15 10:42:44 capella kernel:  [] ? kthread+0xc1/0xe0
Aug 15 10:42:44 capella kernel:  [] ? 
kthread_worker_fn+0x170/0x170
Aug 15 10:42:44 capella kernel:  [] ? ret_from_fork+0x3f/0x70
Aug 15 10:42:44 capella kernel:  [] ? 
kthread_worker_fn+0x170/0x170
Aug 15 10:42:44 capella kernel: Code: 89 d9 4c 89 34 24 4d 89 e8 4c 89 f9 4c 89 
e6 48 89 ef 48 89 44 24 10 8b 84 24 a8 00 00 00 89 44 24 08 e8 f1 d6 ff ff 31 
c0 eb b3 <0f> 0b 90 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 56 41 
Aug 15 10:42:44 capella kernel: RIP  [] 
insert_inline_extent_backref+0xe3/0xf0 [btrfs]
Aug 15 10:4

Re: Major qgroup regression in 4.2?

2015-08-17 Thread Qu Wenruo



Mark Fasheh wrote on 2015/08/17 14:13 -0700:

Hi Qu,

Firstly thanks for the response. I have a few new questions and comments
below,

On Mon, Aug 17, 2015 at 09:33:54AM +0800, Qu Wenruo wrote:

Thanks for pointing out the problem.
But it's already known and we didn't have a good idea to solve it yet.

BTW, the old framework won't handle it well either.
Liu Bo's testcase (btrfs/017) can easily trigger it.
So it's not a regression.


I don't understand your meaning here. btrfs/017 doesn't have anything to do
with subvolume deletion. The regression I am talking about is that deleting
a subvolume in btrfs on 4.2 breaks qgroup numbers, whereas it did not in
4.1. Btw, I do acknowledge that 017 *was* broken before and fixed by your
patches (thanks) but that's of course been at the expense of reintroducing
the subvol regression :(

Oh, it seems that my memory went wrong about the test case.

But I did remember old implement has something wrong with subvolume 
deletion too.

(Or maybe I'm wrong again?)




Let me explain the problem a little and then introduce the fixing plan.

[Qgroup for subvolume delete]
Here are two subvolumes, sharing the whole tree.
The whole trees are consistent with 8 tree blocks.
A B are the tree root of subv 257 and 258 respectively.
C~H are all shared by the two subvolumes.
And I is one data extent which is recorded in tree block H.


subv 257(A)subv258(B)
| \   /|
C  D
   / \/ \
   E FG H
:
:
I

Let's assume the tree block are all in 4K size(just for easy
calculation), and I is in 16K size.

Now qgroup info should be:
257: excl: 4K, rfer: 44K (6 shared tree block + 1 data + 1 tree root)
258: excl: 4K, rfer: 44K

If we are going to delete subvolume 258, then expected qgroup should be:
257: excl 44K, rfer 44K


Ok, this is basically explaining how we expect the numbers from a subvolume
delete to work out. Everything there looks normal to me.



Which means we need to recalculate all the reference relation of
tree block C~H and data extent I.

[[The real problem]]
Now the real problem is, we need to mark *ALL* metadata and data
extent of a subvolume dirty to make subvolume deletion work.


I don't understand why we have to mark all nodes of a subvol. The previous
implementation was able to get this correct without recalculating every
block in the tree. It only passed shared nodes and items through to the
qgroup accounting code. The accounting code in turn had very simple logic
for working it out - if the extent is now exclusively owned, we adjust
qgroup->excl for the (now only remaing) owner.

Oh, you're right here.
I'm stupid on this, as exclusive extent doesn't matter in the case.

So no need to iterate the whole tree, but only shared extents.
That's already a lot of extent we can skip.



If you go back to qgroup_subtree_accounting() it, you would see
*that it never* changes ->rfer on a qgroup. That is because the rfer counts
on any related subvolumes don't change during subvol delete. Your example above 
in
fact illustrates this.

So to be clear,

- Why do we have to visit all tree nodes with the qgroup code given that
we only cared about shared ones before"


My fault, as I didn't jump out of the box and still using the idea of 
rescan whole tree to do it.






Forgot to say, that's at transaction commit time, if we really do that,
commit time consumption is not acceptable if the subvolume is huge.


This too is confusing to me. Even if we assume that qgroups will get every
node now instead of just the shared ones:

- Drop subvol already visits every node (though it doesn't pass them all to
the qgroup code). We were doing this before just fine to my knowledge - what
changed? The commit-time qgroup accounting?


Nothing, as even old implement does qgroup accounting at 
run_delayed_refs time, which is also done at commit_transaction.


So that's just my meaningless worry.



- Also, btrfs_drop_snapshot() throttles transaction commits (via
btrfs_end_transaction_throttle()), so anything done at commit time looks
like it would be broken up into smaller chunks of work for us.

Am I mistaken in how I understand these functions?


No, you're right.
Overall my concern about commit-time qgroup accouting performance impact 
is meaningless now.





Also I must ask... How did you discover this performance issue? Are we
talking about something theoretical here or was your implementation slow on
subvolume delete?

Mainly theoretical.
And theoretically, the new implement should be faster than old implement.

But I'm not quite sure about the old implement is fast enough for 
subvolume deletion.

Anyway, your fix seems no slower than old implement.
So I'm completely OK with it now.





[[Possible fix plan]]
As you can see the biggest problem is the number of child
metadata/data extents can be quite large, it's not possible to mark
them all and account at

[PATCH v3] fstests: btrfs: Add reserved space leak check for rewrite dirty page

2015-08-17 Thread Qu Wenruo
Btrfs qgroup reserve codes lacks check for rewrite dirty page, causing
every write, even rewriting a uncommitted dirty page, to reserve space.

But only written data will free the reserved space, causing reserved
space leaking.

The bug exists almost from the beginning of btrfs qgroup codes, but
nobody found it.

For example:

1)Write [0, 12K) into file A
  reserve 12K space

File A:
0   4K  8K  12K
||
reserved: 12K

2)Write [0,4K) into file A
0   4K  8K  12K
||
reserved: 16K <<< Should be 12K

3) Commit transaction
Dirty pages [0,12) written to disk.
Free 12K reserved space.
reserved: 4K <<< Should be 0

This testcase will test such problem.
Kernel fix will need some huge change, so won't be soon.

Signed-off-by: Qu Wenruo 
---
Changelog:
v2:
  Use smaller write size inside loop, in case commit is trigger by dirty page
  threshold, and ensure following write won't trigger EQUOT
v3: 
  Add more comments and fix some expression.
---
 tests/btrfs/089 | 87 +
 tests/btrfs/089.out | 13 
 tests/btrfs/group   |  1 +
 3 files changed, 101 insertions(+)
 create mode 100755 tests/btrfs/089
 create mode 100644 tests/btrfs/089.out

diff --git a/tests/btrfs/089 b/tests/btrfs/089
new file mode 100755
index 000..afbb1e4
--- /dev/null
+++ b/tests/btrfs/089
@@ -0,0 +1,87 @@
+#! /bin/bash
+# FS QA Test 089
+#
+# Check for qgroup reserved space leaks caused by re-writing dirty ranges
+# This bug has been present in btrfs qgroup for a long time
+#
+#---
+# Copyright (c) 2015 Fujitsu. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# real QA test starts here
+
+# Modify as appropriate.
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+_need_to_be_root
+
+# Use big blocksize to ensure there is still enough space left for metadata
+# space reserve.
+BLOCKSIZE=$(( 2 * 1024 * 1024 )) # 2M block size
+FILESIZE=$(( 128 * 1024 * 1024 )) # 128M file size
+
+_scratch_mkfs >> $seqres.full 2>&1
+_scratch_mount
+_require_fs_space $SCRATCH_MNT $(($FILESIZE * 2 / 1024))
+
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+_run_btrfs_util_prog qgroup limit $FILESIZE 5 $SCRATCH_MNT
+
+# loop 5 times without sync to ensure reserved space leak will happen
+for i in `seq 1 5`; do
+   # Use 1/4 of the file size, to ensure even commit is trigger by
+   # dirty page threshold or commit interval, we should still be
+   # able to continue write
+   $XFS_IO_PROG -f -c "pwrite -b $BLOCKSIZE 0 $(($FILESIZE / 4))" \
+   $SCRATCH_MNT/foo | _filter_xfs_io
+done
+
+# Sync to make sure all the dirty pages are written to disk, which should
+# free all the reserved space
+sync
+
+# remove the file and sync, to ensure all quota space freed
+rm $SCRATCH_MNT/foo
+sync
+
+# We should be able to write $FILESIZE - $BLOCKSIZE data now
+$XFS_IO_PROG -f -c "pwrite -b $BLOCKSIZE 0 $(($FILESIZE - $BLOCKSIZE))" \
+   $SCRATCH_MNT/foo | _filter_xfs_io
+
+# success, all done
+status=0
+exit
diff --git a/tests/btrfs/089.out b/tests/btrfs/089.out
new file mode 100644
index 000..642bede
--- /dev/null
+++ b/tests/btrfs/089.out
@@ -0,0 +1,13 @@
+QA output created by 089
+wrote 33554432/33554432 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 33554432/33554432 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 33554432/33554432 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 33554432/33554432 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 33554432/33554432 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 132120576/132120576 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
diff --git a/tests/btrfs/group b/tests/btrfs/group

Re: [PATCH v6 1/3] xfstests: btrfs: add functions to create dm-error device

2015-08-17 Thread anand jain


Hi Dave,

 All comments accepted thanks. except for this.



+_mount_dmerror()
+{
+   $MOUNT_PROG -t $FSTYP $MOUNT_OPTIONS $DMERROR_DEV $SCRATCH_MNT
+}


Should mirror _scratch_mount.

_mount -t $FSTYP `_scratch_mount_options` $DMERROR_DEV $SCRATCH_MNT



`_scratch_mount_options` also returns $SCRATCH_DEV.
in case of tests involving dmerror module, dmerror_init would use 
$SCRATCH_DEV as backing device and provide $DMERROR_DEV to be used 
instead of $SCRATCH_DEV. So I am proposing..


+	_mount -t $FSTYP $SCRATCH_OPTIONS $MOUNT_OPTIONS 
$SELINUX_MOUNT_OPTIONS $* $DMERROR_DEV $SCRATCH_MNT




Thanks, Anand

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7 2/3] xfstests: btrfs: test device replace, with EIO on the src dev

2015-08-17 Thread Anand Jain
From: Anand Jain 

This test case will test to confirm the replace works with
the failed (EIO) replacing source device. EIO condition is
achieved using the DM device.

Signed-off-by: Anand Jain 
Reviewed-by: Filipe Manana 
---
v6->v7: accepts Dave's comments
(. in line with changes as in 1/3)

v5->v6: accepts Eryu and Filipe's comments
(. commit update
 . correct cleanup with tmp.*
 .  don't redirect run_check $FSFTRESS_PROG to dev null
)

v4->v5: rebase on latest xfstests code and accepts Filipe comment
(. cut line length to 80 char
 . accommodate to the regression introduced by 
a092363bbdfa6cc6e44353136bae9d3aa81baae2
)
  
v3->v4: rebase on latest xfstests code

v2->v3: accepts Filipe Manana's review comments, thanks
(. induce FS load using FSSTRESS_PROG, actually I missed it in v2
)

v1->v2: accepts Dave Chinner's review comments, thanks
(. move 'start with the clean devices' code to actually at the
   start of the script - damn why I didn't think like this before.
   Thanks Dave.
 . add .out file to contain the logs to verify.
)
 tests/btrfs/098 | 81 +
 tests/btrfs/098.out | 11 
 tests/btrfs/group   |  1 +
 3 files changed, 93 insertions(+)
 create mode 100755 tests/btrfs/098
 create mode 100644 tests/btrfs/098.out

diff --git a/tests/btrfs/098 b/tests/btrfs/098
new file mode 100755
index 000..8891655
--- /dev/null
+++ b/tests/btrfs/098
@@ -0,0 +1,81 @@
+#! /bin/bash
+# FS QA Test No. btrfs/098
+#
+# Test device replace works when the source device has EIO
+#
+# Copyright (c) 2015 Oracle.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+
+_cleanup()
+{
+   _dmerror_cleanup
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/filter.btrfs
+. ./common/dmerror
+
+_supported_fs btrfs
+_supported_os Linux
+_need_to_be_root
+_require_scratch_dev_pool 3
+_dmerror_required
+
+rm -f $seqres.full
+
+dev1="`echo $SCRATCH_DEV_POOL | $AWK_PROG '{print $2}'`"
+dev2="`echo $SCRATCH_DEV_POOL | $AWK_PROG '{print $3}'`"
+
+_dmerror_init
+_mkfs_dev "-f -d raid1 -m raid1 $dev1 $DMERROR_DEV"
+_dmerror_mount
+
+_run_btrfs_util_prog filesystem show -m $SCRATCH_MNT
+$BTRFS_UTIL_PROG filesystem show -m $SCRATCH_MNT | 
_filter_btrfs_filesystem_show
+
+error_devid=`$BTRFS_UTIL_PROG filesystem show -m $SCRATCH_MNT |\
+   egrep $DMERROR_DEV | $AWK_PROG '{print $2}'`
+
+snapshot_cmd="$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT"
+snapshot_cmd="$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\`"
+run_check $FSSTRESS_PROG -d $SCRATCH_MNT -n 200 -p 8 $FSSTRESS_AVOID -x \
+   "$snapshot_cmd" -X 50
+
+# now load the error into the DMERROR_DEV
+_dmerror_load_table
+
+_run_btrfs_util_prog replace start -B $error_devid $dev2 $SCRATCH_MNT
+
+_run_btrfs_util_prog filesystem show -m $SCRATCH_MNT
+$BTRFS_UTIL_PROG filesystem show -m $SCRATCH_MNT | 
_filter_btrfs_filesystem_show
+
+echo "=== device replace completed"
+
+status=0; exit
diff --git a/tests/btrfs/098.out b/tests/btrfs/098.out
new file mode 100644
index 000..eb2f87f
--- /dev/null
+++ b/tests/btrfs/098.out
@@ -0,0 +1,11 @@
+QA output created by 098
+Label: none  uuid:  
+   Total devices  FS bytes used 
+   devid  size  used  path SCRATCH_DEV
+   devid  size  used  path /dev/mapper/error-test
+
+Label: none  uuid:  
+   Total devices  FS bytes used 
+   devid  size  used  path SCRATCH_DEV
+
+=== device replace completed
diff --git a/tests/btrfs/group b/tests/btrfs/group
index e13865a..c8a53b5 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -100,3 +100,4 @@
 095 auto quick metadata
 096 auto quick clone
 097 auto quick send clone
+098 auto quick replace
-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7 0/3] dm error based test cases

2015-08-17 Thread Anand Jain
This is v7 of this patch set.
Mainly accepts Dave's latest review comments with thanks.
For actual change-list please ref to the individial patch. 

Anand Jain (3):
  xfstests: btrfs: add functions to create dm-error device
  xfstests: btrfs: test device replace, with EIO on the src dev
  xfstests: btrfs: test device delete with EIO on src dev

 common/dmerror  | 71 ++
 common/rc   |  7 +
 tests/btrfs/098 | 81 
 tests/btrfs/098.out | 11 +++
 tests/btrfs/099 | 82 +
 tests/btrfs/099.out | 11 +++
 tests/btrfs/group   |  2 ++
 7 files changed, 265 insertions(+)
 create mode 100644 common/dmerror
 create mode 100755 tests/btrfs/098
 create mode 100644 tests/btrfs/098.out
 create mode 100755 tests/btrfs/099
 create mode 100644 tests/btrfs/099.out

-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v7 3/3] xfstests: btrfs: test device delete with EIO on src dev

2015-08-17 Thread Anand Jain
From: Anand Jain 

This test case tests if the device delete works with
the failed (EIO) source device. EIO errors are achieved
usign the DM device.

This test would need following btrfs-progs and btrfs
kernel patch
   btrfs-progs: device delete to accept devid
   Btrfs: device delete by devid

However when btrfs-progs patch is not found this test will
not run, and when kernel patch is not found btrfs-progs
will fail gracefully and thus the test script.

Signed-off-by: Anand Jain 
---
v6->v7: accepts Dave's comments
(. in line with changes as in 1/3)

v5->v6: accepts Eryu and Filipe's comments, thanks
(. correct cleanup with tmp.*
 .  don't redirect run_check $FSFTRESS_PROG to dev null
)

v4->v5: rebase on latest xfstests code, and accepts Filipe comment
(. cut line length to 80 char
 . accommodate to the regression introduced by 
a092363bbdfa6cc6e44353136bae9d3aa81baae2
)

v3->v4: rebase on latest xfstests code

v2->v3: accepts Filipe Manana's review comments, thanks
( . induce FS load using FSSTRESS_PROG
)

v1->v2: accepts Dave Chinner's review comments, thanks
(. move 'start with the clean devices' code to actually at the
   start of the script - damn why I didn't think like this
   before. Thanks Dave.
 . update .out file to contain the logs to verify.
)
 common/rc   |  7 +
 tests/btrfs/099 | 82 +
 tests/btrfs/099.out | 11 +++
 tests/btrfs/group   |  1 +
 4 files changed, 101 insertions(+)
 create mode 100755 tests/btrfs/099
 create mode 100644 tests/btrfs/099.out

diff --git a/common/rc b/common/rc
index 70d2fa8..86c5192 100644
--- a/common/rc
+++ b/common/rc
@@ -2728,6 +2728,13 @@ _require_meta_uuid()
umount $SCRATCH_MNT
 }
 
+_require_btrfs_dev_del_by_devid()
+{
+   $BTRFS_UTIL_PROG device delete --help | egrep devid > /dev/null 2>&1
+   [ $? -eq 0 ] || _notrun "$BTRFS_UTIL_PROG too old "\
+   "(must support 'btrfs device delete  /')"
+}
+
 _get_total_inode()
 {
if [ -z "$1" ]; then
diff --git a/tests/btrfs/099 b/tests/btrfs/099
new file mode 100755
index 000..9793c9b
--- /dev/null
+++ b/tests/btrfs/099
@@ -0,0 +1,82 @@
+#! /bin/bash
+# FS QA Test No. btrfs/099
+#
+# Test device delete when the source device has EIO
+#
+# Copyright (c) 2015 Oracle.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+
+_cleanup()
+{
+   _dmerror_cleanup
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/filter.btrfs
+. ./common/dmerror
+
+_supported_fs btrfs
+_supported_os Linux
+_need_to_be_root
+_require_scratch_dev_pool 3
+_require_btrfs_dev_del_by_devid
+_dmerror_required
+
+rm -f $seqres.full
+
+dev1="`echo $SCRATCH_DEV_POOL | $AWK_PROG '{print $2}'`"
+dev2="`echo $SCRATCH_DEV_POOL | $AWK_PROG '{print $3}'`"
+
+_dmerror_init
+_mkfs_dev -f -d raid1 -m raid1 $dev1 $dev2 $DMERROR_DEV
+_dmerror_mount
+
+_run_btrfs_util_prog filesystem show -m $SCRATCH_MNT
+$BTRFS_UTIL_PROG filesystem show -m $SCRATCH_MNT | 
_filter_btrfs_filesystem_show
+
+error_devid=`$BTRFS_UTIL_PROG filesystem show -m $SCRATCH_MNT |\
+   egrep $DMERROR_DEV | $AWK_PROG '{print $2}'`
+
+snapshot_cmd="$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT"
+snapshot_cmd="$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\`"
+run_check $FSSTRESS_PROG -d $SCRATCH_MNT -n 200 -p 8 $FSSTRESS_AVOID -x \
+   "$snapshot_cmd" -X 50
+
+# now load the error into the DMERROR_DEV
+_dmerror_load_table
+
+_run_btrfs_util_prog device delete $error_devid $SCRATCH_MNT
+
+_run_btrfs_util_prog filesystem show -m $SCRATCH_MNT
+$BTRFS_UTIL_PROG filesystem show -m $SCRATCH_MNT | 
_filter_btrfs_filesystem_show
+
+echo "=== device delete completed"
+
+status=0; exit
diff --git a/tests/btrfs/099.out b/tests/btrfs/099.out
new file mode 100644
index 000..ec74e45
--- /dev/null
+++ b/tests/btrfs/099.out
@@ -0,0 +1,11 @@
+QA output created by 099
+Label: none  uuid:  
+   Total devices  FS bytes used 
+   devid  size  used  path SCRATCH_DEV
+   devid  si

[PATCH v7 1/3] xfstests: btrfs: add functions to create dm-error device

2015-08-17 Thread Anand Jain
From: Anand Jain 

Controlled EIO from the device is achieved using the dm device.
Helper functions are at common/dmerror.

Broadly steps will include calling _dmerror_init().
_dmerror_init() will use SCRATCH_DEV to create dm linear device and assign
DMERROR_DEV to /dev/mapper/error-test.

When test script is ready to get EIO, the test cases can call
_dmerror_load_table() which then it will load the dm error.
so that reading DMERROR_DEV will cause EIO. After the test case is
complete, cleanup must be done by calling _dmerror_cleanup().

Signed-off-by: Anand Jain 
Reviewed-by: Filipe Manana 
---
v6->v7:
 rename _init_dmerror() to _dmerror_init()
 remove _scratch_mkfs_dmerror()
 rename _mount_dmerror() to _dmerror_mount()
 rename _cleaup_dmerror() to _dmerror_cleanup()
 rename _load_dmerror_table() to _dmerror_load_table()
 rename BLK_DEV_SIZE to blk_dev_size
 remove _unmount_dmerror there were no consumer of it
 use _fail instead of _fatal in rc/dmerror
 update error log to make crisp sense
 move _require_dmerror() from common/rc to common/dmerror and rename
   it to dmerror_required() so that its consistent with other function
   names with in the file

v5->v6: accepts Eryu's comments, thanks
  (. added missing $MKFS_OPTIONS at _scratch_mkfs_dmerror()
   . used $MOUNT_PROG instead of mount at _mount_dmerror()
   . correct typo $UMOUNT_PROG, no S at the end in _unmount_dmerror()
  )

v4->v5: No Change. keep up with the patch set

v3->v4: rebase on latest xfstests code

v2.1->v3: accepts Filipe Manana's review comments, thanks
   (. correct if else statement in _require_dm_error()
. fix indent 
  a missed Dave comment in v1. looks like I goofed with git cli
   )

v2->v2.1: fixed missed typo error fixup in the commit.


v1->v2: accepts Dave Chinner's review comments, thanks
   (. use SCRATCH_DEV for dmerror backing device
. remove duplicate check of DM_BLK_DEV in _init_dm_error_dev()
. remove a wrong check when reading block size in _init_dm_error_dev()
. remove a wrong check with blockdev --setra in _init_dm_error_dev()
. remove unnecessary check in _load_dm_error_table()
. remove unnecessary dmerror device test by using dd
   )

 common/dmerror | 71 ++
 1 file changed, 71 insertions(+)
 create mode 100644 common/dmerror

diff --git a/common/dmerror b/common/dmerror
new file mode 100644
index 000..f8b9ce6
--- /dev/null
+++ b/common/dmerror
@@ -0,0 +1,71 @@
+##/bin/bash
+#
+# Copyright (c) 2015 Oracle.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#
+#
+# common functions for setting up and tearing down a dmerror device
+
+# this test requires the device mapper error target
+#
+_dmerror_required()
+{
+   _require_command "$DMSETUP_PROG" dmsetup
+   $DMSETUP_PROG targets | grep error >/dev/null 2>&1
+   [ $? -ne 0 ] && _notrun "This test requires dm error support"
+}
+
+_dmerror_init()
+{
+   local dm_backing_dev=$SCRATCH_DEV
+
+   $DMSETUP_PROG remove error-test > /dev/null 2>&1
+
+   local blk_dev_size=`blockdev --getsz $dm_backing_dev`
+
+   DMERROR_DEV='/dev/mapper/error-test'
+
+   DMLINEAR_TABLE="0 $blk_dev_size linear $dm_backing_dev 0"
+
+   $DMSETUP_PROG create error-test --table "$DMLINEAR_TABLE" || \
+   _fatal "failed to create dm linear device"
+
+   DMERROR_TABLE="0 $blk_dev_size error $dm_backing_dev 0"
+}
+
+_dmerror_mount()
+{
+   _scratch_options mount
+   _mount -t $FSTYP $SCRATCH_OPTIONS $MOUNT_OPTIONS $SELINUX_MOUNT_OPTIONS 
$* \
+   $DMERROR_DEV $SCRATCH_MNT
+}
+
+_dmerror_cleanup()
+{
+   $UMOUNT_PROG $SCRATCH_MNT > /dev/null 2>&1
+   $DMSETUP_PROG remove error-test > /dev/null 2>&1
+}
+
+_dmerror_load_table()
+{
+   $DMSETUP_PROG suspend error-test
+   [ $? -ne 0 ] && _fail  "dmsetup suspend failed"
+
+   $DMSETUP_PROG load error-test --table "$DMERROR_TABLE"
+   [ $? -ne 0 ] && _fail "dmsetup failed to load error table"
+
+   $DMSETUP_PROG resume error-test
+   [ $? -ne 0 ] && _fail  "dmsetup resume failed"
+}
-- 
2.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html