Re: [dm-devel] [PATCH 08/15] dm mpath: merge do_end_io_bio into multipath_end_io_bio

2017-05-25 Thread Christoph Hellwig
On Mon, May 22, 2017 at 08:51:20PM +0200, Martin Wilck wrote:
> >  
> > -   if (!error)
> > -   return 0;   /* I/O complete */
> > +   BUG_ON(!mpio);
> 
> You dereferenced mpio already above.

Indeed.  I removed the BUG_ON for the next version.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/15] scsi/osd: don't save block errors into req_results

2017-05-25 Thread h...@lst.de
On Wed, May 24, 2017 at 04:04:40PM +, Bart Van Assche wrote:
> Are you sure that that code is not necessary? From osd_initiator.c:
> 
> static void _put_request(struct request *rq)
> {
>   /*
>* If osd_finalize_request() was called but the request was not
>* executed through the block layer, then we must release BIOs.
>* TODO: Keep error code in or->async_error. Need to audit all
>*   code paths.
>*/
>   if (unlikely(rq->bio))
>   blk_end_request(rq, -ENOMEM, blk_rq_bytes(rq));
>   else
>   blk_put_request(rq);
> }

Which isn't using it at all.  It has a ten year old comment to pass
on some error, but even then ORing two different error types together
would no be very helpful.

> 
> Bart.---end quoted text---
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/5] btrfs-progs: check: inode nbytes fix in lowmem

2017-05-25 Thread Su Yue
After checking one inode item, we should get the actual nbytes of the
inode item.
Introduce function 'repair_inode_nbytes_lowmem' to set nbytes in struct
btrfs_inode_item to the actual nbytes. After call of the function, the
wrong nbytes should have been corrected.

Signed-off-by: Su Yue 
---
 cmds-check.c | 76 +---
 1 file changed, 73 insertions(+), 3 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index ad7c81b2..2797ab9e 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -1922,6 +1922,9 @@ static int process_one_leaf_v2(struct btrfs_root *root, 
struct btrfs_path *path,
 again:
err |= check_inode_item(root, path, ext_ref);
 
+   /* modified cur since check_inode_item may change path */
+   cur = path->nodes[0];
+
if (err & LAST_ITEM)
goto out;
 
@@ -2271,6 +2274,7 @@ static int walk_down_tree_v2(struct btrfs_root *root, 
struct btrfs_path *path,
}
ret = process_one_leaf_v2(root, path, nrefs,
  level, ext_ref);
+   cur = path->nodes[*level];
break;
} else {
ret = btrfs_check_node(root, NULL, cur);
@@ -4854,10 +4858,69 @@ static int check_file_extent(struct btrfs_root *root, 
struct btrfs_key *fkey,
 }
 
 /*
+ * Set inode item nbytes to @nbytes
+ *
+ * Returns <0  means on error
+ * Returns  0  means successful repair
+ */
+static int repair_inode_nbytes_lowmem(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 ino, u64 nbytes)
+{
+   struct btrfs_trans_handle *trans;
+   struct btrfs_inode_item *ii;
+   struct btrfs_key key;
+   struct btrfs_key research_key;
+   int ret;
+   int ret2;
+
+   key.objectid = ino;
+   key.type = BTRFS_INODE_ITEM_KEY;
+   key.offset = 0;
+   btrfs_item_key_to_cpu(path->nodes[0], _key, path->slots[0]);
+   btrfs_release_path(path);
+
+   trans = btrfs_start_transaction(root, 1);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   ret = btrfs_search_slot(trans, root, , path, 0, 1);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
+   struct btrfs_inode_item);
+   btrfs_set_inode_nbytes(path->nodes[0], ii, nbytes);
+   btrfs_mark_buffer_dirty(path->nodes[0]);
+
+   printf("reset nbytes for inode %llu root %llu\n", ino,
+  root->root_key.objectid);
+
+   btrfs_commit_transaction(trans, root);
+out:
+   if (ret < 0)
+   error("failed to reset nbytes for inode %llu root %llu due to 
%s",
+ ino, root->root_key.objectid, strerror(-ret));
+
+   /* research path */
+   btrfs_release_path(path);
+   ret2 = btrfs_search_slot(NULL, root, _key, path, 0, 0);
+   return ret2 < 0 ? ret2 : ret;
+}
+
+/*
  * Check INODE_ITEM and related ITEMs (the same inode number)
  * 1. check link count
  * 2. check inode ref/extref
  * 3. check dir item/index
+ * Be Careful, if repair is enable, @path may be changed.
+ * Remember to reassign any context about @path in repair mode.
  *
  * @ext_ref:   the EXTENDED_IREF feature
  *
@@ -5007,9 +5070,16 @@ out:
}
 
if (nbytes != extent_size) {
-   err |= NBYTES_ERROR;
-   error("root %llu INODE[%llu] nbytes(%llu) not equal to 
extent_size(%llu)",
- root->objectid, inode_id, nbytes, extent_size);
+   if (repair) {
+   ret = repair_inode_nbytes_lowmem(root, path,
+   inode_id, extent_size);
+   }
+   if (!repair || ret) {
+   err |= NBYTES_ERROR;
+   error("root %llu INODE[%llu] nbytes(%llu) not 
equal to extent_size(%llu)",
+ root->objectid, inode_id, nbytes,
+ extent_size);
+   }
}
}
 
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/5] btrfs-progs: check: simple errors repair in lowmem

2017-05-25 Thread Su Yue
The series include following contents:

1) Repair wrong nbytes of file inode item.
   After traversal of extents in one file, we should get the actual nbytes
   of the file. If nbytes in the file inode differs from the actual value,
   set the value to actual one.
   
   The wrong nbytes of file inode case corresponds to fsck-test/016.
   
2) Repair wrong iszie of directory inode item.
   After traversal of dir_index and dir_item in one dir, we should get
   the actual isize of the dirctory. If size in the inode item differs
   from the actual value, set the value to actual one.

   New test case 'fsck-test/026' is added for the case wrong isize of
   directory inode.
   
3) Allow fsck check test to repair in lowmem mode for certain test cases
   if TEST_ENABLE_OVERRIDE=true.


Changelog:
v2:
Rebase to v4.11.0.

Qu Wenruo (1):
  btrfs-progs: fsck-check: Allow fsck check test to repair in lowmem
mode for certain test cases

Su Yue (4):
  btrfs-progs: check: inode nbytes fix in lowmem
  btrfs-progs: check: dir isize fix in lowmem
  btrfs-progs: check: enable lowmem repair
  btrfs-progs: fsck-check: test cases for nbytes and dir isize

 cmds-check.c   | 149 +++--
 tests/common.local |  14 +-
 .../016-wrong-inode-nbytes/.lowmem_repairable  |   0
 .../026-wrong-dir-inode-isize/.lowmem_repairable   |   0
 .../026-wrong-dir-inode-isize/default_case.img | Bin 0 -> 4096 bytes
 5 files changed, 153 insertions(+), 10 deletions(-)
 create mode 100644 tests/fsck-tests/016-wrong-inode-nbytes/.lowmem_repairable
 create mode 100644 
tests/fsck-tests/026-wrong-dir-inode-isize/.lowmem_repairable
 create mode 100644 tests/fsck-tests/026-wrong-dir-inode-isize/default_case.img

-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/5] btrfs-progs: fsck-check: Allow fsck check test to repair in lowmem mode for certain test cases

2017-05-25 Thread Su Yue
From: Qu Wenruo 

Since lowmem mode can repair certain corruptions (mostly in fs tree),
insert a beacon into each fsck test cases to allow some of them be
tested for lowmem mode.

With this patch, fsck option override will check the beacon file
".lowmem_repairbale" in the same directory of the test image, and if the
beacon exists, then it will also run lowmem mode repair to repair the
image.

Signed-off-by: Qu Wenruo 
---
 tests/common.local | 14 +-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/common.local b/tests/common.local
index 4f56bb08..af372f16 100644
--- a/tests/common.local
+++ b/tests/common.local
@@ -15,11 +15,23 @@ TEST_ARGS_CHECK=--mode=lowmem
 # gets arguments of a current command and can decide if the argument insertion
 # should happen, eg. if some option combination does not make sense or would
 # break tests
+#
+# Return 0 if we need to skip option override
+# Return 1 if we don't need to skip option override
 _skip_spec()
 {
+   beacon=.lowmem_repairable
+
+   # For loemem repair, only support fs tree repair yet
+   # So we place lowmem repair beacon in the same dir of the
+   # test case
if echo "$TEST_ARGS_CHECK" | grep -q 'mode=lowmem' &&
   echo "$@" | grep -q -- '--repair'; then
-   return 0
+   dir="$(dirname ${@: -1})"
+   if [ -f ${dir}/${beacon} ]; then
+   return 1;
+   fi
+   return 0;
fi
return 1
 }
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/5] btrfs-progs: check: enable lowmem repair

2017-05-25 Thread Su Yue
Enable btrfsck option '--repair' with option '--mode=lowmem'.
Now lowmem mode only repairs wrong nbytes, dir isize.

Signed-off-by: Su Yue 
---
 cmds-check.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index cf4d8e09..58839314 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -12973,11 +12973,10 @@ int cmd_check(int argc, char **argv)
}
 
/*
-* Not supported yet
+* Support partially
 */
if (repair && check_mode == CHECK_MODE_LOWMEM) {
-   error("low memory mode doesn't support repair yet");
-   exit(1);
+   warning("low memory mode support repair partially");
}
 
radix_tree_init();
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/5] btrfs-progs: check: dir isize fix in lowmem

2017-05-25 Thread Su Yue
After traversal of whole directory, we should get the actual isize.

Introduce function 'repair_dir_isize_lowmem' to set isize in the directory
inode item to actual size. After call of the function, the wrong dir isize
should have been corrected.

Signed-off-by: Su Yue 
---
 cmds-check.c | 68 +---
 1 file changed, 65 insertions(+), 3 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 2797ab9e..cf4d8e09 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -4915,6 +4915,62 @@ out:
 }
 
 /*
+ * Set dir isize to @isize
+ *
+ * Returns <0  means on error
+ * Returns  0  means successful repair
+ */
+static int repair_dir_isize_lowmem(struct btrfs_root *root,
+  struct btrfs_path *path,
+  u64 ino, u64 isize)
+{
+   struct btrfs_trans_handle *trans;
+   struct btrfs_inode_item *ii;
+   struct btrfs_key key;
+   struct btrfs_key research_key;
+   int ret;
+   int ret2;
+
+   key.objectid = ino;
+   key.type = BTRFS_INODE_ITEM_KEY;
+   key.offset = 0;
+
+   btrfs_item_key_to_cpu(path->nodes[0], _key, path->slots[0]);
+   btrfs_release_path(path);
+
+   trans = btrfs_start_transaction(root, 1);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto out;
+   }
+
+   ret = btrfs_search_slot(trans, root, , path, 0, 1);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = -ENOENT;
+   goto out;
+   }
+
+   ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
+   struct btrfs_inode_item);
+   btrfs_set_inode_size(path->nodes[0], ii, isize);
+   btrfs_mark_buffer_dirty(path->nodes[0]);
+
+   printf("reset isize for inode %llu root %llu\n", ino,
+  root->root_key.objectid);
+
+   btrfs_commit_transaction(trans, root);
+out:
+   if (ret < 0)
+   error("failed to reset isize for inode %llu root %llu due to 
%s",
+ ino, root->root_key.objectid, strerror(-ret));
+   btrfs_release_path(path);
+   ret2 = btrfs_search_slot(NULL, root, _key, path, 0, 0);
+   return ret2 < 0 ? ret2 : ret;
+}
+
+/*
  * Check INODE_ITEM and related ITEMs (the same inode number)
  * 1. check link count
  * 2. check inode ref/extref
@@ -5050,9 +5106,15 @@ out:
}
 
if (isize != size) {
-   err |= ISIZE_ERROR;
-   error("root %llu DIR INODE [%llu] size(%llu) not equal 
to %llu",
- root->objectid, inode_id, isize, size);
+   if (repair)
+   ret = repair_dir_isize_lowmem(root, path,
+ inode_id, size);
+
+   if (!repair || ret) {
+   err |= ISIZE_ERROR;
+   error("root %llu DIR INODE [%llu] size(%llu) 
not equal to %llu",
+ root->objectid, inode_id, isize, size);
+   }
}
} else {
if (nlink != refs) {
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/5] btrfs-progs: fsck-check: test cases for nbytes and dir isize

2017-05-25 Thread Su Yue
Create test case '026-wrong-dir-inode-isize'.
Create becon files '.lowmem_repairable' under tests/fsck-test/016 and 026.

Now 'make test-fsck' will test lowmem repairable test cases if
TEST_ENABLE_OVERRIDE=true.

Signed-off-by: Su Yue 
---
 .../fsck-tests/016-wrong-inode-nbytes/.lowmem_repairable |   0
 .../026-wrong-dir-inode-isize/.lowmem_repairable |   0
 .../026-wrong-dir-inode-isize/default_case.img   | Bin 0 -> 4096 bytes
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/fsck-tests/016-wrong-inode-nbytes/.lowmem_repairable
 create mode 100644 
tests/fsck-tests/026-wrong-dir-inode-isize/.lowmem_repairable
 create mode 100644 tests/fsck-tests/026-wrong-dir-inode-isize/default_case.img

diff --git a/tests/fsck-tests/016-wrong-inode-nbytes/.lowmem_repairable 
b/tests/fsck-tests/016-wrong-inode-nbytes/.lowmem_repairable
new file mode 100644
index ..e69de29b
diff --git a/tests/fsck-tests/026-wrong-dir-inode-isize/.lowmem_repairable 
b/tests/fsck-tests/026-wrong-dir-inode-isize/.lowmem_repairable
new file mode 100644
index ..e69de29b
diff --git a/tests/fsck-tests/026-wrong-dir-inode-isize/default_case.img 
b/tests/fsck-tests/026-wrong-dir-inode-isize/default_case.img
new file mode 100644
index 
..a060cca86e945f2659595d812b3017c66c570d5e
GIT binary patch
literal 4096
zcmeHHX*d*I8)g_@X;2Lby@Ra9kg`N}hBQhtuf2#!WM`~nnW8LZ8=4OjNm<8M%Gk5C
zVH*27BfA*OSf(N7lkff2*Prj-cU^DiI@kT&?_xlX(b#&<%}uAb%Z!MPV%

Re: [PATCH] Btrfs: skip commit transaction if we don't have enough pinned bytes

2017-05-25 Thread Liu Bo
On Thu, May 25, 2017 at 06:50:48PM +0200, David Sterba wrote:
> On Tue, May 23, 2017 at 12:06:40PM +0300, Nikolay Borisov wrote:
> > 
> > 
> > On 19.05.2017 20:39, Liu Bo wrote:
> > > We commit transaction in order to reclaim space from pinned bytes because
> > > it could process delayed refs, and in may_commit_transaction(), we check
> > > first if pinned bytes are enough for the required space, we then check if
> > > that plus bytes reserved for delayed insert are enough for the required
> > > space.
> > > 
> > > This changes the code to the above logic.
> > > 
> > > Signed-off-by: Liu Bo 
> > 
> > Please add:
> > Fixes: b150a4f10d87 ("Btrfs: use a percpu to keep track of possibly
> > pinned bytes")
> > 
> > > ---
> > >  fs/btrfs/extent-tree.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > 
> > > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > > index e390451c72e6..bded1ddd1bb6 100644
> > > --- a/fs/btrfs/extent-tree.c
> > > +++ b/fs/btrfs/extent-tree.c
> > > @@ -4837,7 +4837,7 @@ static int may_commit_transaction(struct 
> > > btrfs_fs_info *fs_info,
> > >  
> > >   spin_lock(_rsv->lock);
> > >   if (percpu_counter_compare(_info->total_bytes_pinned,
> > > -bytes - delayed_rsv->size) >= 0) {
> > > +bytes - delayed_rsv->size) < 0) {
> > >   spin_unlock(_rsv->lock);
> > >   return -ENOSPC;
> > >   }
> > > 
> > 
> > With the minor nit above:
> > 
> > Reviewed-by: Nikolay Borisov 
> > Tested-by: Nikolay Borisov 
> 
> Patch applied with updated tags.

Thank you for that!

-liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] Btrfs: add sanity check of extent item in scrub

2017-05-25 Thread Liu Bo
Currently scrub only verify checksum of both metadata and data and
couldn't detect an invalid extent_item.

This adds sanity check for extent item, now it can check if
extent_inline_ref_type is valid.

Signed-off-by: Liu Bo 
---
 fs/btrfs/scrub.c | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b0251eb..e87b752 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3058,6 +3058,39 @@ static noinline_for_stack int scrub_raid56_parity(struct 
scrub_ctx *sctx,
return ret < 0 ? ret : 0;
 }
 
+static int check_extent_item(struct extent_buffer *l, int slot,
+struct btrfs_extent_item *ei, int key_type)
+{
+   unsigned long ptr;
+   unsigned long end;
+   struct btrfs_extent_inline_ref *iref;
+   u64 flags = btrfs_extent_flags(l, ei);
+   int is_data = !!(flags & BTRFS_EXTENT_FLAG_DATA);
+   int type;
+
+   ptr = (unsigned long)(ei + 1);
+   if (!is_data &&
+   key_type != BTRFS_METADATA_ITEM_KEY)
+   ptr += sizeof(struct btrfs_tree_block_info);
+   end = (unsigned long)ei +
+   btrfs_item_size_nr(l, slot);
+
+   while (1) {
+   if (ptr >= end) {
+   WARN_ON(ptr > end);
+   break;
+   }
+
+   iref = (struct btrfs_extent_inline_ref *)ptr;
+   type = btrfs_get_extent_inline_ref_type(l, iref, is_data);
+   if (type < 0)
+   return type;
+
+   ptr += btrfs_extent_inline_ref_size(type);
+   }
+   return 0;
+}
+
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
   struct map_lookup *map,
   struct btrfs_device *scrub_dev,
@@ -3318,6 +3351,16 @@ static noinline_for_stack int scrub_stripe(struct 
scrub_ctx *sctx,
goto next;
}
 
+   /* sanity check for extent inline ref type */
+   if (check_extent_item(l, slot, extent, key.type)) {
+   btrfs_err(fs_info,
+ "scrub: extent %llu(0x%llx) has an 
invalid extent inline ref type, ignored.",
+ key.objectid, key.objectid);
+   spin_lock(>stat_lock);
+   sctx->stat.uncorrectable_errors++;
+   spin_unlock(>stat_lock);
+   goto next;
+   }
 again:
extent_logical = key.objectid;
extent_len = bytes;
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] Btrfs: convert to use btrfs_get_extent_inline_ref_type

2017-05-25 Thread Liu Bo
Since we have a helper which can do sanity check, this converts all
btrfs_extent_inline_ref_type to it.

Signed-off-by: Liu Bo 
---
 fs/btrfs/backref.c |  9 +++--
 fs/btrfs/extent-tree.c | 33 ++---
 fs/btrfs/relocation.c  | 15 +--
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7699e16..6ffc6bb 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1009,7 +1009,10 @@ static int __add_inline_refs(struct btrfs_path *path, 
u64 bytenr,
int type;
 
iref = (struct btrfs_extent_inline_ref *)ptr;
-   type = btrfs_extent_inline_ref_type(leaf, iref);
+   type = btrfs_get_extent_inline_ref_type(leaf, iref, 2);
+   if (type == -EINVAL)
+   return -EINVAL;
+
offset = btrfs_extent_inline_ref_offset(leaf, iref);
 
switch (type) {
@@ -1905,7 +1908,9 @@ static int __get_extent_inline_ref(unsigned long *ptr, 
struct extent_buffer *eb,
 
end = (unsigned long)ei + item_size;
*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
-   *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+   *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, 2);
+   if (*out_type == -EINVAL)
+   return -EINVAL;
 
*ptr += btrfs_extent_inline_ref_size(*out_type);
WARN_ON(*ptr > end);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fba8ca0..ecbed56 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1421,12 +1421,18 @@ static noinline u32 extent_data_ref_count(struct 
btrfs_path *path,
struct btrfs_extent_data_ref *ref1;
struct btrfs_shared_data_ref *ref2;
u32 num_refs = 0;
+   int type;
 
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, , path->slots[0]);
if (iref) {
-   if (btrfs_extent_inline_ref_type(leaf, iref) ==
-   BTRFS_EXTENT_DATA_REF_KEY) {
+   /*
+* If type is invalid, we should have bailed out earlier than
+* this call.
+*/
+   type = btrfs_get_extent_inline_ref_type(leaf, iref, 1);
+   ASSERT(type > 0);
+   if (type == BTRFS_EXTENT_DATA_REF_KEY) {
ref1 = (struct btrfs_extent_data_ref *)(>offset);
num_refs = btrfs_extent_data_ref_count(leaf, ref1);
} else {
@@ -1587,6 +1593,7 @@ int lookup_inline_extent_backref(struct 
btrfs_trans_handle *trans,
int ret;
int err = 0;
bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
+   int is_data = !!(owner >= BTRFS_FIRST_FREE_OBJECTID);
 
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1603,7 +1610,7 @@ int lookup_inline_extent_backref(struct 
btrfs_trans_handle *trans,
 * Owner is our parent level, so we can just add one to get the level
 * for the block we are interested in.
 */
-   if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+   if (skinny_metadata && !is_data) {
key.type = BTRFS_METADATA_ITEM_KEY;
key.offset = owner;
}
@@ -1685,7 +1692,12 @@ int lookup_inline_extent_backref(struct 
btrfs_trans_handle *trans,
break;
}
iref = (struct btrfs_extent_inline_ref *)ptr;
-   type = btrfs_extent_inline_ref_type(leaf, iref);
+   type = btrfs_get_extent_inline_ref_type(leaf, iref, is_data);
+   if (type == -EINVAL) {
+   err = -EINVAL;
+   goto out;
+   }
+
if (want < type)
break;
if (want > type) {
@@ -1877,7 +1889,12 @@ void update_inline_extent_backref(struct btrfs_fs_info 
*fs_info,
if (extent_op)
__run_delayed_extent_op(extent_op, leaf, ei);
 
-   type = btrfs_extent_inline_ref_type(leaf, iref);
+   /*
+* If type is invalid, we should have bailed out after
+* lookup_inline_extent_backref().
+*/
+   type = btrfs_get_extent_inline_ref_type(leaf, iref, 2);
+   ASSERT(type > 0);
 
if (type == BTRFS_EXTENT_DATA_REF_KEY) {
dref = (struct btrfs_extent_data_ref *)(>offset);
@@ -3146,6 +3163,7 @@ static noinline int check_committed_ref(struct btrfs_root 
*root,
struct btrfs_extent_item *ei;
struct btrfs_key key;
u32 item_size;
+   int type;
int ret;
 
key.objectid = bytenr;
@@ -3187,8 +3205,9 @@ static noinline int check_committed_ref(struct btrfs_root 
*root,
goto out;
 
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-   if (btrfs_extent_inline_ref_type(leaf, iref) !=
-   

[PATCH 1/6] Btrfs: add a helper to retrive extent inline ref type

2017-05-25 Thread Liu Bo
An invalid value of extent inline ref type may be read from a
malicious image which may force btrfs to crash.

This adds a helper which does sanity check for the ref type, so we can
know if it's sane, return type if so, otherwise return an error.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.h   |  4 
 fs/btrfs/extent-tree.c | 35 +++
 2 files changed, 39 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c411590..206ae8c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2542,6 +2542,10 @@ static inline gfp_t btrfs_alloc_write_mask(struct 
address_space *mapping)
 
 /* extent-tree.c */
 
+int btrfs_get_extent_inline_ref_type(struct extent_buffer *eb,
+struct btrfs_extent_inline_ref *iref,
+int is_data);
+
 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes);
 
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index be54776..fba8ca0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1117,6 +1117,41 @@ static int convert_extent_item_v0(struct 
btrfs_trans_handle *trans,
 }
 #endif
 
+/*
+ * is_data == 0, tree block type is required,
+ * is_data == 1, data type is requried,
+ * is_data == 2, either type is OK.
+ */
+int btrfs_get_extent_inline_ref_type(struct extent_buffer *eb,
+struct btrfs_extent_inline_ref *iref,
+int is_data)
+{
+   int type = btrfs_extent_inline_ref_type(eb, iref);
+
+   if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+   type == BTRFS_SHARED_BLOCK_REF_KEY ||
+   type == BTRFS_SHARED_DATA_REF_KEY ||
+   type == BTRFS_EXTENT_DATA_REF_KEY) {
+   if (is_data == 2) {
+   return type;
+   } else if (is_data == 1) {
+   if (type == BTRFS_EXTENT_DATA_REF_KEY ||
+   type == BTRFS_SHARED_DATA_REF_KEY)
+   return type;
+   } else {
+   if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+   type == BTRFS_SHARED_BLOCK_REF_KEY)
+   return type;
+   }
+   }
+
+   btrfs_print_leaf(eb->fs_info, eb);
+   WARN(1, "eb %llu(%s block) invalid extent inline ref type %d\n",
+eb->start, (is_data) ? "data" : "tree", type);
+
+   return -EINVAL;
+}
+
 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 {
u32 high_crc = ~(u32)0;
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] Btrfs: remove BUG() in add_data_reference

2017-05-25 Thread Liu Bo
Now that we have a helper to report invalid value of extent inline ref
type, we need to quit gracefully instead of throwing out a kernel panic.

Signed-off-by: Liu Bo 
---
 fs/btrfs/relocation.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b043e200..8b984bd 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3774,7 +3774,10 @@ int add_data_references(struct reloc_control *rc,
ret = find_data_references(rc, extent_key,
   eb, dref, blocks);
} else {
-   BUG();
+   ret = -EINVAL;
+   WARN(1,
+"extent %llu has an invalid inline ref type\n",
+eb->start);
}
if (ret) {
err = ret;
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] Btrfs: remove BUG() in print_extent_item

2017-05-25 Thread Liu Bo
btrfs_print_leaf() is used in btrfs_get_extent_inline_ref_type, so
here we really want to print the invalid value of ref type instead of
causing a kernel panic.

Signed-off-by: Liu Bo 
---
 fs/btrfs/print-tree.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fcae61e..4448be6 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -63,6 +63,7 @@ static void print_extent_item(struct extent_buffer *eb, int 
slot, int type)
u32 item_size = btrfs_item_size_nr(eb, slot);
u64 flags;
u64 offset;
+   int is_data;
 
if (item_size < sizeof(*ei)) {
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -98,6 +99,8 @@ static void print_extent_item(struct extent_buffer *eb, int 
slot, int type)
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
}
 
+   is_data = !!(flags & BTRFS_EXTENT_FLAG_DATA);
+
ptr = (unsigned long)iref;
end = (unsigned long)ei + item_size;
while (ptr < end) {
@@ -121,7 +124,10 @@ static void print_extent_item(struct extent_buffer *eb, 
int slot, int type)
   offset, btrfs_shared_data_ref_count(eb, sref));
break;
default:
-   BUG();
+   btrfs_err(eb->fs_info,
+ "extent %llu has invalid ref type %d\n",
+ eb->start, type);
+   return;
}
ptr += btrfs_extent_inline_ref_size(type);
}
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] Btrfs: remove BUG() in btrfs_extent_inline_ref_size

2017-05-25 Thread Liu Bo
Now that btrfs_get_extent_inline_ref_type() can report if type is a
valid one and all callers can gracefully deal with that, we don't need
to crash here.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 206ae8c..54bbac3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1776,7 +1776,6 @@ static inline u32 btrfs_extent_inline_ref_size(int type)
if (type == BTRFS_EXTENT_DATA_REF_KEY)
return sizeof(struct btrfs_extent_data_ref) +
   offsetof(struct btrfs_extent_inline_ref, offset);
-   BUG();
return 0;
 }
 
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] add sanity check for extent inline ref type

2017-05-25 Thread Liu Bo
An invalid extent inline ref type could be read from a btrfs image and
it ends up with a panic[1], this set is to deal with the insane value
gracefully in patch 1-2 and clean up BUG() in the code in patch 3-5.

Patch 6 adds scrub support to detect the corruption, so users can be
noticed when they do scrub on a regular basis.

I'm not sure in the real world what may result in this corruption, but
I've seen several reports on the ML about __btrfs_free_extent saying
something was missing (or simply wrong), while testing this set with
btrfs-corrupt-block, I found that switching ref type could end up that
situation as well, eg. a data extent's ref type
(BTRFS_EXTENT_DATA_REF_KEY) is switched to (BTRFS_TREE_BLOCK_REF_KEY).

Hopefully this can give people more sights next time when that
happens.

[1]:https://www.spinics.net/lists/linux-btrfs/msg65646.html

Liu Bo (6):
  Btrfs: add a helper to retrive extent inline ref type
  Btrfs: convert to use btrfs_get_extent_inline_ref_type
  Btrfs: remove BUG() in btrfs_extent_inline_ref_size
  Btrfs: remove BUG() in print_extent_item
  Btrfs: remove BUG() in add_data_reference
  Btrfs: add sanity check of extent item in scrub

 fs/btrfs/backref.c |  9 +--
 fs/btrfs/ctree.h   |  5 +++-
 fs/btrfs/extent-tree.c | 68 --
 fs/btrfs/print-tree.c  |  8 +-
 fs/btrfs/relocation.c  | 20 ---
 fs/btrfs/scrub.c   | 43 +++
 6 files changed, 139 insertions(+), 14 deletions(-)

-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: let btrfs_print_leaf print more about block group

2017-05-25 Thread Liu Bo
This adds chunk_objectid and flags, with flags we can recognize whether
the block group is about data or metadata.

Signed-off-by: Liu Bo 
---
 fs/btrfs/print-tree.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index cdafbf9..fcae61e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -261,8 +261,11 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, 
struct extent_buffer *l)
case BTRFS_BLOCK_GROUP_ITEM_KEY:
bi = btrfs_item_ptr(l, i,
struct btrfs_block_group_item);
-   pr_info("\t\tblock group used %llu\n",
-  btrfs_disk_block_group_used(l, bi));
+   pr_info(
+  "\t\tblock group used %llu chunk_objectid %llu flags %llu\n",
+   btrfs_disk_block_group_used(l, bi),
+   btrfs_disk_block_group_chunk_objectid(l, bi),
+   btrfs_disk_block_group_flags(l, bi));
break;
case BTRFS_CHUNK_ITEM_KEY:
print_chunk(l, btrfs_item_ptr(l, i,
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Use kvzalloc instead of kzalloc/vmalloc in alloc_bitmap

2017-05-25 Thread Omar Sandoval
On Thu, May 25, 2017 at 12:18:02PM -0700, Vinnie Magro wrote:
> Replace alloc_bitmap with call to kvzalloc. kvzalloc preserves the
> same fallback heuristic.

Looks good, thanks.

Reviewed-by: Omar Sandoval 

> Signed-off-by: Vinnie Magro 
> ---
>  fs/btrfs/free-space-tree.c | 23 ++-
>  1 file changed, 2 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
> index fc0bd84..9e87129 100644
> --- a/fs/btrfs/free-space-tree.c
> +++ b/fs/btrfs/free-space-tree.c
> @@ -151,25 +151,6 @@ static inline u32 free_space_bitmap_size(u64 size, u32 
> sectorsize)
>   return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
>  }
>  
> -static u8 *alloc_bitmap(u32 bitmap_size)
> -{
> - void *mem;
> -
> - /*
> -  * The allocation size varies, observed numbers were < 4K up to 16K.
> -  * Using vmalloc unconditionally would be too heavy, we'll try
> -  * contiguous allocations first.
> -  */
> - if  (bitmap_size <= PAGE_SIZE)
> - return kzalloc(bitmap_size, GFP_NOFS);
> -
> - mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
> - if (mem)
> - return mem;
> -
> - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL);
> -}
> -
>  int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
> struct btrfs_fs_info *fs_info,
> struct btrfs_block_group_cache *block_group,
> @@ -189,7 +170,7 @@ int convert_free_space_to_bitmaps(struct 
> btrfs_trans_handle *trans,
>  
>   bitmap_size = free_space_bitmap_size(block_group->key.offset,
>fs_info->sectorsize);
> - bitmap = alloc_bitmap(bitmap_size);
> + bitmap = kvzalloc(bitmap_size, GFP_NOFS);
>   if (!bitmap) {
>   ret = -ENOMEM;
>   goto out;
> @@ -330,7 +311,7 @@ int convert_free_space_to_extents(struct 
> btrfs_trans_handle *trans,
>  
>   bitmap_size = free_space_bitmap_size(block_group->key.offset,
>fs_info->sectorsize);
> - bitmap = alloc_bitmap(bitmap_size);
> + bitmap = kvzalloc(bitmap_size, GFP_NOFS);
>   if (!bitmap) {
>   ret = -ENOMEM;
>   goto out;
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: Use kvzalloc instead of kzalloc/vmalloc in alloc_bitmap

2017-05-25 Thread Vinnie Magro
Replace alloc_bitmap with call to kvzalloc. kvzalloc preserves the
same fallback heuristic.

Signed-off-by: Vinnie Magro 
---
 fs/btrfs/free-space-tree.c | 23 ++-
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index fc0bd84..9e87129 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -151,25 +151,6 @@ static inline u32 free_space_bitmap_size(u64 size, u32 
sectorsize)
return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
 }
 
-static u8 *alloc_bitmap(u32 bitmap_size)
-{
-   void *mem;
-
-   /*
-* The allocation size varies, observed numbers were < 4K up to 16K.
-* Using vmalloc unconditionally would be too heavy, we'll try
-* contiguous allocations first.
-*/
-   if  (bitmap_size <= PAGE_SIZE)
-   return kzalloc(bitmap_size, GFP_NOFS);
-
-   mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
-   if (mem)
-   return mem;
-
-   return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL);
-}
-
 int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
  struct btrfs_fs_info *fs_info,
  struct btrfs_block_group_cache *block_group,
@@ -189,7 +170,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle 
*trans,
 
bitmap_size = free_space_bitmap_size(block_group->key.offset,
 fs_info->sectorsize);
-   bitmap = alloc_bitmap(bitmap_size);
+   bitmap = kvzalloc(bitmap_size, GFP_NOFS);
if (!bitmap) {
ret = -ENOMEM;
goto out;
@@ -330,7 +311,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle 
*trans,
 
bitmap_size = free_space_bitmap_size(block_group->key.offset,
 fs_info->sectorsize);
-   bitmap = alloc_bitmap(bitmap_size);
+   bitmap = kvzalloc(bitmap_size, GFP_NOFS);
if (!bitmap) {
ret = -ENOMEM;
goto out;
-- 
2.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 2/2] Btrfs: compression must free at least one sector size

2017-05-25 Thread Timofey Titovets
Btrfs already skip store of data where compression didn't
free at least one byte. Let's make logic better and make check
that compression free at least one sector size
because in another case it useless to store this data compressed

Signed-off-by: Timofey Titovets 
---
 fs/btrfs/lzo.c  | 9 -
 fs/btrfs/zlib.c | 7 ++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index bd0b0938..4aafae6f 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -26,6 +26,7 @@
 #include 
 #include 
 #include "compression.h"
+#include "ctree.h"

 #define LZO_LEN4

@@ -99,6 +100,7 @@ static int lzo_compress_pages(struct list_head *ws,
int nr_pages = 0;
struct page *in_page = NULL;
struct page *out_page = NULL;
+   u32 sectorsize;
unsigned long bytes_left;
unsigned long len = *total_out;
unsigned long nr_dest_pages = *out_pages;
@@ -229,8 +231,13 @@ static int lzo_compress_pages(struct list_head *ws,
in_len = min(bytes_left, PAGE_SIZE);
}

-   if (tot_out > tot_in)
+   /* Compression must save at least one sectorsize */
+   sectorsize = btrfs_inode_sectorsize(mapping->host);
+
+   if (tot_out + sectorsize > tot_in) {
+   ret = -E2BIG;
goto out;
+   }

/* store the size of all chunks of compressed data */
cpage_out = kmap(pages[0]);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 135b1082..f9957248 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include "compression.h"
+#include "ctree.h"

 struct workspace {
z_stream strm;
@@ -86,6 +87,7 @@ static int zlib_compress_pages(struct list_head *ws,
int nr_pages = 0;
struct page *in_page = NULL;
struct page *out_page = NULL;
+   u32 sectorsize;
unsigned long bytes_left;
unsigned long len = *total_out;
unsigned long nr_dest_pages = *out_pages;
@@ -191,7 +193,10 @@ static int zlib_compress_pages(struct list_head *ws,
goto out;
}

-   if (workspace->strm.total_out >= workspace->strm.total_in) {
+   /* Compression must save at least one sectorsize */
+   sectorsize = btrfs_inode_sectorsize(mapping->host);
+
+   if (workspace->strm.total_out + sectorsize > workspace->strm.total_in) {
ret = -E2BIG;
goto out;
}
--
2.13.0
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 1/2] Btrfs: lzo.c pr_debug() deflate->lzo

2017-05-25 Thread Timofey Titovets
Fix copy paste typo in debug message for lzo.c, lzo is not deflate

Signed-off-by: Timofey Titovets 
---
 fs/btrfs/lzo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index f48c8c14..bd0b0938 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -141,7 +141,7 @@ static int lzo_compress_pages(struct list_head *ws,
ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
   _len, workspace->mem);
if (ret != LZO_E_OK) {
-   pr_debug("BTRFS: deflate in loop returned %d\n",
+   pr_debug("BTRFS: lzo in loop returned %d\n",
   ret);
ret = -EIO;
goto out;
--
2.13.0
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 0/2] Btrfs: compression fixes

2017-05-25 Thread Timofey Titovets
First patch:
Fix copy paste typo in debug message for lzo.c, lzo is not deflate

Second patch:
Force btrfs to not store data as compressed,
if compression will not free at least one sector size,
because it's useless in term of saving storage space and
reading data from disk, as a result productivity suffers.

Changes since v1:
- Merge patches for zlib and lzo in one
- Sync check logic for zlib and lzo
- Check profit after all data are compressed (not while compressing)

Changes since v2:
- Fix comparassion logic, it's enough if:
  compressed size + PAGE_SIZE not bigger then input data size

Changes since v3:
- Use btrfs sector size directly instead of assume that PAGE_SIZE == sectorsize

Timofey Titovets (2):
  Btrfs: lzo.c pr_debug() deflate->lzo
  Btrfs: compression must free at least one sector size

 fs/btrfs/lzo.c  | 11 +--
 fs/btrfs/zlib.c |  7 ++-
 2 files changed, 15 insertions(+), 3 deletions(-)

--
2.13.0
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] Btrfs: compression must free at least PAGE_SIZE

2017-05-25 Thread Timofey Titovets
2017-05-25 15:51 GMT+03:00 Chandan Rajendra :
...
> Apologies for the delayed response.
>
> I am not really sure if compression code must save atleast one sectorsize
> worth of space. But if other developers agree to it, then the above
> 'if' condition can be replaced with,
>
> u32 sectorsize = btrfs_inode_sectorsize(mapping->host);
> ...
> ...
>
> if (tot_out + sectorsize > tot_in) {
> --
> chandan
>

Thanks a lot!
This approach much simplier then i imagined, i will update patch set and resend.

Thank you!
-- 
Have a nice day,
Timofey.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: clear EXTENT_DEFRAG bits in finish_ordered_io

2017-05-25 Thread David Sterba
On Fri, May 19, 2017 at 01:01:42PM -0700, Liu Bo wrote:
> On Fri, May 19, 2017 at 09:06:42PM +0200, David Sterba wrote:
> > On Tue, May 09, 2017 at 05:02:15PM -0600, Liu Bo wrote:
> > > Before this, we use 'filled' mode here, ie. if all range has been filled
> > > with EXTENT_DEFRAG bits, get to clear it, but if the defrag range joins
> > > the adjacent delalloc range, then we'll leave EXTENT_DEFRAG bits until
> > > evicting inode.
> > >
> > > This clears the bit if any was found within the ordered extent.
> > 
> > What effects, good or bad, can this have?
> > 
> > Is it worth backporting to stable trees?
> 
> The good effect of this patch is to free extent_state quickly if we
> don't need it, without this, it can't be freed since the extent_state
> has at least EXTENT_DEFRAG bit in ->state.
> 
> Just notice that I made a mistake in the changelog, the bit will be
> cleared until releasing pages, which may be called by
> invalidate_mapping_ranges(), not evicting inode.
> 
> No, I don't think it's a candidate for stable tree.

Thanks for the answers. Please update the patch changelog and resend.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: skip commit transaction if we don't have enough pinned bytes

2017-05-25 Thread David Sterba
On Tue, May 23, 2017 at 12:06:40PM +0300, Nikolay Borisov wrote:
> 
> 
> On 19.05.2017 20:39, Liu Bo wrote:
> > We commit transaction in order to reclaim space from pinned bytes because
> > it could process delayed refs, and in may_commit_transaction(), we check
> > first if pinned bytes are enough for the required space, we then check if
> > that plus bytes reserved for delayed insert are enough for the required
> > space.
> > 
> > This changes the code to the above logic.
> > 
> > Signed-off-by: Liu Bo 
> 
> Please add:
> Fixes: b150a4f10d87 ("Btrfs: use a percpu to keep track of possibly
> pinned bytes")
> 
> > ---
> >  fs/btrfs/extent-tree.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > index e390451c72e6..bded1ddd1bb6 100644
> > --- a/fs/btrfs/extent-tree.c
> > +++ b/fs/btrfs/extent-tree.c
> > @@ -4837,7 +4837,7 @@ static int may_commit_transaction(struct 
> > btrfs_fs_info *fs_info,
> >  
> > spin_lock(_rsv->lock);
> > if (percpu_counter_compare(_info->total_bytes_pinned,
> > -  bytes - delayed_rsv->size) >= 0) {
> > +  bytes - delayed_rsv->size) < 0) {
> > spin_unlock(_rsv->lock);
> > return -ENOSPC;
> > }
> > 
> 
> With the minor nit above:
> 
> Reviewed-by: Nikolay Borisov 
> Tested-by: Nikolay Borisov 

Patch applied with updated tags.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: work around maybe-uninitialized warning

2017-05-25 Thread David Sterba
On Fri, May 19, 2017 at 09:20:53PM +0200, Arnd Bergmann wrote:
> On Fri, May 19, 2017 at 8:10 PM, Liu Bo  wrote:
> > On Thu, May 18, 2017 at 03:33:29PM +0200, Arnd Bergmann wrote:
> >> A rewrite of btrfs_submit_direct_hook appears to have introduced a warning:
> >>
> >> fs/btrfs/inode.c: In function 'btrfs_submit_direct_hook':
> >> fs/btrfs/inode.c:8467:14: error: 'bio' may be used uninitialized in this 
> >> function [-Werror=maybe-uninitialized]
> >>
> >> Where the 'bio' variable was previously initialized unconditionally, it
> >> is now set in the "while (submit_len > 0)" loop that would never execute
> >> if submit_len is zero.
> >>
> >> Assuming this cannot happen in practice, we can avoid the warning
> >> by simply replacing the while{} loop with a do{}while() loop so
> >> the compiler knows that it will always be entered at least once.
> >>
> >
> > Thanks for the fix.  I think it's a false positve one and I've updated it 
> > in v2
> > with a 'struct bio *bio = NULL' to make compiler happy, could you please 
> > help
> > reveiw it?
> 
> Right, it is a false positive and adding the =NULL initialization shuts up the
> warning. The reason my patch used a different approach is to make the
> code more robust, see https://rusty.ozlabs.org/?p=232
> 
> Generally speaking initializing a local variable to an illegal value, and 
> later
> using the variable without a check for that original value is error-prone.
> Even though the code is correct at the moment, someone else might
> modify it later. My first (broken) solution avoided this by checking for
> the condition that led to the warning, my newer solution is nicer as it
> makes it much clearer to the reader what is going on, compared to
> the NULL initialization that does not help readability but makes
> it slightly harder to understand why you wrote the code specifically that
> way.

I like this approach better, so I'll undo "= NULL" and apply your patch.
Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 2/2] Btrfs: compression must free at least PAGE_SIZE

2017-05-25 Thread Chandan Rajendra
On Sunday, May 21, 2017 12:10:39 AM IST Timofey Titovets wrote:
> Btrfs already skip store of data where compression didn't free at least one 
> byte.
> So make logic better and make check that compression free at least one 
> PAGE_SIZE,
> because in another case it useless to store this data compressed
> 
> Signed-off-by: Timofey Titovets 
> ---
>  fs/btrfs/lzo.c  | 5 -
>  fs/btrfs/zlib.c | 3 ++-
>  2 files changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
> index bd0b0938..39678499 100644
> --- a/fs/btrfs/lzo.c
> +++ b/fs/btrfs/lzo.c
> @@ -229,8 +229,11 @@ static int lzo_compress_pages(struct list_head *ws,
>   in_len = min(bytes_left, PAGE_SIZE);
>   }
> 
> - if (tot_out > tot_in)
> + /* Compression must save at least one PAGE_SIZE */
> + if (tot_out + PAGE_SIZE > tot_in) {
> + ret = -E2BIG;
>   goto out;
> + }

Apologies for the delayed response.

I am not really sure if compression code must save atleast one sectorsize
worth of space. But if other developers agree to it, then the above
'if' condition can be replaced with,

u32 sectorsize = btrfs_inode_sectorsize(mapping->host);
...
...

if (tot_out + sectorsize > tot_in) {

> 
>   /* store the size of all chunks of compressed data */
>   cpage_out = kmap(pages[0]);
> diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
> index 135b1082..11e117b5 100644
> --- a/fs/btrfs/zlib.c
> +++ b/fs/btrfs/zlib.c
> @@ -191,7 +191,8 @@ static int zlib_compress_pages(struct list_head *ws,
>   goto out;
>   }
> 
> - if (workspace->strm.total_out >= workspace->strm.total_in) {
> + /* Compression must save at least one PAGE_SIZE */
> + if (workspace->strm.total_out + PAGE_SIZE > workspace->strm.total_in) {
>   ret = -E2BIG;
>   goto out;
>   }
> --
> 2.13.0
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


-- 
chandan

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: btrfs_wait_tree_block_writeback can be void return

2017-05-25 Thread Jeff Layton
Nothing checks its return value.

Signed-off-by: Jeff Layton 
Reviewed-by: Jan Kara 
Reviewed-by: Liu Bo 
---
 fs/btrfs/disk-io.c | 6 +++---
 fs/btrfs/disk-io.h | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8685d67185d0..17acb72fed0f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1222,10 +1222,10 @@ int btrfs_write_tree_block(struct extent_buffer *buf)
buf->start + buf->len - 1);
 }
 
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-   return filemap_fdatawait_range(buf->pages[0]->mapping,
-  buf->start, buf->start + buf->len - 1);
+   filemap_fdatawait_range(buf->pages[0]->mapping,
+   buf->start, buf->start + buf->len - 1);
 }
 
 struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 
bytenr,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 21f1ceb85b76..f92f3e177d70 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -127,7 +127,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, 
struct inode *inode,
extent_submit_bio_hook_t *submit_bio_done);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
-int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
+void btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
-- 
2.9.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 06/10] fs: Introduce IOMAP_NOWAIT

2017-05-25 Thread Jan Kara
On Wed 24-05-17 11:41:46, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> IOCB_NOWAIT translates to IOMAP_NOWAIT for iomaps.
> This is used by XFS in the XFS patch.
> 
> Signed-off-by: Goldwyn Rodrigues 
> Reviewed-by: Christoph Hellwig 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  fs/iomap.c| 2 ++
>  include/linux/iomap.h | 1 +
>  2 files changed, 3 insertions(+)
> 
> diff --git a/fs/iomap.c b/fs/iomap.c
> index 4b10892967a5..5d85ec6e7b20 100644
> --- a/fs/iomap.c
> +++ b/fs/iomap.c
> @@ -879,6 +879,8 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
>   } else {
>   dio->flags |= IOMAP_DIO_WRITE;
>   flags |= IOMAP_WRITE;
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + flags |= IOMAP_NOWAIT;
>   }
>  
>   ret = filemap_write_and_wait_range(mapping, start, end);
> diff --git a/include/linux/iomap.h b/include/linux/iomap.h
> index f753e788da31..69f4e9470084 100644
> --- a/include/linux/iomap.h
> +++ b/include/linux/iomap.h
> @@ -52,6 +52,7 @@ struct iomap {
>  #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */
>  #define IOMAP_FAULT  (1 << 3) /* mapping for page fault */
>  #define IOMAP_DIRECT (1 << 4) /* direct I/O */
> +#define IOMAP_NOWAIT (1 << 5) /* Don't wait for writeback */
>  
>  struct iomap_ops {
>   /*
> -- 
> 2.12.0
> 
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 05/10] fs: return if direct write will trigger writeback

2017-05-25 Thread Jan Kara
On Wed 24-05-17 11:41:45, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> Find out if the write will trigger a wait due to writeback. If yes,
> return -EAGAIN.
> 
> Return -EINVAL for buffered AIO: there are multiple causes of
> delay such as page locks, dirty throttling logic, page loading
> from disk etc. which cannot be taken care of.
> 
> Signed-off-by: Goldwyn Rodrigues 
> Reviewed-by: Christoph Hellwig 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  mm/filemap.c | 17 ++---
>  1 file changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 097213275461..bc146efa6815 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2675,6 +2675,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, 
> struct iov_iter *from)
>  
>   pos = iocb->ki_pos;
>  
> + if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
> + return -EINVAL;
> +
>   if (limit != RLIM_INFINITY) {
>   if (iocb->ki_pos >= limit) {
>   send_sig(SIGXFSZ, current, 0);
> @@ -2743,9 +2746,17 @@ generic_file_direct_write(struct kiocb *iocb, struct 
> iov_iter *from)
>   write_len = iov_iter_count(from);
>   end = (pos + write_len - 1) >> PAGE_SHIFT;
>  
> - written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 
> 1);
> - if (written)
> - goto out;
> + if (iocb->ki_flags & IOCB_NOWAIT) {
> + /* If there are pages to writeback, return */
> + if (filemap_range_has_page(inode->i_mapping, pos,
> +pos + iov_iter_count(from)))
> + return -EAGAIN;
> + } else {
> + written = filemap_write_and_wait_range(mapping, pos,
> + pos + write_len - 1);
> + if (written)
> + goto out;
> + }
>  
>   /*
>* After a write we want buffered reads to be sure to go to disk to get
> -- 
> 2.12.0
> 
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 04/10] fs: Introduce RWF_NOWAIT

2017-05-25 Thread Jan Kara
On Wed 24-05-17 11:41:44, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> RWF_NOWAIT informs kernel to bail out if an AIO request will block
> for reasons such as file allocations, or a writeback triggered,
> or would block while allocating requests while performing
> direct I/O.
> 
> RWF_NOWAIT is translated to IOCB_NOWAIT for iocb->ki_flags.
> 
> The check for -EOPNOTSUPP is placed in generic_file_write_iter(). This
> is called by most filesystems, either through fsops.write_iter() or through
> the function defined by write_iter(). If not, we perform the check defined
> by .write_iter() which is called for direct IO specifically.
> 
> Filesystems xfs, btrfs and ext4 would be supported in the following patches.
> 
> Signed-off-by: Goldwyn Rodrigues 
> Reviewed-by: Christoph Hellwig 

Looks good now. You can add:

Reviewed-by: Jan Kara 

Honza


> ---
>  fs/9p/vfs_file.c|  3 +++
>  fs/aio.c| 13 +
>  fs/ceph/file.c  |  3 +++
>  fs/cifs/file.c  |  3 +++
>  fs/fuse/file.c  |  3 +++
>  fs/nfs/direct.c |  3 +++
>  fs/ocfs2/file.c |  3 +++
>  include/linux/fs.h  |  5 -
>  include/uapi/linux/fs.h |  1 +
>  mm/filemap.c|  3 +++
>  10 files changed, 39 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
> index 3de3b4a89d89..403681db7723 100644
> --- a/fs/9p/vfs_file.c
> +++ b/fs/9p/vfs_file.c
> @@ -411,6 +411,9 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter 
> *from)
>   loff_t origin;
>   int err = 0;
>  
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EOPNOTSUPP;
> +
>   retval = generic_write_checks(iocb, from);
>   if (retval <= 0)
>   return retval;
> diff --git a/fs/aio.c b/fs/aio.c
> index 020fa0045e3c..9616dc733103 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1592,6 +1592,19 @@ static int io_submit_one(struct kioctx *ctx, struct 
> iocb __user *user_iocb,
>   goto out_put_req;
>   }
>  
> + if (req->common.ki_flags & IOCB_NOWAIT) {
> + if (!(req->common.ki_flags & IOCB_DIRECT)) {
> + ret = -EOPNOTSUPP;
> + goto out_put_req;
> + }
> +
> + if ((iocb->aio_lio_opcode != IOCB_CMD_PWRITE) &&
> + (iocb->aio_lio_opcode != IOCB_CMD_PWRITEV)) {
> + ret = -EINVAL;
> + goto out_put_req;
> + }
> + }
> +
>   ret = put_user(KIOCB_KEY, _iocb->aio_key);
>   if (unlikely(ret)) {
>   pr_debug("EFAULT: aio_key\n");
> diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> index 3fdde0b283c9..a53fd2675b1b 100644
> --- a/fs/ceph/file.c
> +++ b/fs/ceph/file.c
> @@ -1300,6 +1300,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, 
> struct iov_iter *from)
>   int err, want, got;
>   loff_t pos;
>  
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EOPNOTSUPP;
> +
>   if (ceph_snap(inode) != CEPH_NOSNAP)
>   return -EROFS;
>  
> diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> index 0fd081bd2a2f..ff84fa9ddb6c 100644
> --- a/fs/cifs/file.c
> +++ b/fs/cifs/file.c
> @@ -2725,6 +2725,9 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct 
> iov_iter *from)
>* write request.
>*/
>  
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EOPNOTSUPP;
> +
>   rc = generic_write_checks(iocb, from);
>   if (rc <= 0)
>   return rc;
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 3ee4fdc3da9e..812c7bd0c290 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1425,6 +1425,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb 
> *iocb, struct iov_iter *from)
>   struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file);
>   ssize_t res;
>  
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EOPNOTSUPP;
> +
>   if (is_bad_inode(inode))
>   return -EIO;
>  
> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
> index 6fb9fad2d1e6..c8e7dd76126c 100644
> --- a/fs/nfs/direct.c
> +++ b/fs/nfs/direct.c
> @@ -979,6 +979,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct 
> iov_iter *iter)
>   dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
>   file, iov_iter_count(iter), (long long) iocb->ki_pos);
>  
> + if (iocb->ki_flags & IOCB_NOWAIT)
> + return -EOPNOTSUPP;
> +
>   result = generic_write_checks(iocb, iter);
>   if (result <= 0)
>   return result;
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index bfeb647459d9..e7f8ba890305 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -2235,6 +2235,9 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
>   if (count == 0)
>   return 0;
>  
> + if (iocb->ki_flags & 

Re: [PATCH 03/10] fs: Use RWF_* flags for AIO operations

2017-05-25 Thread Jan Kara
On Wed 24-05-17 11:41:43, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> aio_rw_flags is introduced in struct iocb (using aio_reserved1) which will
> carry the RWF_* flags. We cannot use aio_flags because they are not
> checked for validity which may break existing applications.
> 
> Note, the only place RWF_HIPRI comes in effect is dio_await_one().
> All the rest of the locations, aio code return -EIOCBQUEUED before the
> checks for RWF_HIPRI.
> 
> Signed-off-by: Goldwyn Rodrigues 
> Reviewed-by: Christoph Hellwig 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  fs/aio.c | 8 +++-
>  include/uapi/linux/aio_abi.h | 2 +-
>  2 files changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/aio.c b/fs/aio.c
> index f52d925ee259..020fa0045e3c 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -1541,7 +1541,7 @@ static int io_submit_one(struct kioctx *ctx, struct 
> iocb __user *user_iocb,
>   ssize_t ret;
>  
>   /* enforce forwards compatibility on users */
> - if (unlikely(iocb->aio_reserved1 || iocb->aio_reserved2)) {
> + if (unlikely(iocb->aio_reserved2)) {
>   pr_debug("EINVAL: reserve field set\n");
>   return -EINVAL;
>   }
> @@ -1586,6 +1586,12 @@ static int io_submit_one(struct kioctx *ctx, struct 
> iocb __user *user_iocb,
>   req->common.ki_flags |= IOCB_EVENTFD;
>   }
>  
> + ret = kiocb_set_rw_flags(>common, iocb->aio_rw_flags);
> + if (unlikely(ret)) {
> + pr_debug("EINVAL: aio_rw_flags\n");
> + goto out_put_req;
> + }
> +
>   ret = put_user(KIOCB_KEY, _iocb->aio_key);
>   if (unlikely(ret)) {
>   pr_debug("EFAULT: aio_key\n");
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index bb2554f7fbd1..a2d4a8ac94ca 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -79,7 +79,7 @@ struct io_event {
>  struct iocb {
>   /* these are internal to the kernel/libc. */
>   __u64   aio_data;   /* data to be returned in event's data */
> - __u32   PADDED(aio_key, aio_reserved1);
> + __u32   PADDED(aio_key, aio_rw_flags);
>   /* the kernel sets aio_key to the req # */
>  
>   /* common fields */
> -- 
> 2.12.0
> 
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 02/10] fs: Introduce filemap_range_has_page()

2017-05-25 Thread Jan Kara
On Wed 24-05-17 11:41:42, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> filemap_range_has_page() return true if the file's mapping has
> a page within the range mentioned. This function will be used
> to check if a write() call will cause a writeback of previous
> writes.
> 
> Signed-off-by: Goldwyn Rodrigues 
> Reviewed-by: Christoph Hellwig 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  include/linux/fs.h |  2 ++
>  mm/filemap.c   | 33 +
>  2 files changed, 35 insertions(+)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f53867140f43..dc0ab585cd56 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2517,6 +2517,8 @@ extern int filemap_fdatawait(struct address_space *);
>  extern void filemap_fdatawait_keep_errors(struct address_space *);
>  extern int filemap_fdatawait_range(struct address_space *, loff_t lstart,
>  loff_t lend);
> +extern int filemap_range_has_page(struct address_space *, loff_t lstart,
> +   loff_t lend);
>  extern int filemap_write_and_wait(struct address_space *mapping);
>  extern int filemap_write_and_wait_range(struct address_space *mapping,
>   loff_t lstart, loff_t lend);
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 6f1be573a5e6..87aba7698584 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -376,6 +376,39 @@ int filemap_flush(struct address_space *mapping)
>  }
>  EXPORT_SYMBOL(filemap_flush);
>  
> +/**
> + * filemap_range_has_page - check if a page exists in range.
> + * @mapping:   address space structure to wait for
> + * @start_byte:offset in bytes where the range starts
> + * @end_byte:  offset in bytes where the range ends (inclusive)
> + *
> + * Find at least one page in the range supplied, usually used to check if
> + * direct writing in this range will trigger a writeback.
> + */
> +int filemap_range_has_page(struct address_space *mapping,
> +loff_t start_byte, loff_t end_byte)
> +{
> + pgoff_t index = start_byte >> PAGE_SHIFT;
> + pgoff_t end = end_byte >> PAGE_SHIFT;
> + struct pagevec pvec;
> + int ret;
> +
> + if (end_byte < start_byte)
> + return 0;
> +
> + if (mapping->nrpages == 0)
> + return 0;
> +
> + pagevec_init(, 0);
> + ret = pagevec_lookup(, mapping, index, 1);
> + if (!ret)
> + return 0;
> + ret = (pvec.pages[0]->index <= end);
> + pagevec_release();
> + return ret;
> +}
> +EXPORT_SYMBOL(filemap_range_has_page);
> +
>  static int __filemap_fdatawait_range(struct address_space *mapping,
>loff_t start_byte, loff_t end_byte)
>  {
> -- 
> 2.12.0
> 
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 01/10] fs: Separate out kiocb flags setup based on RWF_* flags

2017-05-25 Thread Jan Kara
On Wed 24-05-17 11:41:41, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues 
> 
> Signed-off-by: Goldwyn Rodrigues 
> Reviewed-by: Christoph Hellwig 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  fs/read_write.c| 12 +++-
>  include/linux/fs.h | 14 ++
>  2 files changed, 17 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/read_write.c b/fs/read_write.c
> index 47c1d4484df9..53c816c61122 100644
> --- a/fs/read_write.c
> +++ b/fs/read_write.c
> @@ -678,16 +678,10 @@ static ssize_t do_iter_readv_writev(struct file *filp, 
> struct iov_iter *iter,
>   struct kiocb kiocb;
>   ssize_t ret;
>  
> - if (flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC))
> - return -EOPNOTSUPP;
> -
>   init_sync_kiocb(, filp);
> - if (flags & RWF_HIPRI)
> - kiocb.ki_flags |= IOCB_HIPRI;
> - if (flags & RWF_DSYNC)
> - kiocb.ki_flags |= IOCB_DSYNC;
> - if (flags & RWF_SYNC)
> - kiocb.ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
> + ret = kiocb_set_rw_flags(, flags);
> + if (ret)
> + return ret;
>   kiocb.ki_pos = *ppos;
>  
>   if (type == READ)
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 803e5a9b2654..f53867140f43 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -3056,6 +3056,20 @@ static inline int iocb_flags(struct file *file)
>   return res;
>  }
>  
> +static inline int kiocb_set_rw_flags(struct kiocb *ki, int flags)
> +{
> + if (unlikely(flags & ~(RWF_HIPRI | RWF_DSYNC | RWF_SYNC)))
> + return -EOPNOTSUPP;
> +
> + if (flags & RWF_HIPRI)
> + ki->ki_flags |= IOCB_HIPRI;
> + if (flags & RWF_DSYNC)
> + ki->ki_flags |= IOCB_DSYNC;
> + if (flags & RWF_SYNC)
> + ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
> + return 0;
> +}
> +
>  static inline ino_t parent_ino(struct dentry *dentry)
>  {
>   ino_t res;
> -- 
> 2.12.0
> 
-- 
Jan Kara 
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 01/20] btrfs-progs: raid56: Introduce raid56 header for later recovery usage

2017-05-25 Thread Qu Wenruo
Introduce a new header, kernel-lib/raid56.h, for later raid56 works.

It contains 2 functions, from original btrfs-progs code:
void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs);
int raid5_gen_result(int nr_devs, size_t stripe_len, int dest, void **data);

Will be expanded later and some part of it(RAID6 recover part) may keep
sync with kernel later.

Signed-off-by: Qu Wenruo 
---
 Makefile|  4 ++--
 disk-io.h   |  4 
 kernel-lib/raid56.h | 28 
 volumes.c   |  1 +
 4 files changed, 31 insertions(+), 6 deletions(-)
 create mode 100644 kernel-lib/raid56.h

diff --git a/Makefile b/Makefile
index 81598df1..92063a90 100644
--- a/Makefile
+++ b/Makefile
@@ -108,8 +108,8 @@ libbtrfs_objects = send-stream.o send-utils.o 
kernel-lib/rbtree.o btrfs-list.o \
   uuid-tree.o utils-lib.o rbtree-utils.o
 libbtrfs_headers = send-stream.h send-utils.h send.h kernel-lib/rbtree.h 
btrfs-list.h \
   kernel-lib/crc32c.h kernel-lib/list.h kerncompat.h \
-  kernel-lib/radix-tree.h kernel-lib/sizes.h extent-cache.h \
-  extent_io.h ioctl.h ctree.h btrfsck.h version.h
+  kernel-lib/radix-tree.h kernel-lib/sizes.h kernel-lib/raid56.h \
+  extent-cache.h extent_io.h ioctl.h ctree.h btrfsck.h version.h
 convert_objects = convert/main.o convert/common.o convert/source-fs.o \
  convert/source-ext2.o
 mkfs_objects = mkfs/main.o mkfs/common.o
diff --git a/disk-io.h b/disk-io.h
index cd4fe929..ad8efb43 100644
--- a/disk-io.h
+++ b/disk-io.h
@@ -201,8 +201,4 @@ int write_tree_block(struct btrfs_trans_handle *trans,
 struct extent_buffer *eb);
 int write_and_map_eb(struct btrfs_root *root, struct extent_buffer *eb);
 
-/* raid56.c */
-void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs);
-int raid5_gen_result(int nr_devs, size_t stripe_len, int dest, void **data);
-
 #endif
diff --git a/kernel-lib/raid56.h b/kernel-lib/raid56.h
new file mode 100644
index ..fa8fa260
--- /dev/null
+++ b/kernel-lib/raid56.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2017 Fujitsu.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef _BTRFS_PROGS_RAID56_H
+#define _BTRFS_PROGS_RAID56_H
+/*
+ * Headers for RAID5/6 operations.
+ * Original headers from original RAID5/6 codes, not from kernel header.
+ */
+
+void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs);
+int raid5_gen_result(int nr_devs, size_t stripe_len, int dest, void **data);
+#endif
diff --git a/volumes.c b/volumes.c
index b350e259..8c2ffd92 100644
--- a/volumes.c
+++ b/volumes.c
@@ -28,6 +28,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "utils.h"
+#include "kernel-lib/raid56.h"
 
 struct stripe {
struct btrfs_device *dev;
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 13/20] btrfs-progs: scrub: Introduce function to scrub one data stripe

2017-05-25 Thread Qu Wenruo
Introduce new function, scrub_one_data_stripe(), to check all data and
tree blocks inside the data stripe.

This function will not try to recovery any error, but only check if any
data/tree blocks has mismatch csum.

If data missing csum, which is completely valid for case like nodatasum,
it will just record it, but not report as error.

Signed-off-by: Qu Wenruo 
---
 scrub.c | 129 
 1 file changed, 129 insertions(+)

diff --git a/scrub.c b/scrub.c
index 4302aafa..1944f216 100644
--- a/scrub.c
+++ b/scrub.c
@@ -627,3 +627,132 @@ invalid_arg:
error("invalid parameter for %s", __func__);
return -EINVAL;
 }
+
+/*
+ * Scrub one full data stripe of RAID5/6.
+ * This means it will check any data/metadata extent in the data stripe
+ * spcified by @stripe and @stripe_len
+ *
+ * This function will only *CHECK* if the data stripe has any corruption.
+ * Won't repair at this function.
+ *
+ * Return 0 if the full stripe is OK.
+ * Return <0 if any error is found.
+ * Note: Missing csum is not counted as error(NODATACSUM is valid)
+ */
+static int scrub_one_data_stripe(struct btrfs_fs_info *fs_info,
+struct btrfs_scrub_progress *scrub_ctx,
+struct scrub_stripe *stripe, u32 stripe_len)
+{
+   struct btrfs_path *path;
+   struct btrfs_root *extent_root = fs_info->extent_root;
+   struct btrfs_key key;
+   u64 extent_start;
+   u64 extent_len;
+   u64 orig_csum_discards;
+   int ret;
+
+   if (!is_data_stripe(stripe))
+   return -EINVAL;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   key.objectid = stripe->logical + stripe_len;
+   key.offset = 0;
+   key.type = 0;
+
+   ret = btrfs_search_slot(NULL, extent_root, , path, 0, 0);
+   if (ret < 0)
+   goto out;
+   while (1) {
+   struct btrfs_extent_item *ei;
+   struct extent_buffer *eb;
+   char *data;
+   int slot;
+   int metadata = 0;
+   u64 check_start;
+   u64 check_len;
+
+   ret = btrfs_previous_extent_item(extent_root, path, 0);
+   if (ret > 0) {
+   ret = 0;
+   goto out;
+   }
+   if (ret < 0)
+   goto out;
+   eb = path->nodes[0];
+   slot = path->slots[0];
+   btrfs_item_key_to_cpu(eb, , slot);
+   extent_start = key.objectid;
+   ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+
+   /* tree block scrub */
+   if (key.type == BTRFS_METADATA_ITEM_KEY ||
+   btrfs_extent_flags(eb, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+   extent_len = extent_root->nodesize;
+   metadata = 1;
+   } else {
+   extent_len = key.offset;
+   metadata = 0;
+   }
+
+   /* Current extent is out of our range, loop comes to end */
+   if (extent_start + extent_len <= stripe->logical)
+   break;
+
+   if (metadata) {
+   /*
+* Check crossing stripe first, which can't be scrubbed
+*/
+   if (check_crossing_stripes(fs_info, extent_start,
+   extent_root->nodesize)) {
+   error("tree block at %llu is crossing stripe 
boundary, unable to scrub",
+   extent_start);
+   ret = -EIO;
+   goto out;
+   }
+   data = stripe->data + extent_start - stripe->logical;
+   ret = check_tree_mirror(fs_info, scrub_ctx,
+   data, extent_start, 0);
+   /* Any csum/verify error means the stripe is screwed */
+   if (ret < 0) {
+   stripe->csum_mismatch = 1;
+   ret = -EIO;
+   goto out;
+   }
+   ret = 0;
+   continue;
+   }
+   /* Restrict the extent range to fit stripe range */
+   check_start = max(extent_start, stripe->logical);
+   check_len = min(extent_start + extent_len, stripe->logical +
+   stripe_len) - check_start;
+
+   /* Record original csum_discards to detect missing csum case */
+   orig_csum_discards = scrub_ctx->csum_discards;
+
+   data = stripe->data + check_start - stripe->logical;
+   ret = 

[PATCH v4 07/20] btrfs-progs: Allow __btrfs_map_block_v2 to remove unrelated stripes

2017-05-25 Thread Qu Wenruo
For READ, caller normally hopes to get what they request, other than
full stripe map.

In this case, we should remove unrelated stripe map, just like the
following case:
   32K   96K
   |<-request range->|
 0  64k   128K
RAID0:   |Data 1|   Data 2|
  disk1 disk2
Before this patch, we return the full stripe:
Stripe 0: Logical 0, Physical X, Len 64K, Dev disk1
Stripe 1: Logical 64k, Physical Y, Len 64K, Dev disk2

After this patch, we limit the stripe result to the request range:
Stripe 0: Logical 32K, Physical X+32K, Len 32K, Dev disk1
Stripe 1: Logical 64k, Physical Y, Len 32K, Dev disk2

And if it's a RAID5/6 stripe, we just handle it like RAID0, ignoring
parities.

This should make caller easier to use.

Signed-off-by: Qu Wenruo 
---
 volumes.c | 103 +-
 1 file changed, 102 insertions(+), 1 deletion(-)

diff --git a/volumes.c b/volumes.c
index 985e5661..e61ddcd6 100644
--- a/volumes.c
+++ b/volumes.c
@@ -1737,6 +1737,107 @@ static int fill_full_map_block(struct map_lookup *map, 
u64 start, u64 length,
return 0;
 }
 
+static void del_one_stripe(struct btrfs_map_block *map_block, int i)
+{
+   int cur_nr = map_block->num_stripes;
+   int size_left = (cur_nr - 1 - i) * sizeof(struct btrfs_map_stripe);
+
+   memmove(_block->stripes[i], _block->stripes[i + 1], size_left);
+   map_block->num_stripes--;
+}
+
+static void remove_unrelated_stripes(struct map_lookup *map,
+int rw, u64 start, u64 length,
+struct btrfs_map_block *map_block)
+{
+   int i = 0;
+   /*
+* RAID5/6 write must use full stripe.
+* No need to do anything.
+*/
+   if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+   rw == WRITE)
+   return;
+
+   /*
+* For RAID0/1/10/DUP, whatever read/write, we can remove unrelated
+* stripes without causing anything wrong.
+* RAID5/6 READ is just like RAID0, we don't care parity unless we need
+* to recovery.
+* For recovery, rw should be set to WRITE.
+*/
+   while (i < map_block->num_stripes) {
+   struct btrfs_map_stripe *stripe;
+   u64 orig_logical; /* Original stripe logical start */
+   u64 orig_end; /* Original stripe logical end */
+
+   stripe = _block->stripes[i];
+
+   /*
+* For READ, we don't really care parity
+*/
+   if (stripe->logical == BTRFS_RAID5_P_STRIPE ||
+   stripe->logical == BTRFS_RAID6_Q_STRIPE) {
+   del_one_stripe(map_block, i);
+   continue;
+   }
+   /* Completely unrelated stripe */
+   if (stripe->logical >= start + length ||
+   stripe->logical + stripe->length <= start) {
+   del_one_stripe(map_block, i);
+   continue;
+   }
+   /* Covered stripe, modify its logical and physical */
+   orig_logical = stripe->logical;
+   orig_end = stripe->logical + stripe->length;
+   if (start + length <= orig_end) {
+   /*
+* |<--range-->|
+*   |  stripe   |
+* Or
+* ||
+*   |  stripe   |
+*/
+   stripe->logical = max(orig_logical, start);
+   stripe->length = start + length;
+   stripe->physical += stripe->logical - orig_logical;
+   } else if (start >= orig_logical) {
+   /*
+* |<-range--->|
+* |  stripe |
+* Or
+* ||
+* |  stripe |
+*/
+   stripe->logical = start;
+   stripe->length = min(orig_end, start + length);
+   stripe->physical += stripe->logical - orig_logical;
+   }
+   /*
+* Remaining case:
+* ||
+*   | stripe |
+* No need to do any modification
+*/
+   i++;
+   }
+
+   /* Recaculate map_block size */
+   map_block->start = 0;
+   map_block->length = 0;
+   for (i = 0; i < map_block->num_stripes; i++) {
+   struct btrfs_map_stripe *stripe;
+
+   stripe = _block->stripes[i];
+   if (stripe->logical > map_block->start)
+   map_block->start = stripe->logical;
+   if 

[PATCH v4 18/20] btrfs-progs: scrub: Introduce a function to scrub one full stripe

2017-05-25 Thread Qu Wenruo
Introduce a new function, scrub_one_full_stripe(), to check a full
stripe.

It handles the full stripe scrub in the following steps:
0) Check if we need to check full stripe
   If full stripe contains no extent, why waste our CPU and IO?

1) Read out full stripe
   Then we know how many devices are missing or have read error.
   If out of repair, then exit

   If have missing device or have read error, try recover here.

2) Check data stripe against csum
   We add data stripe with csum error as corrupted stripe, just like
   dev missing or read error.
   Then recheck if csum mismatch is still below tolerance.

Finally we check the full stripe using 2 factors only:
A) If the full stripe go through recover ever
B) If the full stripe has csum error

Combine factor A and B we get:
1) A && B: Recovered, csum mismatch
   Screwed up totally
2) A && !B: Recovered, csum match
   Recoverable, data corrupted but P/Q is good to recover
3) !A && B: Not recovered, csum mismatch
   Try to recover corrupted data stripes
   If recovered csum match, then recoverable
   Else, screwed up
4) !A && !B: Not recovered, no csum mismatch
   Best case, just check if P/Q matches.
   If P/Q matches, everything is good
   Else, just P/Q is screwed up, still recoverable.

Signed-off-by: Qu Wenruo 
---
 scrub.c | 285 
 1 file changed, 285 insertions(+)

diff --git a/scrub.c b/scrub.c
index 94981f2b..1c9a7fc4 100644
--- a/scrub.c
+++ b/scrub.c
@@ -918,5 +918,290 @@ static int write_full_stripe(struct scrub_full_stripe 
*fstripe)
 out:
free(ptrs);
return ret;
+}
+
+/*
+ * Return 0 if we still have chance to recover
+ * Return <0 if we have no more chance
+ */
+static int report_recoverablity(struct scrub_full_stripe *fstripe)
+{
+   int max_tolerance;
+   u64 start = fstripe->logical_start;
+
+   if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5)
+   max_tolerance = 1;
+   else
+   max_tolerance = 2;
+
+   if (fstripe->nr_corrupted_stripes > max_tolerance) {
+   error(
+   "full stripe %llu CORRUPTED: too many read error or corrupted devices",
+   start);
+   error(
+   "full stripe %llu: tolerance: %d, missing: %d, read error: %d, csum 
error: %d",
+   start, max_tolerance, fstripe->err_read_stripes,
+   fstripe->err_missing_devs, fstripe->err_csum_dstripes);
+   return -EIO;
+   }
+   return 0;
+}
+
+static void clear_corrupted_stripe_record(struct scrub_full_stripe *fstripe)
+{
+   fstripe->corrupted_index[0] = -1;
+   fstripe->corrupted_index[1] = -1;
+   fstripe->nr_corrupted_stripes = 0;
+}
+
+static void record_corrupted_stripe(struct scrub_full_stripe *fstripe,
+   int index)
+{
+   int i = 0;
+
+   for (i = 0; i < 2; i++) {
+   if (fstripe->corrupted_index[i] == -1) {
+   fstripe->corrupted_index[i] = index;
+   break;
+   }
+   }
+   fstripe->nr_corrupted_stripes++;
+}
+
+/*
+ * Scrub one full stripe.
+ *
+ * If everything matches, that's good.
+ * If data stripe corrupted badly, no mean to recovery, it will report it.
+ * If data stripe corrupted, try recovery first and recheck csum, to
+ * determine if it's recoverable or screwed up.
+ */
+static int scrub_one_full_stripe(struct btrfs_fs_info *fs_info,
+struct btrfs_scrub_progress *scrub_ctx,
+u64 start, u64 *next_ret, int write)
+{
+   struct scrub_full_stripe *fstripe;
+   struct btrfs_map_block *map_block = NULL;
+   u32 stripe_len = BTRFS_STRIPE_LEN;
+   u64 bg_type;
+   u64 len;
+   int i;
+   int ret;
+
+   if (!next_ret) {
+   error("invalid argument for %s", __func__);
+   return -EINVAL;
+   }
+
+   ret = __btrfs_map_block_v2(fs_info, WRITE, start, stripe_len,
+  _block);
+   if (ret < 0) {
+   /* Let caller to skip the whole block group */
+   *next_ret = (u64)-1;
+   return ret;
+   }
+   start = map_block->start;
+   len = map_block->length;
+   *next_ret = start + len;
+
+   /*
+* Step 0: Check if we need to scrub the full stripe
+*
+* If no extent lies in the full stripe, not need to check
+*/
+   ret = btrfs_check_extent_exists(fs_info, start, len);
+   if (ret < 0) {
+   free(map_block);
+   return ret;
+   }
+   /* No extents in range, no need to check */
+   if (ret == 0) {
+   free(map_block);
+   return 0;
+   }
+
+   bg_type = map_block->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+   if (bg_type != BTRFS_BLOCK_GROUP_RAID5 &&
+   bg_type != 

[PATCH v4 19/20] btrfs-progs: scrub: Introduce function to check a whole block group

2017-05-25 Thread Qu Wenruo
Introduce new function, scrub_one_block_group(), to scrub a block group.

For Single/DUP/RAID0/RAID1/RAID10, we use old mirror number based
map_block, and check extent by extent.

For parity based profile (RAID5/6), we use new map_block_v2() and check
full stripe by full stripe.

Signed-off-by: Qu Wenruo 
---
 scrub.c | 92 +
 1 file changed, 92 insertions(+)

diff --git a/scrub.c b/scrub.c
index 1c9a7fc4..5fa2260b 100644
--- a/scrub.c
+++ b/scrub.c
@@ -1205,3 +1205,95 @@ out:
free(map_block);
return ret;
 }
+
+/*
+ * Scrub one block group.
+ *
+ * This function will handle all profiles current btrfs supports.
+ * Return 0 for scrubbing the block group. Found error will be recorded into
+ * scrub_ctx.
+ * Return <0 for fatal error preventing scrubing the block group.
+ */
+static int scrub_one_block_group(struct btrfs_fs_info *fs_info,
+struct btrfs_scrub_progress *scrub_ctx,
+struct btrfs_block_group_cache *bg_cache,
+int write)
+{
+   struct btrfs_root *extent_root = fs_info->extent_root;
+   struct btrfs_path *path;
+   struct btrfs_key key;
+   u64 bg_start = bg_cache->key.objectid;
+   u64 bg_len = bg_cache->key.offset;
+   int ret;
+
+   if (bg_cache->flags &
+   (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+   u64 cur = bg_start;
+   u64 next;
+
+   while (cur < bg_start + bg_len) {
+   ret = scrub_one_full_stripe(fs_info, scrub_ctx, cur,
+   , write);
+   /* Ignore any non-fatal error */
+   if (ret < 0 && ret != -EIO) {
+   error("fatal error happens checking one full 
stripe at bytenr: %llu: %s",
+   cur, strerror(-ret));
+   return ret;
+   }
+   cur = next;
+   }
+   /* Ignore any -EIO error, such error will be reported at last */
+   return 0;
+   }
+   /* None parity based profile, check extent by extent */
+   key.objectid = bg_start;
+   key.type = 0;
+   key.offset = 0;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+   ret = btrfs_search_slot(NULL, extent_root, , path, 0, 0);
+   if (ret < 0)
+   goto out;
+   while (1) {
+   struct extent_buffer *eb = path->nodes[0];
+   int slot = path->slots[0];
+   u64 extent_start;
+   u64 extent_len;
+
+   btrfs_item_key_to_cpu(eb, , slot);
+   if (key.objectid >= bg_start + bg_len)
+   break;
+   if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+   key.type != BTRFS_METADATA_ITEM_KEY)
+   goto next;
+
+   extent_start = key.objectid;
+   if (key.type == BTRFS_METADATA_ITEM_KEY)
+   extent_len = extent_root->nodesize;
+   else
+   extent_len = key.offset;
+
+   ret = scrub_one_extent(fs_info, scrub_ctx, path, extent_start,
+   extent_len, write);
+   if (ret < 0 && ret != -EIO) {
+   error("fatal error checking extent bytenr %llu len 
%llu: %s",
+   extent_start, extent_len, strerror(-ret));
+   goto out;
+   }
+   ret = 0;
+next:
+   ret = btrfs_next_extent_item(extent_root, path, bg_start +
+bg_len);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = 0;
+   break;
+   }
+   }
+out:
+   btrfs_free_path(path);
+   return ret;
+}
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 09/20] btrfs-progs: scrub: Introduce structures to support offline scrub for RAID56

2017-05-25 Thread Qu Wenruo
Introuduce new local structures, scrub_full_stripe and scrub_stripe, for
incoming offline RAID56 scrub support.

For pure stripe/mirror based profiles, like raid0/1/10/dup/single, we
will follow the original bytenr and mirror number based iteration, so
they don't need any extra structures for these profiles.

Signed-off-by: Qu Wenruo 
---
 Makefile |   2 +-
 scrub.c  | 126 +++
 2 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100644 scrub.c

diff --git a/Makefile b/Makefile
index e6d7c187..b3c70e04 100644
--- a/Makefile
+++ b/Makefile
@@ -95,7 +95,7 @@ objects = ctree.o disk-io.o kernel-lib/radix-tree.o 
extent-tree.o print-tree.o \
  qgroup.o free-space-cache.o kernel-lib/list_sort.o props.o \
  kernel-shared/ulist.o qgroup-verify.o backref.o string-table.o 
task-utils.o \
  inode.o file.o find-root.o free-space-tree.o help.o send-dump.o \
- fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o csum.o
+ fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o csum.o scrub.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
   cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o \
diff --git a/scrub.c b/scrub.c
new file mode 100644
index ..a757dff6
--- /dev/null
+++ b/scrub.c
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2017 Fujitsu.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+/*
+ * Main part to implement offline(unmounted) btrfs scrub
+ */
+
+#include 
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "utils.h"
+
+/*
+ * For parity based profile(RAID56)
+ * Mirror/stripe based on won't need this. They are iterated by bytenr and
+ * mirror number.
+ */
+struct scrub_stripe {
+   /* For P/Q logical start will be BTRFS_RAID5/6_P/Q_STRIPE */
+   u64 logical;
+
+   u64 physical;
+
+   /* Device is missing */
+   unsigned int dev_missing:1;
+
+   /* Any tree/data csum mismatches */
+   unsigned int csum_mismatch:1;
+
+   /* Some data doesn't have csum(nodatasum) */
+   unsigned int csum_missing:1;
+
+   /* Device fd, to write correct data back to disc */
+   int fd;
+
+   char *data;
+};
+
+/*
+ * RAID56 full stripe(data stripes + P/Q)
+ */
+struct scrub_full_stripe {
+   u64 logical_start;
+   u64 logical_len;
+   u64 bg_type;
+   u32 nr_stripes;
+   u32 stripe_len;
+
+   /* Read error stripes */
+   u32 err_read_stripes;
+
+   /* Missing devices */
+   u32 err_missing_devs;
+
+   /* Csum error data stripes */
+   u32 err_csum_dstripes;
+
+   /* Missing csum data stripes */
+   u32 missing_csum_dstripes;
+
+   /* currupted stripe index */
+   int corrupted_index[2];
+
+   int nr_corrupted_stripes;
+
+   /* Already recovered once? */
+   unsigned int recovered:1;
+
+   struct scrub_stripe stripes[];
+};
+
+static void free_full_stripe(struct scrub_full_stripe *fstripe)
+{
+   int i;
+
+   for (i = 0; i < fstripe->nr_stripes; i++)
+   free(fstripe->stripes[i].data);
+   free(fstripe);
+}
+
+static struct scrub_full_stripe *alloc_full_stripe(int nr_stripes,
+   u32 stripe_len)
+{
+   struct scrub_full_stripe *ret;
+   int size = sizeof(*ret) + sizeof(unsigned long *) +
+   nr_stripes * sizeof(struct scrub_stripe);
+   int i;
+
+   ret = malloc(size);
+   if (!ret)
+   return NULL;
+
+   memset(ret, 0, size);
+   ret->nr_stripes = nr_stripes;
+   ret->stripe_len = stripe_len;
+   ret->corrupted_index[0] = -1;
+   ret->corrupted_index[1] = -1;
+
+   /* Alloc data memory for each stripe */
+   for (i = 0; i < nr_stripes; i++) {
+   struct scrub_stripe *stripe = >stripes[i];
+
+   stripe->data = malloc(stripe_len);
+   if (!stripe->data) {
+   free_full_stripe(ret);
+   return NULL;
+   }
+   }
+   return ret;
+}
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to 

[PATCH v4 10/20] btrfs-progs: scrub: Introduce functions to scrub mirror based tree block

2017-05-25 Thread Qu Wenruo
Introduce new functions, check/recover_tree_mirror(), to check and
recover mirror-based tree blocks (Single/DUP/RAID0/1/10).

check_tree_mirror() can also be used on in-memory tree blocks using @data
parameter.
This is very handy for RAID5/6 case, either checking the data stripe
tree block by @bytenr and 0 as @mirror, or using @data parameter for
recovered in-memory data.

While recover_tree_mirror() is only used for mirror-based profiles, as
RAID56 recovery is done by stripe unit, not mirror unit.

Signed-off-by: Qu Wenruo 
---
 disk-io.c |   4 +-
 disk-io.h |   2 +
 scrub.c   | 145 ++
 3 files changed, 149 insertions(+), 2 deletions(-)

diff --git a/disk-io.c b/disk-io.c
index 6aa6d98a..8340915d 100644
--- a/disk-io.c
+++ b/disk-io.c
@@ -51,8 +51,8 @@ static u32 max_nritems(u8 level, u32 nodesize)
sizeof(struct btrfs_key_ptr));
 }
 
-static int check_tree_block(struct btrfs_fs_info *fs_info,
-   struct extent_buffer *buf)
+int check_tree_block(struct btrfs_fs_info *fs_info,
+struct extent_buffer *buf)
 {
 
struct btrfs_fs_devices *fs_devices;
diff --git a/disk-io.h b/disk-io.h
index ad8efb43..dbb51fc5 100644
--- a/disk-io.h
+++ b/disk-io.h
@@ -126,6 +126,8 @@ static inline struct extent_buffer* read_tree_block(
parent_transid);
 }
 
+int check_tree_block(struct btrfs_fs_info *fs_info,
+struct extent_buffer *buf);
 int read_extent_data(struct btrfs_root *root, char *data, u64 logical,
 u64 *len, int mirror);
 void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
diff --git a/scrub.c b/scrub.c
index a757dff6..f5a5c205 100644
--- a/scrub.c
+++ b/scrub.c
@@ -124,3 +124,148 @@ static struct scrub_full_stripe *alloc_full_stripe(int 
nr_stripes,
}
return ret;
 }
+
+static inline int is_data_stripe(struct scrub_stripe *stripe)
+{
+   u64 bytenr = stripe->logical;
+
+   if (bytenr == BTRFS_RAID5_P_STRIPE || bytenr == BTRFS_RAID6_Q_STRIPE)
+   return 0;
+   return 1;
+}
+
+/*
+ * Check one tree mirror given by @bytenr and @mirror, or @data.
+ * If @data is not given(NULL), the function will try to read out tree block
+ * using @bytenr and @mirror.
+ * If @data is given, use data directly, won't try to read from disk.
+ *
+ * The extra @data prameter is handy for RAID5/6 recovery code to verify
+ * the recovered data.
+ *
+ * Return 0 if everything is OK.
+ * Return <0 something goes wrong, and @scrub_ctx accounting will be updated
+ * if it's a data corruption.
+ */
+static int check_tree_mirror(struct btrfs_fs_info *fs_info,
+struct btrfs_scrub_progress *scrub_ctx,
+char *data, u64 bytenr, int mirror)
+{
+   struct extent_buffer *eb;
+   u32 nodesize = fs_info->tree_root->nodesize;
+   int ret;
+
+   if (!IS_ALIGNED(bytenr, fs_info->tree_root->sectorsize)) {
+   /* Such error will be reported by check_tree_block() */
+   scrub_ctx->verify_errors++;
+   return -EIO;
+   }
+
+   eb = btrfs_find_create_tree_block(fs_info, bytenr, nodesize);
+   if (!eb)
+   return -ENOMEM;
+   if (data) {
+   memcpy(eb->data, data, nodesize);
+   } else {
+   ret = read_whole_eb(fs_info, eb, mirror);
+   if (ret) {
+   scrub_ctx->read_errors++;
+   error("failed to read tree block %llu mirror %d",
+ bytenr, mirror);
+   goto out;
+   }
+   }
+
+   scrub_ctx->tree_bytes_scrubbed += nodesize;
+   if (csum_tree_block(fs_info->tree_root, eb, 1)) {
+   error("tree block %llu mirror %d checksum mismatch", bytenr,
+   mirror);
+   scrub_ctx->csum_errors++;
+   ret = -EIO;
+   goto out;
+   }
+   ret = check_tree_block(fs_info, eb);
+   if (ret < 0) {
+   error("tree block %llu mirror %d is invalid", bytenr, mirror);
+   scrub_ctx->verify_errors++;
+   goto out;
+   }
+
+   scrub_ctx->tree_extents_scrubbed++;
+out:
+   free_extent_buffer(eb);
+   return ret;
+}
+
+/*
+ * read_extent_data() helper
+ *
+ * This function will handle short read and update @scrub_ctx when read
+ * error happens.
+ */
+static int read_extent_data_loop(struct btrfs_fs_info *fs_info,
+struct btrfs_scrub_progress *scrub_ctx,
+char *buf, u64 start, u64 len, int mirror)
+{
+   int ret = 0;
+   u64 cur = 0;
+
+   while (cur < len) {
+   u64 read_len = len - cur;
+
+   ret = read_extent_data(fs_info->tree_root, buf + cur,
+   start + cur, _len, 

[PATCH v4 11/20] btrfs-progs: scrub: Introduce functions to scrub mirror based data blocks

2017-05-25 Thread Qu Wenruo
Introduce new function, check/recover_data_mirror(), to check and recover
mirror based data blocks.

Unlike tree block, data blocks must be recovered sector by sector, so we
introduced corrupted_bitmap for check and recover.

Signed-off-by: Qu Wenruo 
Signed-off-by: Su Yue 
---
 scrub.c | 212 
 1 file changed, 212 insertions(+)

diff --git a/scrub.c b/scrub.c
index f5a5c205..e473d168 100644
--- a/scrub.c
+++ b/scrub.c
@@ -25,6 +25,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "utils.h"
+#include "kernel-lib/bitops.h"
 
 /*
  * For parity based profile(RAID56)
@@ -269,3 +270,214 @@ out:
free(buf);
return ret;
 }
+
+/*
+ * Check one data mirror given by @start @len and @mirror, or @data
+ * If @data is not given, try to read it from disk.
+ * This function will try to read out all the data then check sum.
+ *
+ * If @data is given, just use the data.
+ * This behavior is useful for RAID5/6 recovery code to verify recovered data.
+ *
+ * If @corrupt_bitmap is given, restore corrupted sector to that bitmap.
+ * This is useful for mirror based profiles to recover its data.
+ *
+ * Return 0 if everything is OK.
+ * Return <0 if something goes wrong, and @scrub_ctx accounting will be updated
+ * if it's a data corruption.
+ */
+static int check_data_mirror(struct btrfs_fs_info *fs_info,
+struct btrfs_scrub_progress *scrub_ctx,
+char *data, u64 start, u64 len, int mirror,
+unsigned long *corrupt_bitmap)
+{
+   u32 sectorsize = fs_info->tree_root->sectorsize;
+   u32 data_csum;
+   u32 *csums = NULL;
+   char *buf = NULL;
+   int ret = 0;
+   int err = 0;
+   int i;
+   unsigned long *csum_bitmap = NULL;
+
+   if (!data) {
+   buf = malloc(len);
+   if (!buf)
+   return -ENOMEM;
+   ret = read_extent_data_loop(fs_info, scrub_ctx, buf, start,
+len, mirror);
+   if (ret < 0)
+   goto out;
+   scrub_ctx->data_bytes_scrubbed += len;
+   } else {
+   buf = data;
+   }
+
+   /* Alloc and Check csums */
+   csums = malloc(len / sectorsize * sizeof(data_csum));
+   if (!csums) {
+   ret = -ENOMEM;
+   goto out;
+   }
+   csum_bitmap = malloc(calculate_bitmap_len(len / sectorsize));
+   if (!csum_bitmap) {
+   ret = -ENOMEM;
+   goto out;
+   }
+
+   if (corrupt_bitmap)
+   memset(corrupt_bitmap, 0,
+   calculate_bitmap_len(len / sectorsize));
+   ret = btrfs_read_data_csums(fs_info, start, len, csums, csum_bitmap);
+   if (ret < 0)
+   goto out;
+
+   for (i = 0; i < len / sectorsize; i++) {
+   if (!test_bit(i, csum_bitmap)) {
+   scrub_ctx->csum_discards++;
+   continue;
+   }
+
+   data_csum = ~(u32)0;
+   data_csum = btrfs_csum_data(buf + i * sectorsize, data_csum,
+   sectorsize);
+   btrfs_csum_final(data_csum, (u8 *)_csum);
+
+   if (memcmp(_csum, (char *)csums + i * sizeof(data_csum),
+  sizeof(data_csum))) {
+   error("data at bytenr %llu mirror %d csum mismatch, 
have 0x%08x expect 0x%08x",
+ start + i * sectorsize, mirror, data_csum,
+ *(u32 *)((char *)csums + i * sizeof(data_csum)));
+   err = 1;
+   scrub_ctx->csum_errors++;
+   if (corrupt_bitmap)
+   set_bit(i, corrupt_bitmap);
+   continue;
+   }
+   scrub_ctx->data_bytes_scrubbed += sectorsize;
+   }
+out:
+   if (!data)
+   free(buf);
+   free(csums);
+   free(csum_bitmap);
+
+   if (!ret && err)
+   return -EIO;
+   return ret;
+}
+
+/* Helper to check all mirrors for a good copy */
+static int has_good_mirror(unsigned long *corrupt_bitmaps[], int num_copies,
+  int bit, int *good_mirror)
+{
+   int found_good = 0;
+   int i;
+
+   for (i = 0; i < num_copies; i++) {
+   if (!test_bit(bit, corrupt_bitmaps[i])) {
+   found_good = 1;
+   if (good_mirror)
+   *good_mirror = i + 1;
+   break;
+   }
+   }
+   return found_good;
+}
+
+/*
+ * Helper function to check @corrupt_bitmaps, to verify if it's recoverable
+ * for mirror based data extent.
+ *
+ * Return 1 for recoverable, and 0 for not recoverable
+ */
+static int 

[PATCH v4 12/20] btrfs-progs: scrub: Introduce function to scrub one mirror-based extent

2017-05-25 Thread Qu Wenruo
Introduce a new function, scrub_one_extent(), as a wrapper to check one
mirror-based extent.

It will accept a btrfs_path parameter @path, which must points to a
META/EXTENT_ITEM.
And @start, @len, which must be a subset of META/EXTENT_ITEM.

Signed-off-by: Qu Wenruo 
---
 scrub.c | 148 +++-
 1 file changed, 147 insertions(+), 1 deletion(-)

diff --git a/scrub.c b/scrub.c
index e473d168..4302aafa 100644
--- a/scrub.c
+++ b/scrub.c
@@ -441,7 +441,7 @@ static int recover_data_mirror(struct btrfs_fs_info 
*fs_info,
 
num_copies = btrfs_num_copies(_info->mapping_tree, start, len);
for (i = 0; i < num_copies; i++) {
-   for_each_set_bit(bit, corrupt_bitmaps[i], BITS_PER_LONG) {
+   for_each_set_bit(bit, corrupt_bitmaps[i], len / sectorsize) {
u64 cur = start + bit * sectorsize;
int good;
 
@@ -481,3 +481,149 @@ out:
free(buf);
return ret;
 }
+
+/* Btrfs only supports up to 2 copies of data, yet */
+#define BTRFS_MAX_COPIES   2
+
+/*
+ * Check all copies of range @start, @len.
+ * Caller must ensure the range is covered by EXTENT_ITEM/METADATA_ITEM
+ * specified by leaf of @path.
+ * And @start, @len must be a subset of the EXTENT_ITEM/METADATA_ITEM.
+ *
+ * Return 0 if the range is all OK or recovered or recoverable.
+ * Return <0 if the range can't be recoverable.
+ */
+static int scrub_one_extent(struct btrfs_fs_info *fs_info,
+   struct btrfs_scrub_progress *scrub_ctx,
+   struct btrfs_path *path, u64 start, u64 len,
+   int write)
+{
+   struct btrfs_key key;
+   struct btrfs_extent_item *ei;
+   struct extent_buffer *leaf = path->nodes[0];
+   u32 sectorsize = fs_info->tree_root->sectorsize;
+   unsigned long *corrupt_bitmaps[BTRFS_MAX_COPIES] = { NULL };
+   int slot = path->slots[0];
+   int num_copies;
+   int meta_corrupted = 0;
+   int meta_good_mirror = 0;
+   int data_bad_mirror = 0;
+   u64 extent_start;
+   u64 extent_len;
+   int metadata = 0;
+   int i;
+   int ret = 0;
+
+   btrfs_item_key_to_cpu(leaf, , slot);
+   if (key.type != BTRFS_METADATA_ITEM_KEY &&
+   key.type != BTRFS_EXTENT_ITEM_KEY)
+   goto invalid_arg;
+
+   extent_start = key.objectid;
+   if (key.type == BTRFS_METADATA_ITEM_KEY) {
+   extent_len = fs_info->tree_root->nodesize;
+   metadata = 1;
+   } else {
+   extent_len = key.offset;
+   ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+   if (btrfs_extent_flags(leaf, ei) & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+   metadata = 1;
+   }
+   if (start >= extent_start + extent_len ||
+   start + len <= extent_start)
+   goto invalid_arg;
+
+   for (i = 0; i < BTRFS_MAX_COPIES; i++) {
+   corrupt_bitmaps[i] = malloc(
+   calculate_bitmap_len(len / sectorsize));
+   if (!corrupt_bitmaps[i])
+   goto out;
+   }
+   num_copies = btrfs_num_copies(_info->mapping_tree, start, len);
+   for (i = 1; i <= num_copies; i++) {
+   if (metadata) {
+   ret = check_tree_mirror(fs_info, scrub_ctx,
+   NULL, extent_start, i);
+   scrub_ctx->tree_extents_scrubbed++;
+   if (ret < 0)
+   meta_corrupted++;
+   else
+   meta_good_mirror = i;
+   } else {
+   ret = check_data_mirror(fs_info, scrub_ctx, NULL, start,
+   len, i, corrupt_bitmaps[i - 1]);
+   scrub_ctx->data_extents_scrubbed++;
+   }
+   }
+
+   /* Metadata recover and report */
+   if (metadata) {
+   if (!meta_corrupted) {
+   goto out;
+   } else if (meta_corrupted && meta_corrupted < num_copies) {
+   if (write) {
+   ret = recover_tree_mirror(fs_info, scrub_ctx,
+   start, meta_good_mirror);
+   if (ret < 0) {
+   error("failed to recover tree block at 
bytenr %llu",
+   start);
+   goto out;
+   }
+   printf("extent %llu len %llu REPAIRED: has 
corrupted mirror, repaired\n",
+   start, len);
+   goto out;
+   }
+   printf("extent %llu len %llu 

[PATCH v4 00/20] Btrfs-progs offline scrub

2017-05-25 Thread Qu Wenruo
For any one who wants to try it, it can be get from my repo:
https://github.com/adam900710/btrfs-progs/tree/offline_scrub

Several reports on kernel scrub screwing up good data stripes are in ML
for sometime.

And since kernel scrub won't account P/Q corruption, it makes us quite
hard to detect error like kernel screwing up P/Q when scrubbing.

To get a comparable tool for kernel scrub, we need a user-space tool to
act as benchmark to compare their different behaviors.

So here is the patchset for user-space scrub.

Which can do:
1) All mirror/backup check for non-parity based stripe
   Which means for RAID1/DUP/RAID10, we can really check all mirrors
   other than the 1st good mirror.

   Current "--check-data-csum" option should be finally replaced by
   offline scrub.
   As "--check-data-csum" doesn't really check all mirrors, if it hits
   a good copy, then resting copies will just be ignored.

   In v4 update, data check is further improved, inspired by kernel
   behavior, now data extent is checked sector by sector, so it can
   handle the following corruption case:

   Data extent A contains data from 0~28K.
   And |///| = corrupted  |   | = good
 0   4k  8k  12k 16k 20k 24k 28k
   Mirror 0  |///|   |///|   |///|   |   |
   Mirror 1  |   |///|   |///|   |///|   |

   Extent A should be RECOVERABLE, while in v3 we treat data extent A as
   a whole unit, above case is reported as CORRUPTED.

2) RAID5/6 full stripe check
   It will take full use of btrfs csum(both tree and data).
   It will only recover the full stripe if all recovered data matches
   with its csum.

   NOTE: Due to the lack of good bitmap facilities, RAID56 sector by
   sector repair will be quite complex, especially when NODATASUM is
   involved.

   So current RAID56 doesn't support vertical sector recovery yet.

   Data extent A contains data from 0~64K
   And |///| = corrupted while |   | = good
  0   8K  16K 24K 32K 40K 48K 56K 64K
   Data stripe 0  |///|   |///|   |///|   |///|   |
   Data stripe 1  |   |///|   |///|   |///|   |///|
   Parity |   |   |   |   |   |   |   |   |

   Kernel will recover it, while current scrub will report it as
   CORRUPTED.

3) Repair
   In this v4 update, repair is finally added.

And this patchset also introduces new btrfs_map_block() function, which is
more flex than current btrfs_map_block(), and has a unified interface
for all profiles, not just an extra array for RAID56.

Check the 6th and 7th patch for details.

They are already used in RAID5/6 scrub, but can also be used for other
profiles too.

The to-do list has been shortened, since repair is added in v4 update.
1) Test cases
   Need to make the infrastructure able to handle multi-device first.

2) Make btrfsck able to handle RAID5 with missing device
   Now it doesn't even open RAID5 btrfs with missing device, even though
   scrub should be able to handle it.

3) RAID56 vertical sector repair
   Although I consider such case is minor compared to RAID1 vertical
   sector repair.
   As for RAID1, an extent can be as large as 128M, while for RAID56 one
   stripe will always be 64K, much smaller than RAID1 case, making the
   possibility lower.

   I prefer to add this function after the patchset get merged, as no
   one really likes get 20 mails every time I update the patchset.

For guys who want to review the patchset, there is a basic function
relationships slide.
I hope this will reduce the time needed to get what the patchset is
doing.
https://docs.google.com/presentation/d/1tAU3lUVaRUXooSjhFaDUeyW3wauHDSg9H-AiLBOSuIM/edit?usp=sharing


Qu Wenruo (20):
  btrfs-progs: raid56: Introduce raid56 header for later recovery usage
  btrfs-progs: raid56: Introduce tables for RAID6 recovery
  btrfs-progs: raid56: Allow raid6 to recover 2 data stripes
  btrfs-progs: raid56: Allow raid6 to recover data and p
  btrfs-progs: Introduce wrapper to recover raid56 data
  btrfs-progs: Introduce new btrfs_map_block function which returns more
unified result.
  btrfs-progs: Allow __btrfs_map_block_v2 to remove unrelated stripes
  btrfs-progs: csum: Introduce function to read out data csums
  btrfs-progs: scrub: Introduce structures to support offline scrub for
RAID56
  btrfs-progs: scrub: Introduce functions to scrub mirror based tree
block
  btrfs-progs: scrub: Introduce functions to scrub mirror based data
blocks
  btrfs-progs: scrub: Introduce function to scrub one mirror-based
extent
  btrfs-progs: scrub: Introduce function to scrub one data stripe
  btrfs-progs: scrub: Introduce function to verify parities
  btrfs-progs: extent-tree: Introduce function to check if there is any
extent in given range.
  btrfs-progs: scrub: Introduce function to recover data parity
  btrfs-progs: scrub: Introduce helper to write a full stripe
  btrfs-progs: scrub: Introduce a function to scrub one full stripe
  btrfs-progs: scrub: Introduce function to check a whole block group
  btrfs-progs: scrub: Introduce 

[PATCH v4 17/20] btrfs-progs: scrub: Introduce helper to write a full stripe

2017-05-25 Thread Qu Wenruo
Introduce a internal helper, write_full_stripe() to calculate P/Q and
write the whole full stripe.

This is useful to recover RAID56 stripes.

Signed-off-by: Qu Wenruo 
---
 scrub.c | 44 
 1 file changed, 44 insertions(+)

diff --git a/scrub.c b/scrub.c
index 3163cacb..94981f2b 100644
--- a/scrub.c
+++ b/scrub.c
@@ -876,3 +876,47 @@ static int recover_from_parities(struct btrfs_fs_info 
*fs_info,
free(ptrs);
return ret;
 }
+
+/*
+ * Helper to write a full stripe to disk
+ * P/Q will be re-calculated.
+ */
+static int write_full_stripe(struct scrub_full_stripe *fstripe)
+{
+   void **ptrs;
+   int nr_stripes = fstripe->nr_stripes;
+   int stripe_len = BTRFS_STRIPE_LEN;
+   int i;
+   int ret = 0;
+
+   ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
+   if (!ptrs)
+   return -ENOMEM;
+
+   for (i = 0; i < fstripe->nr_stripes; i++)
+   ptrs[i] = fstripe->stripes[i].data;
+
+   if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID6) {
+   raid6_gen_syndrome(nr_stripes, stripe_len, ptrs);
+   } else {
+   ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1,
+   ptrs);
+   if (ret < 0)
+   goto out;
+   }
+
+   for (i = 0; i < fstripe->nr_stripes; i++) {
+   struct scrub_stripe *stripe = >stripes[i];
+
+   ret = pwrite(stripe->fd, stripe->data, fstripe->stripe_len,
+stripe->physical);
+   if (ret != fstripe->stripe_len) {
+   ret = -EIO;
+   goto out;
+   }
+   }
+out:
+   free(ptrs);
+   return ret;
+
+}
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 20/20] btrfs-progs: scrub: Introduce offline scrub function

2017-05-25 Thread Qu Wenruo
Now, btrfs-progs has a kernel scrub equivalent.
A new option, --offline is added to "btrfs scrub start".

If --offline is given, btrfs scrub will just act like kernel scrub, to
check every copy of extent and do a report on corrupted data and if it's
recoverable.

The advantage compare to kernel scrub is:
1) No race
   Unlike kernel scrub, which is done in parallel, offline scrub is done
   by a single thread.
   Although it may be slower than kernel one, it's safer and no false
   alert.

2) Correctness
   Kernel has a known bug (fix submitted) which will recovery RAID5/6
   data but screw up P/Q, due to the hardness coding in kernel.
   While in btrfs-progs, no page, (almost) no memory size limit, we're
   can focus on the scrub, and make things easier.

New offline scrub can detect and report P/Q corruption with
recoverability report, while kernel will only report data stripe error.

Signed-off-by: Qu Wenruo 
Signed-off-by: Su 
---
 Documentation/btrfs-scrub.asciidoc |   9 +++
 cmds-scrub.c   | 116 +++--
 ctree.h|   6 ++
 scrub.c|  71 +++
 utils.h|   6 ++
 5 files changed, 204 insertions(+), 4 deletions(-)

diff --git a/Documentation/btrfs-scrub.asciidoc 
b/Documentation/btrfs-scrub.asciidoc
index eb90a1c4..49527c2a 100644
--- a/Documentation/btrfs-scrub.asciidoc
+++ b/Documentation/btrfs-scrub.asciidoc
@@ -78,6 +78,15 @@ set IO priority classdata (see `ionice`(1) manpage)
 force starting new scrub even if a scrub is already running,
 this can useful when scrub status file is damaged and reports a running
 scrub although it is not, but should not normally be necessary
+--offline
+Do offline scrub.
+NOTE: it's experimental and repair is not supported yet.
+--progress
+Show progress status while doing offline scrub. (Default)
+NOTE: it's only useful with option --offline.
+--no-progress
+Don't show progress status while doing offline scrub.
+NOTE: it's only useful with option --offline.
 
 *status* [-d] |::
 Show status of a running scrub for the filesystem identified by 'path' or
diff --git a/cmds-scrub.c b/cmds-scrub.c
index 5388fdcf..063b4dfd 100644
--- a/cmds-scrub.c
+++ b/cmds-scrub.c
@@ -36,12 +36,14 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "ctree.h"
 #include "ioctl.h"
 #include "utils.h"
 #include "volumes.h"
 #include "disk-io.h"
+#include "task-utils.h"
 
 #include "commands.h"
 #include "help.h"
@@ -217,6 +219,32 @@ static void add_to_fs_stat(struct btrfs_scrub_progress *p,
_SCRUB_FS_STAT_MIN(ss, finished, fs_stat);
 }
 
+static void *print_offline_status(void *p)
+{
+   struct task_context *ctx = p;
+   const char work_indicator[] = {'.', 'o', 'O', 'o' };
+   uint32_t count = 0;
+
+   task_period_start(ctx->info, 1000 /* 1s */);
+
+   while (1) {
+   printf("Doing offline scrub [%c] [%llu/%llu]\r",
+  work_indicator[count % 4], ctx->cur, ctx->all);
+   count++;
+   fflush(stdout);
+   task_period_wait(ctx->info);
+   }
+   return NULL;
+}
+
+static int print_offline_return(void *p)
+{
+   printf("\n");
+   fflush(stdout);
+
+   return 0;
+}
+
 static void init_fs_stat(struct scrub_fs_stat *fs_stat)
 {
memset(fs_stat, 0, sizeof(*fs_stat));
@@ -1100,7 +1128,7 @@ static const char * const cmd_scrub_resume_usage[];
 
 static int scrub_start(int argc, char **argv, int resume)
 {
-   int fdmnt;
+   int fdmnt = -1;
int prg_fd = -1;
int fdres = -1;
int ret;
@@ -1124,10 +1152,14 @@ static int scrub_start(int argc, char **argv, int 
resume)
int n_start = 0;
int n_skip = 0;
int n_resume = 0;
+   int offline = 0;
+   int progress_set = -1;
struct btrfs_ioctl_fs_info_args fi_args;
struct btrfs_ioctl_dev_info_args *di_args = NULL;
struct scrub_progress *sp = NULL;
struct scrub_fs_stat fs_stat;
+   struct task_context task = {0};
+   struct btrfs_fs_info *fs_info = NULL;
struct timeval tv;
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
@@ -1147,7 +1179,18 @@ static int scrub_start(int argc, char **argv, int resume)
int force = 0;
int nothing_to_resume = 0;
 
-   while ((c = getopt(argc, argv, "BdqrRc:n:f")) != -1) {
+   enum { GETOPT_VAL_OFFLINE = 257,
+  GETOPT_VAL_PROGRESS,
+  GETOPT_VAL_NO_PROGRESS};
+   static const struct option long_options[] = {
+   { "offline", no_argument, NULL, GETOPT_VAL_OFFLINE},
+   { "progress", no_argument, NULL, GETOPT_VAL_PROGRESS},
+   { "no-progress", no_argument, NULL, GETOPT_VAL_NO_PROGRESS},
+   { NULL, 0, NULL, 0}
+   };
+
+   while ((c = getopt_long(argc, argv, 

[PATCH v4 05/20] btrfs-progs: Introduce wrapper to recover raid56 data

2017-05-25 Thread Qu Wenruo
Introduce a wrapper to recover raid56 data.

The logical is the same with kernel one, but with different interfaces,
since kernel ones cares the performance while in btrfs we don't care
that much.

And the interface is more caller friendly inside btrfs-progs.

Signed-off-by: Qu Wenruo 
---
 kernel-lib/raid56.c | 77 +
 kernel-lib/raid56.h | 11 
 2 files changed, 88 insertions(+)

diff --git a/kernel-lib/raid56.c b/kernel-lib/raid56.c
index e078972b..e3a9339e 100644
--- a/kernel-lib/raid56.c
+++ b/kernel-lib/raid56.c
@@ -280,3 +280,80 @@ int raid6_recov_datap(int nr_devs, size_t stripe_len, int 
dest1, void **data)
}
return 0;
 }
+
+/* Original raid56 recovery wrapper */
+int raid56_recov(int nr_devs, size_t stripe_len, u64 profile, int dest1,
+int dest2, void **data)
+{
+   int min_devs;
+   int ret;
+
+   if (profile & BTRFS_BLOCK_GROUP_RAID5)
+   min_devs = 2;
+   else if (profile & BTRFS_BLOCK_GROUP_RAID6)
+   min_devs = 3;
+   else
+   return -EINVAL;
+   if (nr_devs < min_devs)
+   return -EINVAL;
+
+   /* Nothing to recover */
+   if (dest1 == -1 && dest2 == -1)
+   return 0;
+
+   /* Reorder dest1/2, so only dest2 can be -1  */
+   if (dest1 == -1) {
+   dest1 = dest2;
+   dest2 = -1;
+   } else if (dest2 != -1 && dest1 != -1) {
+   /* Reorder dest1/2, ensure dest2 > dest1 */
+   if (dest1 > dest2) {
+   int tmp;
+
+   tmp = dest2;
+   dest2 = dest1;
+   dest1 = tmp;
+   }
+   }
+
+   if (profile & BTRFS_BLOCK_GROUP_RAID5) {
+   if (dest2 != -1)
+   return 1;
+   return raid5_gen_result(nr_devs, stripe_len, dest1, data);
+   }
+
+   /* RAID6 one dev corrupted case*/
+   if (dest2 == -1) {
+   /* Regenerate P/Q */
+   if (dest1 == nr_devs - 1 || dest1 == nr_devs - 2) {
+   raid6_gen_syndrome(nr_devs, stripe_len, data);
+   return 0;
+   }
+
+   /* Regerneate data from P */
+   return raid5_gen_result(nr_devs - 1, stripe_len, dest1, data);
+   }
+
+   /* P/Q bot corrupted */
+   if (dest1 == nr_devs - 2 && dest2 == nr_devs - 1) {
+   raid6_gen_syndrome(nr_devs, stripe_len, data);
+   return 0;
+   }
+
+   /* 2 Data corrupted */
+   if (dest2 < nr_devs - 2)
+   return raid6_recov_data2(nr_devs, stripe_len, dest1, dest2,
+data);
+   /* Data and P*/
+   if (dest2 == nr_devs - 1)
+   return raid6_recov_datap(nr_devs, stripe_len, dest1, data);
+
+   /*
+* Final case, Data and Q, recover data first then regenerate Q
+*/
+   ret = raid5_gen_result(nr_devs - 1, stripe_len, dest1, data);
+   if (ret < 0)
+   return ret;
+   raid6_gen_syndrome(nr_devs, stripe_len, data);
+   return 0;
+}
diff --git a/kernel-lib/raid56.h b/kernel-lib/raid56.h
index 83ac39a1..e06c3ffb 100644
--- a/kernel-lib/raid56.h
+++ b/kernel-lib/raid56.h
@@ -44,4 +44,15 @@ int raid6_recov_data2(int nr_devs, size_t stripe_len, int 
dest1, int dest2,
  void **data);
 /* Recover data and P */
 int raid6_recov_datap(int nr_devs, size_t stripe_len, int dest1, void **data);
+
+/*
+ * Recover raid56 data
+ * @dest1/2 can be -1 to indicate correct data
+ *
+ * Return >0 for unrecoverable case.
+ * Return 0 for recoverable case, And recovered data will be stored into @data
+ * Return <0 for fatal error
+ */
+int raid56_recov(int nr_devs, size_t stripe_len, u64 profile, int dest1,
+int dest2, void **data);
 #endif
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 16/20] btrfs-progs: scrub: Introduce function to recover data parity

2017-05-25 Thread Qu Wenruo
Introduce function, recover_from_parities(), to recover data stripes.

It just wraps raid56_recov() with extra check functions to
scrub_full_stripe structure.

Signed-off-by: Qu Wenruo 
---
 scrub.c | 51 +++
 1 file changed, 51 insertions(+)

diff --git a/scrub.c b/scrub.c
index ef391ad1..3163cacb 100644
--- a/scrub.c
+++ b/scrub.c
@@ -825,3 +825,54 @@ out:
free(ptrs);
return ret;
 }
+
+/*
+ * Try to recover data stripe from P or Q stripe
+ *
+ * Return >0 if it can't be require any more.
+ * Return 0 for successful repair or no need to repair at all
+ * Return <0 for fatal error
+ */
+static int recover_from_parities(struct btrfs_fs_info *fs_info,
+ struct btrfs_scrub_progress *scrub_ctx,
+ struct scrub_full_stripe *fstripe)
+{
+   void **ptrs;
+   int nr_stripes = fstripe->nr_stripes;
+   int stripe_len = BTRFS_STRIPE_LEN;
+   int max_tolerance;
+   int i;
+   int ret;
+
+   /* No need to recover */
+   if (!fstripe->nr_corrupted_stripes)
+   return 0;
+
+   /* Already recovered once, no more chance */
+   if (fstripe->recovered)
+   return 1;
+
+   if (fstripe->bg_type & BTRFS_BLOCK_GROUP_RAID5)
+   max_tolerance = 1;
+   else
+   max_tolerance = 2;
+
+   /* Out of repair */
+   if (fstripe->nr_corrupted_stripes > max_tolerance)
+   return 1;
+
+   ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
+   if (!ptrs)
+   return -ENOMEM;
+
+   /* Construct ptrs */
+   for (i = 0; i < nr_stripes; i++)
+   ptrs[i] = fstripe->stripes[i].data;
+
+   ret = raid56_recov(nr_stripes, stripe_len, fstripe->bg_type,
+   fstripe->corrupted_index[0],
+   fstripe->corrupted_index[1], ptrs);
+   fstripe->recovered = 1;
+   free(ptrs);
+   return ret;
+}
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 15/20] btrfs-progs: extent-tree: Introduce function to check if there is any extent in given range.

2017-05-25 Thread Qu Wenruo
Introduce a new function, btrfs_check_extent_exists(), to check if there
is any extent in the range specified by user.

The parameter can be a large range, and if any extent exists in the
range, it will return >0 (in fact it will return 1).
Or return 0 if no extent is found.

Signed-off-by: Qu Wenruo 
---
 ctree.h   |  2 ++
 extent-tree.c | 60 +++
 2 files changed, 62 insertions(+)

diff --git a/ctree.h b/ctree.h
index 9c999b1f..d3ddf752 100644
--- a/ctree.h
+++ b/ctree.h
@@ -2541,6 +2541,8 @@ int exclude_super_stripes(struct btrfs_root *root,
 u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
   struct btrfs_fs_info *info, u64 start, u64 end);
 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
+int btrfs_check_extent_exists(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len);
 
 /* ctree.c */
 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
diff --git a/extent-tree.c b/extent-tree.c
index b12ee290..17c2c10e 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -4258,3 +4258,63 @@ u64 add_new_free_space(struct btrfs_block_group_cache 
*block_group,
 
return total_added;
 }
+
+/*
+ * Check if there is any extent(both data and metadata) in the range
+ * [@start, @start + @len)
+ *
+ * Return 0 for no extent found.
+ * Return >0 for found extent.
+ * Return <0 for fatal error.
+ */
+int btrfs_check_extent_exists(struct btrfs_fs_info *fs_info, u64 start,
+ u64 len)
+{
+   struct btrfs_path *path;
+   struct btrfs_key key;
+   u64 extent_start;
+   u64 extent_len;
+   int ret;
+
+   path = btrfs_alloc_path();
+   if (!path)
+   return -ENOMEM;
+
+   key.objectid = start + len;
+   key.type = 0;
+   key.offset = 0;
+
+   ret = btrfs_search_slot(NULL, fs_info->extent_root, , path, 0, 0);
+   if (ret < 0)
+   goto out;
+   /*
+* Now we're pointing at slot whose key.object >= end, skip to previous
+* extent.
+*/
+   ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = 0;
+   goto out;
+   }
+   btrfs_item_key_to_cpu(path->nodes[0], , path->slots[0]);
+   extent_start = key.objectid;
+   if (key.type == BTRFS_METADATA_ITEM_KEY)
+   extent_len = fs_info->extent_root->nodesize;
+   else
+   extent_len = key.offset;
+
+   /*
+* search_slot() and previous_extent_item() has ensured that our
+* extent_start < start + len, we only need to care extent end.
+*/
+   if (extent_start + extent_len <= start)
+   ret = 0;
+   else
+   ret = 1;
+
+out:
+   btrfs_free_path(path);
+   return ret;
+}
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 14/20] btrfs-progs: scrub: Introduce function to verify parities

2017-05-25 Thread Qu Wenruo
Introduce new function, verify_parities(), to check if parities matches
for full stripe which all data stripes matches with their csum.

Caller should fill the scrub_full_stripe structure properly before
calling this function.

Signed-off-by: Qu Wenruo 
---
 scrub.c | 69 +
 1 file changed, 69 insertions(+)

diff --git a/scrub.c b/scrub.c
index 1944f216..ef391ad1 100644
--- a/scrub.c
+++ b/scrub.c
@@ -26,6 +26,7 @@
 #include "disk-io.h"
 #include "utils.h"
 #include "kernel-lib/bitops.h"
+#include "kernel-lib/raid56.h"
 
 /*
  * For parity based profile(RAID56)
@@ -756,3 +757,71 @@ out:
btrfs_free_path(path);
return ret;
 }
+
+/*
+ * Verify parities for RAID56
+ * Caller must fill @fstripe before calling this function
+ *
+ * Return 0 for parities matches.
+ * Return >0 for P or Q mismatch
+ * Return <0 for fatal error
+ */
+static int verify_parities(struct btrfs_fs_info *fs_info,
+  struct btrfs_scrub_progress *scrub_ctx,
+  struct scrub_full_stripe *fstripe)
+{
+   void **ptrs;
+   void *ondisk_p = NULL;
+   void *ondisk_q = NULL;
+   void *buf_p;
+   void *buf_q;
+   int nr_stripes = fstripe->nr_stripes;
+   int stripe_len = BTRFS_STRIPE_LEN;
+   int i;
+   int ret = 0;
+
+   ptrs = malloc(sizeof(void *) * fstripe->nr_stripes);
+   buf_p = malloc(fstripe->stripe_len);
+   buf_q = malloc(fstripe->stripe_len);
+   if (!ptrs || !buf_p || !buf_q) {
+   ret = -ENOMEM;
+   goto out;
+   }
+
+   for (i = 0; i < fstripe->nr_stripes; i++) {
+   struct scrub_stripe *stripe = >stripes[i];
+
+   if (stripe->logical == BTRFS_RAID5_P_STRIPE) {
+   ondisk_p = stripe->data;
+   ptrs[i] = buf_p;
+   continue;
+   } else if (stripe->logical == BTRFS_RAID6_Q_STRIPE) {
+   ondisk_q = stripe->data;
+   ptrs[i] = buf_q;
+   continue;
+   } else {
+   ptrs[i] = stripe->data;
+   continue;
+   }
+   }
+   /* RAID6 */
+   if (ondisk_q) {
+   raid6_gen_syndrome(nr_stripes, stripe_len, ptrs);
+
+   if (memcmp(ondisk_q, ptrs[nr_stripes - 1], stripe_len) != 0 ||
+   memcmp(ondisk_p, ptrs[nr_stripes - 2], stripe_len))
+   ret = 1;
+   } else {
+   ret = raid5_gen_result(nr_stripes, stripe_len, nr_stripes - 1,
+   ptrs);
+   if (ret < 0)
+   goto out;
+   if (memcmp(ondisk_p, ptrs[nr_stripes - 1], stripe_len) != 0)
+   ret = 1;
+   }
+out:
+   free(buf_p);
+   free(buf_q);
+   free(ptrs);
+   return ret;
+}
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 04/20] btrfs-progs: raid56: Allow raid6 to recover data and p

2017-05-25 Thread Qu Wenruo
Copied from kernel lib/raid6/recov.c.

Minor modifications includes:
- Rename from raid6_datap_recov_intx() to raid5_recov_datap()
- Rename parameter from faila to dest1

Signed-off-by: Qu Wenruo 
---
 kernel-lib/raid56.c | 41 +
 kernel-lib/raid56.h |  2 ++
 2 files changed, 43 insertions(+)

diff --git a/kernel-lib/raid56.c b/kernel-lib/raid56.c
index dca8f8d4..e078972b 100644
--- a/kernel-lib/raid56.c
+++ b/kernel-lib/raid56.c
@@ -239,3 +239,44 @@ int raid6_recov_data2(int nr_devs, size_t stripe_len, int 
dest1, int dest2,
free(zero_mem2);
return ret;
 }
+
+/*
+ * Raid 6 recover code copied from kernel lib/raid6/recov.c
+ * - rename from raid6_datap_recov_intx1()
+ * - parameter changed from faila to dest1
+ */
+int raid6_recov_datap(int nr_devs, size_t stripe_len, int dest1, void **data)
+{
+   u8 *p, *q, *dq;
+   const u8 *qmul; /* Q multiplier table */
+   char *zero_mem;
+
+   p = (u8 *)data[nr_devs - 2];
+   q = (u8 *)data[nr_devs - 1];
+
+   zero_mem = calloc(1, stripe_len);
+   if (!zero_mem)
+   return -ENOMEM;
+
+   /* Compute syndrome with zero for the missing data page
+  Use the dead data page as temporary storage for delta q */
+   dq = (u8 *)data[dest1];
+   data[dest1] = (void *)zero_mem;
+   data[nr_devs - 1] = dq;
+
+   raid6_gen_syndrome(nr_devs, stripe_len, data);
+
+   /* Restore pointer table */
+   data[dest1]   = dq;
+   data[nr_devs - 1] = q;
+
+   /* Now, pick the proper data tables */
+   qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[dest1]]];
+
+   /* Now do it... */
+   while ( stripe_len-- ) {
+   *p++ ^= *dq = qmul[*q ^ *dq];
+   q++; dq++;
+   }
+   return 0;
+}
diff --git a/kernel-lib/raid56.h b/kernel-lib/raid56.h
index 8d64256f..83ac39a1 100644
--- a/kernel-lib/raid56.h
+++ b/kernel-lib/raid56.h
@@ -42,4 +42,6 @@ extern const u8 raid6_gfexi[256]  
__attribute__((aligned(256)));
 /* Recover raid6 with 2 data corrupted */
 int raid6_recov_data2(int nr_devs, size_t stripe_len, int dest1, int dest2,
  void **data);
+/* Recover data and P */
+int raid6_recov_datap(int nr_devs, size_t stripe_len, int dest1, void **data);
 #endif
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 02/20] btrfs-progs: raid56: Introduce tables for RAID6 recovery

2017-05-25 Thread Qu Wenruo
Use kernel RAID6 galois tables for later RAID6 recovery.

Galois tables file, kernel-lib/tables.c is generated by user space
program, mktable.

Galois field tables declaration, in kernel-lib/raid56.h, is completely
copied from kernel.

The mktables.c is copied from kernel with minor header/macro
modification, to ensure the generated tables.c works well in
btrfs-progs.

Signed-off-by: Qu Wenruo 
---
 .gitignore|   2 +
 Makefile  |  13 -
 kernel-lib/mktables.c | 148 ++
 kernel-lib/raid56.h   |  12 
 4 files changed, 173 insertions(+), 2 deletions(-)
 create mode 100644 kernel-lib/mktables.c

diff --git a/.gitignore b/.gitignore
index 43c0ed88..7d7a5482 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,6 +35,8 @@ btrfs-select-super
 btrfs-calc-size
 btrfs-crc
 btrfstune
+mktables
+kernel-lib/tables.c
 libbtrfs.a
 libbtrfs.so
 libbtrfs.so.0
diff --git a/Makefile b/Makefile
index 92063a90..ba73357b 100644
--- a/Makefile
+++ b/Makefile
@@ -95,7 +95,7 @@ objects = ctree.o disk-io.o kernel-lib/radix-tree.o 
extent-tree.o print-tree.o \
  qgroup.o raid56.o free-space-cache.o kernel-lib/list_sort.o props.o \
  kernel-shared/ulist.o qgroup-verify.o backref.o string-table.o 
task-utils.o \
  inode.o file.o find-root.o free-space-tree.o help.o send-dump.o \
- fsfeatures.o
+ fsfeatures.o kernel-lib/tables.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
   cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o \
@@ -314,6 +314,14 @@ version.h: version.sh version.h.in configure.ac
@echo "[SH] $@"
$(Q)bash ./config.status --silent $@
 
+mktables: kernel-lib/mktables.c
+   @echo "[CC] $@"
+   $(Q)$(CC) $(CFLAGS) $< -o $@
+
+kernel-lib/tables.c: mktables
+   @echo "[TABLE]  $@"
+   $(Q)./mktables > $@ || ($(RM) -f $@ && exit 1)
+
 libbtrfs: $(libs_shared) $(lib_links)
 
 $(libs_shared): $(libbtrfs_objects) $(lib_links) send.h
@@ -503,11 +511,12 @@ clean: $(CLEANDIRS)
$(Q)$(RM) -f -- $(progs) *.o *.o.d \
kernel-lib/*.o kernel-lib/*.o.d \
kernel-shared/*.o kernel-shared/*.o.d \
+   kernel-lib/tables.c\
image/*.o image/*.o.d \
convert/*.o convert/*.o.d \
mkfs/*.o mkfs/*.o.d \
  dir-test ioctl-test quick-test library-test library-test-static \
- btrfs.static mkfs.btrfs.static fssum \
+  mktables btrfs.static mkfs.btrfs.static fssum \
  $(check_defs) \
  $(libs) $(lib_links) \
  $(progs_static) $(progs_extra)
diff --git a/kernel-lib/mktables.c b/kernel-lib/mktables.c
new file mode 100644
index ..85f621fe
--- /dev/null
+++ b/kernel-lib/mktables.c
@@ -0,0 +1,148 @@
+/* -*- linux-c -*- --- *
+ *
+ *   Copyright 2002-2007 H. Peter Anvin - All Rights Reserved
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * --- */
+
+/*
+ * mktables.c
+ *
+ * Make RAID-6 tables.  This is a host user space program to be run at
+ * compile time.
+ */
+
+/*
+ * Btrfs-progs port, with following minor fixes:
+ * 1) Use "kerncompat.h"
+ * 2) Get rid of __KERNEL__ related macros
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static uint8_t gfmul(uint8_t a, uint8_t b)
+{
+   uint8_t v = 0;
+
+   while (b) {
+   if (b & 1)
+   v ^= a;
+   a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
+   b >>= 1;
+   }
+
+   return v;
+}
+
+static uint8_t gfpow(uint8_t a, int b)
+{
+   uint8_t v = 1;
+
+   b %= 255;
+   if (b < 0)
+   b += 255;
+
+   while (b) {
+   if (b & 1)
+   v = gfmul(v, a);
+   a = gfmul(a, a);
+   b >>= 1;
+   }
+
+   return v;
+}
+
+int main(int argc, char *argv[])
+{
+   int i, j, k;
+   uint8_t v;
+   uint8_t exptbl[256], invtbl[256];
+
+   printf("#include \"kerncompat.h\"\n");
+
+   /* Compute multiplication table */
+   printf("\nconst u8  __attribute__((aligned(256)))\n"
+   "raid6_gfmul[256][256] =\n"
+   "{\n");
+   for (i = 0; i < 256; i++) {
+   printf("\t{\n");
+   for (j = 0; j < 256; j += 8) {
+   printf("\t\t");
+   for (k = 0; k < 8; k++)
+   printf("0x%02x,%c", gfmul(i, j + k),
+  (k == 7) ? '\n' : ' ');
+

[PATCH v4 03/20] btrfs-progs: raid56: Allow raid6 to recover 2 data stripes

2017-05-25 Thread Qu Wenruo
Copied from kernel lib/raid6/recov.c raid6_2data_recov_intx1() function.
With the following modification:
- Rename to raid6_recov_data2() for shorter name
- s/kfree/free/g modification

Signed-off-by: Qu Wenruo 
---
 Makefile|  4 +--
 raid56.c => kernel-lib/raid56.c | 69 +
 kernel-lib/raid56.h |  5 +++
 3 files changed, 76 insertions(+), 2 deletions(-)
 rename raid56.c => kernel-lib/raid56.c (71%)

diff --git a/Makefile b/Makefile
index ba73357b..df584672 100644
--- a/Makefile
+++ b/Makefile
@@ -92,10 +92,10 @@ CHECKER_FLAGS := -include $(check_defs) -D__CHECKER__ \
 objects = ctree.o disk-io.o kernel-lib/radix-tree.o extent-tree.o print-tree.o 
\
  root-tree.o dir-item.o file-item.o inode-item.o inode-map.o \
  extent-cache.o extent_io.o volumes.o utils.o repair.o \
- qgroup.o raid56.o free-space-cache.o kernel-lib/list_sort.o props.o \
+ qgroup.o free-space-cache.o kernel-lib/list_sort.o props.o \
  kernel-shared/ulist.o qgroup-verify.o backref.o string-table.o 
task-utils.o \
  inode.o file.o find-root.o free-space-tree.o help.o send-dump.o \
- fsfeatures.o kernel-lib/tables.o
+ fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
   cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o \
diff --git a/raid56.c b/kernel-lib/raid56.c
similarity index 71%
rename from raid56.c
rename to kernel-lib/raid56.c
index 8c79c456..dca8f8d4 100644
--- a/raid56.c
+++ b/kernel-lib/raid56.c
@@ -28,6 +28,7 @@
 #include "disk-io.h"
 #include "volumes.h"
 #include "utils.h"
+#include "kernel-lib/raid56.h"
 
 /*
  * This is the C data type to use
@@ -170,3 +171,71 @@ int raid5_gen_result(int nr_devs, size_t stripe_len, int 
dest, void **data)
}
return 0;
 }
+
+/*
+ * Raid 6 recovery code copied from kernel lib/raid6/recov.c.
+ * With modifications:
+ * - rename from raid6_2data_recov_intx1
+ * - kfree/free modification for btrfs-progs
+ */
+int raid6_recov_data2(int nr_devs, size_t stripe_len, int dest1, int dest2,
+ void **data)
+{
+   u8 *p, *q, *dp, *dq;
+   u8 px, qx, db;
+   const u8 *pbmul;/* P multiplier table for B data */
+   const u8 *qmul; /* Q multiplier table (for both) */
+   char *zero_mem1, *zero_mem2;
+   int ret = 0;
+
+   /* Early check */
+   if (dest1 < 0 || dest1 >= nr_devs - 2 ||
+   dest2 < 0 || dest2 >= nr_devs - 2 || dest1 >= dest2)
+   return -EINVAL;
+
+   zero_mem1 = calloc(1, stripe_len);
+   zero_mem2 = calloc(1, stripe_len);
+   if (!zero_mem1 || !zero_mem2) {
+   free(zero_mem1);
+   free(zero_mem2);
+   return -ENOMEM;
+   }
+
+   p = (u8 *)data[nr_devs - 2];
+   q = (u8 *)data[nr_devs - 1];
+
+   /* Compute syndrome with zero for the missing data pages
+  Use the dead data pages as temporary storage for
+  delta p and delta q */
+   dp = (u8 *)data[dest1];
+   data[dest1] = (void *)zero_mem1;
+   data[nr_devs - 2] = dp;
+   dq = (u8 *)data[dest2];
+   data[dest2] = (void *)zero_mem2;
+   data[nr_devs - 1] = dq;
+
+   raid6_gen_syndrome(nr_devs, stripe_len, data);
+
+   /* Restore pointer table */
+   data[dest1]   = dp;
+   data[dest2]   = dq;
+   data[nr_devs - 2] = p;
+   data[nr_devs - 1] = q;
+
+   /* Now, pick the proper data tables */
+   pbmul = raid6_gfmul[raid6_gfexi[dest2 - dest1]];
+   qmul  = raid6_gfmul[raid6_gfinv[raid6_gfexp[dest1]^raid6_gfexp[dest2]]];
+
+   /* Now do it... */
+   while ( stripe_len-- ) {
+   px= *p ^ *dp;
+   qx= qmul[*q ^ *dq];
+   *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
+   *dp++ = db ^ px; /* Reconstructed A */
+   p++; q++;
+   }
+
+   free(zero_mem1);
+   free(zero_mem2);
+   return ret;
+}
diff --git a/kernel-lib/raid56.h b/kernel-lib/raid56.h
index 030b0afb..8d64256f 100644
--- a/kernel-lib/raid56.h
+++ b/kernel-lib/raid56.h
@@ -37,4 +37,9 @@ extern const u8 raid6_vgfmul[256][32] 
__attribute__((aligned(256)));
 extern const u8 raid6_gfexp[256]  __attribute__((aligned(256)));
 extern const u8 raid6_gfinv[256]  __attribute__((aligned(256)));
 extern const u8 raid6_gfexi[256]  __attribute__((aligned(256)));
+
+
+/* Recover raid6 with 2 data corrupted */
+int raid6_recov_data2(int nr_devs, size_t stripe_len, int dest1, int dest2,
+ void **data);
 #endif
-- 
2.13.0



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

[PATCH v4 08/20] btrfs-progs: csum: Introduce function to read out data csums

2017-05-25 Thread Qu Wenruo
Introduce a new function: btrfs_read_data_csums(), to read out csums
for sectors in range.

This is quite useful for read out data csum so we don't need to do it
using open code.

Signed-off-by: Qu Wenruo 
Signed-off-by: Su Yue 
---
 Makefile |   2 +-
 csum.c   | 136 +++
 ctree.h  |   4 ++
 kerncompat.h |   3 ++
 utils.h  |   6 +++
 5 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 csum.c

diff --git a/Makefile b/Makefile
index df584672..e6d7c187 100644
--- a/Makefile
+++ b/Makefile
@@ -95,7 +95,7 @@ objects = ctree.o disk-io.o kernel-lib/radix-tree.o 
extent-tree.o print-tree.o \
  qgroup.o free-space-cache.o kernel-lib/list_sort.o props.o \
  kernel-shared/ulist.o qgroup-verify.o backref.o string-table.o 
task-utils.o \
  inode.o file.o find-root.o free-space-tree.o help.o send-dump.o \
- fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o
+ fsfeatures.o kernel-lib/tables.o kernel-lib/raid56.o csum.o
 cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
   cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
   cmds-quota.o cmds-qgroup.o cmds-replace.o cmds-check.o \
diff --git a/csum.c b/csum.c
new file mode 100644
index ..513a6fbd
--- /dev/null
+++ b/csum.c
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2017 Fujitsu.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "kerncompat.h"
+#include "kernel-lib/bitops.h"
+#include "ctree.h"
+#include "utils.h"
+
+/*
+ * TODO:
+ * 1) Add write support for csum
+ *So we can write new data extents and add csum into csum tree
+ *
+ * Get csums of range[@start, @start + len).
+ *
+ * @start:Start offset, shall be aligned to sectorsize.
+ * @len:  Length, shall be aligned to sectorsize.
+ * @csum_ret: The size of csum_ret shall be @len / sectorsize * csum_size.
+ * @bit_map:  Every bit corresponds to the offset have csum or not.
+ *The size in byte of bit_map should be
+ *calculate_bitmap_len(csum_ret's size / csum_size).
+ *
+ * Returns 0  means success
+ * Returns >0 means on error
+ * Returns <0 means on fatal error
+ */
+
+int btrfs_read_data_csums(struct btrfs_fs_info *fs_info, u64 start, u64 len,
+ void *csum_ret, unsigned long *bitmap_ret)
+
+{
+   struct btrfs_path path;
+   struct btrfs_key key;
+   struct btrfs_root *csum_root = fs_info->csum_root;
+   u32 item_offset;
+   u32 item_size;
+   u32 final_offset;
+   u32 final_len;
+   u32 sectorsize = fs_info->tree_root->sectorsize;
+   u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+   u64 cur_start;
+   u64 cur_end;
+   int found = 0;
+   int ret;
+
+   ASSERT(IS_ALIGNED(start, sectorsize));
+   ASSERT(IS_ALIGNED(len, sectorsize));
+   ASSERT(csum_ret);
+   ASSERT(bitmap_ret);
+
+   memset(bitmap_ret, 0, calculate_bitmap_len(len / sectorsize));
+   btrfs_init_path();
+
+   key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+   key.type = BTRFS_EXTENT_CSUM_KEY;
+   key.offset = start;
+
+   ret = btrfs_search_slot(NULL, csum_root, , , 0, 0);
+   if (ret < 0)
+   goto out;
+   if (ret > 0) {
+   ret = btrfs_previous_item(csum_root, ,
+ BTRFS_EXTENT_CSUM_OBJECTID,
+ BTRFS_EXTENT_CSUM_KEY);
+   if (ret < 0)
+   goto out;
+   }
+   /* The csum tree may be empty. */
+   if (!btrfs_header_nritems(path.nodes[0]))
+   goto next;
+
+   while (1) {
+   btrfs_item_key_to_cpu(path.nodes[0], , path.slots[0]);
+
+   if (!IS_ALIGNED(key.offset, sectorsize)) {
+   error("csum item bytenr %llu is not aligned to %u",
+ key.offset, sectorsize);
+   ret = -EIO;
+   break;
+   }
+   /* exceeds end */
+   if (key.offset >= start + len)
+   break;
+
+   item_offset = btrfs_item_ptr_offset(path.nodes[0],
+   

[PATCH v4 06/20] btrfs-progs: Introduce new btrfs_map_block function which returns more unified result.

2017-05-25 Thread Qu Wenruo
Introduce a new function, __btrfs_map_block_v2().

Unlike old btrfs_map_block(), which needs different parameter to handle
different RAID profile, this new function uses unified btrfs_map_block
structure to handle all RAID profile in a more meaningful method:

Return physical address along with logical address for each stripe.

For RAID1/Single/DUP (none-stripped):
result would be like:
Map block: Logical 128M, Len 10M, Type RAID1, Stripe len 0, Nr_stripes 2
Stripe 0: Logical 128M, Physical X, Len: 10M Dev dev1
Stripe 1: Logical 128M, Physical Y, Len: 10M Dev dev2

Result will be as long as possible, since it's not stripped at all.

For RAID0/10 (stripped without parity):
Result will be aligned to full stripe size:
Map block: Logical 64K, Len 128K, Type RAID10, Stripe len 64K, Nr_stripes 4
Stripe 0: Logical 64K, Physical X, Len 64K Dev dev1
Stripe 1: Logical 64K, Physical Y, Len 64K Dev dev2
Stripe 2: Logical 128K, Physical Z, Len 64K Dev dev3
Stripe 3: Logical 128K, Physical W, Len 64K Dev dev4

For RAID5/6 (stripped with parity and dev-rotation)
Result will be aligned to full stripe size:
Map block: Logical 64K, Len 128K, Type RAID6, Stripe len 64K, Nr_stripes 4
Stripe 0: Logical 64K, Physical X, Len 64K Dev dev1
Stripe 1: Logical 128K, Physical Y, Len 64K Dev dev2
Stripe 2: Logical RAID5_P, Physical Z, Len 64K Dev dev3
Stripe 3: Logical RAID6_Q, Physical W, Len 64K Dev dev4

The new unified layout should be very flex and can even handle things
like N-way RAID1 (which old mirror_num basic one can't handle well).

Signed-off-by: Qu Wenruo 
---
 volumes.c | 181 ++
 volumes.h |  78 +++
 2 files changed, 259 insertions(+)

diff --git a/volumes.c b/volumes.c
index 8c2ffd92..985e5661 100644
--- a/volumes.c
+++ b/volumes.c
@@ -1597,6 +1597,187 @@ out:
return 0;
 }
 
+static inline struct btrfs_map_block *alloc_map_block(int num_stripes)
+{
+   struct btrfs_map_block *ret;
+   int size;
+
+   size = sizeof(struct btrfs_map_stripe) * num_stripes +
+   sizeof(struct btrfs_map_block);
+   ret = malloc(size);
+   if (!ret)
+   return NULL;
+   memset(ret, 0, size);
+   return ret;
+}
+
+static int fill_full_map_block(struct map_lookup *map, u64 start, u64 length,
+  struct btrfs_map_block *map_block)
+{
+   u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+   u64 bg_start = map->ce.start;
+   u64 bg_end = bg_start + map->ce.size;
+   u64 bg_offset = start - bg_start; /* offset inside the block group */
+   u64 fstripe_logical = 0;/* Full stripe start logical bytenr */
+   u64 fstripe_size = 0;   /* Full stripe logical size */
+   u64 fstripe_phy_off = 0;/* Full stripe offset in each dev */
+   u32 stripe_len = map->stripe_len;
+   int sub_stripes = map->sub_stripes;
+   int data_stripes = nr_data_stripes(map);
+   int dev_rotation;
+   int i;
+
+   map_block->num_stripes = map->num_stripes;
+   map_block->type = profile;
+
+   /*
+* Common full stripe data for stripe based profiles
+*/
+   if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
+  BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+   fstripe_size = stripe_len * data_stripes;
+   if (sub_stripes)
+   fstripe_size /= sub_stripes;
+   fstripe_logical = bg_offset / fstripe_size * fstripe_size +
+   bg_start;
+   fstripe_phy_off = bg_offset / fstripe_size * stripe_len;
+   }
+
+   switch (profile) {
+   case BTRFS_BLOCK_GROUP_DUP:
+   case BTRFS_BLOCK_GROUP_RAID1:
+   case 0: /* SINGLE */
+   /*
+* None-stripe mode,(Single, DUP and RAID1)
+* Just use offset to fill map_block
+*/
+   map_block->stripe_len = 0;
+   map_block->start = start;
+   map_block->length = min(bg_end, start + length) - start;
+   for (i = 0; i < map->num_stripes; i++) {
+   struct btrfs_map_stripe *stripe;
+
+   stripe = _block->stripes[i];
+
+   stripe->dev = map->stripes[i].dev;
+   stripe->logical = start;
+   stripe->physical = map->stripes[i].physical + bg_offset;
+   stripe->length = map_block->length;
+   }
+   break;
+   case BTRFS_BLOCK_GROUP_RAID10:
+   case BTRFS_BLOCK_GROUP_RAID0:
+   /*
+* Stripe modes without parity(0 and 10)
+* Return the whole full stripe
+*/
+
+   map_block->start = fstripe_logical;
+   map_block->length = fstripe_size;
+