[PATCH] btrfs: add support to search subvolume by rootid and uuid

2017-07-12 Thread Anand Jain
Unless the top level is mounted there is no way to know the
details of all the subvolume.  For example:

mount -o subvol=sv1/newsv1 /dev/sdb /btrfs

btrfs su list /btrfs
ID 257 gen 12 top level 5 path sv1
ID 258 gen 9 top level 257 path sv1/snap
ID 259 gen 11 top level 257 path sv1/newsv1

You can't subvol show for sv1 and sv1/snap as its paths aren't
accessible to the user unless the its top level is mounted.

This patch adds two new options to the existing btrfs subvol show
cli. They are --rootid/-r or --uuid/-u, with this now the user will
be able to look for a subvolume using the rootid OR the uuid.

./btrfs su show -r 257 /btrfs
sv1
Name:   sv1
UUID:   30129358-c69d-3e4a-a662-29509cc69c95
Parent UUID:-
Received UUID:  -
Creation time:  2017-07-11 20:32:57 +0800
Subvolume ID:   257
Generation: 12
Gen at creation:7
Parent ID:  5
Top level ID:   5
Flags:  -
Snapshot(s):
sv1/snap

Signed-off-by: Anand Jain 
---
 btrfs-list.c |  4 +++-
 cmds-subvolume.c | 44 +---
 utils.c  | 52 
 utils.h  |  5 -
 4 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/btrfs-list.c b/btrfs-list.c
index 8eec05ea797f..92a537f425f3 100644
--- a/btrfs-list.c
+++ b/btrfs-list.c
@@ -1582,7 +1582,9 @@ int btrfs_get_subvol(int fd, struct root_info *the_ri)
rbn = rb_next(rbn);
continue;
}
-   if (!comp_entry_with_rootid(the_ri, ri, 0)) {
+
+   if (!comp_entry_with_rootid(the_ri, ri, 0) ||
+   !uuid_compare(the_ri->uuid, ri->uuid)) {
memcpy(the_ri, ri, offsetof(struct root_info, path));
the_ri->path = strdup_or_null(ri->path);
the_ri->name = strdup_or_null(ri->name);
diff --git a/cmds-subvolume.c b/cmds-subvolume.c
index de6204eabeaf..1fa54d1b24cf 100644
--- a/cmds-subvolume.c
+++ b/cmds-subvolume.c
@@ -891,8 +891,11 @@ static int cmd_subvol_find_new(int argc, char **argv)
 }
 
 static const char * const cmd_subvol_show_usage[] = {
-   "btrfs subvolume show ",
+   "btrfs subvolume show [options] |",
"Show more information of the subvolume",
+   "-r|--rootid   rootid of the subvol to show",
+   "-u|--uuid uuid of the subvol to show",
+   "If no option is specified  will be shown.",
NULL
 };
 
@@ -907,8 +910,36 @@ static int cmd_subvol_show(int argc, char **argv)
int fd = -1;
int ret = 1;
DIR *dirstream1 = NULL;
+   int by_rootid = 0;
+   int by_uuid = 0;
+   u64 rootid_arg;
+   u8 uuid_arg[BTRFS_UUID_SIZE];
 
-   clean_args_no_options(argc, argv, cmd_subvol_show_usage);
+   while (1) {
+   int c;
+   static const struct option long_options[] = {
+   { "rootid", required_argument, NULL, 'r'},
+   { "uuid", required_argument, NULL, 'u'},
+   { NULL, 0, NULL, 0 }
+   };
+
+   c = getopt_long(argc, argv, "r:u:", long_options, NULL);
+   if (c < 0)
+   break;
+
+   switch (c) {
+   case 'r':
+   rootid_arg = arg_strtou64(optarg);
+   by_rootid = 1;
+   break;
+   case 'u':
+   uuid_parse(optarg, uuid_arg);
+   by_uuid = 1;
+   break;
+   default:
+   usage(cmd_subvol_show_usage);
+   }
+   }
 
if (check_argc_exact(argc - optind, 1))
usage(cmd_subvol_show_usage);
@@ -921,7 +952,14 @@ static int cmd_subvol_show(int argc, char **argv)
goto out;
}
 
-   ret = get_subvol_info(fullpath, &get_ri);
+   if (by_rootid) {
+   ret = get_subvol_info_by_rootid(fullpath, &get_ri, rootid_arg);
+   } else if (by_uuid) {
+   ret = get_subvol_info_by_uuid(fullpath, &get_ri, uuid_arg);
+   } else {
+   ret = get_subvol_info(fullpath, &get_ri);
+   }
+
if (ret) {
if (ret < 0) {
error("Failed to get subvol info %s: %s",
diff --git a/utils.c b/utils.c
index d2489e70f8d8..250e6cc76cbc 100644
--- a/utils.c
+++ b/utils.c
@@ -2432,6 +2432,58 @@ out:
return ret;
 }
 
+int get_subvol_info_by_rootid(const char *mnt, struct root_info *get_ri, u64 
r_id)
+{
+   int fd;
+   int ret;
+   DIR *dirstream = NULL;
+
+   fd = btrfs_open_dir(mnt, &dirstream, 1);
+   if (fd < 0)
+   return -EINVAL;
+
+   memset(get_ri

Re: [PATCH] btrfs: qgroups: Fix BUG_ON condition

2017-07-12 Thread Qu Wenruo



在 2017年07月12日 14:42, Nikolay Borisov 写道:

The current code was erroneously checking for root_level > BTRFS_MAX_LEVEL. If
we had a root_level of 8 then the check won't trigger and we could
potentially hit a buffer overflow. The correct check should be
root_level >= BTRFS_MAX_LEVEL


Thanks for catching this.

Reviewed-by: Qu Wenruo 



Signed-off-by: Nikolay Borisov 
---
  fs/btrfs/qgroup.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4ce351efe281..3b787915ef31 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1603,7 +1603,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle 
*trans,
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
  
-	BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);

+   BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
BUG_ON(root_eb == NULL);
  
  	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))

@@ -2959,7 +2959,7 @@ static int __btrfs_qgroup_release_data(struct inode 
*inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
-   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);


I didn't recongize it's a tailing white space at first.
Nice catch.

Thanks,
Qu


if (ret < 0)
goto out;


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: add support to search subvolume by rootid and uuid

2017-07-12 Thread Anand Jain
Unless the top level is mounted there is no way to know the
details of all the subvolume.  For example:

mount -o subvol=sv1/newsv1 /dev/sdb /btrfs

btrfs su list /btrfs
ID 257 gen 12 top level 5 path sv1
ID 258 gen 9 top level 257 path sv1/snap
ID 259 gen 11 top level 257 path sv1/newsv1

You can't subvol show for sv1 and sv1/snap as its paths aren't
accessible to the user unless the its top level is mounted.

This patch adds two new options to the existing btrfs subvol show
cli. They are --rootid/-r or --uuid/-u, with this now the user will
be able to look for a subvolume using the rootid OR the uuid.

./btrfs su show -r 257 /btrfs
sv1
Name:   sv1
UUID:   30129358-c69d-3e4a-a662-29509cc69c95
Parent UUID:-
Received UUID:  -
Creation time:  2017-07-11 20:32:57 +0800
Subvolume ID:   257
Generation: 12
Gen at creation:7
Parent ID:  5
Top level ID:   5
Flags:  -
Snapshot(s):
sv1/snap

Signed-off-by: Anand Jain 
---
 btrfs-list.c |  4 +++-
 cmds-subvolume.c | 44 +---
 utils.c  | 52 
 utils.h  |  5 -
 4 files changed, 100 insertions(+), 5 deletions(-)

diff --git a/btrfs-list.c b/btrfs-list.c
index 8eec05ea797f..92a537f425f3 100644
--- a/btrfs-list.c
+++ b/btrfs-list.c
@@ -1582,7 +1582,9 @@ int btrfs_get_subvol(int fd, struct root_info *the_ri)
rbn = rb_next(rbn);
continue;
}
-   if (!comp_entry_with_rootid(the_ri, ri, 0)) {
+
+   if (!comp_entry_with_rootid(the_ri, ri, 0) ||
+   !uuid_compare(the_ri->uuid, ri->uuid)) {
memcpy(the_ri, ri, offsetof(struct root_info, path));
the_ri->path = strdup_or_null(ri->path);
the_ri->name = strdup_or_null(ri->name);
diff --git a/cmds-subvolume.c b/cmds-subvolume.c
index de6204eabeaf..1fa54d1b24cf 100644
--- a/cmds-subvolume.c
+++ b/cmds-subvolume.c
@@ -891,8 +891,11 @@ static int cmd_subvol_find_new(int argc, char **argv)
 }
 
 static const char * const cmd_subvol_show_usage[] = {
-   "btrfs subvolume show ",
+   "btrfs subvolume show [options] |",
"Show more information of the subvolume",
+   "-r|--rootid   rootid of the subvol to show",
+   "-u|--uuid uuid of the subvol to show",
+   "If no option is specified  will be shown.",
NULL
 };
 
@@ -907,8 +910,36 @@ static int cmd_subvol_show(int argc, char **argv)
int fd = -1;
int ret = 1;
DIR *dirstream1 = NULL;
+   int by_rootid = 0;
+   int by_uuid = 0;
+   u64 rootid_arg;
+   u8 uuid_arg[BTRFS_UUID_SIZE];
 
-   clean_args_no_options(argc, argv, cmd_subvol_show_usage);
+   while (1) {
+   int c;
+   static const struct option long_options[] = {
+   { "rootid", required_argument, NULL, 'r'},
+   { "uuid", required_argument, NULL, 'u'},
+   { NULL, 0, NULL, 0 }
+   };
+
+   c = getopt_long(argc, argv, "r:u:", long_options, NULL);
+   if (c < 0)
+   break;
+
+   switch (c) {
+   case 'r':
+   rootid_arg = arg_strtou64(optarg);
+   by_rootid = 1;
+   break;
+   case 'u':
+   uuid_parse(optarg, uuid_arg);
+   by_uuid = 1;
+   break;
+   default:
+   usage(cmd_subvol_show_usage);
+   }
+   }
 
if (check_argc_exact(argc - optind, 1))
usage(cmd_subvol_show_usage);
@@ -921,7 +952,14 @@ static int cmd_subvol_show(int argc, char **argv)
goto out;
}
 
-   ret = get_subvol_info(fullpath, &get_ri);
+   if (by_rootid) {
+   ret = get_subvol_info_by_rootid(fullpath, &get_ri, rootid_arg);
+   } else if (by_uuid) {
+   ret = get_subvol_info_by_uuid(fullpath, &get_ri, uuid_arg);
+   } else {
+   ret = get_subvol_info(fullpath, &get_ri);
+   }
+
if (ret) {
if (ret < 0) {
error("Failed to get subvol info %s: %s",
diff --git a/utils.c b/utils.c
index d2489e70f8d8..250e6cc76cbc 100644
--- a/utils.c
+++ b/utils.c
@@ -2432,6 +2432,58 @@ out:
return ret;
 }
 
+int get_subvol_info_by_rootid(const char *mnt, struct root_info *get_ri, u64 
r_id)
+{
+   int fd;
+   int ret;
+   DIR *dirstream = NULL;
+
+   fd = btrfs_open_dir(mnt, &dirstream, 1);
+   if (fd < 0)
+   return -EINVAL;
+
+   memset(get_ri

[PATCH] btrfs-progs: fix the path use full_path as provided by the root info

2017-07-12 Thread Anand Jain
This is a kind of preparatory patch for the patch which will add
--rootid and --uuid options for the btrfs subvol show command.

As of now btrfs subvol show is using the external user provided subvol
path to show in the output. Which is kind of confusing.

btrfs su show /btrfs
/btrfs <--
Name:   

It will be even more confusing when proposed --uuid or --rootid
options are used.

btrfs su show --rootid 258 /btrfs
/btrfs <--
Name:   snap <--
UUID:   9630a45f-e647-4242-bd19-97590b4e20b2
Parent UUID:30129358-c69d-3e4a-a662-29509cc69c95
Received UUID:  -
Creation time:  2017-07-12 12:43:28 +0800
Subvolume ID:   258
Generation: 9
Gen at creation:9
Parent ID:  257
Top level ID:   257
Flags:  -
Snapshot(s):

Now with this patch, it will only show what is provided by the root_info.

btrfs su show --rootid 258 /btrfs
sv1/snap <--
Name:   snap
UUID:   9630a45f-e647-4242-bd19-97590b4e20b2
Parent UUID:30129358-c69d-3e4a-a662-29509cc69c95
Received UUID:  -
Creation time:  2017-07-12 12:43:28 +0800
Subvolume ID:   258
Generation: 9
Gen at creation:9
Parent ID:  257
Top level ID:   257
Flags:  -
Snapshot(s):

Signed-off-by: Anand Jain 
---
 cmds-subvolume.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmds-subvolume.c b/cmds-subvolume.c
index 674a3a51f6f9..de6204eabeaf 100644
--- a/cmds-subvolume.c
+++ b/cmds-subvolume.c
@@ -934,7 +934,7 @@ static int cmd_subvol_show(int argc, char **argv)
}
 
/* print the info */
-   printf("%s\n", fullpath);
+   printf("%s\n", get_ri.full_path);
printf("\tName: \t\t\t%s\n", get_ri.name);
 
if (uuid_is_null(get_ri.uuid))
-- 
2.13.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs check reports errors, filesystem seems fine

2017-07-12 Thread Qu Wenruo

Sorry for the late reply.

After investigating the dumps, I found the output is quite strange.

1) Mismatching output.
In "btrfs-debug-tree-grep-79177.txt" I found only 79177 as offset for 
INODE_REF is here, while 79177 as objectid for DIR_ITEM/DIR_INDEX is not 
here at all.


While in "btrfs-debug-tree-grep-deprecated-txt.txt" there is epected 
79177 DIR_ITEM/DIR_INDEX.


Maybe something wrong in grep happened which skip "(79177" ?

2) Mismatched hash
The main problem I found is that, for key (79177 DIR_ITEM 54846528), the 
number 54846528 is the hash(crc32c) of filename, and it contains 2 
items, one for "deprecated.txt" and one for "deprecated.sxt".


But we found that 54846528 only matches the hash for "deprecated.txt", 
not "deprecated.sxt".


I think that's the main problem.

BTW, would you please try "btrfs check --mode=lowmem" to see if lowmem 
mode reports similar (well, output may differ) error?


If lowmem mode also reports error on such DIR_ITEM, I'm pretty sure 
that's the problem.


However it may take some time before we can fix it in repair mode.

Thanks,
Qu



在 2017年07月04日 21:24, Filippe LeMarchand 写道:

Sure, here it is:
https://drive.google.com/drive/folders/0B1ax9Am81gx9YjJBVVA0LXRHeGc

In a letter dated Tuesday, July 4, 2017 16:16:36 MSK user Lu Fengqi wrote:

On Mon, Jul 03, 2017 at 08:34:52AM +0800, Qu Wenruo wrote:



At 07/01/2017 07:59 PM, Filippe LeMarchand wrote:

Hello everyone.

I have an btrfs root partition on Intel 530 ssd, which mounts without errors 
and seem to work fine,
but `btrfs check` gives me foloowing output (and --repair doesn't remove 
errors):

enabling repair mode
Checking filesystem on /dev/sda2
UUID: 12c84aa3-ce65-4390-807e-a72cc8a7445e
checking extents
Fixed 0 roots.
checking free space cache
cache and super generation don't match, space cache will be invalidated
checking fs roots
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref


This means that in dir whose inode number is 79177, it has a child inode
pointer pointing to depercated.sxt.

But it doesn't have dir index and corresponding inode ref, which is breaking
the cross reference rule of btrfs.

Would you please run the following command to dump needed info for us to
debug?

# btrfs-debug-tree /dev/sda2 | grep 79177 -C 10

and

# btrfs-debug-tree /dev/sda2 | grep deprecated.sxt -C 10

and

# btrfs-debug-tree /dev/sda2 | grep deprecated.txt -C 10


Considering the output has both .txt and .sxt, I think that's the problem.
But such bit-flip should be detected by tree block csum.
I'm not sure what's wrong with it.

Thanks,
Qu


unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref
unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
filetype 1 errors 1, no dir item
checking csums
checking root refs
found 23421812736 bytes used err is 0
total csum bytes: 21531608
total tree bytes: 776650752
total fs tree bytes: 711278592
total extent tree bytes: 36798464
btree space waste bytes: 116002036
file data blocks allocated: 850546470912
   referenced 27611987968

Is it dangerous and what should I do about it?

I also tried --clear-space-cache, but it just removes the line about

Re: Leveldb in google-chrome incompatible with btrfs?

2017-07-12 Thread Cerem Cem ASLAN
The day before I re-checked to see if I could recover any of my files
and it seems that disk is all alive. Nothing missed, nothing deleted.
I run `btrfs scrub`, there are only a few uncorrectable errors (they
were there before the event). I'm using the disk as before for two
days and there is nothing unusual. Quite interesting.

2017-07-07 19:50 GMT+03:00 Marc MERLIN :
> (removing pwnall at chromium.org to cut spam)
>
> On Thu, Jul 06, 2017 at 10:46:08PM -0700, Omar Sandoval wrote:
>> ┌[osandov@vader ~/.config]
>> └$ ls -al google-chrome-busted/**
>> ls: cannot access 'google-chrome-busted/Local State': No such file or 
>> directory
>> google-chrome-busted/Default:
>> ls: cannot access 'google-chrome-busted/Default/Preferences': No such file 
>> or directory
>> ls: cannot access 'google-chrome-busted/Default/.com.google.Chrome.VfAUNx': 
>> No such file or directory
>> total 0
>> drwx-- 1 osandov users 12 Feb  7 16:50 .
>> drwx-- 1 osandov users 14 Feb  7 16:50 ..
>> -? ? ?   ?  ?? .com.google.Chrome.VfAUNx
>> -? ? ?   ?  ?? Preferences
>
> Yeah, that's definitely not chrome's fault :)
>
> On Fri, Jul 07, 2017 at 06:05:39PM +0300, Cerem Cem ASLAN wrote:
>> I was also struggling with this issue for quite some time. Today my 2
>> months old disk is crashed (which really surprises me). It doesn't
>> even being shown by `fdisk -l`.
>>
>> After buying this this disk, I installed a Debian on a BTRFS partition
>> on an LVM partition on a LUKS partition. From that day till today,
>> while doing my work, there was always a video running on Youtube in
>> the background. Sometimes my editor (atom, it also shares a lot with
>> chromium) was slowing down which was making me end up restarting the
>> Chromium (and it was restarting barely with `killall -9 chromium`.
>> Another mentionable event is, 3 days ago my laptop suddenly restarted.
>
> Sorry, I'm not quite making sense out of this.
>
> Marc
> --
> "A mouse is a device used to point at the xterm you want to type in" - A.S.R.
> Microsoft is to operating systems 
>    what McDonalds is to gourmet 
> cooking
> Home page: http://marc.merlins.org/
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: Check if tgt_device is not null

2017-07-12 Thread Nikolay Borisov
btrfs_err_in_rcu indiscriminately dereferences tgt_device to access its
->name member in an error path. However, couple of lines below there is code
which checks whether tgt_device is not NULL. Let's be consistent and check if
the tgt_device is NULL before dereferencing it.

Signed-off-by: Nikolay Borisov 
---
 fs/btrfs/dev-replace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index bee3edeea7a3..e2a16cb8f7f3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -541,7 +541,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info 
*fs_info,
 src_device->missing ? "" :
 rcu_str_deref(src_device->name),
 src_device->devid,
-rcu_str_deref(tgt_device->name), scrub_ret);
+tgt_device ? rcu_str_deref(tgt_device->name) :
+"", scrub_ret);
btrfs_dev_replace_unlock(dev_replace, 1);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-- 
2.7.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.4 00/15] Btrfs In-band De-duplication

2017-07-12 Thread Lu Fengqi
This patchset can be fetched from github:
https://github.com/littleroad/linux.git dedupe_20170712

This is just a normal rebase update.
Now the new base is kdave/for-4.13-part1

Normal test cases from auto group exposes no regression, and ib-dedupe
group can pass without problem.

Changelog:
v2:
  Totally reworked to handle multiple backends
v3:
  Fix a stupid but deadly on-disk backend bug
  Add handle for multiple hash on same bytenr corner case to fix abort
  trans error
  Increase dedup rate by enhancing delayed ref handler for both backend.
  Move dedup_add() to run_delayed_ref() time, to fix abort trans error.
  Increase dedup block size up limit to 8M.
v4:
  Add dedup prop for disabling dedup for given files/dirs.
  Merge inmem_search() and ondisk_search() into generic_search() to save
  some code
  Fix another delayed_ref related bug.
  Use the same mutex for both inmem and ondisk backend.
  Move dedup_add() back to btrfs_finish_ordered_io() to increase dedup
  rate.
v5:
  Reuse compress routine for much simpler dedup function.
  Slightly improved performance due to above modification.
  Fix race between dedup enable/disable
  Fix for false ENOSPC report
v6:
  Further enable/disable race window fix.
  Minor format change according to checkpatch.
v7:
  Fix one concurrency bug with balance.
  Slightly modify return value from -EINVAL to -EOPNOTSUPP for
  btrfs_dedup_ioctl() to allow progs to distinguish unsupported commands
  and wrong parameter.
  Rebased to integration-4.6.
v8:
  Rename 'dedup' to 'dedupe'.
  Add support to allow dedupe and compression work at the same time.
  Fix several balance related bugs. Special thanks to Satoru Takeuchi,
  who exposed most of them.
  Small dedupe hit case performance improvement.
v9:
  Re-order the patchset to completely separate pure in-memory and any
  on-disk format change.
  Fold bug fixes into its original patch.
v10:
  Adding back missing bug fix patch.
  Reduce on-disk item size.
  Hide dedupe ioctl under CONFIG_BTRFS_DEBUG.
v11:
  Remove other backend and props support to focus on the framework and
  in-memory backend. Suggested by David.
  Better disable and buffered write race protection.
  Comprehensive fix to dedupe metadata ENOSPC problem.
v12:
  Stateful 'enable' ioctl and new 'reconf' ioctl
  New FORCE flag for enable ioctl to allow stateless ioctl
  Precise error report and extendable ioctl structure.
v12.1
  Rebase to David's for-next-20160704 branch
  Add co-ordinate patch for subpage and dedupe patchset. 
v12.2
  Rebase to David's for-next-20160715 branch
  Add co-ordinate patch for other patchset.
v13
  Rebase to David's for-next-20160906 branch
  Fix a reserved space leak bug, which only frees quota reserved space
  but not space_info->byte_may_use.
v13.1
  Rebase to Chris' for-linux-4.9 branch
v14
  Use generic ENOSPC fix for both compression and dedupe.
v14.1
  Further split ENOSPC fix.
v14.2
  Rebase to v4.11-rc2.
  Co-operate with count_max_extent() to calculate num_extents.
  No longer rely on qgroup fixes.
v14.3
  Rbease to v4.12-rc1
v14.4
  Rbease to kdave/for-4.13-part1

Qu Wenruo (4):
  btrfs: delayed-ref: Add support for increasing data ref under spinlock
  btrfs: dedupe: Inband in-memory only de-duplication implement
  btrfs: relocation: Enhance error handling to avoid BUG_ON
  btrfs: dedupe: Introduce new reconfigure ioctl

Wang Xiaoguang (11):
  btrfs: improve inode's outstanding_extents computation
  btrfs: introduce type based delalloc metadata reserve
  btrfs: Introduce COMPRESS reserve type to fix false enospc for
compression
  btrfs: dedupe: Introduce dedupe framework and its header
  btrfs: dedupe: Introduce function to initialize dedupe info
  btrfs: dedupe: Introduce function to add hash into in-memory tree
  btrfs: dedupe: Introduce function to remove hash from in-memory tree
  btrfs: dedupe: Introduce function to search for an existing hash
  btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface
  btrfs: ordered-extent: Add support for dedupe
  btrfs: dedupe: Add ioctl for inband dedupelication

 fs/btrfs/Makefile|   2 +-
 fs/btrfs/ctree.h |  51 ++-
 fs/btrfs/dedupe.c| 821 +++
 fs/btrfs/dedupe.h| 184 +-
 fs/btrfs/delayed-ref.c   |  35 +-
 fs/btrfs/delayed-ref.h   |  10 +
 fs/btrfs/disk-io.c   |   4 +
 fs/btrfs/extent-tree.c   |  67 +++-
 fs/btrfs/extent_io.c |  63 +++-
 fs/btrfs/extent_io.h |   6 +
 fs/btrfs/file.c  |  26 +-
 fs/btrfs/free-space-cache.c  |   5 +-
 fs/btrfs/inode-map.c |   6 +-
 fs/btrfs/inode.c | 532 +++-
 fs/btrfs/ioctl.c |  99 +-
 fs/btrfs/ordered-data.c  |  46 ++-
 fs/btrfs/ordered-data.h  |  13 +
 fs/btrfs/relocation.c|  52 ++-
 fs/btrfs/sysfs.c |   2 +
 fs/btrfs/tests/inode-tests.c |  15 +-
 include/uapi/linux/btrfs.h   |  55 +++
 21 files changed, 1940 inse

[PATCH v14.4 04/15] btrfs: dedupe: Introduce dedupe framework and its header

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce the header for btrfs in-band(write time) de-duplication
framework and needed header.

The new de-duplication framework is going to support 2 different dedupe
methods and 1 dedupe hash.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
---
 fs/btrfs/ctree.h   |   7 +++
 fs/btrfs/dedupe.h  | 137 -
 fs/btrfs/disk-io.c |   1 +
 include/uapi/linux/btrfs.h |  34 +++
 4 files changed, 177 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 439b6568f13e..965c6615d882 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1118,6 +1118,13 @@ struct btrfs_fs_info {
u32 nodesize;
u32 sectorsize;
u32 stripesize;
+
+   /*
+* Inband de-duplication related structures
+*/
+   unsigned long dedupe_enabled:1;
+   struct btrfs_dedupe_info *dedupe_info;
+   struct mutex dedupe_ioctl_lock;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 83ebfe28da9e..5ecc32179a9c 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -19,6 +19,139 @@
 #ifndef __BTRFS_DEDUPE__
 #define __BTRFS_DEDUPE__
 
-/* later in-band dedupe will expand this struct */
-struct btrfs_dedupe_hash;
+#include 
+#include 
+#include 
+
+static const int btrfs_hash_sizes[] = { 32 };
+
+/*
+ * For caller outside of dedupe.c
+ *
+ * Different dedupe backends should have their own hash structure
+ */
+struct btrfs_dedupe_hash {
+   u64 bytenr;
+   u32 num_bytes;
+
+   /* last field is a variable length array of dedupe hash */
+   u8 hash[];
+};
+
+struct btrfs_dedupe_info {
+   /* dedupe blocksize */
+   u64 blocksize;
+   u16 backend;
+   u16 hash_algo;
+
+   struct crypto_shash *dedupe_driver;
+
+   /*
+* Use mutex to portect both backends
+* Even for in-memory backends, the rb-tree can be quite large,
+* so mutex is better for such use case.
+*/
+   struct mutex lock;
+
+   /* following members are only used in in-memory backend */
+   struct rb_root hash_root;
+   struct rb_root bytenr_root;
+   struct list_head lru_list;
+   u64 limit_nr;
+   u64 current_nr;
+};
+
+struct btrfs_trans_handle;
+
+static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
+{
+   return (hash && hash->bytenr);
+}
+
+int btrfs_dedupe_hash_size(u16 algo);
+struct btrfs_dedupe_hash *btrfs_dedupe_alloc_hash(u16 algo);
+
+/*
+ * Initial inband dedupe info
+ * Called at dedupe enable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (from unsupported param to tree creation error for some backends)
+ */
+int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
+   struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Disable dedupe and invalidate all its dedupe data.
+ * Called at dedupe disable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
+
+/*
+ * Get current dedupe status.
+ * Return 0 for success
+ * No possible error yet
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Calculate hash for dedupe.
+ * Caller must ensure [start, start + dedupe_bs) has valid data.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (error from hash codes)
+ */
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash);
+
+/*
+ * Search for duplicated extents by calculated hash
+ * Caller must call btrfs_dedupe_calc_hash() first to get the hash.
+ *
+ * @inode: the inode for we are writing
+ * @file_pos: offset inside the inode
+ * As we will increase extent ref immediately after a hash match,
+ * we need @file_pos and @inode in this case.
+ *
+ * Return > 0 for a hash match, and the extent ref will be
+ * *INCREASED*, and hash->bytenr/num_bytes will record the existing
+ * extent data.
+ * Return 0 for a hash miss. Nothing is done
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash);
+
+/*
+ * Add a dedupe hash into dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_add(struct btrfs_trans_handle *trans,
+struct btrfs_fs_info *fs_info,
+struct btrfs_dedupe_hash *hash);
+
+/*
+ * Remove a dedupe hash from dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ *
+ * NOTE:

[PATCH v14.4 03/15] btrfs: Introduce COMPRESS reserve type to fix false enospc for compression

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

When testing btrfs compression, sometimes we got ENOSPC error, though fs
still has much free space, xfstests generic/171, generic/172, generic/173,
generic/174, generic/175 can reveal this bug in my test environment when
compression is enabled.

After some debuging work, we found that it's
btrfs_delalloc_reserve_metadata() which sometimes tries to reserve too
much metadata space, even for very small data range.

In btrfs_delalloc_reserve_metadata(), the number of metadata bytes to
reserve is calculated by the difference between outstanding extents and
reserved extents.
But due to bad designed drop_outstanding_extent() function, it can make
the difference too big, and cause problem.

The problem happens in the following flow with compression enabled.

1) Buffered write 128M data with 128K blocksize
   outstanding_extents = 1
   reserved_extents = 1024 (128M / 128K, one blocksize will get one
reserved_extent)

   Note: it's btrfs_merge_extent_hook() to merge outstanding extents.
 But reserved extents are still 1024.

2) Allocate extents for dirty range
   cow_file_range_async() split above large extent into small 128K
   extents.
   Let's assume 2 compressed extents have been split.

   So we have:
   outstanding_extents = 3
   reserved_extents = 1024

   range [0, 256K) has extents allocated

3) One ordered extent get finished
   btrfs_finish_ordered_io()
   |- btrfs_delalloc_release_metadata()
  |- drop_outstanding_extent()

   drop_outstanding_extent() will free *ALL* redundant reserved extents.
   So we have:
   outstanding_extents = 2 (One has finished)
   reserved_extents = 2

4) Continue allocating extents for dirty range
   cow_file_range_async() continue handling the remaining range.

   When the whole 128M range is done and assume no more ordered extents
   have finished.
   outstanding_extents = 1023 (One has finished in Step 3)
   reserved_extents = 2 (*ALL* freed in Step 3)

5) Another buffered write happens to the file
   btrfs_delalloc_reserve_metadata() will calculate metadata space.

   The calculation is:
   meta_to_reserve = (outstanding_extents - reserved_extents) * \
 nodesize * max_tree_level(8) * 2

   If nodesize is 16K, it's 1021 * 16K * 8 * 2, near 256M.
   If nodesize is 64K, it's about 1G.

   That's totally insane.

The fix is to introduce new reserve type, COMPRESSION, to info outstanding
extents calculation algorithm, to get correct outstanding_extents based
extent size.

So in Step 1), outstanding_extents = 1024 reserved_extents = 1024
Step 2): outstanding_extents = 1024 reserved_extents = 1024
Step 3): outstanding_extents = 1023 reserved_extents = 1023

And in Step 5) we reserve correct amount of metadata space.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |  2 ++
 fs/btrfs/extent-tree.c |  2 ++
 fs/btrfs/extent_io.c   | 61 --
 fs/btrfs/extent_io.h   |  5 +++
 fs/btrfs/file.c|  3 ++
 fs/btrfs/inode.c   | 88 ++
 fs/btrfs/ioctl.c   |  2 ++
 fs/btrfs/relocation.c  |  3 ++
 8 files changed, 151 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cb3feebc7f63..439b6568f13e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -115,9 +115,11 @@ static inline u32 count_max_extents(u64 size, u64 
max_extent_size)
  */
 enum btrfs_metadata_reserve_type {
BTRFS_RESERVE_NORMAL,
+   BTRFS_RESERVE_COMPRESS,
 };
 
 u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+int inode_need_compress(struct inode *inode);
 
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 81cc85b9710a..892a47b13deb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5989,6 +5989,8 @@ u64 btrfs_max_extent_size(enum 
btrfs_metadata_reserve_type reserve_type)
 {
if (reserve_type == BTRFS_RESERVE_NORMAL)
return BTRFS_MAX_EXTENT_SIZE;
+   else if (reserve_type == BTRFS_RESERVE_COMPRESS)
+   return SZ_128K;
 
ASSERT(0);
return BTRFS_MAX_EXTENT_SIZE;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 2e6f69908303..c3601069e5b3 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -597,7 +597,7 @@ static int __clear_extent_bit(struct extent_io_tree *tree, 
u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
 
if (bits & EXTENT_DELALLOC)
-   bits |= EXTENT_NORESERVE;
+   bits |= EXTENT_NORESERVE | EXTENT_COMPRESS;
 
if (delete)
bits |= ~EXTENT_CTLBITS;
@@ -736,6 +736,60 @@ static int __clear_extent_bit(struct extent_io_tree *tree, 
u64 start, u64 end,
 
 }
 
+static void adjust_one_outstanding_extent(struct inode *inode, u64 len,
+

[PATCH v14.4 02/15] btrfs: introduce type based delalloc metadata reserve

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce type based metadata reserve parameter for delalloc space
reservation/freeing function.

The problem we are going to solve is, btrfs use different max extent
size for different mount options.

For compression, the max extent size is 128K, while for non-compress write
it's 128M.
And further more, split/merge extent hook highly depends that max extent
size.

Such situation contributes to quite a lot of false ENOSPC.

So this patch introduce the facility to help solve these false ENOSPC
related to different max extent size.

Currently only normal 128M extent size is supported. More types will
follow soon.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h |  40 
 fs/btrfs/extent-tree.c   |  44 +-
 fs/btrfs/file.c  |  20 
 fs/btrfs/free-space-cache.c  |   5 +-
 fs/btrfs/inode-map.c |   6 ++-
 fs/btrfs/inode.c | 108 ---
 fs/btrfs/ioctl.c |  12 +++--
 fs/btrfs/relocation.c|  10 ++--
 fs/btrfs/tests/inode-tests.c |  15 +++---
 9 files changed, 176 insertions(+), 84 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1c52d5a3544a..cb3feebc7f63 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -101,11 +101,24 @@ static const int btrfs_csum_sizes[] = { 4 };
 /*
  * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
  */
-static inline u32 count_max_extents(u64 size)
+static inline u32 count_max_extents(u64 size, u64 max_extent_size)
 {
-   return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+   return div_u64(size + max_extent_size - 1, max_extent_size);
 }
 
+/*
+ * Type based metadata reserve type
+ * This affects how btrfs reserve metadata space for buffered write.
+ *
+ * This is caused by the different max extent size for normal COW
+ * and compression, and further in-band dedupe
+ */
+enum btrfs_metadata_reserve_type {
+   BTRFS_RESERVE_NORMAL,
+};
+
+u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
 };
@@ -2713,8 +2726,6 @@ int btrfs_check_data_free_space(struct inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
 void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-void btrfs_delalloc_release_space(struct inode *inode,
-   struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2729,10 +2740,16 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
 u64 *qgroup_reserved, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+   enum btrfs_metadata_reserve_type reserve_type);
+void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
+   enum btrfs_metadata_reserve_type reserve_type);
 int btrfs_delalloc_reserve_space(struct inode *inode,
-   struct extent_changeset **reserved, u64 start, u64 len);
+   struct extent_changeset **reserved, u64 start, u64 len,
+   enum btrfs_metadata_reserve_type reserve_type);
+void btrfs_delalloc_release_space(struct inode *inode,
+   struct extent_changeset *reserved, u64 start, u64 len,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
  unsigned short type);
@@ -3174,9 +3191,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, 
int delay_iput);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
   int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
- struct extent_state **cached_state, int dedupe);
+ struct extent_state **cached_state,
+ enum btrfs_metadata_reserve_type reserve_type);
 int btrfs_set_extent_defrag(struct inode *inode, u64 start, u64 end,
-   struct extent_state **cached_state);
+   

[PATCH v14.4 08/15] btrfs: delayed-ref: Add support for increasing data ref under spinlock

2017-07-12 Thread Lu Fengqi
From: Qu Wenruo 

For in-band dedupe, btrfs needs to increase data ref with delayed_ref
locked, so add a new function btrfs_add_delayed_data_ref_lock() to
increase extent ref with delayed_refs already locked.

Signed-off-by: Qu Wenruo 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 35 +++
 fs/btrfs/delayed-ref.h | 10 ++
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 93ffa898df6d..19110dd19ac9 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -843,6 +843,29 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info 
*fs_info,
 }
 
 /*
+ * Do real delayed data ref insert.
+ * Caller must hold delayed_refs->lock and allocation memory
+ * for dref,head_ref and record.
+ */
+void btrfs_add_delayed_data_ref_locked(struct btrfs_fs_info *fs_info,
+   struct btrfs_trans_handle *trans,
+   struct btrfs_delayed_data_ref *dref,
+   struct btrfs_delayed_ref_head *head_ref,
+   struct btrfs_qgroup_extent_record *qrecord,
+   u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+   u64 owner, u64 offset, u64 reserved, int action,
+   int *qrecord_inserted_ret, int *old_ref_mod,
+   int *new_ref_mod)
+{
+   head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node,
+   qrecord, bytenr, num_bytes, ref_root, reserved,
+   action, 1, qrecord_inserted_ret, old_ref_mod,
+   new_ref_mod);
+   add_delayed_data_ref(fs_info, trans, head_ref, &dref->node, bytenr,
+   num_bytes, parent, ref_root, owner, offset, action);
+}
+
+/*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
 int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
@@ -888,14 +911,10 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info 
*fs_info,
 * insert both the head node and the new ref without dropping
 * the spin lock
 */
-   head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
-   bytenr, num_bytes, ref_root, reserved,
-   action, 1, &qrecord_inserted,
-   old_ref_mod, new_ref_mod);
-
-   add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
-  num_bytes, parent, ref_root, owner, offset,
-  action);
+   btrfs_add_delayed_data_ref_locked(fs_info, trans, ref, head_ref, record,
+   bytenr, num_bytes, parent, ref_root, owner, offset,
+   reserved, action, &qrecord_inserted, old_ref_mod,
+   new_ref_mod);
spin_unlock(&delayed_refs->lock);
 
if (qrecord_inserted)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index ce88e4ac5276..dc9761ec139a 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -243,12 +243,22 @@ static inline void btrfs_put_delayed_ref(struct 
btrfs_delayed_ref_node *ref)
}
 }
 
+struct btrfs_qgroup_extent_record;
 int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
   struct btrfs_trans_handle *trans,
   u64 bytenr, u64 num_bytes, u64 parent,
   u64 ref_root, int level, int action,
   struct btrfs_delayed_extent_op *extent_op,
   int *old_ref_mod, int *new_ref_mod);
+void btrfs_add_delayed_data_ref_locked(struct btrfs_fs_info *fs_info,
+   struct btrfs_trans_handle *trans,
+   struct btrfs_delayed_data_ref *dref,
+   struct btrfs_delayed_ref_head *head_ref,
+   struct btrfs_qgroup_extent_record *qrecord,
+   u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+   u64 owner, u64 offset, u64 reserved, int action,
+   int *qrecord_inserted_ret, int *old_ref_mod,
+   int *new_ref_mod);
 int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
   struct btrfs_trans_handle *trans,
   u64 bytenr, u64 num_bytes,
-- 
2.13.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.4 15/15] btrfs: dedupe: Introduce new reconfigure ioctl

2017-07-12 Thread Lu Fengqi
From: Qu Wenruo 

Introduce new reconfigure ioctl, and new FORCE flag for in-band dedupe
ioctls.

Now dedupe enable and reconfigure ioctl are stateful.


| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Not allowed |
| Enabled   |  reconf| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  dsiable   | Disabled|
| Disabled  |  reconf| Not allowed |

(While disbale is always stateless)

While for guys prefer stateless ioctl (myself for example), new FORCE
flag is introduced.

In FORCE mode, enable/disable is completely stateless.

| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  disable   | Disabled|


Also, re-configure ioctl will only modify specified fields.
Unlike enable, un-specified fields will be filled with default value.

For example:
 # btrfs dedupe enable --block-size 64k /mnt
 # btrfs dedupe reconfigure --limit-hash 1m /mnt
Will leads to:
 dedupe blocksize: 64K
 dedupe hash limit nr: 1m

While for enable:
 # btrfs dedupe enable --force --block-size 64k /mnt
 # btrfs dedupe enable --force --limit-hash 1m /mnt
Will reset blocksize to default value:
 dedupe blocksize: 128K << reset
 dedupe hash limit nr: 1m

Suggested-by: David Sterba 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 131 -
 fs/btrfs/dedupe.h  |  13 +
 fs/btrfs/ioctl.c   |  13 +
 include/uapi/linux/btrfs.h |  11 +++-
 4 files changed, 143 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index eebbe17c3676..9cafffe45883 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -37,6 +37,40 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+/*
+ * Copy from current dedupe info to fill dargs.
+ * For reconf case, only fill members which is uninitialized.
+ */
+static void get_dedupe_status(struct btrfs_dedupe_info *dedupe_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   int reconf = (dargs->cmd == BTRFS_DEDUPE_CTL_RECONF);
+
+   dargs->status = 1;
+
+   if (!reconf || (reconf && dargs->blocksize == (u64)-1))
+   dargs->blocksize = dedupe_info->blocksize;
+   if (!reconf || (reconf && dargs->backend == (u16)-1))
+   dargs->backend = dedupe_info->backend;
+   if (!reconf || (reconf && dargs->hash_algo == (u16)-1))
+   dargs->hash_algo = dedupe_info->hash_algo;
+
+   /*
+* For re-configure case, if not modifying limit,
+* therir limit will be set to 0, unlike other fields
+*/
+   if (!reconf || !(dargs->limit_nr || dargs->limit_mem)) {
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
+   /* current_nr doesn't makes sense for reconfig case */
+   if (!reconf)
+   dargs->current_nr = dedupe_info->current_nr;
+}
+
 void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
 struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -53,15 +87,7 @@ void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
return;
}
mutex_lock(&dedupe_info->lock);
-   dargs->status = 1;
-   dargs->blocksize = dedupe_info->blocksize;
-   dargs->backend = dedupe_info->backend;
-   dargs->hash_algo = dedupe_info->hash_algo;
-   dargs->limit_nr = dedupe_info->limit_nr;
-   dargs->limit_mem = dedupe_info->limit_nr *
-   (sizeof(struct inmem_hash) +
-btrfs_hash_sizes[dedupe_info->hash_algo]);
-   dargs->current_nr = dedupe_info->current_nr;
+   get_dedupe_status(dedupe_info, dargs);
mutex_unlock(&dedupe_info->lock);
memset(dargs->__unused, -1, sizeof(dargs->__unused));
 }
@@ -110,17 +136,50 @@ static int init_dedupe_info(struct btrfs_dedupe_info 
**ret_info,
 static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dedupe_args *dargs)
 {
-   u64 blocksize = dargs->blocksize;
-   u64 limit_nr = dargs->limit_nr;
-   u64 limit_mem = dargs->limit_mem;
-   u16 hash_algo = dargs->hash_algo;
-   u8 backend = dargs->backend;
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   u64 blocksize;
+   u64

[PATCH v14.4 11/15] btrfs: ordered-extent: Add support for dedupe

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Add ordered-extent support for dedupe.

Note, current ordered-extent support only supports non-compressed source
extent.
Support for compressed source extent will be added later.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/ordered-data.c | 46 ++
 fs/btrfs/ordered-data.h | 13 +
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a3aca495e33e..c78aa7d0104f 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -26,6 +26,7 @@
 #include "extent_io.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "dedupe.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -184,7 +185,8 @@ static inline struct rb_node *tree_search(struct 
btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+ int type, int dio, int compress_type,
+ struct btrfs_dedupe_hash *hash)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -205,6 +207,33 @@ static int __btrfs_add_ordered_extent(struct inode *inode, 
u64 file_offset,
entry->inode = igrab(inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+   entry->hash = NULL;
+   /*
+* A hash hit means we have already incremented the extents delayed
+* ref.
+* We must handle this even if another process is trying to
+* turn off dedupe, otherwise we will leak a reference.
+*/
+   if (hash && (hash->bytenr || root->fs_info->dedupe_enabled)) {
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = root->fs_info->dedupe_info;
+   if (WARN_ON(dedupe_info == NULL)) {
+   kmem_cache_free(btrfs_ordered_extent_cache,
+   entry);
+   return -EINVAL;
+   }
+   entry->hash = btrfs_dedupe_alloc_hash(dedupe_info->hash_algo);
+   if (!entry->hash) {
+   kmem_cache_free(btrfs_ordered_extent_cache, entry);
+   return -ENOMEM;
+   }
+   entry->hash->bytenr = hash->bytenr;
+   entry->hash->num_bytes = hash->num_bytes;
+   memcpy(entry->hash->hash, hash->hash,
+  btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, &entry->flags);
 
@@ -250,15 +279,23 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 
file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
+int btrfs_add_ordered_extent_dedupe(struct inode *inode, u64 file_offset,
+  u64 start, u64 len, u64 disk_len, int type,
+  struct btrfs_dedupe_hash *hash)
+{
+   return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ BTRFS_COMPRESS_NONE, hash);
+}
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 u64 start, u64 len, u64 disk_len, int type)
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 1,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
@@ -267,7 +304,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, 
u64 file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- compress_type);
+ compress_type, NULL);
 }
 
 /*
@@ -578,6 +615,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent 
*entry)
list_del(&sum->list);
kfree(sum);
}
+   kfree(entry->hash);
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 56c4c0ee6381..ed2bbb5a7b94 100644
--- 

[PATCH v14.4 13/15] btrfs: dedupe: Add ioctl for inband dedupelication

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Add ioctl interface for inband dedupelication, which includes:
1) enable
2) disable
3) status

And a pseudo RO compat flag, to imply that btrfs now supports inband
dedup.
However we don't add any ondisk format change, it's just a pseudo RO
compat flag.

All these ioctl interfaces are state-less, which means caller don't need
to bother previous dedupe state before calling them, and only need to
care the final desired state.

For example, if user want to enable dedupe with specified block size and
limit, just fill the ioctl structure and call enable ioctl.
No need to check if dedupe is already running.

These ioctls will handle things like re-configure or disable quite well.

Also, for invalid parameters, enable ioctl interface will set the field
of the first encounted invalid parameter to (-1) to inform caller.
While for limit_nr/limit_mem, the value will be (0).

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 50 ++
 fs/btrfs/dedupe.h  | 17 
 fs/btrfs/disk-io.c |  3 +++
 fs/btrfs/ioctl.c   | 67 ++
 fs/btrfs/sysfs.c   |  2 ++
 include/uapi/linux/btrfs.h | 12 -
 6 files changed, 145 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 6bdcff625533..eebbe17c3676 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -37,6 +37,35 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled || !dedupe_info) {
+   dargs->status = 0;
+   dargs->blocksize = 0;
+   dargs->backend = 0;
+   dargs->hash_algo = 0;
+   dargs->limit_nr = 0;
+   dargs->current_nr = 0;
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   return;
+   }
+   mutex_lock(&dedupe_info->lock);
+   dargs->status = 1;
+   dargs->blocksize = dedupe_info->blocksize;
+   dargs->backend = dedupe_info->backend;
+   dargs->hash_algo = dedupe_info->hash_algo;
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   dargs->current_nr = dedupe_info->current_nr;
+   mutex_unlock(&dedupe_info->lock);
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+}
+
 static int init_dedupe_info(struct btrfs_dedupe_info **ret_info,
struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -416,6 +445,27 @@ static void unblock_all_writers(struct btrfs_fs_info 
*fs_info)
percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
 }
 
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   fs_info->dedupe_enabled = 0;
+   /* same as disable */
+   smp_wmb();
+   dedupe_info = fs_info->dedupe_info;
+   fs_info->dedupe_info = NULL;
+
+   if (!dedupe_info)
+   return 0;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
+
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
 {
struct btrfs_dedupe_info *dedupe_info;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 3a15fc2069b9..f224b7d00cb3 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -109,6 +109,15 @@ static inline struct btrfs_dedupe_hash 
*btrfs_dedupe_alloc_hash(u16 algo)
 int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dedupe_args *dargs);
 
+
+/*
+ * Get inband dedupe info
+ * Since it needs to access different backends' hash size, which
+ * is not exported, we need such simple function.
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
 /*
  * Disable dedupe and invalidate all its dedupe data.
  * Called at dedupe disable time.
@@ -120,12 +129,10 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
 
 /*
- * Get current dedupe status.
- * Return 0 for success
- * No possible error yet
+ * Cleanup current btrfs_dedupe_info
+ * Called in umount time
  */
-void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
-struct btrfs_ioctl_dedupe_args *dargs);
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info);
 
 /*
  * Calculate hash for dedupe.
diff --git a/fs/btrfs/disk-io.c b/fs/b

[PATCH v14.4 06/15] btrfs: dedupe: Introduce function to add hash into in-memory tree

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_add() to add hash into in-memory tree.
And now we can implement the btrfs_dedupe_add() interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/dedupe.c | 151 ++
 1 file changed, 151 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index fbb2121c3736..d2acdccef944 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -28,6 +28,14 @@ struct inmem_hash {
u8 hash[];
 };
 
+static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
+{
+   if (WARN_ON(algo >= ARRAY_SIZE(btrfs_hash_sizes)))
+   return NULL;
+   return kzalloc(sizeof(struct inmem_hash) + btrfs_hash_sizes[algo],
+   GFP_NOFS);
+}
+
 static int init_dedupe_info(struct btrfs_dedupe_info **ret_info,
struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -179,3 +187,146 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
/* Place holder for bisect, will be implemented in later patches */
return 0;
 }
+
+static int inmem_insert_hash(struct rb_root *root,
+struct inmem_hash *hash, int hash_len)
+{
+   struct rb_node **p = &root->rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+   if (memcmp(hash->hash, entry->hash, hash_len) < 0)
+   p = &(*p)->rb_left;
+   else if (memcmp(hash->hash, entry->hash, hash_len) > 0)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(&hash->hash_node, parent, p);
+   rb_insert_color(&hash->hash_node, root);
+   return 0;
+}
+
+static int inmem_insert_bytenr(struct rb_root *root,
+  struct inmem_hash *hash)
+{
+   struct rb_node **p = &root->rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+   if (hash->bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (hash->bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(&hash->bytenr_node, parent, p);
+   rb_insert_color(&hash->bytenr_node, root);
+   return 0;
+}
+
+static void __inmem_del(struct btrfs_dedupe_info *dedupe_info,
+   struct inmem_hash *hash)
+{
+   list_del(&hash->lru_list);
+   rb_erase(&hash->hash_node, &dedupe_info->hash_root);
+   rb_erase(&hash->bytenr_node, &dedupe_info->bytenr_root);
+
+   if (!WARN_ON(dedupe_info->current_nr == 0))
+   dedupe_info->current_nr--;
+
+   kfree(hash);
+}
+
+/*
+ * Insert a hash into in-memory dedupe tree
+ * Will remove exceeding last recent use hash.
+ *
+ * If the hash mathced with existing one, we won't insert it, to
+ * save memory
+ */
+static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
+struct btrfs_dedupe_hash *hash)
+{
+   int ret = 0;
+   u16 algo = dedupe_info->hash_algo;
+   struct inmem_hash *ihash;
+
+   ihash = inmem_alloc_hash(algo);
+
+   if (!ihash)
+   return -ENOMEM;
+
+   /* Copy the data out */
+   ihash->bytenr = hash->bytenr;
+   ihash->num_bytes = hash->num_bytes;
+   memcpy(ihash->hash, hash->hash, btrfs_hash_sizes[algo]);
+
+   mutex_lock(&dedupe_info->lock);
+
+   ret = inmem_insert_bytenr(&dedupe_info->bytenr_root, ihash);
+   if (ret > 0) {
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   ret = inmem_insert_hash(&dedupe_info->hash_root, ihash,
+   btrfs_hash_sizes[algo]);
+   if (ret > 0) {
+   /*
+* We only keep one hash in tree to save memory, so if
+* hash conflicts, free the one to insert.
+*/
+   rb_erase(&ihash->bytenr_node, &dedupe_info->bytenr_root);
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   list_add(&ihash->lru_list, &dedupe_info->lru_list);
+   dedupe_info->current_nr++;
+
+   /* Remove the last dedupe hash if we exceed limit */
+   while (dedupe_info->current_nr > dedupe_info->limit_nr) {
+   struct inmem_hash *last;
+
+   last = list_entry(dedupe_info->lru_list.prev,
+ struct inmem_hash, lru_list);
+   __inmem_del(dedupe_info, last);
+   }
+out:
+   mutex_unlock(&dedupe_info->lock);
+   return 0;
+}
+
+int btr

[PATCH v14.4 01/15] btrfs: improve inode's outstanding_extents computation

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

This issue was revealed by modifying BTRFS_MAX_EXTENT_SIZE(128MB) to 64KB,
When modifying BTRFS_MAX_EXTENT_SIZE(128MB) to 64KB, fsstress test often
gets these warnings from btrfs_destroy_inode():
WARN_ON(BTRFS_I(inode)->outstanding_extents);
WARN_ON(BTRFS_I(inode)->reserved_extents);

Simple test program below can reproduce this issue steadily.
Note: you need to modify BTRFS_MAX_EXTENT_SIZE to 64KB to have test,
otherwise there won't be such WARNING.
#include 
#include 
#include 
#include 
#include 

int main(void)
{
int fd;
char buf[68 *1024];

memset(buf, 0, 68 * 1024);
fd = open("testfile", O_CREAT | O_EXCL | O_RDWR);
pwrite(fd, buf, 68 * 1024, 64 * 1024);
return;
}

When BTRFS_MAX_EXTENT_SIZE is 64KB, and buffered data range is:
64KB128K132KB
|---|---|
 64 + 4KB

1) for above data range, btrfs_delalloc_reserve_metadata() will reserve
metadata and set BTRFS_I(inode)->outstanding_extents to 2.
(68KB + 64KB - 1) / 64KB == 2

Outstanding_extents: 2

2) then btrfs_dirty_page() will be called to dirty pages and set
EXTENT_DELALLOC flag. In this case, btrfs_set_bit_hook() will be called
twice.
The 1st set_bit_hook() call will set DEALLOC flag for the first 64K.
64KB128KB
|---|
64KB DELALLOC
Outstanding_extents: 2

Set_bit_hooks() uses FIRST_DELALLOC flag to avoid re-increase
outstanding_extents counter.
So for 1st set_bit_hooks() call, it won't modify outstanding_extents,
it's still 2.

Then FIRST_DELALLOC flag is *CLEARED*.

3) 2nd btrfs_set_bit_hook() call.
Because FIRST_DELALLOC have been cleared by previous set_bit_hook(),
btrfs_set_bit_hook() will increase BTRFS_I(inode)->outstanding_extents by
one, so now BTRFS_I(inode)->outstanding_extents is 3.
64KB128KB132KB
|---||
64K DELALLOC   4K DELALLOC
Outstanding_extents: 3

But the correct outstanding_extents number should be 2, not 3.
The 2nd btrfs_set_bit_hook() call just screwed up this, and leads to the
WARN_ON().

Normally, we can solve it by only increasing outstanding_extents in
set_bit_hook().
But the problem is for delalloc_reserve/release_metadata(), we only have
a 'length' parameter, and calculate in-accurate outstanding_extents.
If we only rely on set_bit_hook() release_metadata() will crew things up
as it will decrease inaccurate number.

So the fix we use is:
1) Increase *INACCURATE* outstanding_extents at delalloc_reserve_meta
   Just as a place holder.
2) Increase *accurate* outstanding_extents at set_bit_hooks()
   This is the real increaser.
3) Decrease *INACCURATE* outstanding_extents before returning
   This makes outstanding_extents to correct value.

For 128M BTRFS_MAX_EXTENT_SIZE, due to limitation of
__btrfs_buffered_write(), each iteration will only handle about 2MB
data.
So btrfs_dirty_pages() won't need to handle cases cross 2 extents.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h |  2 ++
 fs/btrfs/inode.c | 63 ++--
 fs/btrfs/ioctl.c |  6 ++
 3 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5bdd36664421..1c52d5a3544a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3175,6 +3175,8 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info 
*fs_info, int delay_iput,
   int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  struct extent_state **cached_state, int dedupe);
+int btrfs_set_extent_defrag(struct inode *inode, u64 start, u64 end,
+   struct extent_state **cached_state);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 struct btrfs_root *new_root,
 struct btrfs_root *parent_root,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5d3c6ac960fd..e644f936449b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1601,6 +1601,9 @@ static void btrfs_split_extent_hook(void *private_data,
if (!(orig->state & EXTENT_DELALLOC))
return;
 
+   if (btrfs_is_free_space_inode(BTRFS_I(inode)))
+   return;
+
size = orig->end - orig->start + 1;
if (size > BTRFS_MAX_EXTENT_SIZE) {
u32 num_extents;
@@ -1641,6 +1644,9 @@ static void btrfs_merge_extent_hook(void *private_data,
if (!(other->state & EXTENT_DELALLOC))
return;
 
+   if (btrfs_is_fr

[PATCH v14.4 14/15] btrfs: relocation: Enhance error handling to avoid BUG_ON

2017-07-12 Thread Lu Fengqi
From: Qu Wenruo 

Since the introduce of btrfs dedupe tree, it's possible that balance can
race with dedupe disabling.

When this happens, dedupe_enabled will make btrfs_get_fs_root() return
PTR_ERR(-ENOENT).
But due to a bug in error handling branch, when this happens
backref_cache->nr_nodes is increased but the node is neither added to
backref_cache or nr_nodes decreased.
Causing BUG_ON() in backref_cache_cleanup()

[ 2611.668810] [ cut here ]
[ 2611.669946] kernel BUG at
/home/sat/ktest/linux/fs/btrfs/relocation.c:243!
[ 2611.670572] invalid opcode:  [#1] SMP
[ 2611.686797] Call Trace:
[ 2611.687034]  []
btrfs_relocate_block_group+0x1b3/0x290 [btrfs]
[ 2611.687706]  []
btrfs_relocate_chunk.isra.40+0x47/0xd0 [btrfs]
[ 2611.688385]  [] btrfs_balance+0xb22/0x11e0 [btrfs]
[ 2611.688966]  [] btrfs_ioctl_balance+0x391/0x3a0
[btrfs]
[ 2611.689587]  [] btrfs_ioctl+0x1650/0x2290 [btrfs]
[ 2611.690145]  [] ? lru_cache_add+0x3a/0x80
[ 2611.690647]  [] ?
lru_cache_add_active_or_unevictable+0x4c/0xc0
[ 2611.691310]  [] ? handle_mm_fault+0xcd4/0x17f0
[ 2611.691842]  [] ? cp_new_stat+0x153/0x180
[ 2611.692342]  [] ? __vma_link_rb+0xfd/0x110
[ 2611.692842]  [] ? vma_link+0xb9/0xc0
[ 2611.693303]  [] do_vfs_ioctl+0xa1/0x5a0
[ 2611.693781]  [] ? __do_page_fault+0x1b4/0x400
[ 2611.694310]  [] SyS_ioctl+0x41/0x70
[ 2611.694758]  [] entry_SYSCALL_64_fastpath+0x12/0x71
[ 2611.695331] Code: ff 48 8b 45 bf 49 83 af a8 05 00 00 01 49 89 87 a0
05 00 00 e9 2e fd ff ff b8 f4 ff ff ff e9 e4 fb ff ff 0f 0b 0f 0b 0f 0b
0f 0b <0f> 0b 0f 0b 41 89 c6 e9 b8 fb ff ff e8 9e a6 e8 e0 4c 89 e7 44
[ 2611.697870] RIP  []
relocate_block_group+0x741/0x7a0 [btrfs]
[ 2611.698818]  RSP 

This patch will call remove_backref_node() in error handling branch, and
cache the returned -ENOENT in relocate_tree_block() and continue
balancing.

Reported-by: Satoru Takeuchi 
Signed-off-by: Qu Wenruo 
---
 fs/btrfs/relocation.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 02a4b989a577..51e7640590ec 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -889,6 +889,13 @@ struct backref_node *build_backref_tree(struct 
reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
+   /*
+* Don't forget to cleanup current node.
+* As it may not be added to backref_cache but nr_node
+* increased.
+* This will cause BUG_ON() in backref_cache_cleanup().
+*/
+   remove_backref_node(&rc->backref_cache, cur);
goto out;
}
 
@@ -3051,14 +3058,21 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
}
 
rb_node = rb_first(blocks);
-   while (rb_node) {
+   for (rb_node = rb_first(blocks); rb_node; rb_node = rb_next(rb_node)) {
block = rb_entry(rb_node, struct tree_block, rb_node);
 
node = build_backref_tree(rc, &block->key,
  block->level, block->bytenr);
if (IS_ERR(node)) {
+   /*
+* The root(dedupe tree yet) of the tree block is
+* going to be freed and can't be reached.
+* Just skip it and continue balancing.
+*/
+   if (PTR_ERR(node) == -ENOENT)
+   continue;
err = PTR_ERR(node);
-   goto out;
+   break;
}
 
ret = relocate_tree_block(trans, rc, node, &block->key,
@@ -3066,11 +3080,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
if (ret < 0) {
if (ret != -EAGAIN || rb_node == rb_first(blocks))
err = ret;
-   goto out;
+   break;
}
-   rb_node = rb_next(rb_node);
}
-out:
err = finish_pending_nodes(trans, rc, path, err);
 
 out_free_path:
-- 
2.13.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.4 05/15] btrfs: dedupe: Introduce function to initialize dedupe info

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Add generic function to initialize dedupe info.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/Makefile  |   2 +-
 fs/btrfs/dedupe.c  | 181 +
 fs/btrfs/dedupe.h  |  13 +++-
 include/uapi/linux/btrfs.h |   4 +-
 4 files changed, 196 insertions(+), 4 deletions(-)
 create mode 100644 fs/btrfs/dedupe.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 128ce17a80b0..1b8c627cddb6 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,7 +9,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-  uuid-tree.o props.o hash.o free-space-tree.o
+  uuid-tree.o props.o hash.o free-space-tree.o dedupe.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
new file mode 100644
index ..fbb2121c3736
--- /dev/null
+++ b/fs/btrfs/dedupe.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2016 Fujitsu.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include "ctree.h"
+#include "dedupe.h"
+#include "btrfs_inode.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+
+struct inmem_hash {
+   struct rb_node hash_node;
+   struct rb_node bytenr_node;
+   struct list_head lru_list;
+
+   u64 bytenr;
+   u32 num_bytes;
+
+   u8 hash[];
+};
+
+static int init_dedupe_info(struct btrfs_dedupe_info **ret_info,
+   struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = kzalloc(sizeof(*dedupe_info), GFP_NOFS);
+   if (!dedupe_info)
+   return -ENOMEM;
+
+   dedupe_info->hash_algo = dargs->hash_algo;
+   dedupe_info->backend = dargs->backend;
+   dedupe_info->blocksize = dargs->blocksize;
+   dedupe_info->limit_nr = dargs->limit_nr;
+
+   /* only support SHA256 yet */
+   dedupe_info->dedupe_driver = crypto_alloc_shash("sha256", 0, 0);
+   if (IS_ERR(dedupe_info->dedupe_driver)) {
+   int ret;
+
+   ret = PTR_ERR(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return ret;
+   }
+
+   dedupe_info->hash_root = RB_ROOT;
+   dedupe_info->bytenr_root = RB_ROOT;
+   dedupe_info->current_nr = 0;
+   INIT_LIST_HEAD(&dedupe_info->lru_list);
+   mutex_init(&dedupe_info->lock);
+
+   *ret_info = dedupe_info;
+   return 0;
+}
+
+/*
+ * Helper to check if parameters are valid.
+ * The first invalid field will be set to (-1), to info user which parameter
+ * is invalid.
+ * Except dargs->limit_nr or dargs->limit_mem, in that case, 0 will returned
+ * to info user, since user can specify any value to limit, except 0.
+ */
+static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   u64 blocksize = dargs->blocksize;
+   u64 limit_nr = dargs->limit_nr;
+   u64 limit_mem = dargs->limit_mem;
+   u16 hash_algo = dargs->hash_algo;
+   u8 backend = dargs->backend;
+
+   /*
+* Set all reserved fields to -1, allow user to detect
+* unsupported optional parameters.
+*/
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   if (blocksize > BTRFS_DEDUPE_BLOCKSIZE_MAX ||
+   blocksize < BTRFS_DEDUPE_BLOCKSIZE_MIN ||
+   blocksize < fs_info->sectorsize ||
+   !is_power_of_2(blocksize) ||
+   blocksize < PAGE_SIZE) {
+   dargs->blocksize = (u64)-1;
+   return -EINVAL;
+   }
+   if (hash_algo >= ARRAY_SIZE(btrfs_hash_sizes)) {
+   dargs->hash_algo = (u16)-1;
+   return -EINVAL;
+   }
+   if (backend >= BTRFS_DEDUPE_BACKEND_COUNT) {
+   dargs->backend = (u8)-1;
+   return -EINVAL;
+   }
+
+   /* Backend specific check */
+   if (backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   /* only one limit is accepted for enable*/
+   if (dargs->limit_nr && dargs->limit_mem) {
+   dargs->limit_nr = 0;
+   dargs->limit_mem = 0;
+  

[PATCH v14.4 09/15] btrfs: dedupe: Introduce function to search for an existing hash

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_search() to handle the job for in-memory
hash tree.

The trick is, we must ensure the delayed ref head is not being run at
the time we search the for the hash.

With inmem_search(), we can implement the btrfs_dedupe_search()
interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 189 ++
 1 file changed, 189 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 762960edb251..890a08bf4152 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -16,6 +16,7 @@
 #include "btrfs_inode.h"
 #include "transaction.h"
 #include "delayed-ref.h"
+#include "qgroup.h"
 
 struct inmem_hash {
struct rb_node hash_node;
@@ -450,3 +451,191 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
kfree(dedupe_info);
return 0;
 }
+
+/*
+ * Caller must ensure the corresponding ref head is not being run.
+ */
+static struct inmem_hash *
+inmem_search_hash(struct btrfs_dedupe_info *dedupe_info, u8 *hash)
+{
+   struct rb_node **p = &dedupe_info->hash_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+   u16 hash_algo = dedupe_info->hash_algo;
+   int hash_len = btrfs_hash_sizes[hash_algo];
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+
+   if (memcmp(hash, entry->hash, hash_len) < 0) {
+   p = &(*p)->rb_left;
+   } else if (memcmp(hash, entry->hash, hash_len) > 0) {
+   p = &(*p)->rb_right;
+   } else {
+   /* Found, need to re-add it to LRU list head */
+   list_del(&entry->lru_list);
+   list_add(&entry->lru_list, &dedupe_info->lru_list);
+   return entry;
+   }
+   }
+   return NULL;
+}
+
+static int inmem_search(struct btrfs_dedupe_info *dedupe_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash)
+{
+   int ret;
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_delayed_ref_root *delayed_refs;
+   struct btrfs_delayed_ref_head *head;
+   struct btrfs_delayed_ref_head *insert_head;
+   struct btrfs_delayed_data_ref *insert_dref;
+   struct btrfs_qgroup_extent_record *insert_qrecord = NULL;
+   struct inmem_hash *found_hash;
+   int free_insert = 1;
+   int qrecord_inserted = 0;
+   u64 bytenr;
+   u32 num_bytes;
+
+   insert_head = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+   if (!insert_head)
+   return -ENOMEM;
+   insert_head->extent_op = NULL;
+   insert_dref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+   if (!insert_dref) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep, insert_head);
+   return -ENOMEM;
+   }
+   if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) &&
+   is_fstree(root->root_key.objectid)) {
+   insert_qrecord = kmalloc(sizeof(*insert_qrecord), GFP_NOFS);
+   if (!insert_qrecord) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep,
+   insert_head);
+   kmem_cache_free(btrfs_delayed_data_ref_cachep,
+   insert_dref);
+   return -ENOMEM;
+   }
+   }
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto free_mem;
+   }
+
+again:
+   mutex_lock(&dedupe_info->lock);
+   found_hash = inmem_search_hash(dedupe_info, hash->hash);
+   /* If we don't find a duplicated extent, just return. */
+   if (!found_hash) {
+   ret = 0;
+   goto out;
+   }
+   bytenr = found_hash->bytenr;
+   num_bytes = found_hash->num_bytes;
+
+   delayed_refs = &trans->transaction->delayed_refs;
+
+   spin_lock(&delayed_refs->lock);
+   head = btrfs_find_delayed_ref_head(&trans->transaction->delayed_refs,
+  bytenr);
+   if (!head) {
+   /*
+* We can safely insert a new delayed_ref as long as we
+* hold delayed_refs->lock.
+* Only need to use atomic inc_extent_ref()
+*/
+   btrfs_add_delayed_data_ref_locked(root->fs_info, trans,
+   insert_dref, insert_head, insert_qrecord,
+   bytenr, num_bytes, 0, root->root_key.objectid,
+   btrfs_ino(BTRFS_I(inode)), file_pos, 0,
+  

[PATCH v14.4 10/15] btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Unlike in-memory or on-disk dedupe method, only SHA256 hash method is
supported yet, so implement btrfs_dedupe_calc_hash() interface using
SHA256.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 47 +++
 1 file changed, 47 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 890a08bf4152..6bdcff625533 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -639,3 +639,50 @@ int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
}
return ret;
 }
+
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash)
+{
+   int i;
+   int ret;
+   struct page *p;
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+   struct crypto_shash *tfm = dedupe_info->dedupe_driver;
+   u64 dedupe_bs;
+   u64 sectorsize = fs_info->sectorsize;
+
+   SHASH_DESC_ON_STACK(sdesc, tfm);
+
+   if (!fs_info->dedupe_enabled || !hash)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   WARN_ON(!IS_ALIGNED(start, sectorsize));
+
+   dedupe_bs = dedupe_info->blocksize;
+
+   sdesc->tfm = tfm;
+   sdesc->flags = 0;
+   ret = crypto_shash_init(sdesc);
+   if (ret)
+   return ret;
+   for (i = 0; sectorsize * i < dedupe_bs; i++) {
+   char *d;
+
+   p = find_get_page(inode->i_mapping,
+ (start >> PAGE_SHIFT) + i);
+   if (WARN_ON(!p))
+   return -ENOENT;
+   d = kmap(p);
+   ret = crypto_shash_update(sdesc, d, sectorsize);
+   kunmap(p);
+   put_page(p);
+   if (ret)
+   return ret;
+   }
+   ret = crypto_shash_final(sdesc, hash->hash);
+   return ret;
+}
-- 
2.13.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14.4 07/15] btrfs: dedupe: Introduce function to remove hash from in-memory tree

2017-07-12 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_del() to remove hash from in-memory
dedupe tree.
And implement btrfs_dedupe_del() and btrfs_dedup_disable() interfaces.

Also for btrfs_dedupe_disable(), add new functions to wait existing
writer and block incoming writers to eliminate all possible race.

Cc: Mark Fasheh 
Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
---
 fs/btrfs/dedupe.c | 132 +++---
 1 file changed, 126 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index d2acdccef944..762960edb251 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -182,12 +182,6 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
return ret;
 }
 
-int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
-{
-   /* Place holder for bisect, will be implemented in later patches */
-   return 0;
-}
-
 static int inmem_insert_hash(struct rb_root *root,
 struct inmem_hash *hash, int hash_len)
 {
@@ -330,3 +324,129 @@ int btrfs_dedupe_add(struct btrfs_trans_handle *trans,
return inmem_add(dedupe_info, hash);
return -EINVAL;
 }
+
+static struct inmem_hash *
+inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct rb_node **p = &dedupe_info->bytenr_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+
+   if (bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return entry;
+   }
+
+   return NULL;
+}
+
+/* Delete a hash from in-memory dedupe tree */
+static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct inmem_hash *hash;
+
+   mutex_lock(&dedupe_info->lock);
+   hash = inmem_search_bytenr(dedupe_info, bytenr);
+   if (!hash) {
+   mutex_unlock(&dedupe_info->lock);
+   return 0;
+   }
+
+   __inmem_del(dedupe_info, hash);
+   mutex_unlock(&dedupe_info->lock);
+   return 0;
+}
+
+/* Remove a dedupe hash from dedupe tree */
+int btrfs_dedupe_del(struct btrfs_trans_handle *trans,
+struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   return inmem_del(dedupe_info, bytenr);
+   return -EINVAL;
+}
+
+static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info)
+{
+   struct inmem_hash *entry, *tmp;
+
+   mutex_lock(&dedupe_info->lock);
+   list_for_each_entry_safe(entry, tmp, &dedupe_info->lru_list, lru_list)
+   __inmem_del(dedupe_info, entry);
+   mutex_unlock(&dedupe_info->lock);
+}
+
+/*
+ * Helper function to wait and block all incoming writers
+ *
+ * Use rw_sem introduced for freeze to wait/block writers.
+ * So during the block time, no new write will happen, so we can
+ * do something quite safe, espcially helpful for dedupe disable,
+ * as it affect buffered write.
+ */
+static void block_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+   down_write(&sb->s_umount);
+}
+
+static void unblock_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   up_write(&sb->s_umount);
+   percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+}
+
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+   int ret;
+
+   dedupe_info = fs_info->dedupe_info;
+
+   if (!dedupe_info)
+   return 0;
+
+   /* Don't allow disable status change in RO mount */
+   if (fs_info->sb->s_flags & MS_RDONLY)
+   return -EROFS;
+
+   /*
+* Wait for all unfinished writers and block further writers.
+* Then sync the whole fs so all current write will go through
+* dedupe, and all later write won't go through dedupe.
+*/
+   block_all_writers(fs_info);
+   ret = sync_filesystem(fs_info->sb);
+   fs_info->dedupe_enabled = 0;
+   fs_info->dedupe_info = NULL;
+   unblock_all_writers(fs_info);
+   if (ret < 0)
+   return ret;
+
+   /* now we are OK to clean up everything */
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(d

[PATCH v14.4 12/15] btrfs: dedupe: Inband in-memory only de-duplication implement

2017-07-12 Thread Lu Fengqi
From: Qu Wenruo 

Core implement for inband de-duplication.
It reuse the async_cow_start() facility to do the calculate dedupe hash.
And use dedupe hash to do inband de-duplication at extent level.

The work flow is as below:
1) Run delalloc range for an inode
2) Calculate hash for the delalloc range at the unit of dedupe_bs
3) For hash match(duplicated) case, just increase source extent ref
   and insert file extent.
   For hash mismatch case, go through the normal cow_file_range()
   fallback, and add hash into dedupe_tree.
   Compress for hash miss case is not supported yet.

Current implement restore all dedupe hash in memory rb-tree, with LRU
behavior to control the limit.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   4 +-
 fs/btrfs/dedupe.h  |  18 +++
 fs/btrfs/extent-tree.c |  33 -
 fs/btrfs/extent_io.c   |  10 +-
 fs/btrfs/extent_io.h   |   1 +
 fs/btrfs/file.c|   3 +
 fs/btrfs/inode.c   | 329 -
 fs/btrfs/ioctl.c   |   1 +
 fs/btrfs/relocation.c  |  17 +++
 9 files changed, 350 insertions(+), 66 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 965c6615d882..337d9b7cc4a3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -116,9 +116,11 @@ static inline u32 count_max_extents(u64 size, u64 
max_extent_size)
 enum btrfs_metadata_reserve_type {
BTRFS_RESERVE_NORMAL,
BTRFS_RESERVE_COMPRESS,
+   BTRFS_RESERVE_DEDUPE,
 };
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type);
 int inode_need_compress(struct inode *inode);
 
 struct btrfs_mapping_tree {
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 8311ee13ca83..3a15fc2069b9 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include "btrfs_inode.h"
 
 static const int btrfs_hash_sizes[] = { 32 };
 
@@ -63,6 +64,23 @@ struct btrfs_dedupe_info {
 
 struct btrfs_trans_handle;
 
+static inline u64 btrfs_dedupe_blocksize(struct btrfs_inode *inode)
+{
+   struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+   return fs_info->dedupe_info->blocksize;
+}
+
+static inline int inode_need_dedupe(struct inode *inode)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+
+   if (!fs_info->dedupe_enabled)
+   return 0;
+
+   return 1;
+}
+
 static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
 {
return (hash && hash->bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 892a47b13deb..05acf9bc11df 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -38,6 +38,7 @@
 #include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
+#include "dedupe.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -2429,6 +2430,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle 
*trans,
 
if (btrfs_delayed_ref_is_head(node)) {
struct btrfs_delayed_ref_head *head;
+
/*
 * we've hit the end of the chain and we were supposed
 * to insert this extent into the tree.  But, it got
@@ -2453,6 +2455,18 @@ static int run_one_delayed_ref(struct btrfs_trans_handle 
*trans,
btrfs_pin_extent(fs_info, node->bytenr,
 node->num_bytes, 1);
if (head->is_data) {
+   /*
+* If insert_reserved is given, it means
+* a new extent is revered, then deleted
+* in one tran, and inc/dec get merged to 0.
+*
+* In this case, we need to remove its dedupe
+* hash.
+*/
+   ret = btrfs_dedupe_del(trans, fs_info,
+  node->bytenr);
+   if (ret < 0)
+   return ret;
ret = btrfs_del_csums(trans, fs_info,
  node->bytenr,
  node->num_bytes);
@@ -5916,7 +5930,7 @@ static unsigned drop_outstanding_extent(struct 
btrfs_inode *inode,
unsigned drop_inode_space = 0;
unsigned dropped_extents = 0;
unsigned num_extents;
-   u64 max_extent_size = btrfs_max_extent_size(reserve_type);
+   u64 max_extent_size = btrfs_max_extent_size(inode, reserve_type);
 
num_extents = count_max_extents(num_bytes, max_extent_size);
ASSERT(num_extents);
@@ -5985,15 +5999,17 @@ static u64 calc_csum_metadata_size(struct btrfs_inod

RE: HELLO

2017-07-12 Thread selvi

I am Ms.Ella Golan, I am the Executive Vice President Banking Division with 
FIRST INTERNATIONAL BANK OF ISRAEL LTD (FIBI). 

I am getting in touch with you regarding an extremely important and urgent 
matter. If you would oblige me the opportunity, 

I shall provide you with details upon your response.

Faithfully,
Ms.Ella Golan
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs check reports errors, filesystem seems fine

2017-07-12 Thread Filippe LeMarchand
> Maybe something wrong in grep happened which skip "(79177" ?
Yes, my bad. Now I used grep -E "\(79177| 79177" pattern, file on GDrive 
updated.

And btrfs check --mode=lowmem gives this:

checking extents
ERROR: extent[1609877700608, 94208] referencer count mismatch (root: 260, 
owner: 61720, offset: 6742016) wanted: 2, have: 5
ERROR: extent[1630301675520, 39583744] referencer count mismatch (root: 260, 
owner: 5847554, offset: 0) wanted: 36, have: 114
ERROR: extent[1658646986752, 10551296] referencer count mismatch (root: 274, 
owner: 283675, offset: 0) wanted: 2, have: 5
ERROR: extent[1672239132672, 84381696] referencer count mismatch (root: 274, 
owner: 2521382, offset: 0) wanted: 21, have: 25
ERROR: errors found in extent allocation tree or chunk allocation
checking free space cache
checking fs roots
ERROR: root 4546 DIR_ITEM[79177 54846528] relative INODE_REF missing namelen 14 
filename deprecated.sxt filetype 1
ERROR: root 4546 INODE REF[4222342 79177] and DIR_ITEM[79177 54846528] mismatch 
namelen 14 filename deprecated.txt filetype 1
ERROR: root 5134 DIR_ITEM[79177 54846528] relative INODE_REF missing namelen 14 
filename deprecated.sxt filetype 1
ERROR: errors found in fs roots
Checking filesystem on /dev/sda2
UUID: 12c84aa3-ce65-4390-807e-a72cc8a7445e
found 153429872640 bytes used, error(s) found
total csum bytes: 121991672
total tree bytes: 1940160512
total fs tree bytes: 1683767296
total extent tree bytes: 103841792
btree space waste bytes: 310722480
file data blocks allocated: 842455031808
 referenced 159286636544

In a letter from Wednesday, July 12, 2017 10:15:18 MSK user Qu Wenruo wrote:
> Sorry for the late reply.
> 
> After investigating the dumps, I found the output is quite strange.
> 
> 1) Mismatching output.
> In "btrfs-debug-tree-grep-79177.txt" I found only 79177 as offset for 
> INODE_REF is here, while 79177 as objectid for DIR_ITEM/DIR_INDEX is not 
> here at all.
> 
> While in "btrfs-debug-tree-grep-deprecated-txt.txt" there is epected 
> 79177 DIR_ITEM/DIR_INDEX.
> 
> Maybe something wrong in grep happened which skip "(79177" ?
> 
> 2) Mismatched hash
> The main problem I found is that, for key (79177 DIR_ITEM 54846528), the 
> number 54846528 is the hash(crc32c) of filename, and it contains 2 
> items, one for "deprecated.txt" and one for "deprecated.sxt".
> 
> But we found that 54846528 only matches the hash for "deprecated.txt", 
> not "deprecated.sxt".
> 
> I think that's the main problem.
> 
> BTW, would you please try "btrfs check --mode=lowmem" to see if lowmem 
> mode reports similar (well, output may differ) error?
> 
> If lowmem mode also reports error on such DIR_ITEM, I'm pretty sure 
> that's the problem.
> 
> However it may take some time before we can fix it in repair mode.
> 
> Thanks,
> Qu
> 
> 
> 
> 在 2017年07月04日 21:24, Filippe LeMarchand 写道:
> > Sure, here it is:
> > https://drive.google.com/drive/folders/0B1ax9Am81gx9YjJBVVA0LXRHeGc
> > 
> > In a letter dated Tuesday, July 4, 2017 16:16:36 MSK user Lu Fengqi wrote:
> >> On Mon, Jul 03, 2017 at 08:34:52AM +0800, Qu Wenruo wrote:
> >>>
> >>>
> >>> At 07/01/2017 07:59 PM, Filippe LeMarchand wrote:
>  Hello everyone.
> 
>  I have an btrfs root partition on Intel 530 ssd, which mounts without 
>  errors and seem to work fine,
>  but `btrfs check` gives me foloowing output (and --repair doesn't remove 
>  errors):
> 
>  enabling repair mode
>  Checking filesystem on /dev/sda2
>  UUID: 12c84aa3-ce65-4390-807e-a72cc8a7445e
>  checking extents
>  Fixed 0 roots.
>  checking free space cache
>  cache and super generation don't match, space cache will be invalidated
>  checking fs roots
>   unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
>  filetype 1 errors 6, no dir index, no inode ref
> >>>
> >>> This means that in dir whose inode number is 79177, it has a child inode
> >>> pointer pointing to depercated.sxt.
> >>>
> >>> But it doesn't have dir index and corresponding inode ref, which is 
> >>> breaking
> >>> the cross reference rule of btrfs.
> >>>
> >>> Would you please run the following command to dump needed info for us to
> >>> debug?
> >>>
> >>> # btrfs-debug-tree /dev/sda2 | grep 79177 -C 10
> >>>
> >>> and
> >>>
> >>> # btrfs-debug-tree /dev/sda2 | grep deprecated.sxt -C 10
> >>>
> >>> and
> >>>
> >>> # btrfs-debug-tree /dev/sda2 | grep deprecated.txt -C 10
> >>>
> >>>
> >>> Considering the output has both .txt and .sxt, I think that's the problem.
> >>> But such bit-flip should be detected by tree block csum.
> >>> I'm not sure what's wrong with it.
> >>>
> >>> Thanks,
> >>> Qu
> >>>
>   unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
>  filetype 1 errors 1, no dir item
>   unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
>  filetype 1 errors 6, no dir index, no inode ref
>   unresolved ref dir 79177 index 417 namelen 14 name deprecated.txt 
>  filetype 1 errors 1

Re: Btrfs check reports errors, filesystem seems fine

2017-07-12 Thread Qu Wenruo



On 2017年07月12日 19:12, Filippe LeMarchand wrote:

Maybe something wrong in grep happened which skip "(79177" ?

Yes, my bad. Now I used grep -E "\(79177| 79177" pattern, file on GDrive 
updated.


It looks much better, thanks.



And btrfs check --mode=lowmem gives this:

checking extents
ERROR: extent[1609877700608, 94208] referencer count mismatch (root: 260, 
owner: 61720, offset: 6742016) wanted: 2, have: 5
ERROR: extent[1630301675520, 39583744] referencer count mismatch (root: 260, 
owner: 5847554, offset: 0) wanted: 36, have: 114
ERROR: extent[1658646986752, 10551296] referencer count mismatch (root: 274, 
owner: 283675, offset: 0) wanted: 2, have: 5
ERROR: extent[1672239132672, 84381696] referencer count mismatch (root: 274, 
owner: 2521382, offset: 0) wanted: 21, have: 25
ERROR: errors found in extent allocation tree or chunk allocation


Looks much like an exposed lowmem mode bug.
Feel free to ignore these error from extent tree, they are just false 
alerts.



checking free space cache
checking fs roots
ERROR: root 4546 DIR_ITEM[79177 54846528] relative INODE_REF missing namelen 14 
filename deprecated.sxt filetype 1


The error report is much better than original mode, and that's what I need.

Now I can wipe out all other noise as we know exactly which tree and 
which DIR_ITEM/INODE_REF is causing the problem.


Would you please update the dump result with "-t 4546" passed to 
btrfs-debug-tree like:


# btrfs-debug-tree -t 4546 | grep 79177

Only "-t 4546" is added, to only dump the result of subvolume 4546.
As always, all 3 grep results (2 "deprecated" and one 79177) need to be 
updated.


And it seems that my previous assumption is still right for this case.
If it's caused by kernel, your dump would definitely help us to locate 
the problem.



ERROR: root 4546 INODE REF[4222342 79177] and DIR_ITEM[79177 54846528] mismatch 
namelen 14 filename deprecated.txt filetype 1
ERROR: root 5134 DIR_ITEM[79177 54846528] relative INODE_REF missing namelen 14 
filename deprecated.sxt filetype 1


Also for root 5134 please.

Thanks,
Qu


ERROR: errors found in fs roots
Checking filesystem on /dev/sda2
UUID: 12c84aa3-ce65-4390-807e-a72cc8a7445e
found 153429872640 bytes used, error(s) found
total csum bytes: 121991672
total tree bytes: 1940160512
total fs tree bytes: 1683767296
total extent tree bytes: 103841792
btree space waste bytes: 310722480
file data blocks allocated: 842455031808
  referenced 159286636544

In a letter from Wednesday, July 12, 2017 10:15:18 MSK user Qu Wenruo wrote:

Sorry for the late reply.

After investigating the dumps, I found the output is quite strange.

1) Mismatching output.
In "btrfs-debug-tree-grep-79177.txt" I found only 79177 as offset for
INODE_REF is here, while 79177 as objectid for DIR_ITEM/DIR_INDEX is not
here at all.

While in "btrfs-debug-tree-grep-deprecated-txt.txt" there is epected
79177 DIR_ITEM/DIR_INDEX.

Maybe something wrong in grep happened which skip "(79177" ?

2) Mismatched hash
The main problem I found is that, for key (79177 DIR_ITEM 54846528), the
number 54846528 is the hash(crc32c) of filename, and it contains 2
items, one for "deprecated.txt" and one for "deprecated.sxt".

But we found that 54846528 only matches the hash for "deprecated.txt",
not "deprecated.sxt".

I think that's the main problem.

BTW, would you please try "btrfs check --mode=lowmem" to see if lowmem
mode reports similar (well, output may differ) error?

If lowmem mode also reports error on such DIR_ITEM, I'm pretty sure
that's the problem.

However it may take some time before we can fix it in repair mode.

Thanks,
Qu



在 2017年07月04日 21:24, Filippe LeMarchand 写道:

Sure, here it is:
https://drive.google.com/drive/folders/0B1ax9Am81gx9YjJBVVA0LXRHeGc

In a letter dated Tuesday, July 4, 2017 16:16:36 MSK user Lu Fengqi wrote:

On Mon, Jul 03, 2017 at 08:34:52AM +0800, Qu Wenruo wrote:



At 07/01/2017 07:59 PM, Filippe LeMarchand wrote:

Hello everyone.

I have an btrfs root partition on Intel 530 ssd, which mounts without errors 
and seem to work fine,
but `btrfs check` gives me foloowing output (and --repair doesn't remove 
errors):

enabling repair mode
Checking filesystem on /dev/sda2
UUID: 12c84aa3-ce65-4390-807e-a72cc8a7445e
checking extents
Fixed 0 roots.
checking free space cache
cache and super generation don't match, space cache will be invalidated
checking fs roots
unresolved ref dir 79177 index 0 namelen 14 name deprecated.sxt 
filetype 1 errors 6, no dir index, no inode ref


This means that in dir whose inode number is 79177, it has a child inode
pointer pointing to depercated.sxt.

But it doesn't have dir index and corresponding inode ref, which is breaking
the cross reference rule of btrfs.

Would you please run the following command to dump needed info for us to
debug?

# btrfs-debug-tree /dev/sda2 | grep 79177 -C 10

and

# btrfs-debug-tree /dev/sda2 | grep deprecated.sxt -C 10

and

# btrfs-debug-tree /dev/sda2 | grep deprecated.txt -C 10


Considering 

Re: Btrfs check reports errors, filesystem seems fine

2017-07-12 Thread Filippe LeMarchand
Done, files added to same GDrive folder with corresponding names.
If it matters, subvol 4546 is my root filesystem (r/w snapshot created with 
snapper rollback), and 5134 is its snapshot.

In a letter dated Wednesday, July 12, 2017 15:44:52 MSK user Qu Wenruo wrote:
> 
> On 2017年07月12日 19:12, Filippe LeMarchand wrote:
> >> Maybe something wrong in grep happened which skip "(79177" ?
> > Yes, my bad. Now I used grep -E "\(79177| 79177" pattern, file on GDrive 
> > updated.
> 
> It looks much better, thanks.
> 
> > 
> > And btrfs check --mode=lowmem gives this:
> > 
> > checking extents
> > ERROR: extent[1609877700608, 94208] referencer count mismatch (root: 260, 
> > owner: 61720, offset: 6742016) wanted: 2, have: 5
> > ERROR: extent[1630301675520, 39583744] referencer count mismatch (root: 
> > 260, owner: 5847554, offset: 0) wanted: 36, have: 114
> > ERROR: extent[1658646986752, 10551296] referencer count mismatch (root: 
> > 274, owner: 283675, offset: 0) wanted: 2, have: 5
> > ERROR: extent[1672239132672, 84381696] referencer count mismatch (root: 
> > 274, owner: 2521382, offset: 0) wanted: 21, have: 25
> > ERROR: errors found in extent allocation tree or chunk allocation
> 
> Looks much like an exposed lowmem mode bug.
> Feel free to ignore these error from extent tree, they are just false 
> alerts.
> 
> > checking free space cache
> > checking fs roots
> > ERROR: root 4546 DIR_ITEM[79177 54846528] relative INODE_REF missing 
> > namelen 14 filename deprecated.sxt filetype 1
> 
> The error report is much better than original mode, and that's what I need.
> 
> Now I can wipe out all other noise as we know exactly which tree and 
> which DIR_ITEM/INODE_REF is causing the problem.
> 
> Would you please update the dump result with "-t 4546" passed to 
> btrfs-debug-tree like:
> 
> # btrfs-debug-tree -t 4546 | grep 79177
> 
> Only "-t 4546" is added, to only dump the result of subvolume 4546.
> As always, all 3 grep results (2 "deprecated" and one 79177) need to be 
> updated.
> 
> And it seems that my previous assumption is still right for this case.
> If it's caused by kernel, your dump would definitely help us to locate 
> the problem.
> 
> > ERROR: root 4546 INODE REF[4222342 79177] and DIR_ITEM[79177 54846528] 
> > mismatch namelen 14 filename deprecated.txt filetype 1
> > ERROR: root 5134 DIR_ITEM[79177 54846528] relative INODE_REF missing 
> > namelen 14 filename deprecated.sxt filetype 1
> 
> Also for root 5134 please.
> 
> Thanks,
> Qu
> 
> > ERROR: errors found in fs roots
> > Checking filesystem on /dev/sda2
> > UUID: 12c84aa3-ce65-4390-807e-a72cc8a7445e
> > found 153429872640 bytes used, error(s) found
> > total csum bytes: 121991672
> > total tree bytes: 1940160512
> > total fs tree bytes: 1683767296
> > total extent tree bytes: 103841792
> > btree space waste bytes: 310722480
> > file data blocks allocated: 842455031808
> >   referenced 159286636544
> > 
> > In a letter from Wednesday, July 12, 2017 10:15:18 MSK user Qu Wenruo wrote:
> >> Sorry for the late reply.
> >>
> >> After investigating the dumps, I found the output is quite strange.
> >>
> >> 1) Mismatching output.
> >> In "btrfs-debug-tree-grep-79177.txt" I found only 79177 as offset for
> >> INODE_REF is here, while 79177 as objectid for DIR_ITEM/DIR_INDEX is not
> >> here at all.
> >>
> >> While in "btrfs-debug-tree-grep-deprecated-txt.txt" there is epected
> >> 79177 DIR_ITEM/DIR_INDEX.
> >>
> >> Maybe something wrong in grep happened which skip "(79177" ?
> >>
> >> 2) Mismatched hash
> >> The main problem I found is that, for key (79177 DIR_ITEM 54846528), the
> >> number 54846528 is the hash(crc32c) of filename, and it contains 2
> >> items, one for "deprecated.txt" and one for "deprecated.sxt".
> >>
> >> But we found that 54846528 only matches the hash for "deprecated.txt",
> >> not "deprecated.sxt".
> >>
> >> I think that's the main problem.
> >>
> >> BTW, would you please try "btrfs check --mode=lowmem" to see if lowmem
> >> mode reports similar (well, output may differ) error?
> >>
> >> If lowmem mode also reports error on such DIR_ITEM, I'm pretty sure
> >> that's the problem.
> >>
> >> However it may take some time before we can fix it in repair mode.
> >>
> >> Thanks,
> >> Qu
> >>
> >>
> >>
> >> 在 2017年07月04日 21:24, Filippe LeMarchand 写道:
> >>> Sure, here it is:
> >>> https://drive.google.com/drive/folders/0B1ax9Am81gx9YjJBVVA0LXRHeGc
> >>>
> >>> In a letter dated Tuesday, July 4, 2017 16:16:36 MSK user Lu Fengqi wrote:
>  On Mon, Jul 03, 2017 at 08:34:52AM +0800, Qu Wenruo wrote:
> >
> >
> > At 07/01/2017 07:59 PM, Filippe LeMarchand wrote:
> >> Hello everyone.
> >>
> >> I have an btrfs root partition on Intel 530 ssd, which mounts without 
> >> errors and seem to work fine,
> >> but `btrfs check` gives me foloowing output (and --repair doesn't 
> >> remove errors):
> >>
> >> enabling repair mode
> >> Checking filesystem on /dev/sda2
> >> UUID: 12c84aa3-ce65-4390-8

Re: [PATCH] btrfs: qgroups: Fix BUG_ON condition

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 03:09:42PM +0800, Qu Wenruo wrote:
> 
> 
> 在 2017年07月12日 14:42, Nikolay Borisov 写道:
> > The current code was erroneously checking for root_level > BTRFS_MAX_LEVEL. 
> > If
> > we had a root_level of 8 then the check won't trigger and we could
> > potentially hit a buffer overflow. The correct check should be
> > root_level >= BTRFS_MAX_LEVEL
> 
> Thanks for catching this.
> 
> Reviewed-by: Qu Wenruo 
> 
> > 
> > Signed-off-by: Nikolay Borisov 
> > ---
> >   fs/btrfs/qgroup.c | 4 ++--
> >   1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
> > index 4ce351efe281..3b787915ef31 100644
> > --- a/fs/btrfs/qgroup.c
> > +++ b/fs/btrfs/qgroup.c
> > @@ -1603,7 +1603,7 @@ int btrfs_qgroup_trace_subtree(struct 
> > btrfs_trans_handle *trans,
> > struct extent_buffer *eb = root_eb;
> > struct btrfs_path *path = NULL;
> >   
> > -   BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
> > +   BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
> > BUG_ON(root_eb == NULL);
> >   
> > if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
> > @@ -2959,7 +2959,7 @@ static int __btrfs_qgroup_release_data(struct inode 
> > *inode,
> > if (free && reserved)
> > return qgroup_free_reserved_data(inode, reserved, start, len);
> > extent_changeset_init(&changeset);
> > -   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
> > +   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
> > start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
> 
> I didn't recongize it's a tailing white space at first.

The original code is from you, so please configure your editor to
hilight trailing whitespace. Whitespace damage happens, git am warns
about tha but git cherry-pick does not.

> Nice catch.

So before we start seeing patches that fix random whitespace in
unrelated code: please don't do that.

As you wrote, it was not obvious that there was no change on the line,
this just slowed down reading the patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: qgroups: Fix BUG_ON condition

2017-07-12 Thread Nikolay Borisov


On 12.07.2017 16:42, David Sterba wrote:
> On Wed, Jul 12, 2017 at 03:09:42PM +0800, Qu Wenruo wrote:
>>
>>
>> 在 2017年07月12日 14:42, Nikolay Borisov 写道:
>>> The current code was erroneously checking for root_level > BTRFS_MAX_LEVEL. 
>>> If
>>> we had a root_level of 8 then the check won't trigger and we could
>>> potentially hit a buffer overflow. The correct check should be
>>> root_level >= BTRFS_MAX_LEVEL
>>
>> Thanks for catching this.
>>
>> Reviewed-by: Qu Wenruo 
>>
>>>
>>> Signed-off-by: Nikolay Borisov 
>>> ---
>>>   fs/btrfs/qgroup.c | 4 ++--
>>>   1 file changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
>>> index 4ce351efe281..3b787915ef31 100644
>>> --- a/fs/btrfs/qgroup.c
>>> +++ b/fs/btrfs/qgroup.c
>>> @@ -1603,7 +1603,7 @@ int btrfs_qgroup_trace_subtree(struct 
>>> btrfs_trans_handle *trans,
>>> struct extent_buffer *eb = root_eb;
>>> struct btrfs_path *path = NULL;
>>>   
>>> -   BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
>>> +   BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
>>> BUG_ON(root_eb == NULL);
>>>   
>>> if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
>>> @@ -2959,7 +2959,7 @@ static int __btrfs_qgroup_release_data(struct inode 
>>> *inode,
>>> if (free && reserved)
>>> return qgroup_free_reserved_data(inode, reserved, start, len);
>>> extent_changeset_init(&changeset);
>>> -   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
>>> +   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
>>> start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
>>
>> I didn't recongize it's a tailing white space at first.
> 
> The original code is from you, so please configure your editor to
> hilight trailing whitespace. Whitespace damage happens, git am warns
> about tha but git cherry-pick does not.
> 
>> Nice catch.
> 
> So before we start seeing patches that fix random whitespace in
> unrelated code: please don't do that.
> 
> As you wrote, it was not obvious that there was no change on the line,
> this just slowed down reading the patch.

I didn't intentionally fix this, I've configured vi so as to
automatically do this. There is also whitespace damage on a particular
line in extent-tree.c and every time I submit a patch that touches this
file I explicitly have to omit that particular hunk.

How would you feel about me sending a patch fixing those 2 whitespace
damages?

> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: qgroups: Fix BUG_ON condition

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 04:50:20PM +0300, Nikolay Borisov wrote:
> > As you wrote, it was not obvious that there was no change on the line,
> > this just slowed down reading the patch.
> 
> I didn't intentionally fix this, I've configured vi so as to
> automatically do this. There is also whitespace damage on a particular
> line in extent-tree.c and every time I submit a patch that touches this
> file I explicitly have to omit that particular hunk.
> 
> How would you feel about me sending a patch fixing those 2 whitespace
> damages?

Can't you fix your editor not to auto-correct? :)
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: qgroups: Fix BUG_ON condition

2017-07-12 Thread Qu Wenruo



On 2017年07月12日 21:42, David Sterba wrote:

On Wed, Jul 12, 2017 at 03:09:42PM +0800, Qu Wenruo wrote:



在 2017年07月12日 14:42, Nikolay Borisov 写道:

The current code was erroneously checking for root_level > BTRFS_MAX_LEVEL. If
we had a root_level of 8 then the check won't trigger and we could
potentially hit a buffer overflow. The correct check should be
root_level >= BTRFS_MAX_LEVEL


Thanks for catching this.

Reviewed-by: Qu Wenruo 



Signed-off-by: Nikolay Borisov 
---
   fs/btrfs/qgroup.c | 4 ++--
   1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4ce351efe281..3b787915ef31 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1603,7 +1603,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle 
*trans,
struct extent_buffer *eb = root_eb;
struct btrfs_path *path = NULL;
   
-	BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);

+   BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL);
BUG_ON(root_eb == NULL);
   
   	if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))

@@ -2959,7 +2959,7 @@ static int __btrfs_qgroup_release_data(struct inode 
*inode,
if (free && reserved)
return qgroup_free_reserved_data(inode, reserved, start, len);
extent_changeset_init(&changeset);
-   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+   ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);


I didn't recongize it's a tailing white space at first.


The original code is from you, so please configure your editor to
hilight trailing whitespace. Whitespace damage happens, git am warns
about tha but git cherry-pick does not.


Well, I should make send-email to automatically to run checkpatch.

Sometimes I forgot to run checkpatch manually and will cause such damage.
Really sorry for that.

Thanks,
Qu




Nice catch.


So before we start seeing patches that fix random whitespace in
unrelated code: please don't do that.

As you wrote, it was not obvious that there was no change on the line,
this just slowed down reading the patch.


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: qgroups: Fix BUG_ON condition

2017-07-12 Thread Qu Wenruo



On 2017年07月12日 21:51, David Sterba wrote:

On Wed, Jul 12, 2017 at 04:50:20PM +0300, Nikolay Borisov wrote:

As you wrote, it was not obvious that there was no change on the line,
this just slowed down reading the patch.


I didn't intentionally fix this, I've configured vi so as to
automatically do this. There is also whitespace damage on a particular
line in extent-tree.c and every time I submit a patch that touches this
file I explicitly have to omit that particular hunk.

How would you feel about me sending a patch fixing those 2 whitespace
damages?


Can't you fix your editor not to auto-correct? :)


At least for tailing white space, there are only 4  in v4.12, including 
this one.


Why not fixing it in one patch so we don't need to bother them any longer?

Thanks,
Qu
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: report errors when checksum is not found

2017-07-12 Thread David Sterba
On Tue, Jul 11, 2017 at 02:43:16PM -0600, Liu Bo wrote:
> When btrfs fails the checksum check, it'll fill the whole page with
> "1".

One could ask, why is the page filled with 1s. Brought by commit
07157aacb1ecd394a54949 from 2007, without mentioning any justification.
I'm more inclined to revisit this behaviour and drop it eventually.

> However, if %csum_expected is 0 (which means there is no checksum), then
> for some unknown reason, we just pretend that the read is correct, so
> userspace would be confused about the dilemma that read is successful but
> getting a page with all content being "1".

Here 'no checksum' means that no checksum was found but was expected,
right? An EIO would fail the read, I don't see a reason why the page
needs to be "zeroed". The contents would be inaccessible anyway.

> This can happen due to a bug in btrfs-convert.
> 
> This fixes it by always returning errors if checksum doesn't match.

Independent of the above, this fix makes sense.

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Remove never reached code

2017-07-12 Thread David Sterba
The subject is too generic, and the changelog could say something about
the btrfs_panic semantics. Otherwise the change is ok, previously there
was just BUG_ON, then if -> btrfs_panic and after moving the kfree after
the print, the return has been added, but this is just redundant.

On Wed, Jul 12, 2017 at 09:13:58AM +0300, Nikolay Borisov wrote:
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/relocation.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
> index 65661d1aae4e..1a532bb72eab 100644
> --- a/fs/btrfs/relocation.c
> +++ b/fs/btrfs/relocation.c
> @@ -1308,8 +1308,6 @@ static int __must_check __add_reloc_root(struct 
> btrfs_root *root)
>   btrfs_panic(fs_info, -EEXIST,
>   "Duplicate root found for start=%llu while 
> inserting into relocation tree",
>   node->bytenr);
> - kfree(node);
> - return -EEXIST;
>   }
>  
>   list_add_tail(&root->root_list, &rc->reloc_roots);
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Remove redundant code

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 09:32:15AM +0300, Nikolay Borisov wrote:
> insert_into_bitmap has only one caller which always allocates the info struct
> passed. As such remove the any NULL checks for info and also remove code
> to allocate info in case it was NULL.
> 
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/free-space-cache.c | 13 +
>  1 file changed, 1 insertion(+), 12 deletions(-)
> 
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index c5e6180cdb8c..fd24fb99d6dc 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -2101,7 +2101,7 @@ static int insert_into_bitmap(struct 
> btrfs_free_space_ctl *ctl,
>   goto again;
>  
>  new_bitmap:
> - if (info && info->bitmap) {
> + if (info->bitmap) {
>   add_new_bitmap(ctl, info, offset);
>   added = 1;
>   info = NULL;

What if we reach this point, go back to label "again:", come back to
this check again, then info would be NULL and dereferencing info->bitmap
would crash. Then the below code is still required.

> @@ -2109,17 +2109,6 @@ static int insert_into_bitmap(struct 
> btrfs_free_space_ctl *ctl,
>   } else {
>   spin_unlock(&ctl->tree_lock);
>  
> - /* no pre-allocated info, allocate a new one */
> - if (!info) {
> - info = kmem_cache_zalloc(btrfs_free_space_cachep,
> -  GFP_NOFS);
> - if (!info) {
> - spin_lock(&ctl->tree_lock);
> - ret = -ENOMEM;
> - goto out;
> - }
> - }
> -
>   /* allocate the bitmap */
>   info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS);
>   spin_lock(&ctl->tree_lock);
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Check if tgt_device is not null

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 11:39:22AM +0300, Nikolay Borisov wrote:
> btrfs_err_in_rcu indiscriminately dereferences tgt_device to access its
> ->name member in an error path. However, couple of lines below there is code
> which checks whether tgt_device is not NULL. Let's be consistent and check if
> the tgt_device is NULL before dereferencing it.

The question is if tgt_device can be really NULL. From what I see I
don't think so. The target device is the one we're writing to, so we've
used it through the entire dev-replace. Source device can be null if
we're replacing a missing device.

> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/dev-replace.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
> index bee3edeea7a3..e2a16cb8f7f3 100644
> --- a/fs/btrfs/dev-replace.c
> +++ b/fs/btrfs/dev-replace.c
> @@ -541,7 +541,8 @@ static int btrfs_dev_replace_finishing(struct 
> btrfs_fs_info *fs_info,
>src_device->missing ? "" :
>rcu_str_deref(src_device->name),
>src_device->devid,
> -  rcu_str_deref(tgt_device->name), scrub_ret);
> +  tgt_device ? rcu_str_deref(tgt_device->name) :
> +  "", scrub_ret);
>   btrfs_dev_replace_unlock(dev_replace, 1);
>   mutex_unlock(&fs_info->chunk_mutex);
>   mutex_unlock(&fs_info->fs_devices->device_list_mutex);
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 08/13] btrfs: convert prelimary reference tracking to use rbtrees

2017-07-12 Thread David Sterba
On Tue, Jul 11, 2017 at 05:12:27PM -0600, Edmund Nadolski wrote:
> 
> 
> On 07/11/2017 09:15 AM, David Sterba wrote:
> > On Wed, Jun 28, 2017 at 09:57:00PM -0600, Edmund Nadolski wrote:
> >> It's been known for a while that the use of multiple lists
> >> that are periodically merged was an algorithmic problem within
> >> btrfs.  There are several workloads that don't complete in any
> >> reasonable amount of time (e.g. btrfs/130) and others that cause
> >> soft lockups.
> >>
> >> The solution is to use a pair of rbtrees that do insertion merging
> >> for both indirect and direct refs, with the former converting
> >> refs into the latter.  The result is a btrfs/130 workload that
> >> used to take several hours now takes about half of that. This
> >> runtime still isn't acceptable and a future patch will address that
> >> by moving the rbtrees higher in the stack so the lookups can be
> >> shared across multiple calls to find_parent_nodes.
> >>
> >> Signed-off-by: Edmund Nadolski 
> >> Signed-off-by: Jeff Mahoney 
> > 
> > I've bisected to this patch, the self-tests run at module load time
> > fail:
> > 
> > tests/qgroup-tests.c:272
> > 
> > 270 if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
> > 271 nodesize, nodesize)) {
> > 272 test_msg("Qgroup counts didn't match expected 
> > values\n");
> > 273 return -EINVAL;
> > 274 }
> > 
> >  245 int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 
> > qgroupid,
> >  246u64 rfer, u64 excl)
> >  247 {
> >  248 struct btrfs_qgroup *qgroup;
> >  249
> >  250 qgroup = find_qgroup_rb(fs_info, qgroupid);
> >  251 if (!qgroup)
> >  252 return -EINVAL;
> >  253 if (qgroup->rfer != rfer || qgroup->excl != excl)
> >  254 return -EINVAL;
> >  255 return 0;
> >  256 }
> > 
> > the second if fails, with 0 != 4096 || 0 != 4096
> > 
> > Tested branch was current for-next-test (top commit
> > 8d73f8348287a3d3be10795f45d313f63cdcd72c), with
> > CONFIG_BTRFS_FS_RUN_SANITY_TESTS=y
> 
> This looks like a consequence of an existing check in 
> __resolve_indirect_ref():
> 
>   if (btrfs_is_testing(fs_info)) {
>   srcu_read_unlock(&fs_info->subvol_srcu, index);
>   ret = -ENOENT;
>   goto out;
>   }
> 
> The existing code simply leaves the ref on the pref list, to be picked up 
> later
> in find_parent_nodes(), which will ulist_add() an entry onto the roots list 
> for
> it.  The patch otoh when it sees -ENOENT just frees the ref so no entry is
> ever added to the ulist.
> 
> The patch can be fixed to behave similarly to the existing code by
> inserting the ref into the direct tree instead of freeing it.  This seems
> a bit odd since technically the ref isn't actually 'resolved'. Considering
> that this code path is really just a special case for the sanity check when
> the fs_info is in a BTRFS_FS_STATE_DUMMY_FS_INFO state, perhaps that's not
> too great a concern. Thoughts?

Yeah, this has been introduced by d9ee522ba3ab51b7e3c6d and it wants to
take some shortcuts for the self-tests. I'm concerned because the module
does not load when the self-tests are enabled.

So at this moment I'd take simpler approach and work around it so we can
continue testing and then fix it properly (either populate the trees or
add more exceptions for the self-tests).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/6] Chunk level degradable check

2017-07-12 Thread David Sterba
On Wed, Jun 28, 2017 at 01:43:29PM +0800, Qu Wenruo wrote:
> The patchset can be fetched from my github repo:
> https://github.com/adam900710/linux/tree/degradable
> 
> The patchset is based on David's for-4.13-part1 branch.
> 
> Btrfs currently uses num_tolerated_disk_barrier_failures to do global
> check for tolerated missing device.
> 
> Although the one-size-fit-all solution is quite safe, it's too strict
> if data and metadata has different duplication level.
> 
> For example, if one use Single data and RAID1 metadata for 2 disks, it
> means any missing device will make the fs unable to be degraded
> mounted.
> 
> But in fact, some times all single chunks may be in the existing
> device and in that case, we should allow it to be rw degraded mounted.
> 
> Such case can be easily reproduced using the following script:
>  # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc
>  # wipefs -f /dev/sdc
>  # mount /dev/sdb -o degraded,rw

I've seen wider testing coverage in replies to the previous patchset
iterations. Can we have that added to fstests?

I'm going to add this patchset to the devel queue (ie. not a separate
for-next branch anymore).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Prevent possible ERR_PTR() dereference

2017-07-12 Thread David Sterba
On Tue, Jul 11, 2017 at 10:29:49PM +0300, Nikolay Borisov wrote:
> 
> 
> On 11.07.2017 20:24, David Sterba wrote:
> > On Tue, Jul 11, 2017 at 04:55:51PM +0300, Nikolay Borisov wrote:
> >> In btrfs_full_stripe_len/btrfs_is_parity_mirror we have similar code which
> >> gets the chunk map for a particular range via get_chunk_map. However,
> >> get_chunk_map can return an ERR_PTR value and while the 2 callers do catch
> >> this with a WARN_ON they then proceed to indiscriminately dereference the
> >> extent map. This of course leads to a crash. Fix the offenders by making 
> >> the
> >> dereference conditional on IS_ERR.
> > 
> > While the code makes it better, the whole callchain should be fixed. The
> > WARN_ON used to be a BUG_ON and the error handling was absent, and still
> > is. Although it's unlikely to see the warnings from that, I'd rather see
> > it fixed properly. The direct caller of btrfs_full_stripe_len will be
> > able to handle it.
> 
> What should be returned in case we can't find the chunk_map -EINVAL ?

Returning what get_chunk_map seems ok in btrfs_full_stripe_len (compared
to other callers of get_chunk_map that may interpret a failure in a
different way).

But EINVAL is IMO wrong as it's more like the ENOENT when the mapping is
missing or EUCLEAN when the mapping looks incorrect.

The failure of btrfs_full_stripe_len in btrfs_create_block_group_cache
could be best handled if we return ERR_PTR instead of assuming ENOMEM in
all of its callers (btrfs_read_block_groups and btrfs_make_block_group).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/19] fs_info refactor part 2 (disk-io.h and volumes.h based)

2017-07-12 Thread David Sterba
On Tue, Jul 11, 2017 at 05:11:16PM +0800, Qu Wenruo wrote:
> Any comment?

Patches applied, sorry for the delay.

> 在 2017年06月13日 17:19, Qu Wenruo 写道:
> > This is the part 2 patchset to refactor btrfs_root usage to
> > btrfs_fs_info.
> >
> > The most obvious advantage is to make function calls a little shorter,
> > and less confused.
> > Function calls like btrfs_alloc_data_chunk() needs caller to pass a
> > btrfs_root parameter, while the parameter should always be
> > fs_info->extent_root, grab it inside that function is much more safer
> > than allowing user to pass in a random root.
> >
> > Another advantage is to make it easier to sync some headers with kernel.
> > In fact, when I'm not sure if I should convert one function, I normally
> > check kernel code to find the parameter of kernel equivalent.
> > So when we are going to sync (part of) the headers with kernel, it will
> > reduce the work.
> >
> > Part 2 focus on the exported functions mainly in disk-io.h and
> > volumes.h.
> > Now most tree blocks/extents read/write, device scan/read, chunk
> > allocation/initialization functions are all using fs_info.
> >
> > I'm quite sure there is a lot of remaining functions to be refactored,
> > but I prefer to do it step by step.

Yes, this would be better.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [proposal] making filesystem tools more machine friendly

2017-07-12 Thread Richard W.M. Jones

libguestfs could really use structured output from more of the command
line tools.  Particularly:

 - all the ext4 tools
 - all the xfs tools
 - all the btrfs tools
 - parted

and more.  See also:

  https://github.com/libguestfs/libguestfs/tree/master/daemon

A dbus service would not be useful.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-builder quickly builds VMs from scratch
http://libguestfs.org/virt-builder.1.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Check if tgt_device is not null

2017-07-12 Thread Nikolay Borisov


On 12.07.2017 18:03, David Sterba wrote:
> On Wed, Jul 12, 2017 at 11:39:22AM +0300, Nikolay Borisov wrote:
>> btrfs_err_in_rcu indiscriminately dereferences tgt_device to access its
>> ->name member in an error path. However, couple of lines below there is code
>> which checks whether tgt_device is not NULL. Let's be consistent and check if
>> the tgt_device is NULL before dereferencing it.
> 
> The question is if tgt_device can be really NULL. From what I see I
> don't think so. The target device is the one we're writing to, so we've
> used it through the entire dev-replace. Source device can be null if
> we're replacing a missing device.

So in this case the if tgt_null check can be removed a couple of lines
below.

> 
>> Signed-off-by: Nikolay Borisov 
>> ---
>>  fs/btrfs/dev-replace.c | 3 ++-
>>  1 file changed, 2 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
>> index bee3edeea7a3..e2a16cb8f7f3 100644
>> --- a/fs/btrfs/dev-replace.c
>> +++ b/fs/btrfs/dev-replace.c
>> @@ -541,7 +541,8 @@ static int btrfs_dev_replace_finishing(struct 
>> btrfs_fs_info *fs_info,
>>   src_device->missing ? "" :
>>   rcu_str_deref(src_device->name),
>>   src_device->devid,
>> - rcu_str_deref(tgt_device->name), scrub_ret);
>> + tgt_device ? rcu_str_deref(tgt_device->name) :
>> + "", scrub_ret);
>>  btrfs_dev_replace_unlock(dev_replace, 1);
>>  mutex_unlock(&fs_info->chunk_mutex);
>>  mutex_unlock(&fs_info->fs_devices->device_list_mutex);
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: report errors when checksum is not found

2017-07-12 Thread Liu Bo
On Wed, Jul 12, 2017 at 04:40:36PM +0200, David Sterba wrote:
> On Tue, Jul 11, 2017 at 02:43:16PM -0600, Liu Bo wrote:
> > When btrfs fails the checksum check, it'll fill the whole page with
> > "1".
> 
> One could ask, why is the page filled with 1s. Brought by commit
> 07157aacb1ecd394a54949 from 2007, without mentioning any justification.
> I'm more inclined to revisit this behaviour and drop it eventually.
> 
> > However, if %csum_expected is 0 (which means there is no checksum), then
> > for some unknown reason, we just pretend that the read is correct, so
> > userspace would be confused about the dilemma that read is successful but
> > getting a page with all content being "1".
> 
> Here 'no checksum' means that no checksum was found but was expected,
> right?

Yes, no checksum was found.

> An EIO would fail the read, I don't see a reason why the page
> needs to be "zeroed". The contents would be inaccessible anyway.
>

Right, resetting page's content is needed when we return 0 instead of
-EIO.  I guess it was introduced for testing.  So yes, I'm glad to
remove that part, will do in a v2.

> > This can happen due to a bug in btrfs-convert.
> > 
> > This fixes it by always returning errors if checksum doesn't match.
> 
> Independent of the above, this fix makes sense.
> 
> Reviewed-by: David Sterba 

Thank you for the comments.

Thanks,

-liubo
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Lock between userspace and btrfs-cleaner on extent_buffer

2017-07-12 Thread Sargun Dhillon
On Thu, Jun 29, 2017 at 11:49 AM, Jeff Mahoney  wrote:
> On 6/29/17 2:46 PM, Sargun Dhillon wrote:
>> On Thu, Jun 29, 2017 at 11:42 AM, Jeff Mahoney  wrote:
>>> On 6/28/17 6:02 PM, Sargun Dhillon wrote:
 On Wed, Jun 28, 2017 at 2:55 PM, Jeff Mahoney  wrote:
> On 6/27/17 5:12 PM, Jeff Mahoney wrote:
>> On 6/13/17 9:05 PM, Sargun Dhillon wrote:
>>> On Thu, Jun 8, 2017 at 11:34 AM, Sargun Dhillon  
>>> wrote:
 I have a deadlock caught in the wild between two processes --
 btrfs-cleaner, and userspace process (Docker). Here, you can see both
 of the backtraces. btrfs-cleaner is trying to get a lock on
 9859d360caf0, which is owned by Docker's pid. Docker on the other
 hand is trying to get a lock on 9859dc0f0578, which is owned by
 btrfs-cleaner's Pid.

 This is on vanilla 4.11.3 without much workload. The background
 workload was basically starting and stopping Docker with a medium
 sized image like ubuntu:latest with sleep 5. So, snapshot creation,
 destruction. And there's some stuff that's logging to btrfs.
>>
>> Hi Sargun -
>>
>> We hit this bug in testing last week.  I have a patch that I've written
>> up and have run under your reproducer for a while.  So far it hasn't
>> hit.  I'll post it shortly and CC you.  It does depend lightly on the
>> rbtree code, though.  Since we'll want this fix for -stable, I'll write
>> up a version for that too.
>
> After thinking about it a bit more, I think my patch just happens to
> make it less likely to hit but would ultimately degrade into a livelock
> where it was a deadlock previously.  I was just trylocking and
> requeuing, so both threads are allowed to do other work and maybe even
> finish but ultimately if there's a true deadlock it'll hit anyway.
>
> -Jeff
>
 Does it make sense to spend the time on making it so that
 btrfs-cleaner has abortable operations, and the ability to abort if
 the root deletion either takes too long, or if it receives a signal?
 Although, such a case may result in a livelock, to me it seems like a
 lot less bad than deadlocking.
>>>
>>>
>>> For now, reverting:
>>>
>>> commit fb235dc06fac9eaa4408ade9c8b20d45d63c89b7
>>> Author: Qu Wenruo 
>>> Date:   Wed Feb 15 10:43:03 2017 +0800
>>>
>>> btrfs: qgroup: Move half of the qgroup accounting time out of commit
>>> trans
>>>
>>> ... should do the trick.
>>>
>>> -Jeff
>>>
>> I thought it was this as well, but we still saw lock-ups even after
>> reverting this change on 4.11. They were rarer, but we still saw
>> issues with locked up btrfs-transactions. It may have been due to a
>> different issue. If you want. I can try to revert this, and run a
>> workload on it to see where the exact lock-up is?
>
> Yeah, I'd be interested in those results.
>
> -Jeff
>
>
> --
> Jeff Mahoney
> SUSE Labs
>
Thanks Jeff,
Upon further analysis, it looks like rolling this back fixed the
btrfs-cleaner lock up, but the we're seeing a different hard lockup,
where num_writers on the current transaction gets stuck at 2.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: Enable ThreadSanitizer, using D=tsan.

2017-07-12 Thread Adam Buchbinder
Tested with clang-3.9.

Signed-off-by: Adam Buchbinder 
---
 Makefile | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/Makefile b/Makefile
index 81598df..8948301 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,7 @@
 #  abort   - call abort() on first error (dumps core)
 #  all - shortcut for all of the above
 #  asan- enable address sanitizer compiler feature
+#  tsan- enable thread sanitizer compiler feature
 #  ubsan   - undefined behaviour sanitizer compiler feature
 #  bcheck  - extended build checks
 #   W=123  build with warnings (default: off)
@@ -157,6 +158,11 @@ ifneq (,$(findstring asan,$(D)))
   DEBUG_CFLAGS_INTERNAL += -fsanitize=address
 endif
 
+ifneq (,$(findstring tsan,$(D)))
+  DEBUG_CFLAGS_INTERNAL += -fsanitize=thread -fPIE
+  LD_FLAGS += -fsanitize=thread -ltsan -pie
+endif
+
 ifneq (,$(findstring ubsan,$(D)))
   DEBUG_CFLAGS_INTERNAL += -fsanitize=undefined
 endif
-- 
2.13.2.932.g7449e964c-goog

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: Fix data races in btrfs-image.

2017-07-12 Thread Adam Buchbinder
Making the code data-race safe requires that reads *and* writes
happen under a mutex lock, if any of the access are writes. See
Dmitri Vyukov, "Benign data races: what could possibly go wrong?"
for more details.

The fix here was to put most of the main loop of restore_worker
under a mutex lock.

This race was detected using fsck-tests/012-leaf-corruption.

==
WARNING: ThreadSanitizer: data race
  Write of size 4 by main thread:
#0 add_cluster btrfs-progs/image/main.c:1931
#1 restore_metadump btrfs-progs/image/main.c:2566
#2 main btrfs-progs/image/main.c:2859

  Previous read of size 4 by thread T6:
#0 restore_worker btrfs-progs/image/main.c:1720

  Location is stack of main thread.

  Thread T6 (running) created by main thread at:
#0 pthread_create 
#1 mdrestore_init btrfs-progs/image/main.c:1868
#2 restore_metadump btrfs-progs/image/main.c:2534
#3 main btrfs-progs/image/main.c:2859

SUMMARY: ThreadSanitizer: data race btrfs-progs/image/main.c:1931 in
add_cluster

Signed-off-by: Adam Buchbinder 
---
 image/main.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/image/main.c b/image/main.c
index 1eca414..a5d01d8 100644
--- a/image/main.c
+++ b/image/main.c
@@ -1715,14 +1715,15 @@ static void *restore_worker(void *data)
}
async = list_entry(mdres->list.next, struct async_work, list);
list_del_init(&async->list);
-   pthread_mutex_unlock(&mdres->mutex);
 
if (mdres->compress_method == COMPRESS_ZLIB) {
size = compress_size; 
+   pthread_mutex_unlock(&mdres->mutex);
ret = uncompress(buffer, (unsigned long *)&size,
 async->buffer, async->bufsize);
+   pthread_mutex_lock(&mdres->mutex);
if (ret != Z_OK) {
-   error("decompressiion failed with %d", ret);
+   error("decompression failed with %d", ret);
err = -EIO;
}
outbuf = buffer;
@@ -1798,7 +1799,6 @@ error:
if (!mdres->multi_devices && async->start == 
BTRFS_SUPER_INFO_OFFSET)
write_backup_supers(outfd, outbuf);
 
-   pthread_mutex_lock(&mdres->mutex);
if (err && !mdres->error)
mdres->error = err;
mdres->num_items--;
@@ -1899,7 +1899,7 @@ static int fill_mdres_info(struct mdrestore_struct *mdres,
ret = uncompress(buffer, (unsigned long *)&size,
 async->buffer, async->bufsize);
if (ret != Z_OK) {
-   error("decompressiion failed with %d", ret);
+   error("decompression failed with %d", ret);
free(buffer);
return -EIO;
}
@@ -1928,7 +1928,9 @@ static int add_cluster(struct meta_cluster *cluster,
u32 i, nritems;
int ret;
 
+   pthread_mutex_lock(&mdres->mutex);
mdres->compress_method = header->compress;
+   pthread_mutex_unlock(&mdres->mutex);
 
bytenr = le64_to_cpu(header->bytenr) + BLOCK_SIZE;
nritems = le32_to_cpu(header->nritems);
@@ -2171,7 +2173,7 @@ static int search_for_chunk_blocks(struct 
mdrestore_struct *mdres,
continue;
}
error(
-   "unknown state after reading cluster at %llu, probably crrupted data",
+   "unknown state after reading cluster at %llu, probably corrupted data",
cluster_bytenr);
ret = -EIO;
break;
@@ -2220,7 +,7 @@ static int search_for_chunk_blocks(struct 
mdrestore_struct *mdres,
 (unsigned long *)&size, tmp,
 bufsize);
if (ret != Z_OK) {
-   error("decompressiion failed with %d",
+   error("decompression failed with %d",
ret);
ret = -EIO;
break;
@@ -2340,7 +2342,7 @@ static int build_chunk_tree(struct mdrestore_struct 
*mdres,
ret = uncompress(tmp, (unsigned long *)&size,
 buffer, le32_to_cpu(item->size));
if (ret != Z_OK) {
-   error("decompressiion failed with %d", ret);
+   error("decompression failed with %d", ret);
free(buffer);
free(tmp);
return -EIO;
-- 
2.13.2.932.g7449e964c-goog

--
To uns

[PATCH] btrfs-progs: Use '-t btrfs' mount option in tests.

2017-07-12 Thread Adam Buchbinder
Without it, mount (at least from util-linux 2.20.1) tries (and
fails) to mount some filesystems as NTFS.

Signed-off-by: Adam Buchbinder 
---
 tests/common | 2 +-
 tests/fsck-tests/012-leaf-corruption/test.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/common b/tests/common
index 7ad436e..bed6009 100644
--- a/tests/common
+++ b/tests/common
@@ -387,7 +387,7 @@ run_check_mount_test_dev()
_fail "Invalid \$TEST_MNT: $TEST_MNT"
}
 
-   run_check $SUDO_HELPER mount $loop_opt "$@" "$TEST_DEV" "$TEST_MNT"
+   run_check $SUDO_HELPER mount -t btrfs $loop_opt "$@" "$TEST_DEV" 
"$TEST_MNT"
 }
 
 run_check_umount_test_dev()
diff --git a/tests/fsck-tests/012-leaf-corruption/test.sh 
b/tests/fsck-tests/012-leaf-corruption/test.sh
index 43b0e6d..fc10a4f 100755
--- a/tests/fsck-tests/012-leaf-corruption/test.sh
+++ b/tests/fsck-tests/012-leaf-corruption/test.sh
@@ -90,7 +90,7 @@ check_inode()
 check_leaf_corrupt_no_data_ext()
 {
image=$1
-   $SUDO_HELPER mount -o loop "$image" -o ro "$TEST_MNT"
+   $SUDO_HELPER mount -o loop -t btrfs "$image" -o ro "$TEST_MNT"
 
i=0
while [ $i -lt ${#leaf_no_data_ext_list[@]} ]; do
-- 
2.13.2.932.g7449e964c-goog

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: Fix data race in btrfs-convert.

2017-07-12 Thread Adam Buchbinder
The status display was reading the state while the task was updating
it. Use a mutex to prevent the race.

This race was detected using ThreadSanitizer and
misc-tests/005-convert-progress-thread-crash.

==
WARNING: ThreadSanitizer: data race
  Write of size 8 by main thread:
#0 ext2_copy_inodes btrfs-progs/convert/source-ext2.c:853
#1 copy_inodes btrfs-progs/convert/main.c:145
#2 do_convert btrfs-progs/convert/main.c:1297
#3 main btrfs-progs/convert/main.c:1924

  Previous read of size 8 by thread T1:
#0 print_copied_inodes btrfs-progs/convert/main.c:124

  Location is stack of main thread.

  Thread T1 (running) created by main thread at:
#0 pthread_create 
#1 task_start btrfs-progs/task-utils.c:50
#2 do_convert btrfs-progs/convert/main.c:1295
#3 main btrfs-progs/convert/main.c:1924

SUMMARY: ThreadSanitizer: data race
btrfs-progs/convert/source-ext2.c:853 in ext2_copy_inodes

Signed-off-by: Adam Buchbinder 
---
 convert/main.c| 12 ++--
 convert/source-ext2.c |  3 +++
 convert/source-fs.h   |  3 +++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/convert/main.c b/convert/main.c
index c56382e..c9c1fd4 100644
--- a/convert/main.c
+++ b/convert/main.c
@@ -88,6 +88,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "ctree.h"
@@ -119,10 +120,12 @@ static void *print_copied_inodes(void *p)
task_period_start(priv->info, 1000 /* 1s */);
while (1) {
count++;
+   pthread_mutex_lock(&priv->mutex);
printf("copy inodes [%c] [%10llu/%10llu]\r",
   work_indicator[count % 4],
-  (unsigned long long)priv->cur_copy_inodes,
-  (unsigned long long)priv->max_copy_inodes);
+  (u64)priv->cur_copy_inodes,
+  (u64)priv->max_copy_inodes);
+   pthread_mutex_unlock(&priv->mutex);
fflush(stdout);
task_period_wait(priv->info);
}
@@ -1286,6 +1289,11 @@ static int do_convert(const char *devname, u32 
convert_flags, u32 nodesize,
}
 
printf("creating btrfs metadata");
+   ret = pthread_mutex_init(&ctx.mutex, NULL);
+   if (ret) {
+   error("failed to init mutex: %d", ret);
+   goto fail;
+   }
ctx.max_copy_inodes = (cctx.inodes_count - cctx.free_inodes_count);
ctx.cur_copy_inodes = 0;
 
diff --git a/convert/source-ext2.c b/convert/source-ext2.c
index 38c3cd3..4bce4b3 100644
--- a/convert/source-ext2.c
+++ b/convert/source-ext2.c
@@ -18,6 +18,7 @@
 
 #include "kerncompat.h"
 #include 
+#include 
 #include "disk-io.h"
 #include "transaction.h"
 #include "utils.h"
@@ -850,7 +851,9 @@ static int ext2_copy_inodes(struct btrfs_convert_context 
*cctx,
ret = ext2_copy_single_inode(trans, root,
objectid, ext2_fs, ext2_ino,
&ext2_inode, convert_flags);
+   pthread_mutex_lock(&p->mutex);
p->cur_copy_inodes++;
+   pthread_mutex_unlock(&p->mutex);
if (ret)
return ret;
if (trans->blocks_used >= 4096) {
diff --git a/convert/source-fs.h b/convert/source-fs.h
index ca32d15..7ae6edd 100644
--- a/convert/source-fs.h
+++ b/convert/source-fs.h
@@ -17,6 +17,8 @@
 #ifndef __BTRFS_CONVERT_SOURCE_FS_H__
 #define __BTRFS_CONVERT_SOURCE_FS_H__
 
+#include 
+
 #include "kerncompat.h"
 
 #define CONV_IMAGE_SUBVOL_OBJECTID BTRFS_FIRST_FREE_OBJECTID
@@ -37,6 +39,7 @@ extern const struct simple_range btrfs_reserved_ranges[3];
 struct task_info;
 
 struct task_ctx {
+   pthread_mutex_t mutex;
u64 max_copy_inodes;
u64 cur_copy_inodes;
struct task_info *info;
-- 
2.13.2.932.g7449e964c-goog

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Enable ThreadSanitizer, using D=tsan.

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 01:04:49PM -0700, Adam Buchbinder wrote:
> Tested with clang-3.9.
> 
> Signed-off-by: Adam Buchbinder 
> ---
>  Makefile | 6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/Makefile b/Makefile
> index 81598df..8948301 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -17,6 +17,7 @@
>  #  abort   - call abort() on first error (dumps core)
>  #  all - shortcut for all of the above
>  #  asan- enable address sanitizer compiler feature
> +#  tsan- enable thread sanitizer compiler feature
>  #  ubsan   - undefined behaviour sanitizer compiler feature
>  #  bcheck  - extended build checks
>  #   W=123  build with warnings (default: off)
> @@ -157,6 +158,11 @@ ifneq (,$(findstring asan,$(D)))
>DEBUG_CFLAGS_INTERNAL += -fsanitize=address
>  endif
>  
> +ifneq (,$(findstring tsan,$(D)))
> +  DEBUG_CFLAGS_INTERNAL += -fsanitize=thread -fPIE
> +  LD_FLAGS += -fsanitize=thread -ltsan -pie

Why do you need to set PIE here? Is is necessary for tsan?

> +endif
> +
>  ifneq (,$(findstring ubsan,$(D)))
>DEBUG_CFLAGS_INTERNAL += -fsanitize=undefined
>  endif
> -- 
> 2.13.2.932.g7449e964c-goog
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Use '-t btrfs' mount option in tests.

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 01:05:22PM -0700, Adam Buchbinder wrote:
> Without it, mount (at least from util-linux 2.20.1) tries (and
> fails) to mount some filesystems as NTFS.
> 
> Signed-off-by: Adam Buchbinder 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Fix data race in btrfs-convert.

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 01:05:38PM -0700, Adam Buchbinder wrote:
> The status display was reading the state while the task was updating
> it. Use a mutex to prevent the race.
> 
> This race was detected using ThreadSanitizer and
> misc-tests/005-convert-progress-thread-crash.
> 
> ==
> WARNING: ThreadSanitizer: data race
>   Write of size 8 by main thread:
> #0 ext2_copy_inodes btrfs-progs/convert/source-ext2.c:853
> #1 copy_inodes btrfs-progs/convert/main.c:145
> #2 do_convert btrfs-progs/convert/main.c:1297
> #3 main btrfs-progs/convert/main.c:1924
> 
>   Previous read of size 8 by thread T1:
> #0 print_copied_inodes btrfs-progs/convert/main.c:124
> 
>   Location is stack of main thread.
> 
>   Thread T1 (running) created by main thread at:
> #0 pthread_create 
> #1 task_start btrfs-progs/task-utils.c:50
> #2 do_convert btrfs-progs/convert/main.c:1295
> #3 main btrfs-progs/convert/main.c:1924
> 
> SUMMARY: ThreadSanitizer: data race
> btrfs-progs/convert/source-ext2.c:853 in ext2_copy_inodes
> 
> Signed-off-by: Adam Buchbinder 

Thanks, patch applied, with some minor modifications.

> ---
>  convert/main.c| 12 ++--
>  convert/source-ext2.c |  3 +++
>  convert/source-fs.h   |  3 +++
>  3 files changed, 16 insertions(+), 2 deletions(-)
> 
> diff --git a/convert/main.c b/convert/main.c
> index c56382e..c9c1fd4 100644
> --- a/convert/main.c
> +++ b/convert/main.c
> @@ -88,6 +88,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "ctree.h"
> @@ -119,10 +120,12 @@ static void *print_copied_inodes(void *p)
>   task_period_start(priv->info, 1000 /* 1s */);
>   while (1) {
>   count++;
> + pthread_mutex_lock(&priv->mutex);
>   printf("copy inodes [%c] [%10llu/%10llu]\r",
>  work_indicator[count % 4],
> -(unsigned long long)priv->cur_copy_inodes,
> -(unsigned long long)priv->max_copy_inodes);
> +(u64)priv->cur_copy_inodes,
> +(u64)priv->max_copy_inodes);

This needs to be unsigned long long to match %llu. We know that u64 will
always be equivalent, so there should not be any problem. With u64, the
compiler tends to warn. The cast is not necessary in kernel code, but I
don't know what magic has caused that, so we still use the ULL type cast
in progs.

> + pthread_mutex_unlock(&priv->mutex);
>   fflush(stdout);
>   task_period_wait(priv->info);
>   }
> @@ -1286,6 +1289,11 @@ static int do_convert(const char *devname, u32 
> convert_flags, u32 nodesize,
>   }
>  
>   printf("creating btrfs metadata");
> + ret = pthread_mutex_init(&ctx.mutex, NULL);
> + if (ret) {
> + error("failed to init mutex: %d", ret);
> + goto fail;
> + }
>   ctx.max_copy_inodes = (cctx.inodes_count - cctx.free_inodes_count);
>   ctx.cur_copy_inodes = 0;
>  
> diff --git a/convert/source-ext2.c b/convert/source-ext2.c
> index 38c3cd3..4bce4b3 100644
> --- a/convert/source-ext2.c
> +++ b/convert/source-ext2.c
> @@ -18,6 +18,7 @@
>  
>  #include "kerncompat.h"
>  #include 
> +#include 
>  #include "disk-io.h"
>  #include "transaction.h"
>  #include "utils.h"
> @@ -850,7 +851,9 @@ static int ext2_copy_inodes(struct btrfs_convert_context 
> *cctx,
>   ret = ext2_copy_single_inode(trans, root,
>   objectid, ext2_fs, ext2_ino,
>   &ext2_inode, convert_flags);
> + pthread_mutex_lock(&p->mutex);
>   p->cur_copy_inodes++;
> + pthread_mutex_unlock(&p->mutex);
>   if (ret)
>   return ret;
>   if (trans->blocks_used >= 4096) {
> diff --git a/convert/source-fs.h b/convert/source-fs.h
> index ca32d15..7ae6edd 100644
> --- a/convert/source-fs.h
> +++ b/convert/source-fs.h
> @@ -17,6 +17,8 @@
>  #ifndef __BTRFS_CONVERT_SOURCE_FS_H__
>  #define __BTRFS_CONVERT_SOURCE_FS_H__
>  
> +#include 
> +
>  #include "kerncompat.h"

This is really minor, kerncompat should be always included first due to
potential clashes in type definitions.

>  #define CONV_IMAGE_SUBVOL_OBJECTID BTRFS_FIRST_FREE_OBJECTID
> @@ -37,6 +39,7 @@ extern const struct simple_range btrfs_reserved_ranges[3];
>  struct task_info;
>  
>  struct task_ctx {
> + pthread_mutex_t mutex;
>   u64 max_copy_inodes;
>   u64 cur_copy_inodes;
>   struct task_info *info;
> -- 
> 2.13.2.932.g7449e964c-goog
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Fix data races in btrfs-image.

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 01:05:10PM -0700, Adam Buchbinder wrote:
> Making the code data-race safe requires that reads *and* writes
> happen under a mutex lock, if any of the access are writes. See
> Dmitri Vyukov, "Benign data races: what could possibly go wrong?"
> for more details.
> 
> The fix here was to put most of the main loop of restore_worker
> under a mutex lock.
> 
> This race was detected using fsck-tests/012-leaf-corruption.
> 
> ==
> WARNING: ThreadSanitizer: data race
>   Write of size 4 by main thread:
> #0 add_cluster btrfs-progs/image/main.c:1931
> #1 restore_metadump btrfs-progs/image/main.c:2566
> #2 main btrfs-progs/image/main.c:2859
> 
>   Previous read of size 4 by thread T6:
> #0 restore_worker btrfs-progs/image/main.c:1720
> 
>   Location is stack of main thread.
> 
>   Thread T6 (running) created by main thread at:
> #0 pthread_create 
> #1 mdrestore_init btrfs-progs/image/main.c:1868
> #2 restore_metadump btrfs-progs/image/main.c:2534
> #3 main btrfs-progs/image/main.c:2859
> 
> SUMMARY: ThreadSanitizer: data race btrfs-progs/image/main.c:1931 in
> add_cluster
> 
> Signed-off-by: Adam Buchbinder 

Applied, thanks.

> ---
>  image/main.c | 16 +---
>  1 file changed, 9 insertions(+), 7 deletions(-)
> 
> diff --git a/image/main.c b/image/main.c
> index 1eca414..a5d01d8 100644
> --- a/image/main.c
> +++ b/image/main.c
> @@ -1715,14 +1715,15 @@ static void *restore_worker(void *data)
>   }
>   async = list_entry(mdres->list.next, struct async_work, list);
>   list_del_init(&async->list);
> - pthread_mutex_unlock(&mdres->mutex);
>  
>   if (mdres->compress_method == COMPRESS_ZLIB) {
>   size = compress_size; 
> + pthread_mutex_unlock(&mdres->mutex);
>   ret = uncompress(buffer, (unsigned long *)&size,
>async->buffer, async->bufsize);
> + pthread_mutex_lock(&mdres->mutex);
>   if (ret != Z_OK) {
> - error("decompressiion failed with %d", ret);
> + error("decompression failed with %d", ret);

The typo fixes belong to a separate patch, as they fix a different
problem. We stick to the same style as in kernel "one patch per logical
change", so I'll split them to another patch as it's fairly trivial.

>   err = -EIO;
>   }
>   outbuf = buffer;
> @@ -1798,7 +1799,6 @@ error:
>   if (!mdres->multi_devices && async->start == 
> BTRFS_SUPER_INFO_OFFSET)
>   write_backup_supers(outfd, outbuf);
>  
> - pthread_mutex_lock(&mdres->mutex);
>   if (err && !mdres->error)
>   mdres->error = err;
>   mdres->num_items--;
> @@ -1899,7 +1899,7 @@ static int fill_mdres_info(struct mdrestore_struct 
> *mdres,
>   ret = uncompress(buffer, (unsigned long *)&size,
>async->buffer, async->bufsize);
>   if (ret != Z_OK) {
> - error("decompressiion failed with %d", ret);
> + error("decompression failed with %d", ret);
>   free(buffer);
>   return -EIO;
>   }
> @@ -1928,7 +1928,9 @@ static int add_cluster(struct meta_cluster *cluster,
>   u32 i, nritems;
>   int ret;
>  
> + pthread_mutex_lock(&mdres->mutex);
>   mdres->compress_method = header->compress;
> + pthread_mutex_unlock(&mdres->mutex);
>  
>   bytenr = le64_to_cpu(header->bytenr) + BLOCK_SIZE;
>   nritems = le32_to_cpu(header->nritems);
> @@ -2171,7 +2173,7 @@ static int search_for_chunk_blocks(struct 
> mdrestore_struct *mdres,
>   continue;
>   }
>   error(
> - "unknown state after reading cluster at %llu, probably crrupted data",
> + "unknown state after reading cluster at %llu, probably corrupted data",
>   cluster_bytenr);
>   ret = -EIO;
>   break;
> @@ -2220,7 +,7 @@ static int search_for_chunk_blocks(struct 
> mdrestore_struct *mdres,
>(unsigned long *)&size, tmp,
>bufsize);
>   if (ret != Z_OK) {
> - error("decompressiion failed with %d",
> + error("decompression failed with %d",
>   ret);
>   ret = -EIO;
>   break;
> @@ -2340,7 +2342,7 @@ static int build_chunk_tree(struct mdrestore_struct 
> *mdres,
>   ret = uncompress(tmp, (unsign

Re: [PATCH] btrfs-progs: fix the path use full_path as provided by the root info

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 03:20:27PM +0800, Anand Jain wrote:
> This is a kind of preparatory patch for the patch which will add
> --rootid and --uuid options for the btrfs subvol show command.
> 
> As of now btrfs subvol show is using the external user provided subvol
> path to show in the output. Which is kind of confusing.
> 
> btrfs su show /btrfs
> /btrfs <--
>   Name:   
> 
> It will be even more confusing when proposed --uuid or --rootid
> options are used.
> 
> btrfs su show --rootid 258 /btrfs
> /btrfs <--
>   Name:   snap <--
>   UUID:   9630a45f-e647-4242-bd19-97590b4e20b2
>   Parent UUID:30129358-c69d-3e4a-a662-29509cc69c95
>   Received UUID:  -
>   Creation time:  2017-07-12 12:43:28 +0800
>   Subvolume ID:   258
>   Generation: 9
>   Gen at creation:9
>   Parent ID:  257
>   Top level ID:   257
>   Flags:  -
>   Snapshot(s):
> 
> Now with this patch, it will only show what is provided by the root_info.
> 
> btrfs su show --rootid 258 /btrfs
> sv1/snap <--
>   Name:   snap
>   UUID:   9630a45f-e647-4242-bd19-97590b4e20b2
>   Parent UUID:30129358-c69d-3e4a-a662-29509cc69c95
>   Received UUID:  -
>   Creation time:  2017-07-12 12:43:28 +0800
>   Subvolume ID:   258
>   Generation: 9
>   Gen at creation:9
>   Parent ID:  257
>   Top level ID:   257
>   Flags:  -
>   Snapshot(s):
> 
> Signed-off-by: Anand Jain 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 00/13] use rbtrees for preliminary backrefs

2017-07-12 Thread Edmund Nadolski
This patch series attempts to improve the performance of backref
searches by changing the prelim_refs implementation to use
rbtrees instead of lists.  This also aims to reduce the soft
lockup occurences that can result when a backref search consumes
too much cpu time.

Test runs of btrfs/130 show an improvement in the overall
run time of the test (shown below in seconds) as a function of
the number of extents:

nr_extents:2565126401024 2048
+---+-+---+---+--
 unpatched: 20186375220440419
   patched: 12 93203106022007

(Note, the current default value for nr_extents in btrfs/130 is
4096, which takes a very long time to complete.)

Changes for v3:

Patch 08/13:
 - Update changelog and comments for third rbtree.
 - Fixed issue in resolve_indirect_refs() which prevented
   module load when sanity checking was enabled.

Patch 10/13:
 - Fix TP_printk_btrfs format string per coding standards.

Changes for v2:

Patch 06/13:
 - Added changelog description.

Patch 07/13:
 - Updated changelog description.
 - Removed 'TODO' comment.

Patch 08/13:
 - Added code for proper iteration of missing keys. This adds
   a third rbtree (.indirect_missing_keys in struct preftrees)
   plus the requisite code in add_prelim_ref(), add_missing_keys(),
   resolve_indirect_refs(), and find_parent_nodes().
 - Rename release_pref() to free_pref().
 - Replace WARN() with BUG_ON().
 - Remove 'TODO' comments and the unused 'merge_mode' enum.

The other patches have no functional changes. Some have diff
context changes due to the above modifications.

Edmund Nadolski (6):
  btrfs: btrfs_check_shared should manage its own transaction
  btrfs: remove ref_tree implementation from backref.c
  btrfs: convert prelimary reference tracking to use rbtrees
  btrfs: add cond_resched() calls when resolving backrefs
  btrfs: allow backref search checks for shared extents
  btrfs: clean up extraneous computations in add_delayed_refs

Jeff Mahoney (7):
  btrfs: struct-funcs, constify readers
  btrfs: constify tracepoint arguments
  btrfs: backref, constify some arguments
  btrfs: backref, add unode_aux_to_inode_list helper
  btrfs: backref, cleanup __ namespace abuse
  btrfs: add a node counter to each of the rbtrees
  btrfs: backref, add tracepoints for prelim_ref insertion and merging

 fs/btrfs/async-thread.c  |6 +-
 fs/btrfs/async-thread.h  |6 +-
 fs/btrfs/backref.c   | 1072 ++
 fs/btrfs/backref.h   |   16 +-
 fs/btrfs/btrfs_inode.h   |4 +-
 fs/btrfs/ctree.h |  128 ++---
 fs/btrfs/extent_io.c |   46 +-
 fs/btrfs/extent_io.h |   19 +-
 fs/btrfs/struct-funcs.c  |9 +-
 fs/btrfs/super.c |1 +
 include/trace/events/btrfs.h |  300 +++-
 11 files changed, 772 insertions(+), 835 deletions(-)

-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 08/13] btrfs: convert prelimary reference tracking to use rbtrees

2017-07-12 Thread Edmund Nadolski
It's been known for a while that the use of multiple lists
that are periodically merged was an algorithmic problem within
btrfs.  There are several workloads that don't complete in any
reasonable amount of time (e.g. btrfs/130) and others that cause
soft lockups.

The solution is to use a set of rbtrees that do insertion merging
for both indirect and direct refs, with the former converting
refs into the latter.  The result is a btrfs/130 workload that
used to take several hours now takes about half of that. This
runtime still isn't acceptable and a future patch will address that
by moving the rbtrees higher in the stack so the lookups can be
shared across multiple calls to find_parent_nodes.

Signed-off-by: Edmund Nadolski 
Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/backref.c | 441 ++---
 1 file changed, 284 insertions(+), 157 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 6cac5ab..1edb107 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -26,11 +26,6 @@
 #include "delayed-ref.h"
 #include "locking.h"
 
-enum merge_mode {
-   MERGE_IDENTICAL_KEYS = 1,
-   MERGE_IDENTICAL_PARENTS,
-};
-
 /* Just an arbitrary number so we can be sure this happened */
 #define BACKREF_FOUND_SHARED 6
 
@@ -129,7 +124,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
  * this structure records all encountered refs on the way up to the root
  */
 struct prelim_ref {
-   struct list_head list;
+   struct rb_node rbnode;
u64 root_id;
struct btrfs_key key_for_search;
int level;
@@ -139,6 +134,18 @@ struct prelim_ref {
u64 wanted_disk_byte;
 };
 
+struct preftree {
+   struct rb_root root;
+};
+
+#define PREFTREE_INIT  { .root = RB_ROOT }
+
+struct preftrees {
+   struct preftree direct;/* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
+   struct preftree indirect;  /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */
+   struct preftree indirect_missing_keys;
+};
+
 static struct kmem_cache *btrfs_prelim_ref_cache;
 
 int __init btrfs_prelim_ref_init(void)
@@ -158,6 +165,108 @@ void btrfs_prelim_ref_exit(void)
kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
 
+static void free_pref(struct prelim_ref *ref)
+{
+   kmem_cache_free(btrfs_prelim_ref_cache, ref);
+}
+
+/*
+ * Return 0 when both refs are for the same block (and can be merged).
+ * A -1 return indicates ref1 is a 'lower' block than ref2, while 1
+ * indicates a 'higher' block.
+ */
+static int prelim_ref_compare(struct prelim_ref *ref1,
+ struct prelim_ref *ref2)
+{
+   if (ref1->level < ref2->level)
+   return -1;
+   if (ref1->level > ref2->level)
+   return 1;
+   if (ref1->root_id < ref2->root_id)
+   return -1;
+   if (ref1->root_id > ref2->root_id)
+   return 1;
+   if (ref1->key_for_search.type < ref2->key_for_search.type)
+   return -1;
+   if (ref1->key_for_search.type > ref2->key_for_search.type)
+   return 1;
+   if (ref1->key_for_search.objectid < ref2->key_for_search.objectid)
+   return -1;
+   if (ref1->key_for_search.objectid > ref2->key_for_search.objectid)
+   return 1;
+   if (ref1->key_for_search.offset < ref2->key_for_search.offset)
+   return -1;
+   if (ref1->key_for_search.offset > ref2->key_for_search.offset)
+   return 1;
+   if (ref1->parent < ref2->parent)
+   return -1;
+   if (ref1->parent > ref2->parent)
+   return 1;
+
+   return 0;
+}
+
+/*
+ * Add @newref to the @root rbtree, merging identical refs.
+ *
+ * Callers should assumed that newref has been freed after calling.
+ */
+static void prelim_ref_insert(struct preftree *preftree,
+ struct prelim_ref *newref)
+{
+   struct rb_root *root;
+   struct rb_node **p;
+   struct rb_node *parent = NULL;
+   struct prelim_ref *ref;
+   int result;
+
+   root = &preftree->root;
+   p = &root->rb_node;
+
+   while (*p) {
+   parent = *p;
+   ref = rb_entry(parent, struct prelim_ref, rbnode);
+   result = prelim_ref_compare(ref, newref);
+   if (result < 0) {
+   p = &(*p)->rb_left;
+   } else if (result > 0) {
+   p = &(*p)->rb_right;
+   } else {
+   /* Identical refs, merge them and free @newref */
+   struct extent_inode_elem *eie = ref->inode_list;
+
+   while (eie && eie->next)
+   eie = eie->next;
+
+   if (!eie)
+   ref->inode_list = newref->inode_list;
+   else
+   eie->next = newref->inode_list;
+   ref->count += newref->count;
+  

[PATCH v3 11/13] btrfs: add cond_resched() calls when resolving backrefs

2017-07-12 Thread Edmund Nadolski
Since backref resolution is CPU-intensive, the cond_resched calls
should help alleviate soft lockup occurences.

Signed-off-by: Edmund Nadolski 
Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/backref.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 19c9e92..c1882e5 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -660,6 +660,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info 
*fs_info,
prelim_ref_insert(fs_info, &preftrees->direct, ref);
 
ulist_reinit(parents);
+   cond_resched();
}
 out:
ulist_free(parents);
@@ -702,6 +703,7 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
prelim_ref_insert(fs_info, &preftrees->indirect, ref);
+   cond_resched();
}
return 0;
 }
@@ -1243,6 +1245,7 @@ static int find_parent_nodes(struct btrfs_trans_handle 
*trans,
}
eie = NULL;
}
+   cond_resched();
}
 
 out:
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 09/13] btrfs: add a node counter to each of the rbtrees

2017-07-12 Thread Edmund Nadolski
From: Jeff Mahoney 

This patch adds counters to each of the rbtrees so that we can tell
how large they are growing for a given workload.  These counters
will be exported by tracepoints in the next patch.

Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/backref.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 1edb107..2e452264 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -136,9 +136,10 @@ struct prelim_ref {
 
 struct preftree {
struct rb_root root;
+   unsigned int count;
 };
 
-#define PREFTREE_INIT  { .root = RB_ROOT }
+#define PREFTREE_INIT  { .root = RB_ROOT, .count = 0 }
 
 struct preftrees {
struct preftree direct;/* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -248,6 +249,7 @@ static void prelim_ref_insert(struct preftree *preftree,
}
}
 
+   preftree->count++;
rb_link_node(&newref->rbnode, parent, p);
rb_insert_color(&newref->rbnode, root);
 }
@@ -265,6 +267,7 @@ static void prelim_release(struct preftree *preftree)
free_pref(ref);
 
preftree->root = RB_ROOT;
+   preftree->count = 0;
 }
 
 /*
@@ -607,6 +610,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info 
*fs_info,
}
 
rb_erase(&ref->rbnode, &preftrees->indirect.root);
+   preftrees->indirect.count--;
 
if (ref->count == 0) {
free_pref(ref);
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 13/13] btrfs: clean up extraneous computations in add_delayed_refs

2017-07-12 Thread Edmund Nadolski
Repeating the same computation in multiple places is not
necessary.

Signed-off-by: Edmund Nadolski 
Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/backref.c | 30 +-
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 35ac0bd..e62704a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -769,7 +769,7 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
struct btrfs_key key;
struct btrfs_key tmp_op_key;
struct btrfs_key *op_key = NULL;
-   int sgn;
+   int count;
int ret = 0;
 
if (extent_op && extent_op->update_key) {
@@ -788,15 +788,15 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
WARN_ON(1);
continue;
case BTRFS_ADD_DELAYED_REF:
-   sgn = 1;
+   count = node->ref_mod;
break;
case BTRFS_DROP_DELAYED_REF:
-   sgn = -1;
+   count = node->ref_mod * -1;
break;
default:
BUG_ON(1);
}
-   *total_refs += (node->ref_mod * sgn);
+   *total_refs += count;
switch (node->type) {
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
@@ -805,9 +805,8 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
   &tmp_op_key, ref->level + 1,
-  node->bytenr,
-  node->ref_mod * sgn,
-  sc, GFP_ATOMIC);
+  node->bytenr, count, sc,
+  GFP_ATOMIC);
break;
}
case BTRFS_SHARED_BLOCK_REF_KEY: {
@@ -816,9 +815,8 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
 
ref = btrfs_delayed_node_to_tree_ref(node);
 
-   ret = add_direct_ref(fs_info, preftrees,
-ref->level + 1, ref->parent,
-node->bytenr, node->ref_mod * sgn,
+   ret = add_direct_ref(fs_info, preftrees, ref->level + 1,
+ref->parent, node->bytenr, count,
 sc, GFP_ATOMIC);
break;
}
@@ -841,9 +839,8 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
}
 
ret = add_indirect_ref(fs_info, preftrees, ref->root,
-  &key, 0, node->bytenr,
-  node->ref_mod * sgn,
-  sc, GFP_ATOMIC);
+  &key, 0, node->bytenr, count, sc,
+  GFP_ATOMIC);
break;
}
case BTRFS_SHARED_DATA_REF_KEY: {
@@ -852,10 +849,9 @@ static int add_delayed_refs(const struct btrfs_fs_info 
*fs_info,
 
ref = btrfs_delayed_node_to_data_ref(node);
 
-   ret = add_direct_ref(fs_info, preftrees, 0,
-ref->parent, node->bytenr,
-node->ref_mod * sgn,
-sc, GFP_ATOMIC);
+   ret = add_direct_ref(fs_info, preftrees, 0, ref->parent,
+node->bytenr, count, sc,
+GFP_ATOMIC);
break;
}
default:
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 12/13] btrfs: allow backref search checks for shared extents

2017-07-12 Thread Edmund Nadolski
When called with a struct share_check, find_parent_nodes()
will detect a shared extent and immediately return with
BACKREF_SHARED_FOUND.

Signed-off-by: Edmund Nadolski 
Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/backref.c | 164 +
 1 file changed, 115 insertions(+), 49 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c1882e5..35ac0bd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -135,6 +135,25 @@ struct preftrees {
struct preftree indirect_missing_keys;
 };
 
+/*
+ * Checks for a shared extent during backref search.
+ *
+ * The share_count tracks prelim_refs (direct and indirect) having a
+ * ref->count >0:
+ *  - incremented when a ref->count transitions to >0
+ *  - decremented when a ref->count transitions to <1
+ */
+struct share_check {
+   u64 root_objectid;
+   u64 inum;
+   int share_count;
+};
+
+static inline int extent_is_shared(struct share_check *sc)
+{
+   return (sc && sc->share_count > 1) ? BACKREF_FOUND_SHARED : 0;
+}
+
 static struct kmem_cache *btrfs_prelim_ref_cache;
 
 int __init btrfs_prelim_ref_init(void)
@@ -195,14 +214,26 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
return 0;
 }
 
+void update_share_count(struct share_check *sc, int oldcount, int newcount)
+{
+   if ((!sc) || (oldcount == 0 && newcount < 1))
+   return;
+
+   if (oldcount > 0 && newcount < 1)
+   sc->share_count--;
+   else if (oldcount < 1 && newcount > 0)
+   sc->share_count++;
+}
+
 /*
  * Add @newref to the @root rbtree, merging identical refs.
  *
- * Callers should assumed that newref has been freed after calling.
+ * Callers should assume that newref has been freed after calling.
  */
 static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
  struct preftree *preftree,
- struct prelim_ref *newref)
+ struct prelim_ref *newref,
+ struct share_check *sc)
 {
struct rb_root *root;
struct rb_node **p;
@@ -234,12 +265,20 @@ static void prelim_ref_insert(const struct btrfs_fs_info 
*fs_info,
eie->next = newref->inode_list;
trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
 preftree->count);
+   /*
+* A delayed ref can have newref->count < 0.
+* The ref->count is updated to follow any
+* BTRFS_[ADD|DROP]_DELAYED_REF actions.
+*/
+   update_share_count(sc, ref->count,
+  ref->count + newref->count);
ref->count += newref->count;
free_pref(newref);
return;
}
}
 
+   update_share_count(sc, 0, newref->count);
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
@@ -303,7 +342,8 @@ static void prelim_release(struct preftree *preftree)
 static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
  struct preftree *preftree, u64 root_id,
  const struct btrfs_key *key, int level, u64 parent,
- u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+ u64 wanted_disk_byte, int count,
+ struct share_check *sc, gfp_t gfp_mask)
 {
struct prelim_ref *ref;
 
@@ -348,31 +388,32 @@ static int add_prelim_ref(const struct btrfs_fs_info 
*fs_info,
ref->count = count;
ref->parent = parent;
ref->wanted_disk_byte = wanted_disk_byte;
-   prelim_ref_insert(fs_info, preftree, ref);
-
-   return 0;
+   prelim_ref_insert(fs_info, preftree, ref, sc);
+   return extent_is_shared(sc);
 }
 
 /* direct refs use root == 0, key == NULL */
 static int add_direct_ref(const struct btrfs_fs_info *fs_info,
  struct preftrees *preftrees, int level, u64 parent,
- u64 wanted_disk_byte, int count, gfp_t gfp_mask)
+ u64 wanted_disk_byte, int count,
+ struct share_check *sc, gfp_t gfp_mask)
 {
return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level,
- parent, wanted_disk_byte, count, gfp_mask);
+ parent, wanted_disk_byte, count, sc, gfp_mask);
 }
 
 /* indirect refs use parent == 0 */
 static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
struct preftrees *preftrees, u64 root_id,
const struct btrfs_key *key, int level,
-   u64 wanted_disk_byte, int co

[PATCH v3 10/13] btrfs: backref, add tracepoints for prelim_ref insertion and merging

2017-07-12 Thread Edmund Nadolski
From: Jeff Mahoney 

This patch adds a tracepoint event for prelim_ref insertion and
merging.  For each, the ref being inserted or merged and the count
of tree nodes is issued.

Signed-off-by: Jeff Mahoney 
---
 fs/btrfs/backref.c   | 119 ++-
 fs/btrfs/backref.h   |  12 +
 fs/btrfs/super.c |   1 +
 include/trace/events/btrfs.h |  58 +
 4 files changed, 132 insertions(+), 58 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2e452264..19c9e92 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -26,6 +26,8 @@
 #include "delayed-ref.h"
 #include "locking.h"
 
+#include 
+
 /* Just an arbitrary number so we can be sure this happened */
 #define BACKREF_FOUND_SHARED 6
 
@@ -120,20 +122,6 @@ static int find_extent_in_eb(const struct extent_buffer 
*eb,
return 0;
 }
 
-/*
- * this structure records all encountered refs on the way up to the root
- */
-struct prelim_ref {
-   struct rb_node rbnode;
-   u64 root_id;
-   struct btrfs_key key_for_search;
-   int level;
-   int count;
-   struct extent_inode_elem *inode_list;
-   u64 parent;
-   u64 wanted_disk_byte;
-};
-
 struct preftree {
struct rb_root root;
unsigned int count;
@@ -212,7 +200,8 @@ static int prelim_ref_compare(struct prelim_ref *ref1,
  *
  * Callers should assumed that newref has been freed after calling.
  */
-static void prelim_ref_insert(struct preftree *preftree,
+static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
+ struct preftree *preftree,
  struct prelim_ref *newref)
 {
struct rb_root *root;
@@ -243,6 +232,8 @@ static void prelim_ref_insert(struct preftree *preftree,
ref->inode_list = newref->inode_list;
else
eie->next = newref->inode_list;
+   trace_btrfs_prelim_ref_merge(fs_info, ref, newref,
+preftree->count);
ref->count += newref->count;
free_pref(newref);
return;
@@ -250,6 +241,7 @@ static void prelim_ref_insert(struct preftree *preftree,
}
 
preftree->count++;
+   trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
rb_insert_color(&newref->rbnode, root);
 }
@@ -308,7 +300,8 @@ static void prelim_release(struct preftree *preftree)
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
-static int add_prelim_ref(struct preftree *preftree, u64 root_id,
+static int add_prelim_ref(const struct btrfs_fs_info *fs_info,
+ struct preftree *preftree, u64 root_id,
  const struct btrfs_key *key, int level, u64 parent,
  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
@@ -355,28 +348,30 @@ static int add_prelim_ref(struct preftree *preftree, u64 
root_id,
ref->count = count;
ref->parent = parent;
ref->wanted_disk_byte = wanted_disk_byte;
-   prelim_ref_insert(preftree, ref);
+   prelim_ref_insert(fs_info, preftree, ref);
 
return 0;
 }
 
 /* direct refs use root == 0, key == NULL */
-static int add_direct_ref(struct preftrees *preftrees, int level, u64 parent,
+static int add_direct_ref(const struct btrfs_fs_info *fs_info,
+ struct preftrees *preftrees, int level, u64 parent,
  u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
-   return add_prelim_ref(&preftrees->direct, 0, NULL, level, parent,
- wanted_disk_byte, count, gfp_mask);
+   return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level,
+ parent, wanted_disk_byte, count, gfp_mask);
 }
 
 /* indirect refs use parent == 0 */
-static int add_indirect_ref(struct preftrees *preftrees, u64 root_id,
+static int add_indirect_ref(const struct btrfs_fs_info *fs_info,
+   struct preftrees *preftrees, u64 root_id,
const struct btrfs_key *key, int level,
u64 wanted_disk_byte, int count, gfp_t gfp_mask)
 {
struct preftree *tree = &preftrees->indirect;
if (!key)
tree = &preftrees->indirect_missing_keys;
-   return add_prelim_ref(tree, root_id, key, level, 0,
+   return add_prelim_ref(fs_info, tree, root_id, key, level, 0,
  wanted_disk_byte, count, gfp_mask);
 }
 
@@ -630,7 +625,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info 
*fs_info,
 * and return directly.
 */
if (err == -ENOENT) {
-

Re: [PATCH] btrfs-progs: add support to search subvolume by rootid and uuid

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 03:20:50PM +0800, Anand Jain wrote:
> Unless the top level is mounted there is no way to know the
> details of all the subvolume.  For example:
> 
> mount -o subvol=sv1/newsv1 /dev/sdb /btrfs
> 
> btrfs su list /btrfs
> ID 257 gen 12 top level 5 path sv1
> ID 258 gen 9 top level 257 path sv1/snap
> ID 259 gen 11 top level 257 path sv1/newsv1
> 
> You can't subvol show for sv1 and sv1/snap as its paths aren't
> accessible to the user unless the its top level is mounted.
> 
> This patch adds two new options to the existing btrfs subvol show
> cli. They are --rootid/-r or --uuid/-u, with this now the user will
> be able to look for a subvolume using the rootid OR the uuid.

Sounds good.

> ./btrfs su show -r 257 /btrfs
> sv1
>   Name:   sv1
>   UUID:   30129358-c69d-3e4a-a662-29509cc69c95
>   Parent UUID:-
>   Received UUID:  -
>   Creation time:  2017-07-11 20:32:57 +0800
>   Subvolume ID:   257
>   Generation: 12
>   Gen at creation:7
>   Parent ID:  5
>   Top level ID:   5
>   Flags:  -
>   Snapshot(s):
>   sv1/snap
> 
> Signed-off-by: Anand Jain 
> ---
>  btrfs-list.c |  4 +++-
>  cmds-subvolume.c | 44 +---
>  utils.c  | 52 
>  utils.h  |  5 -
>  4 files changed, 100 insertions(+), 5 deletions(-)
> 
> diff --git a/btrfs-list.c b/btrfs-list.c
> index 8eec05ea797f..92a537f425f3 100644
> --- a/btrfs-list.c
> +++ b/btrfs-list.c
> @@ -1582,7 +1582,9 @@ int btrfs_get_subvol(int fd, struct root_info *the_ri)
>   rbn = rb_next(rbn);
>   continue;
>   }
> - if (!comp_entry_with_rootid(the_ri, ri, 0)) {
> +
> + if (!comp_entry_with_rootid(the_ri, ri, 0) ||
> + !uuid_compare(the_ri->uuid, ri->uuid)) {
>   memcpy(the_ri, ri, offsetof(struct root_info, path));
>   the_ri->path = strdup_or_null(ri->path);
>   the_ri->name = strdup_or_null(ri->name);
> diff --git a/cmds-subvolume.c b/cmds-subvolume.c
> index de6204eabeaf..1fa54d1b24cf 100644
> --- a/cmds-subvolume.c
> +++ b/cmds-subvolume.c
> @@ -891,8 +891,11 @@ static int cmd_subvol_find_new(int argc, char **argv)
>  }
>  
>  static const char * const cmd_subvol_show_usage[] = {
> - "btrfs subvolume show ",
> + "btrfs subvolume show [options] |",
>   "Show more information of the subvolume",
> + "-r|--rootid   rootid of the subvol to show",
> + "-u|--uuid uuid of the subvol to show",
> + "If no option is specified  will be shown.",
>   NULL
>  };
>  
> @@ -907,8 +910,36 @@ static int cmd_subvol_show(int argc, char **argv)
>   int fd = -1;
>   int ret = 1;
>   DIR *dirstream1 = NULL;
> + int by_rootid = 0;
> + int by_uuid = 0;
> + u64 rootid_arg;
> + u8 uuid_arg[BTRFS_UUID_SIZE];
>  
> - clean_args_no_options(argc, argv, cmd_subvol_show_usage);
> + while (1) {
> + int c;
> + static const struct option long_options[] = {
> + { "rootid", required_argument, NULL, 'r'},
> + { "uuid", required_argument, NULL, 'u'},
> + { NULL, 0, NULL, 0 }
> + };
> +
> + c = getopt_long(argc, argv, "r:u:", long_options, NULL);
> + if (c < 0)
> + break;
> +
> + switch (c) {
> + case 'r':
> + rootid_arg = arg_strtou64(optarg);
> + by_rootid = 1;
> + break;
> + case 'u':
> + uuid_parse(optarg, uuid_arg);
> + by_uuid = 1;
> + break;
> + default:
> + usage(cmd_subvol_show_usage);
> + }
> + }
>  
>   if (check_argc_exact(argc - optind, 1))
>   usage(cmd_subvol_show_usage);
> @@ -921,7 +952,14 @@ static int cmd_subvol_show(int argc, char **argv)
>   goto out;
>   }
>  
> - ret = get_subvol_info(fullpath, &get_ri);
> + if (by_rootid) {
> + ret = get_subvol_info_by_rootid(fullpath, &get_ri, rootid_arg);
> + } else if (by_uuid) {
> + ret = get_subvol_info_by_uuid(fullpath, &get_ri, uuid_arg);
> + } else {
> + ret = get_subvol_info(fullpath, &get_ri);
> + }

Here rootid takes precedence if there are both options specified. I
think this should be handled as invalid syntax. If the rootid and uuid
do not match, it's not well defined and can cause silent breakage.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.ker

Re: [PATCH] btrfs-progs: Fix missing internal deps in tests.

2017-07-12 Thread David Sterba
On Mon, Jul 10, 2017 at 02:29:08PM -0700, Adam Buchbinder wrote:
> Doing a straight 'make test' would fail because some misc and fsck
> tests require particular tools to already be built. Add dependencies
> at the Makefile and shell-script level.
> 
> Signed-off-by: Adam Buchbinder 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Tighten integer types in print-tree.

2017-07-12 Thread David Sterba
On Mon, Jul 10, 2017 at 02:29:09PM -0700, Adam Buchbinder wrote:
> There are likely more places where the wrong size types are used, but
> these tripped Clang's warnings because they eventually get passed to
> printf.
> 
> Signed-off-by: Adam Buchbinder 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch v2] Btrfs-progs: fix infinite loop in find_free_extent

2017-07-12 Thread David Sterba
On Mon, Jun 26, 2017 at 11:34:41AM -0600, Liu Bo wrote:
> If the found %ins is crossing a stripe len, ie. BTRFS_STRIPE_LEN, we'd
> search again with a stripe-aligned %search_start.  The current code
> calculates %search_start by adding a wrong offset, in order to fix it, the
> start position of the block group should be taken, otherwise, it'll end up
> with looking at the same block group forever.
> 
> Cc: David Sterba 
> Signed-off-by: Liu Bo 

Applied, thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: Fix an infinite loop in btrfs_next_bg

2017-07-12 Thread David Sterba
On Fri, Jun 09, 2017 at 11:09:35AM -0700, Justin Maggard wrote:
> I've run into a couple filesystems where btrfs-find-root would spin
> indefinitely.
> 
> If the first cache extent start location is 0, we end up in an infinite
> loop in btrfs_next_bg().  Fix it by checking for that situation, and
> jumping to the next bg if necessary.
> 
> Fixes: e2e0dae9 (btrfs-progs: volume: Fix a bug causing btrfs-find-root to 
> skip first chunk)
> Signed-off-by: Justin Maggard 

Applied, thanks.  Do you have a minimal image for testing?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: report errors when checksum is not found

2017-07-12 Thread Liu Bo
On Wed, Jul 12, 2017 at 11:46:29AM -0600, Liu Bo wrote:
> On Wed, Jul 12, 2017 at 04:40:36PM +0200, David Sterba wrote:
> > On Tue, Jul 11, 2017 at 02:43:16PM -0600, Liu Bo wrote:
> > > When btrfs fails the checksum check, it'll fill the whole page with
> > > "1".
> > 
> > One could ask, why is the page filled with 1s. Brought by commit
> > 07157aacb1ecd394a54949 from 2007, without mentioning any justification.
> > I'm more inclined to revisit this behaviour and drop it eventually.
> > 
> > > However, if %csum_expected is 0 (which means there is no checksum), then
> > > for some unknown reason, we just pretend that the read is correct, so
> > > userspace would be confused about the dilemma that read is successful but
> > > getting a page with all content being "1".
> > 
> > Here 'no checksum' means that no checksum was found but was expected,
> > right?
> 
> Yes, no checksum was found.
> 
> > An EIO would fail the read, I don't see a reason why the page
> > needs to be "zeroed". The contents would be inaccessible anyway.
> >
> 
> Right, resetting page's content is needed when we return 0 instead of
> -EIO.  I guess it was introduced for testing.  So yes, I'm glad to
> remove that part, will do in a v2.
>

Since this __readpage_endio_check() is also called by directIO's
btrfs_retry_endio(), in the dio case, userspace can read out the page
content.

For that reason, I think we would have to keep it and return errors to
userspace.

Thanks,

-liubo

> > > This can happen due to a bug in btrfs-convert.
> > > 
> > > This fixes it by always returning errors if checksum doesn't match.
> > 
> > Independent of the above, this fix makes sense.
> > 
> > Reviewed-by: David Sterba 
> 
> Thank you for the comments.
> 
> Thanks,
> 
> -liubo
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V2] btrfs-progs: add support to search subvolume by rootid and uuid

2017-07-12 Thread Anand Jain
Unless the top level is mounted there is no way to know the
details of all the subvolume.  For example:

mount -o subvol=sv1/newsv1 /dev/sdb /btrfs

btrfs su list /btrfs
ID 257 gen 12 top level 5 path sv1
ID 258 gen 9 top level 257 path sv1/snap
ID 259 gen 11 top level 257 path sv1/newsv1

You can't subvol show for sv1 and sv1/snap as its paths aren't
accessible to the user unless the its top level is mounted.

This patch adds two new options to the existing btrfs subvol show
cli. They are --rootid/-r or --uuid/-u, with this now the user will
be able to look for a subvolume using the rootid OR the uuid.

./btrfs su show -r 257 /btrfs
sv1
Name:   sv1
UUID:   30129358-c69d-3e4a-a662-29509cc69c95
Parent UUID:-
Received UUID:  -
Creation time:  2017-07-11 20:32:57 +0800
Subvolume ID:   257
Generation: 12
Gen at creation:7
Parent ID:  5
Top level ID:   5
Flags:  -
Snapshot(s):
sv1/snap

Signed-off-by: Anand Jain 
---
v2: Check if both -u and -r are set by the user and fail if so.

 btrfs-list.c |  4 +++-
 cmds-subvolume.c | 47 ---
 utils.c  | 52 
 utils.h  |  5 -
 4 files changed, 103 insertions(+), 5 deletions(-)

diff --git a/btrfs-list.c b/btrfs-list.c
index 8eec05ea797f..92a537f425f3 100644
--- a/btrfs-list.c
+++ b/btrfs-list.c
@@ -1582,7 +1582,9 @@ int btrfs_get_subvol(int fd, struct root_info *the_ri)
rbn = rb_next(rbn);
continue;
}
-   if (!comp_entry_with_rootid(the_ri, ri, 0)) {
+
+   if (!comp_entry_with_rootid(the_ri, ri, 0) ||
+   !uuid_compare(the_ri->uuid, ri->uuid)) {
memcpy(the_ri, ri, offsetof(struct root_info, path));
the_ri->path = strdup_or_null(ri->path);
the_ri->name = strdup_or_null(ri->name);
diff --git a/cmds-subvolume.c b/cmds-subvolume.c
index de6204eabeaf..de5e0da0ff48 100644
--- a/cmds-subvolume.c
+++ b/cmds-subvolume.c
@@ -891,8 +891,11 @@ static int cmd_subvol_find_new(int argc, char **argv)
 }
 
 static const char * const cmd_subvol_show_usage[] = {
-   "btrfs subvolume show ",
+   "btrfs subvolume show [options] |",
"Show more information of the subvolume",
+   "-r|--rootid   rootid of the subvol to show",
+   "-u|--uuid uuid of the subvol to show",
+   "If no option is specified  will be shown.",
NULL
 };
 
@@ -907,12 +910,43 @@ static int cmd_subvol_show(int argc, char **argv)
int fd = -1;
int ret = 1;
DIR *dirstream1 = NULL;
+   int by_rootid = 0;
+   int by_uuid = 0;
+   u64 rootid_arg;
+   u8 uuid_arg[BTRFS_UUID_SIZE];
 
-   clean_args_no_options(argc, argv, cmd_subvol_show_usage);
+   while (1) {
+   int c;
+   static const struct option long_options[] = {
+   { "rootid", required_argument, NULL, 'r'},
+   { "uuid", required_argument, NULL, 'u'},
+   { NULL, 0, NULL, 0 }
+   };
+
+   c = getopt_long(argc, argv, "r:u:", long_options, NULL);
+   if (c < 0)
+   break;
+
+   switch (c) {
+   case 'r':
+   rootid_arg = arg_strtou64(optarg);
+   by_rootid = 1;
+   break;
+   case 'u':
+   uuid_parse(optarg, uuid_arg);
+   by_uuid = 1;
+   break;
+   default:
+   usage(cmd_subvol_show_usage);
+   }
+   }
 
if (check_argc_exact(argc - optind, 1))
usage(cmd_subvol_show_usage);
 
+   if (by_rootid && by_uuid)
+   usage(cmd_subvol_show_usage);
+
memset(&get_ri, 0, sizeof(get_ri));
fullpath = realpath(argv[optind], NULL);
if (!fullpath) {
@@ -921,7 +955,14 @@ static int cmd_subvol_show(int argc, char **argv)
goto out;
}
 
-   ret = get_subvol_info(fullpath, &get_ri);
+   if (by_rootid) {
+   ret = get_subvol_info_by_rootid(fullpath, &get_ri, rootid_arg);
+   } else if (by_uuid) {
+   ret = get_subvol_info_by_uuid(fullpath, &get_ri, uuid_arg);
+   } else {
+   ret = get_subvol_info(fullpath, &get_ri);
+   }
+
if (ret) {
if (ret < 0) {
error("Failed to get subvol info %s: %s",
diff --git a/utils.c b/utils.c
index d2489e70f8d8..250e6cc76cbc 100644
--- a/utils.c
+++ b/utils.c
@@ -2432,6 +2432,58 @@ out:
return ret;
 }
 
+int ge

Re: [PATCH] btrfs-progs: Enable ThreadSanitizer, using D=tsan.

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 03:45:46PM -0700, Adam Buchbinder wrote:
> On Wed, Jul 12, 2017 at 2:51 PM, David Sterba  wrote:
> >
> > On Wed, Jul 12, 2017 at 01:04:49PM -0700, Adam Buchbinder wrote:
> > > Tested with clang-3.9.
> > >
> > > Signed-off-by: Adam Buchbinder 
> > > ---
> > >  Makefile | 6 ++
> > >  1 file changed, 6 insertions(+)
> > >
> > > diff --git a/Makefile b/Makefile
> > > index 81598df..8948301 100644
> > > --- a/Makefile
> > > +++ b/Makefile
> > > @@ -17,6 +17,7 @@
> > >  #  abort   - call abort() on first error (dumps core)
> > >  #  all - shortcut for all of the above
> > >  #  asan- enable address sanitizer compiler feature
> > > +#  tsan- enable thread sanitizer compiler feature
> > >  #  ubsan   - undefined behaviour sanitizer compiler
> feature
> > >  #  bcheck  - extended build checks
> > >  #   W=123  build with warnings (default: off)
> > > @@ -157,6 +158,11 @@ ifneq (,$(findstring asan,$(D)))
> > >DEBUG_CFLAGS_INTERNAL += -fsanitize=address
> > >  endif
> > >
> > > +ifneq (,$(findstring tsan,$(D)))
> > > +  DEBUG_CFLAGS_INTERNAL += -fsanitize=thread -fPIE
> > > +  LD_FLAGS += -fsanitize=thread -ltsan -pie
> >
> > Why do you need to set PIE here? Is is necessary for tsan?
> 
> Yes; see https://clang.llvm.org/docs/ThreadSanitizer.html:
> "Non-position-independent executables are not supported."

Thanks, patch applied.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V2] btrfs-progs: add support to search subvolume by rootid and uuid

2017-07-12 Thread David Sterba
On Thu, Jul 13, 2017 at 06:47:11AM +0800, Anand Jain wrote:
> Unless the top level is mounted there is no way to know the
> details of all the subvolume.  For example:
> 
> mount -o subvol=sv1/newsv1 /dev/sdb /btrfs
> 
> btrfs su list /btrfs
> ID 257 gen 12 top level 5 path sv1
> ID 258 gen 9 top level 257 path sv1/snap
> ID 259 gen 11 top level 257 path sv1/newsv1
> 
> You can't subvol show for sv1 and sv1/snap as its paths aren't
> accessible to the user unless the its top level is mounted.
> 
> This patch adds two new options to the existing btrfs subvol show
> cli. They are --rootid/-r or --uuid/-u, with this now the user will
> be able to look for a subvolume using the rootid OR the uuid.
> 
> ./btrfs su show -r 257 /btrfs
> sv1
>   Name:   sv1
>   UUID:   30129358-c69d-3e4a-a662-29509cc69c95
>   Parent UUID:-
>   Received UUID:  -
>   Creation time:  2017-07-11 20:32:57 +0800
>   Subvolume ID:   257
>   Generation: 12
>   Gen at creation:7
>   Parent ID:  5
>   Top level ID:   5
>   Flags:  -
>   Snapshot(s):
>   sv1/snap
> 
> Signed-off-by: Anand Jain 
> ---
> v2: Check if both -u and -r are set by the user and fail if so.

Applied, thanks. I did some minor updates to the help text.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: report errors when checksum is not found

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 03:35:43PM -0600, Liu Bo wrote:
> On Wed, Jul 12, 2017 at 11:46:29AM -0600, Liu Bo wrote:
> > On Wed, Jul 12, 2017 at 04:40:36PM +0200, David Sterba wrote:
> > > On Tue, Jul 11, 2017 at 02:43:16PM -0600, Liu Bo wrote:
> > > > When btrfs fails the checksum check, it'll fill the whole page with
> > > > "1".
> > > 
> > > One could ask, why is the page filled with 1s. Brought by commit
> > > 07157aacb1ecd394a54949 from 2007, without mentioning any justification.
> > > I'm more inclined to revisit this behaviour and drop it eventually.
> > > 
> > > > However, if %csum_expected is 0 (which means there is no checksum), then
> > > > for some unknown reason, we just pretend that the read is correct, so
> > > > userspace would be confused about the dilemma that read is successful 
> > > > but
> > > > getting a page with all content being "1".
> > > 
> > > Here 'no checksum' means that no checksum was found but was expected,
> > > right?
> > 
> > Yes, no checksum was found.
> > 
> > > An EIO would fail the read, I don't see a reason why the page
> > > needs to be "zeroed". The contents would be inaccessible anyway.
> > >
> > 
> > Right, resetting page's content is needed when we return 0 instead of
> > -EIO.  I guess it was introduced for testing.  So yes, I'm glad to
> > remove that part, will do in a v2.
> >
> 
> Since this __readpage_endio_check() is also called by directIO's
> btrfs_retry_endio(), in the dio case, userspace can read out the page
> content.
> 
> For that reason, I think we would have to keep it and return errors to
> userspace.

We can decide what to do in case of the error:

a) this is what we read from the disk and is presumably the expected
   content yet with some corruptions. Ie. "this is what we found but EIO
   tells you that it's not valid, you may still salvage some data"

b) if the page contents is random and possibly contains sensitive data
   of somebody else, then it should be zeroed.

Although zeoring will be completely safe, I know people are asking about
a way to get the data even if the checksum does not match (other than
manually locating the data on the device). This would need to audit the
call paths leading to __readpage_endio_check.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/6] Chunk level degradable check

2017-07-12 Thread Qu Wenruo



On 2017年07月12日 23:24, David Sterba wrote:

On Wed, Jun 28, 2017 at 01:43:29PM +0800, Qu Wenruo wrote:

The patchset can be fetched from my github repo:
https://github.com/adam900710/linux/tree/degradable

The patchset is based on David's for-4.13-part1 branch.

Btrfs currently uses num_tolerated_disk_barrier_failures to do global
check for tolerated missing device.

Although the one-size-fit-all solution is quite safe, it's too strict
if data and metadata has different duplication level.

For example, if one use Single data and RAID1 metadata for 2 disks, it
means any missing device will make the fs unable to be degraded
mounted.

But in fact, some times all single chunks may be in the existing
device and in that case, we should allow it to be rw degraded mounted.

Such case can be easily reproduced using the following script:
  # mkfs.btrfs -f -m raid1 -d sing /dev/sdb /dev/sdc
  # wipefs -f /dev/sdc
  # mount /dev/sdb -o degraded,rw


I've seen wider testing coverage in replies to the previous patchset
iterations. Can we have that added to fstests?


I'm completely OK to add fstests test case.
While the concern is still the same: we need better wrapper to detect 
chunk layout.


Or we can only have static test case to test chunk-level degradable check.

Thanks,
Qu



I'm going to add this patchset to the devel queue (ie. not a separate
for-next branch anymore).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 13/13] btrfs: clean up extraneous computations in add_delayed_refs

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 04:20:11PM -0600, Edmund Nadolski wrote:
> Repeating the same computation in multiple places is not
> necessary.
> 
> Signed-off-by: Edmund Nadolski 
> Signed-off-by: Jeff Mahoney 

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 11/13] btrfs: add cond_resched() calls when resolving backrefs

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 04:20:09PM -0600, Edmund Nadolski wrote:
> Since backref resolution is CPU-intensive, the cond_resched calls
> should help alleviate soft lockup occurences.
> 
> Signed-off-by: Edmund Nadolski 
> Signed-off-by: Jeff Mahoney 

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 10/13] btrfs: backref, add tracepoints for prelim_ref insertion and merging

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 04:20:08PM -0600, Edmund Nadolski wrote:
> From: Jeff Mahoney 
> 
> This patch adds a tracepoint event for prelim_ref insertion and
> merging.  For each, the ref being inserted or merged and the count
> of tree nodes is issued.
> 
> Signed-off-by: Jeff Mahoney 

Reviewed-by: David Sterba 

I've chanbed "key=[%llu %u %llu]" -> "key=[%llu,%u,%llu]" and updated
the wiki.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/6] Chunk level degradable check

2017-07-12 Thread David Sterba
On Mon, Jul 10, 2017 at 09:11:50PM +0300, Dmitrii Tcvetkov wrote:
> Tested on top of current mainline master (commit
> af3c8d98508d37541d4bf57f13a984a7f73a328c). Didn't find any
> regressions.

Thanks for testing.

If anybody wants to get their Tested-by in the patches, please let me
know, I'll add the tags (can be done until rc6). Your efforts are
appreciated so you get the credit in the undying history log of linux
kernel.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 09/13] btrfs: add a node counter to each of the rbtrees

2017-07-12 Thread David Sterba
On Wed, Jul 12, 2017 at 04:20:07PM -0600, Edmund Nadolski wrote:
> From: Jeff Mahoney 
> 
> This patch adds counters to each of the rbtrees so that we can tell
> how large they are growing for a given workload.  These counters
> will be exported by tracepoints in the next patch.
> 
> Signed-off-by: Jeff Mahoney 

Reviewed-by: David Sterba 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v4 0/6] Chunk level degradable check

2017-07-12 Thread Adam Borowski
On Thu, Jul 13, 2017 at 02:50:10AM +0200, David Sterba wrote:
> On Mon, Jul 10, 2017 at 09:11:50PM +0300, Dmitrii Tcvetkov wrote:
> > Tested on top of current mainline master (commit
> > af3c8d98508d37541d4bf57f13a984a7f73a328c). Didn't find any
> > regressions.

I've retested this yet again.  No regressions as well.

> Thanks for testing.
> 
> If anybody wants to get their Tested-by in the patches, please let me
> know, I'll add the tags (can be done until rc6). Your efforts are
> appreciated so you get the credit in the undying history log of linux
> kernel.

Heh.  Let's not play such games but finally get this patch set in, it's by
far the biggest problem for multi-device.

Any issues with degraded mounts which I reported before are not related to
this patch set, I merely found them at the time of testing.


Meow!
-- 
⢀⣴⠾⠻⢶⣦⠀ 
⣾⠁⢠⠒⠀⣿⡁ A dumb species has no way to open a tuna can.
⢿⡄⠘⠷⠚⠋⠀ A smart species invents a can opener.
⠈⠳⣄ A master species delegates.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS: error (device dm-2) in btrfs_run_delayed_refs:2960: errno=-17 Object already exists

2017-07-12 Thread Marc MERLIN
On Tue, Jul 11, 2017 at 09:48:12AM -0700, Marc MERLIN wrote:
> On Tue, Jul 11, 2017 at 10:00:40AM -0600, Chris Murphy wrote:
> > > ---[ end trace feb4b95c83ac065f ]---
> > > BTRFS: error (device dm-2) in btrfs_run_delayed_refs:2960: errno=-17 
> > > Object already exists
> > > BTRFS info (device dm-2): forced readonly
> >
> > You've already had this same traceback, not sure whether it's the same
> > file system or not, but it was 4.7.2 kernel.
>
> You have better memory than me. I'll admit that I'm kind of overwhelmed
> by all the time I'm currently spending/wasting on btrfs recovery and
> that came almost out of nowwhere and hit me in 3 different places :-/

Ok, I'm on 4.9.36 and same problem :(

This is on an otherwise ok working filesystem that comes back clean 
on btrfs check (although I haven't done lowmem but last time I tried lowmem it
reported problems that apparently weren't really problems)

Dear devs, what does this error mean exactly and what should I do about it 
besides
ignoring it and remounting my FS read-write?
On the plus side thanks for both
1) showing which device the error is on
2) not crashing the system :)

WARNING: CPU: 6 PID: 3730 at fs/btrfs/extent-tree.c:2967 
btrfs_run_delayed_refs+0xbd/0x1be
BTRFS: Transaction aborted (error -17)
CPU: 0 PID: 3730 Comm: btrfs-cleaner Tainted: G U  W   
4.9.36-amd64-preempt-sysrq-20170

Hardware name: System manufacturer System Product Name/P8H67-M PRO, BIOS 3904 
04/27/2013
 b55c679bfc88 8239b00b b55c679bfcd8 
 b55c679bfcc8 82066769 0b97679bfd48 a07f61a5eaa0
 a086f217c800 ffef a086ad8b5a90 03a0
Call Trace:
 [] dump_stack+0x61/0x7d
 [] __warn+0xc2/0xdd
 [] warn_slowpath_fmt+0x5a/0x76
 [] btrfs_run_delayed_refs+0xbd/0x1be
 [] ? walk_up_tree+0x87/0x10f
 [] btrfs_should_end_transaction+0x54/0x5d
 [] btrfs_drop_snapshot+0x380/0x65c
 [] ? btrfs_kill_all_delayed_nodes+0x5f/0xd7
 [] ? _raw_spin_lock+0x15/0x17
 [] ? btrfs_delete_unused_bgs+0x326/0x369
 [] btrfs_clean_one_deleted_snapshot+0xce/0xdc
 [] cleaner_kthread+0xaf/0x17c
 [] ? btrfs_need_cleaner_sleep.isra.25+0x2c/0x2c
 [] kthread+0xd1/0xd9
 [] ? init_completion+0x24/0x24
 [] ? do_fast_syscall_32+0xb7/0xfe
 [] ret_from_fork+0x25/0x30
---[ end trace 59fd1c9a379f73bc ]---
BTRFS: error (device dm-2) in btrfs_run_delayed_refs:2967: errno=-17 Object 
already exists
BTRFS info (device dm-2): forced readonly
-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems 
   what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: My Second Email to You, Pls Reply Me

2017-07-12 Thread Makl Na
Hello Dear,

How are you doing? I hope you are doing well. I am writing as I have written to 
you previously without any response from you. I hope all is well with you.I 
will appreciate if you will acknowledge your receipt of this mail.

Thank you and have a good day.

Miss Naya

Please Write Me at My Private e-mail which i used to send you the previous 
e-mail( sgtmarkd2...@lycos.com )




 KILL Mail Shield Gateway scanned 


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: add support to sort by topid

2017-07-12 Thread Anand Jain
As users generally organize the subvols and snapshots based on the subvol
directory hierarchy. So providing an ability to sort them by topid would
help. Thanks.

Signed-off-by: Anand Jain 
---
 btrfs-list.c | 18 ++
 btrfs-list.h |  1 +
 cmds-subvolume.c |  8 
 3 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/btrfs-list.c b/btrfs-list.c
index 92a537f425f3..733e22bc2524 100644
--- a/btrfs-list.c
+++ b/btrfs-list.c
@@ -188,11 +188,28 @@ static int comp_entry_with_path(struct root_info *entry1,
return is_descending ? -ret : ret;
 }
 
+static int comp_entry_with_topid(struct root_info *entry1,
+   struct root_info *entry2,
+   int is_descending)
+{
+   int ret;
+
+   if (entry1->top_id > entry2->top_id)
+   ret = 1;
+   else if (entry1->top_id < entry2->top_id)
+   ret = -1;
+   else
+   ret = 0;
+
+   return is_descending ? -ret : ret;
+}
+
 static btrfs_list_comp_func all_comp_funcs[] = {
[BTRFS_LIST_COMP_ROOTID]= comp_entry_with_rootid,
[BTRFS_LIST_COMP_OGEN]  = comp_entry_with_ogen,
[BTRFS_LIST_COMP_GEN]   = comp_entry_with_gen,
[BTRFS_LIST_COMP_PATH]  = comp_entry_with_path,
+   [BTRFS_LIST_COMP_TOPID] = comp_entry_with_topid,
 };
 
 static char *all_sort_items[] = {
@@ -200,6 +217,7 @@ static char *all_sort_items[] = {
[BTRFS_LIST_COMP_OGEN]  = "ogen",
[BTRFS_LIST_COMP_GEN]   = "gen",
[BTRFS_LIST_COMP_PATH]  = "path",
+   [BTRFS_LIST_COMP_TOPID] = "topid",
[BTRFS_LIST_COMP_MAX]   = NULL,
 };
 
diff --git a/btrfs-list.h b/btrfs-list.h
index 6e5fc7784fb1..9a3869a452a8 100644
--- a/btrfs-list.h
+++ b/btrfs-list.h
@@ -152,6 +152,7 @@ enum btrfs_list_comp_enum {
BTRFS_LIST_COMP_OGEN,
BTRFS_LIST_COMP_GEN,
BTRFS_LIST_COMP_PATH,
+   BTRFS_LIST_COMP_TOPID,
BTRFS_LIST_COMP_MAX,
 };
 
diff --git a/cmds-subvolume.c b/cmds-subvolume.c
index de5e0da0ff48..17b680d26e86 100644
--- a/cmds-subvolume.c
+++ b/cmds-subvolume.c
@@ -416,10 +416,10 @@ static const char * const cmd_subvol_list_usage[] = {
"-C [+|-]value",
" filter the subvolumes by ogeneration",
" (+value: >= value; -value: <= value; value: = value)",
-   "--sort=gen,ogen,rootid,path",
-   " list the subvolume in order of gen, ogen, rootid or path",
-   " you also can add '+' or '-' in front of each items.",
-   " (+:ascending, -:descending, ascending default)",
+   "--sort=gen,ogen,rootid,path,topid",
+   " list the subvolumes in order of gen, ogen, rootid, path",
+   " or topid. You also can add '+' or '-' in front of each",
+   " items. (+:ascending, -:descending, ascending default)",
NULL,
 };
 
-- 
2.13.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


degraded raid scribbling upon wrong device

2017-07-12 Thread Adam Borowski
Hi!
Here's a set of test cases, two of them in some cases seem to scribble upon
the wrong device:

* deg-mid-missing
* deg-last-replaced (not on the innocent "re")
* but never deg-last-missing

When all goes ok, there are no errors other than wrong generation on the
re-added disk (expected).   When it goes bad, there's a lot of corruption.
In all cases, though, the "Device missing:" field is wrong.

I'm not yet sure how to trigger this, perhaps someone would have a clue?

8:30am, hitting the sack, will try again todorrow.


Meow!
-- 
⢀⣴⠾⠻⢶⣦⠀ 
⣾⠁⢠⠒⠀⣿⡁ A dumb species has no way to open a tuna can.
⢿⡄⠘⠷⠚⠋⠀ A smart species invents a can opener.
⠈⠳⣄ A master species delegates.
#!/bin/sh
set -e
set -x

umount /mnt/vol1 ||:
losetup -D

dd if=/dev/zero bs=1048576 count=1 seek=4095 of=ra
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rb
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rc
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rd

mkfs.btrfs -draid1 -mraid1 ra rb rc rd

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
losetup -f rd
sleep 1
mount /dev/loop0 /mnt/vol1
cp -pr /bin /mnt/vol1
btrfs fi sync /mnt/vol1
btrfs fi us /mnt/vol1
umount /mnt/vol1

losetup -D
losetup -f ra
losetup -f rb
losetup -f rd
sleep 1
mount -odegraded /dev/loop0 /mnt/vol1
btrfs fi us /mnt/vol1
dd if=/dev/zero of=/mnt/vol1/foo bs=1048576 count=
umount /mnt/vol1

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
losetup -f rd
sleep 1
mount /dev/loop0 /mnt/vol1
btrfs scrub start -B /mnt/vol1
#!/bin/sh
set -e
set -x

umount /mnt/vol1 ||:
losetup -D

dd if=/dev/zero bs=1048576 count=1 seek=4095 of=ra
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rb
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rc
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rd

mkfs.btrfs -draid1 -mraid1 ra rb rc rd

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
losetup -f rd
sleep 1
mount /dev/loop0 /mnt/vol1
cp -pr /bin /mnt/vol1
btrfs fi sync /mnt/vol1
btrfs fi us /mnt/vol1
umount /mnt/vol1

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
sleep 1
mount -odegraded /dev/loop0 /mnt/vol1
btrfs fi us /mnt/vol1
dd if=/dev/zero of=/mnt/vol1/foo bs=1048576 count=
umount /mnt/vol1

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
losetup -f rd
sleep 1
mount /dev/loop0 /mnt/vol1
btrfs scrub start -B /mnt/vol1
#!/bin/sh
set -e
set -x

umount /mnt/vol1 ||:
losetup -D

dd if=/dev/zero bs=1048576 count=1 seek=4095 of=ra
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rb
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rc
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=rd
dd if=/dev/zero bs=1048576 count=1 seek=4095 of=re

mkfs.btrfs -draid1 -mraid1 ra rb rc rd

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
losetup -f rd
sleep 1
mount /dev/loop0 /mnt/vol1
cp -pr /bin /mnt/vol1
btrfs fi sync /mnt/vol1
btrfs fi us /mnt/vol1
umount /mnt/vol1

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
losetup -f re
sleep 1
mount -odegraded /dev/loop0 /mnt/vol1
btrfs fi us /mnt/vol1
dd if=/dev/zero of=/mnt/vol1/foo bs=1048576 count=
umount /mnt/vol1

losetup -D
losetup -f ra
losetup -f rb
losetup -f rc
losetup -f rd
sleep 1
mount /dev/loop0 /mnt/vol1
btrfs scrub start -B /mnt/vol1