[PATCH] btrfs-progs: convert: Add support to rollback new convert image

2016-05-30 Thread Qu Wenruo
For new btrfs-convert, it's less restrict for metadata chunk allocation.
While the may_rollback() function is still following the restrict 1:1
mapping check for all chunks, it will not allow some new convert image
to be rolled back.

Add new per extent check for new convert to allow it to be rolled back.

Signed-off-by: Qu Wenruo 
---
 btrfs-convert.c | 79 -
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/btrfs-convert.c b/btrfs-convert.c
index f776a4f..c28dae6 100644
--- a/btrfs-convert.c
+++ b/btrfs-convert.c
@@ -2315,6 +2315,65 @@ fail:
return -1;
 }
 
+/*
+ * Check if a non 1:1 mapped chunk can be rolled back.
+ * For new convert, it's OK while for old convert it's NG
+ */
+static int may_rollback_chunk(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+   struct btrfs_block_group_cache *bg;
+   struct btrfs_key key;
+   struct btrfs_path path;
+   struct btrfs_root *extent_root = fs_info->extent_root;
+   u64 bg_start;
+   u64 bg_end;
+   int ret;
+
+   bg = btrfs_lookup_first_block_group(fs_info, bytenr);
+   if (!bg)
+   return -ENOENT;
+   bg_start = bg->key.objectid;
+   bg_end = bg->key.objectid + bg->key.offset;
+
+   key.objectid = bg_end;
+   key.type = BTRFS_METADATA_ITEM_KEY;
+   key.offset = 0;
+   btrfs_init_path();
+
+   ret = btrfs_search_slot(NULL, extent_root, , , 0, 0);
+   if (ret < 0)
+   return ret;
+
+   while (1) {
+   struct btrfs_extent_item *ei;
+
+   ret = btrfs_previous_extent_item(extent_root, , bg_start);
+   if (ret > 0) {
+   ret = 0;
+   break;
+   }
+   if (ret < 0)
+   break;
+
+   btrfs_item_key_to_cpu(path.nodes[0], , path.slots[0]);
+   if (key.type == BTRFS_METADATA_ITEM_KEY)
+   continue;
+   /* Now it's EXTENT_ITEM_KEY only */
+   ei = btrfs_item_ptr(path.nodes[0], path.slots[0],
+   struct btrfs_extent_item);
+   /*
+* Found data extent, means this is old convert
+* must follow 1:1 mapping.
+*/
+   if (btrfs_extent_flags(path.nodes[0], ei) & 
BTRFS_EXTENT_FLAG_DATA) {
+   ret = -EINVAL;
+   break;
+   }
+   }
+   btrfs_release_path();
+   return ret;
+}
+
 static int may_rollback(struct btrfs_root *root)
 {
struct btrfs_fs_info *info = root->fs_info;
@@ -2351,8 +2410,26 @@ static int may_rollback(struct btrfs_root *root)
physical = multi->stripes[0].physical;
kfree(multi);
 
-   if (num_stripes != 1 || physical != bytenr)
+   if (num_stripes != 1) {
+   error("Num stripes for bytenr %llu is not 1\n",
+ bytenr);
goto fail;
+   }
+
+   /*
+* Extra check for new convert, as metadata chunk from
+* new convert is much more free than old convert, it doesn't
+* need to do 1:1 mapping.
+*/
+   if (physical != bytenr) {
+   /*
+* Check if it's a metadata chunk and has only
+* metadata extent
+*/
+   ret = may_rollback_chunk(info, bytenr);
+   if (ret < 0)
+   goto fail;
+   }
 next:
bytenr += length;
if (bytenr >= total_bytes)
-- 
2.8.3



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: tests: run rollback after conversion

2016-05-30 Thread Qu Wenruo

Problem located.

may_rollback() is still the old codes, while for new convert's metadata, 
it's complete OK if it's physical position is different from its logical 
space.

Unlike the strict old condition.

I'll add more check to ensure new convert can pass may_rollback() check.

Thanks,
Qu

Qu Wenruo wrote on 2016/05/31 09:13 +0800:



David Sterba wrote on 2016/05/30 17:56 +0200:

Hi Qu,

the convert patchset does not pass a rollback test, fails in the case of
32k nodesize. Thre's not much info why, just 'rollback failed'.

The branch that passes is 'test-rollback', it's current devel without
the convert and low-mem fsck patchsets.



Pretty strange.

As I manually tested a ext4 filled with my /etc, used 32K default
incompat flags, both convert and rollback succeeded without problem.

The same is for convert-tests.sh.
All passed here, just as what we did when developing the patchset.

The commit head I tested is 2a7c68a4e46f4713e746d6e977e9c4cf27913ce3.

Did you have more info on the situation where the rollback fails?
Like the commit head, test method (I assume it's rollback tests from
btrfs-progs)?

Thanks,
Qu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] fstests: btrfs: add test for qgroup handle de-refer

2016-05-30 Thread Lu Fengqi
Test if qgroup can handle de-reference reallocation. Although current
qgroup can handle it, we still need to prevent any regression which may
break current qgroup.

Signed-off-by: Lu Fengqi 
---
 common/rc   |  4 +--
 tests/btrfs/028 | 91 +
 tests/btrfs/028.out |  2 ++
 tests/btrfs/group   |  1 +
 4 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100755 tests/btrfs/028
 create mode 100644 tests/btrfs/028.out

diff --git a/common/rc b/common/rc
index 51092a0..650d198 100644
--- a/common/rc
+++ b/common/rc
@@ -3284,9 +3284,9 @@ _btrfs_get_profile_configs()
 # stress btrfs by running balance operation in a loop
 _btrfs_stress_balance()
 {
-   local btrfs_mnt=$1
+   local options=$@
while true; do
-   $BTRFS_UTIL_PROG balance start $btrfs_mnt
+   $BTRFS_UTIL_PROG balance start $options
done
 }
 
diff --git a/tests/btrfs/028 b/tests/btrfs/028
new file mode 100755
index 000..6e0ad36
--- /dev/null
+++ b/tests/btrfs/028
@@ -0,0 +1,91 @@
+#! /bin/bash
+# FS QA Test 028
+#
+#---
+# Copyright (c) 2016 Fujitsu. All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it would be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write the Free Software Foundation,
+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+#---
+#
+
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+
+# remove previous $seqres.full before test
+rm -f $seqres.full
+
+# real QA test starts here
+_supported_fs btrfs
+_supported_os Linux
+_require_scratch
+
+# Currently in btrfs the node/leaf size can not be smaller than the page
+# size (but it can be greater than the page size). So use the largest
+# supported node/leaf size (64Kb) so that the test can run on any platform
+# that Linux supports.
+_scratch_mkfs "--nodesize 64k"
+_scratch_mount
+
+_run_btrfs_util_prog quota enable $SCRATCH_MNT
+_run_btrfs_util_prog quota rescan -w $SCRATCH_MNT
+
+# Increase the probability of generating de-refer extent, and decrease
+# other.
+args=`_scale_fsstress_args -z \
+   -f write=10 -f unlink=10 \
+   -f creat=10 -f fsync=10 \
+   -f fsync=10 -n 10 -p 2 \
+   -d $SCRATCH_MNT/stress_dir`
+echo "Run fsstress $args" >>$seqres.full
+$FSSTRESS_PROG $args >/dev/null 2>&1 &
+fsstress_pid=$!
+
+echo "Start balance" >>$seqres.full
+_btrfs_stress_balance -d $SCRATCH_MNT >/dev/null 2>&1 &
+balance_pid=$!
+
+# 30s is enough to trigger bug
+sleep $((30*$TIME_FACTOR))
+kill $fsstress_pid $balance_pid
+wait
+
+# kill _btrfs_stress_balance can't end balance, so call btrfs balance cancel
+# to cancel running or paused balance.
+$BTRFS_UTIL_PROG balance cancel $SCRATCH_MNT &> /dev/null
+
+rm -rf $SCRATCH_MNT/*
+_run_btrfs_util_prog filesystem sync $SCRATCH_MNT
+units=`_btrfs_qgroup_units`
+$BTRFS_UTIL_PROG qgroup show $units $SCRATCH_MNT | $SED_PROG -n '/[0-9]/p' | \
+   $AWK_PROG '{print $2" "$3}'
+
+# success, all done
+status=0
+exit
diff --git a/tests/btrfs/028.out b/tests/btrfs/028.out
new file mode 100644
index 000..69b68bf
--- /dev/null
+++ b/tests/btrfs/028.out
@@ -0,0 +1,2 @@
+QA output created by 028
+65536 65536
diff --git a/tests/btrfs/group b/tests/btrfs/group
index da0e27f..35ecf59 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -30,6 +30,7 @@
 025 auto quick send clone
 026 auto quick compress prealloc
 027 auto replace
+028 auto qgroup balance
 029 auto quick clone
 030 auto quick send
 031 auto quick subvol clone
-- 
2.5.5



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs filesystem keeps allocating new chunks for no apparent reason

2016-05-30 Thread Qu Wenruo



Hans van Kranenburg wrote on 2016/05/06 23:28 +0200:

Hi,

I've got a mostly inactive btrfs filesystem inside a virtual machine
somewhere that shows interesting behaviour: while no interesting disk
activity is going on, btrfs keeps allocating new chunks, a GiB at a time.

A picture, telling more than 1000 words:
https://syrinx.knorrie.org/~knorrie/btrfs/keep/btrfs_usage_ichiban.png
(when the amount of allocated/unused goes down, I did a btrfs balance)


Nice picture.
Really better than 1000 words.

AFAIK, the problem may be caused by fragments.

And even I saw some early prototypes inside the codes to allow btrfs do 
allocation smaller extent than required.

(E.g. caller needs 2M extent, but btrfs returns 2 1M extents)

But it's still prototype and seems no one is really working on it now.

So when btrfs is writing new data, for example, to write about 16M data, 
it will need to allocate a 16M continuous extent, and if it can't find 
large enough space to allocate, then create a new data chunk.


Despite the already awesome chunk level usage pricutre, I hope there is 
info about extent level allocation to confirm my assumption.


You could dump it by calling "btrfs-debug-tree -t 2 ".
It's normally recommended to do it unmounted, but it's still possible to 
call it mounted, although not 100% perfect though.
(Then I'd better find a good way to draw a picture of 
allocate/unallocate space and how fragments the chunks are)


Thanks,
Qu


Linux ichiban 4.5.0-0.bpo.1-amd64 #1 SMP Debian 4.5.1-1~bpo8+1
(2016-04-20) x86_64 GNU/Linux

# btrfs fi show /
Label: none  uuid: 9881fc30-8f69-4069-a8c8-c057b842b0c4
Total devices 1 FS bytes used 6.17GiB
devid1 size 20.00GiB used 16.54GiB path /dev/xvda

# btrfs fi df /
Data, single: total=15.01GiB, used=5.16GiB
System, single: total=32.00MiB, used=16.00KiB
Metadata, single: total=1.50GiB, used=1.01GiB
GlobalReserve, single: total=144.00MiB, used=0.00B

I'm a bit puzzled, since I haven't seen this happening on other
filesystems that use 4.4 or 4.5 kernels.

If I dump the allocated chunks and their % usage, it's clear that the
last 6 new added ones have a usage of only a few percent.

dev item devid 1 total bytes 21474836480 bytes used 17758683136
chunk vaddr 12582912 type 1 stripe 0 devid 1 offset 12582912 length
8388608 used 4276224 used_pct 50
chunk vaddr 1103101952 type 1 stripe 0 devid 1 offset 2185232384 length
1073741824 used 433127424 used_pct 40
chunk vaddr 3250585600 type 1 stripe 0 devid 1 offset 4332716032 length
1073741824 used 764391424 used_pct 71
chunk vaddr 9271508992 type 1 stripe 0 devid 1 offset 12079595520 length
1073741824 used 270704640 used_pct 25
chunk vaddr 12492734464 type 1 stripe 0 devid 1 offset 13153337344
length 1073741824 used 866574336 used_pct 80
chunk vaddr 13566476288 type 1 stripe 0 devid 1 offset 11005853696
length 1073741824 used 1028059136 used_pct 95
chunk vaddr 14640218112 type 1 stripe 0 devid 1 offset 3258974208 length
1073741824 used 762466304 used_pct 71
chunk vaddr 26250051584 type 1 stripe 0 devid 1 offset 19595788288
length 1073741824 used 114982912 used_pct 10
chunk vaddr 31618760704 type 1 stripe 0 devid 1 offset 15300820992
length 1073741824 used 488902656 used_pct 45
chunk vaddr 32692502528 type 4 stripe 0 devid 1 offset 5406457856 length
268435456 used 209272832 used_pct 77
chunk vaddr 32960937984 type 4 stripe 0 devid 1 offset 5943328768 length
268435456 used 251199488 used_pct 93
chunk vaddr 33229373440 type 4 stripe 0 devid 1 offset 7419723776 length
268435456 used 248709120 used_pct 92
chunk vaddr 33497808896 type 4 stripe 0 devid 1 offset 8896118784 length
268435456 used 247791616 used_pct 92
chunk vaddr 33766244352 type 4 stripe 0 devid 1 offset 8627683328 length
268435456 used 93061120 used_pct 34
chunk vaddr 34303115264 type 2 stripe 0 devid 1 offset 6748635136 length
33554432 used 16384 used_pct 0
chunk vaddr 34336669696 type 1 stripe 0 devid 1 offset 16374562816
length 1073741824 used 105054208 used_pct 9
chunk vaddr 35410411520 type 1 stripe 0 devid 1 offset 20971520 length
1073741824 used 10899456 used_pct 1
chunk vaddr 36484153344 type 1 stripe 0 devid 1 offset 1094713344 length
1073741824 used 441778176 used_pct 41
chunk vaddr 37557895168 type 4 stripe 0 devid 1 offset 5674893312 length
268435456 used 33439744 used_pct 12
chunk vaddr 37826330624 type 1 stripe 0 devid 1 offset 9164554240 length
1073741824 used 32096256 used_pct 2
chunk vaddr 38900072448 type 1 stripe 0 devid 1 offset 14227079168
length 1073741824 used 40140800 used_pct 3
chunk vaddr 39973814272 type 1 stripe 0 devid 1 offset 17448304640
length 1073741824 used 58093568 used_pct 5
chunk vaddr 41047556096 type 1 stripe 0 devid 1 offset 18522046464
length 1073741824 used 119701504 used_pct 11

The only things this host does is
 1) being a webserver for a small internal debian packages repository
 2) running low-volume mailman with a few lists, no archive-gzipping
mega cronjobs or anything enabled.
 3) some little legacy php 

Re: [PATCH] btrfs-progs: tests: run rollback after conversion

2016-05-30 Thread Qu Wenruo



David Sterba wrote on 2016/05/30 17:56 +0200:

Hi Qu,

the convert patchset does not pass a rollback test, fails in the case of
32k nodesize. Thre's not much info why, just 'rollback failed'.

The branch that passes is 'test-rollback', it's current devel without
the convert and low-mem fsck patchsets.



Pretty strange.

As I manually tested a ext4 filled with my /etc, used 32K default 
incompat flags, both convert and rollback succeeded without problem.


The same is for convert-tests.sh.
All passed here, just as what we did when developing the patchset.

The commit head I tested is 2a7c68a4e46f4713e746d6e977e9c4cf27913ce3.

Did you have more info on the situation where the rollback fails?
Like the commit head, test method (I assume it's rollback tests from 
btrfs-progs)?


Thanks,
Qu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: tests: run rollback after conversion

2016-05-30 Thread Qu Wenruo

Thanks for the report.

We will check it and fix it soon.

Thanks,
Qu

David Sterba wrote on 2016/05/30 17:56 +0200:

Hi Qu,

the convert patchset does not pass a rollback test, fails in the case of
32k nodesize. Thre's not much info why, just 'rollback failed'.

The branch that passes is 'test-rollback', it's current devel without
the convert and low-mem fsck patchsets.





--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: fix check_shared for fiemap ioctl

2016-05-30 Thread Qu Wenruo



David Sterba wrote on 2016/05/30 17:15 +0200:

On Fri, May 27, 2016 at 09:39:53AM +0800, Qu Wenruo wrote:

Any comment?

This patch does not fix the submitted generic/352[1] and generic/353[2]
test cases, but also introduce a much better structure and design for
later backref walk use.

Instead of a list and do a O(n^3)~O(n^4) iteration for fiemap ioctl on a
reflinked(deduped) file, it's now only O(n)~O(nlogn) for SHARED flag
check to pass generic/352.


This is a good improvement, though there's potentially hidden cost in
the allocations and maintaining the temporary structures. Do you have
actual performance numbers?



Test case generic/352 is already the proof.

For a 1G file whose all file extents are pointing to the same 128K file 
extent, fiemap will just soft lockup for backref walk.


Even for case(*) it doesn't hang, it takes about 3~4 seconds to do 
check_shared() for one extent.



With this patch, chech_shared() returned immediately, if found any other 
delayed ref/extent ref, making the whole fiemap ioctl super fast for 
deduped file case.


*: Two 1G file, inside the same subvolume, pointing to the same 128K 
file extent will not cause soft lockup.



We can use such patch to rework current backref walk code, which does 
unneeded list iteration instead of faster rb tree search, but it's too 
aggressive, and we want to do it step by step, starting from these bugs 
exposed by inband dedupe.


Thanks,
Qu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs filesystem keeps allocating new chunks for no apparent reason

2016-05-30 Thread Duncan
Hans van Kranenburg posted on Mon, 30 May 2016 23:18:20 +0200 as
excerpted:

>> Snip the dump, but curious as a user (not a dev) what command you used.
>> Presumably one of the debug commands which I'm not particularly
>> familiar with, but I wasn't aware it was even possible.
> 
> It's the output of a little programming exercise calling the search
> ioctl from python. https://github.com/knorrie/btrfs-heatmap
> 
> While using balance I got interested in knowing where balance got the
> information from to find how much % a chunk is used. I want to see that
> list in advance, so I can see what -dusage the most effective would be.
> My munin graphs show the stacked total value, which does not give you an
> idea about how badly the unused space is fragmented over already
> allocated chunks.
> 
> So, with some help of Hugo on IRC to get started, I ended up with this
> PoC, which can create nice movies of your data moving around over the
> physical space of the filesystem over time, like this one:
> 
> https://syrinx.knorrie.org/~knorrie/btrfs/heatmap.gif
> 
> Seeing the chunk allocator work its way around the two devices, choosing
> the one with the most free space, and reusing the gaps left by balance
> is super interesting. :-]

Very cool indeed.  Reminds me of the nice eye candy dynamic graphicals 
that MS defrag had back in 9x times.  (I've no idea what they have now as 
I've been off the platform for a decade and a half now.)

I may have to play with it a bit, when I have more time (I'm moving in a 
couple days...).

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs filesystem keeps allocating new chunks for no apparent reason

2016-05-30 Thread Hans van Kranenburg

On 05/30/2016 09:55 PM, Duncan wrote:

Hans van Kranenburg posted on Mon, 30 May 2016 13:07:26 +0200 as
excerpted:

[Please don't post "upside down".  Reply in context under the quoted
point, here the whole post, you're replying to.  It makes further replies
in context far easier. =:^)  I've pasted your update at the bottom here.]


Sure, thanks.


On 05/06/2016 11:28 PM, Hans van Kranenburg wrote:


I've got a mostly inactive btrfs filesystem inside a virtual machine
somewhere that shows interesting behaviour: while no interesting disk
activity is going on, btrfs keeps allocating new chunks, a GiB at a
time.

A picture, telling more than 1000 words:
https://syrinx.knorrie.org/~knorrie/btrfs/keep/btrfs_usage_ichiban.png
(when the amount of allocated/unused goes down, I did a btrfs balance)


Agreed, that shows something strange going on.


Linux ichiban 4.5.0-0.bpo.1-amd64 #1 SMP Debian 4.5.1-1~bpo8+1
(2016-04-20) x86_64 GNU/Linux


So the kernel is/was current...


Running a slightly newer one now:

Linux ichiban 4.5.0-0.bpo.2-amd64 #1 SMP Debian 4.5.4-1~bpo8+1 
(2016-05-13) x86_64



# btrfs fi show /
Label: none  uuid: 9881fc30-8f69-4069-a8c8-c057b842b0c4
  Total devices 1 FS bytes used 6.17GiB
  devid1 size 20.00GiB used 16.54GiB path /dev/xvda

# btrfs fi df /
Data, single: total=15.01GiB, used=5.16GiB
System, single: total=32.00MiB, used=16.00KiB
Metadata, single: total=1.50GiB, used=1.01GiB
GlobalReserve, single: total=144.00MiB, used=0.00B

I'm a bit puzzled, since I haven't seen this happening on other
filesystems that use 4.4 or 4.5 kernels.


Nor have I, either reported (save for you) or personally.


If I dump the allocated chunks and their % usage, it's clear that the
last 6 new added ones have a usage of only a few percent.


Snip the dump, but curious as a user (not a dev) what command you used.
Presumably one of the debug commands which I'm not particularly familiar
with, but I wasn't aware it was even possible.


It's the output of a little programming exercise calling the search 
ioctl from python. https://github.com/knorrie/btrfs-heatmap


While using balance I got interested in knowing where balance got the 
information from to find how much % a chunk is used. I want to see that 
list in advance, so I can see what -dusage the most effective would be. 
My munin graphs show the stacked total value, which does not give you an 
idea about how badly the unused space is fragmented over already 
allocated chunks.


So, with some help of Hugo on IRC to get started, I ended up with this 
PoC, which can create nice movies of your data moving around over the 
physical space of the filesystem over time, like this one:


https://syrinx.knorrie.org/~knorrie/btrfs/heatmap.gif

Seeing the chunk allocator work its way around the two devices, choosing 
the one with the most free space, and reusing the gaps left by balance 
is super interesting. :-]



The only things this host does is
   1) being a webserver for a small internal debian packages repository
   2) running low-volume mailman with a few lists, no archive-gzipping
mega cronjobs or anything enabled.
   3) some little legacy php thingies

Interesting fact is that most of the 1GiB increases happen at the same
time as cron.daily runs. However, there's only a few standard things in
there. An occasional package upgrade by unattended-upgrade, or some
logrotate. The total contents of /var/log/ together is only 66MB...
Graphs show only less than about 100 MB reads/writes in total around
this time.


The cron.daily timing is interesting.  I'll come back to that below.


Well, it obviously has a very large sign saying "LOOK HERE" directly 
next to it yes.



As you can see in the graph the amount of used space is even
decreasing, because I cleaned up a bunch of old packages in the
repository, and still, btrfs keeps allocating new data chunks like a
hungry beast.

Why would this happen?



since it got any followup and since I'm bold enough to bump it one more
time... :)

I really don't understand the behaviour I described. Does it ring a bell
with anyone? This system is still allocating new 1GB data chunks every 1
or 2 days without using them at all, and I have to use balance every
week to get them away again.


Honestly I can only guess, and it's a new guess I didn't think of the
first time around, thus my lack of response the first time around.  But
lacking anyone else replying with better theories, given that I do have a
guess, I might as well put it out there.

Is it possible something in that daily cron allocates/writes a large but
likely spare file, perhaps a gig or more, probably fsyncing to lock the
large size in place, then truncates it to actual size, which might be
only a few kilobytes?

That's sort of behavior could at least in theory trigger the behavior you
describe, tho not being a dev and not being a Linux filesystem behavior
expert by any means, I'm admittedly fuzzy on exactly what details might
translate that theory into 

Re: Runaway SLAB usage by 'bio' during 'device replace'

2016-05-30 Thread Duncan
Chris Johnson posted on Mon, 30 May 2016 11:48:02 -0700 as excerpted:

> I have a RAID6 array that had a failed HDD. The drive failed completely
> and has been removed from the system. I'm running a 'device replace'
> operation with a new disk. The array is ~20TB so this will take a few
> days.

This isn't a direct answer to your issue as I'm a user and list regular, 
not a dev, and that's beyond me, but it's something you need to know, if 
you don't already...

Btrfs raid56 mode remains for the time being in general negatively-
recommended, except specifically for testing with throw-away data, due to 
two critical but not immediately data destroying bugs, one related to 
serial device replacement, the other to balance restriping.  They may or 
may not be related to each other, as neither one has been fully traced.

The serial replace bug has to do with replacing multiple devices, one at 
a time.  The first replace appears to work fine by all visible measures, 
but apparently doesn't return the array to full working condition after 
all, because an attempt to replace a second device fails, and can bring 
down the filesystem.  Unfortunately it doesn't always happen, and due to 
the size of devices these days, working arrays tend to be multi-TB 
monsters that take time to get to this point, so all we have at this 
point is multiple reports of the same issue, but no real way to reproduce 
it.  I believe but am not sure that the problem can occur regardless of 
whether btrfs replace or device add/delete was used.

The restriping bug has to do with restriping to a different width, either 
manually doing a filtered balance after adding devices, or automatically, 
as triggered by btrfs device delete.  Again, multiple reports but not 
nailed down to anything specifically reproducible yet.  The problem here 
is that the restripes, while apparently producing correct results, can 
inexplicably take an order of magnitude (or worse) longer than they 
should.  What one might expect to take hours takes over a week, and on 
the big arrays that might be expected to take 2-3 days, months.

The problem, again, isn't correctness, but the fact that over such long 
periods, the risk of device loss is increased, and if the array was 
already being reshaped/rebalanced to repair loss of one device, loss of 
another device may kill it.

Neither of these bugs affects normal runtime operation, but both are 
critical enough with regard to what people normally use parity-raid for, 
so they /can/ take a device (or two with raid6) loss and repair the array 
to get back to normal operation, that raid56 remains negatively 
recommended for anything but testing with throw-away data, until after 
these bugs can be fully traced and fixed.


Your particular issue doesn't appear to be directly related to either of 
the above.  In fact, I know I've seen patches recently having to do with 
memory leaks that may well fix your problem (tho you'd have to be running 
4.6 at least to have them at this point, and perhaps even 4.7-rc1.

But given the situation, either be sure you have backups and are prepared 
to use them if the array goes south on you due to failed or impractical 
device replacement, or switch to something other than btrfs raid56 mode.  
Btrfs redundancy-raid (raid1 and raid10) are more mature and tested, and 
thus may be options if they fit your filesystem space and device layout 
needs.  Alternatively, btrfs (or other filesystems) on top of dm/md-raid 
may be an option, tho you obviously lose some features of btrfs that 
way.  And of course zfs is the closest btrfs-comparable that's reasonably 
mature and may be an option, tho there are licensing and hardware issues 
(it likes lots of memory on linux due to double-caching of some elements 
as its caching scheme doesn't work well with that of linux, and ecc 
memory is very strongly recommended) if using it on linux.

I'd suggest giving btrfs raid56 another few kernel releases, six months 
to a year, and then check back.  I'd hope the bugs can be properly traced 
and fixed within a couple kernel cycles, so four months or so, but I 
prefer a few cycles to stabilize with no known critical bugs, before I 
recommend it (I was getting close to recommending it after the last known 
critical bug was fixed in 4.1, when these came up), which puts the 
projected timeframe at 8-12 months, before I could really consider raid56 
mode as reasonably stable as btrfs in general, which is to say, 
stabilizing, but not yet fully stable, so even then, the standard admin 
backup rule that if you don't have backups you consider the data to be 
worth less than the time/resources/hassle to do those backups, still 
applies more strongly than it would to a fully mature filesystem.

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe 

Re: btrfs filesystem keeps allocating new chunks for no apparent reason

2016-05-30 Thread Duncan
Hans van Kranenburg posted on Mon, 30 May 2016 13:07:26 +0200 as
excerpted:

[Please don't post "upside down".  Reply in context under the quoted 
point, here the whole post, you're replying to.  It makes further replies 
in context far easier. =:^)  I've pasted your update at the bottom here.]

> On 05/06/2016 11:28 PM, Hans van Kranenburg wrote:
>>
>> I've got a mostly inactive btrfs filesystem inside a virtual machine
>> somewhere that shows interesting behaviour: while no interesting disk
>> activity is going on, btrfs keeps allocating new chunks, a GiB at a
>> time.
>>
>> A picture, telling more than 1000 words:
>> https://syrinx.knorrie.org/~knorrie/btrfs/keep/btrfs_usage_ichiban.png
>> (when the amount of allocated/unused goes down, I did a btrfs balance)

Agreed, that shows something strange going on.

>> Linux ichiban 4.5.0-0.bpo.1-amd64 #1 SMP Debian 4.5.1-1~bpo8+1
>> (2016-04-20) x86_64 GNU/Linux

So the kernel is/was current...

>> # btrfs fi show /
>> Label: none  uuid: 9881fc30-8f69-4069-a8c8-c057b842b0c4
>>  Total devices 1 FS bytes used 6.17GiB
>>  devid1 size 20.00GiB used 16.54GiB path /dev/xvda
>>
>> # btrfs fi df /
>> Data, single: total=15.01GiB, used=5.16GiB
>> System, single: total=32.00MiB, used=16.00KiB
>> Metadata, single: total=1.50GiB, used=1.01GiB
>> GlobalReserve, single: total=144.00MiB, used=0.00B
>>
>> I'm a bit puzzled, since I haven't seen this happening on other
>> filesystems that use 4.4 or 4.5 kernels.

Nor have I, either reported (save for you) or personally.

>> If I dump the allocated chunks and their % usage, it's clear that the
>> last 6 new added ones have a usage of only a few percent.

Snip the dump, but curious as a user (not a dev) what command you used.  
Presumably one of the debug commands which I'm not particularly familiar 
with, but I wasn't aware it was even possible.

>> The only things this host does is
>>   1) being a webserver for a small internal debian packages repository
>>   2) running low-volume mailman with a few lists, no archive-gzipping
>> mega cronjobs or anything enabled.
>>   3) some little legacy php thingies
>>
>> Interesting fact is that most of the 1GiB increases happen at the same
>> time as cron.daily runs. However, there's only a few standard things in
>> there. An occasional package upgrade by unattended-upgrade, or some
>> logrotate. The total contents of /var/log/ together is only 66MB...
>> Graphs show only less than about 100 MB reads/writes in total around
>> this time.

The cron.daily timing is interesting.  I'll come back to that below.

>> As you can see in the graph the amount of used space is even
>> decreasing, because I cleaned up a bunch of old packages in the
>> repository, and still, btrfs keeps allocating new data chunks like a
>> hungry beast.
>>
>> Why would this happen?

> since it got any followup and since I'm bold enough to bump it one more
> time... :)
> 
> I really don't understand the behaviour I described. Does it ring a bell
> with anyone? This system is still allocating new 1GB data chunks every 1
> or 2 days without using them at all, and I have to use balance every
> week to get them away again.

Honestly I can only guess, and it's a new guess I didn't think of the 
first time around, thus my lack of response the first time around.  But 
lacking anyone else replying with better theories, given that I do have a 
guess, I might as well put it out there.

Is it possible something in that daily cron allocates/writes a large but 
likely spare file, perhaps a gig or more, probably fsyncing to lock the 
large size in place, then truncates it to actual size, which might be 
only a few kilobytes?

That's sort of behavior could at least in theory trigger the behavior you 
describe, tho not being a dev and not being a Linux filesystem behavior 
expert by any means, I'm admittedly fuzzy on exactly what details might 
translate that theory into the reality you're seeing.


In any event, my usual "brute force" approach to such mysteries is to 
bisect the problem space down until I know where the issue is.

First, try rescheduling your cron.daily run to a different time, and see 
if the behavior follows it, thus specifically tying it to something in 
that run.

Second, try either running all tasks it runs manually, checking which one 
triggers the problem, or if you have too many tasks for that to be 
convenient, split them into cron.daily1 and cron.daily2, scheduled at 
different times, bisecting the problem by seeing which one the behavior 
follows.

Repeat as needed until you've discovered the culprit, then examine 
exactly what it's doing to the filesystem.

And please report your results.  Besides satisfying my own personal 
curiosity, there's a fair chance someone else will have the same issue at 
some point and either post their own question, or discover this thread 
via google or whatever.

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and 

Runaway SLAB usage by 'bio' during 'device replace'

2016-05-30 Thread Chris Johnson
I have a RAID6 array that had a failed HDD. The drive failed
completely and has been removed from the system. I'm running a 'device
replace' operation with a new disk. The array is ~20TB so this will
take a few days.

Yesterday the system crashed hard with OOM errors about 24 hours into
the replace. Rebooting after the crash and remounting the array
automatically resumed the replace where it left off.

Today I kept a close eye on it and have watched the memory usage creep
up slowly.

htop says this is user process memory (green bar) but shows no user
processes using this much memory

free says this is almost entirely cached/buffered memory that is
taking up the space.

slabtop reveals that there is a highly unusual amount of SLAB going to
'bio' which has to do with block allocation apparently. slabtop output
is attached.

'sync && echo 3 > /proc/sys/vm/drop_caches' clears the high usage
(~4GB) from dentry but 'bio' does not release any (11GB) memory and
continues to grow slowly.

This is running the Rockstor distro based on CentOS. The system has 16GB of RAM.

Kernel: 4.4.5-1.el7.elrepo.x86_64
btrfs-progs: 4.4.1

Kernel messages aren't showing anything of note during the replace
until it starts throwing out OOM errors.

I would like to collect enough information for a useful bug report
here, but I also can't babysit this rebuild during the work week and
reboot it once a day for OOM crashes. Should I cancel the replace
operation and use 'dev delete missing' instead? Will using 'delete
missing' cause any problem if it's done after a partially completed
and canceled replace?
# slabtop -o -s=a
 Active / Total Objects (% used): 33431432 / 33664160 (99.3%)
 Active / Total Slabs (% used)  : 1346736 / 1346736 (100.0%)
 Active / Total Caches (% used) : 78 / 114 (68.4%)
 Active / Total Size (% used)   : 10512136.19K / 10737701.80K (97.9%)
 Minimum / Average / Maximum Object : 0.01K / 0.32K / 15.62K

  OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME   
32493650 32492775  99%0.31K 1299746   25  10397968K bio-1   
   
323505 323447  99%0.19K  15405   21 61620K dentry 
176680 176680 100%0.07K   3155   56 12620K btrfs_free_space   
118208  41288  34%0.12K   3694   32 14776K kmalloc-128
 94528  43378  45%0.25K   2954   32 23632K kmalloc-256
 91872  41682  45%0.50K   2871   32 45936K kmalloc-512
 83048  39031  46%4.00K  103818332192K kmalloc-4096   
 69049  69049 100%0.27K   2381   29 19048K btrfs_extent_buffer
 46872  46385  98%0.57K   1674   28 26784K radix_tree_node
 23460  23460 100%0.12K690   34  2760K kernfs_node_cache  
 17536  17536 100%0.98K548   32 17536K btrfs_inode
 16380  16007  97%0.14K585   28  2340K btrfs_path 
 12444  11635  93%0.08K244   51   976K Acpi-State 
 12404  12404 100%0.55K443   28  7088K inode_cache
 11648  10851  93%0.06K182   64   728K kmalloc-64 
 10404   5716  54%0.08K204   51   816K btrfs_extent_state 
  8954   8703  97%0.18K407   22  1628K vm_area_struct 
  5888   4946  84%0.03K 46  128   184K kmalloc-32 
  5632   5632 100%0.01K 11  51244K kmalloc-8  
  5049   4905  97%0.08K 99   51   396K anon_vma   
  4352   4352 100%0.02K 17  25668K kmalloc-16 
  3723   3723 100%0.05K 51   73   204K Acpi-Parse 
  3230   3230 100%0.05K 38   85   152K ftrace_event_field 
  3213   2949  91%0.19K153   21   612K kmalloc-192
  3120   3090  99%0.61K120   26  1920K proc_inode_cache   
  2814   2814 100%0.09K 67   42   268K kmalloc-96 
  1984   1510  76%1.00K 62   32  1984K kmalloc-1024   
  1904   1904 100%0.07K 34   56   136K Acpi-Operand   
  1472   1472 100%0.09K 32   46   128K trace_event_file   
  1224   1224 100%0.04K 12  10248K Acpi-Namespace 
  1152   1152 100%0.64K 48   24   768K shmem_inode_cache  
   592581  98%2.00K 37   16  1184K kmalloc-2048   
   528457  86%0.36K 24   22   192K blkdev_requests
   462355  76%0.38K 22   21   176K mnt_cache  
   450433  96%1.06K 15   30   480K signal_cache   
   429429 100%0.20K 11   3988K btrfs_delayed_ref_head 
   420420 100%2.05K 28   15   896K idr_layer_cache
   408408 100%0.04K  4  102

Re: [PATCH] btrfs-progs: make btrfs-image restore to support dup

2016-05-30 Thread David Sterba
On Thu, May 26, 2016 at 05:43:00PM +0800, Lu Fengqi wrote:
> Previously btrfs-image restore would set the chunk items to have 1 stripe,
> even if the chunk is dup. If you use btrfsck on the restored file system,
> some dev_extent will not find any relative chunk stripe, and the
> bytes-used of dev_item will not equal to the dev_extents's total_bytes.
> This patch store a additional physical just for the dup case when build
> the in-memory chunk-tree.
> Currently btrfsck on the restored file system, only single and dup is no
> problem. raid* support should be added in the future.
> 
> Signed-off-by: Lu Fengqi 
> ---
>  btrfs-image.c | 143 
> +++---
>  1 file changed, 97 insertions(+), 46 deletions(-)
> 
> diff --git a/btrfs-image.c b/btrfs-image.c
> index 8a1b799..d121951 100644
> --- a/btrfs-image.c
> +++ b/btrfs-image.c
> @@ -68,6 +68,12 @@ struct meta_cluster {
>  struct fs_chunk {
>   u64 logical;
>   u64 physical;
> + /* physical_dup only store additonal physical for BTRFS_BLOCK_GROUP_DUP
> +  * currently restore only support single and dup
> +  * TODO: modify this structure and the function related to this
> +  * structure for support raid*

What does it do in case of RAID? Can we do runtime checks and report
potential problems? btrfs-image on multiple device was always somehow
tricky so I'll merge the patch.

> +  */
> + u64 physical_dup;
>   u64 bytes;
>   struct rb_node l;
>   struct rb_node p;
> @@ -290,7 +296,8 @@ static struct rb_node *tree_search(struct rb_root *root,
>   return NULL;
>  }
>  
> -static u64 logical_to_physical(struct mdrestore_struct *mdres, u64 logical, 
> u64 *size)
> +static u64 logical_to_physical(struct mdrestore_struct *mdres, u64 logical,
> +u64 *size, u64 *physical_dup)
>  {
>   struct fs_chunk *fs_chunk;
>   struct rb_node *entry;
> @@ -312,6 +319,14 @@ static u64 logical_to_physical(struct mdrestore_struct 
> *mdres, u64 logical, u64
>   BUG();
>   offset = search.logical - fs_chunk->logical;
>  
> + if (physical_dup) {
> + /* only in dup case, physical_dup is not equal to 0 */
> + if (fs_chunk->physical_dup)
> + *physical_dup = fs_chunk->physical_dup + offset;
> + else
> + *physical_dup = 0;
> + }
> +
>   *size = min(*size, fs_chunk->bytes + fs_chunk->logical - logical);
>   return fs_chunk->physical + offset;
>  }
> @@ -1451,20 +1466,26 @@ static int update_super(struct mdrestore_struct 
> *mdres, u8 *buffer)
>   cur += sizeof(*disk_key);
>  
>   if (key.type == BTRFS_CHUNK_ITEM_KEY) {
> - u64 physical, size = 0;
> + u64 type, physical, physical_dup, size = 0;
>  
>   chunk = (struct btrfs_chunk *)ptr;
>   old_num_stripes = btrfs_stack_chunk_num_stripes(chunk);
>   chunk = (struct btrfs_chunk *)write_ptr;
>  
>   memmove(write_ptr, ptr, sizeof(*chunk));
> - btrfs_set_stack_chunk_num_stripes(chunk, 1);
>   btrfs_set_stack_chunk_sub_stripes(chunk, 0);
> - btrfs_set_stack_chunk_type(chunk,
> -BTRFS_BLOCK_GROUP_SYSTEM);
> + type = btrfs_stack_chunk_type(chunk);
> + if (type & BTRFS_BLOCK_GROUP_DUP) {
> + new_array_size += sizeof(struct btrfs_stripe);
> + write_ptr += sizeof(struct btrfs_stripe);
> + } else {
> + btrfs_set_stack_chunk_num_stripes(chunk, 1);
> + btrfs_set_stack_chunk_type(chunk,
> + BTRFS_BLOCK_GROUP_SYSTEM);
> + }
>   chunk->stripe.devid = super->dev_item.devid;
>   physical = logical_to_physical(mdres, key.offset,
> -);
> +, _dup);
>   if (size != (u64)-1)
>   btrfs_set_stack_stripe_offset(>stripe,
> physical);
> @@ -1573,41 +1594,47 @@ static int fixup_chunk_tree_block(struct 
> mdrestore_struct *mdres,
>   goto next;
>  
>   for (i = 0; i < btrfs_header_nritems(eb); i++) {
> - struct btrfs_chunk chunk;
> + struct btrfs_chunk *chunk;
>   struct btrfs_key key;
> - u64 type, physical, size = (u64)-1;
> + u64 type, physical, physical_dup, size = (u64)-1;
>  
>   btrfs_item_key_to_cpu(eb, , i);
> 

Re: [PATCH 3/5] Btrfs: self-tests: Support non-4k page size

2016-05-30 Thread Feifei Xu



On 2016/5/30 21:55, David Sterba wrote:

On Sun, May 29, 2016 at 01:17:34PM +0800, Fei Fei Xu wrote:

There are more instances of the pointed style issues, please fix all of
them. As the changes do not affect functionality I'll add the paches to
for-next, but I'm expecting a v2.

Thanks, I will send out v2 soon according to all above comments.

What's the base of the patchset? Does not apply cleanly on the current
integration (ie. what's in master).

It is based on branch master commit 2f7c3a18a2dc79ddf7b , which is not 
up-to-date already.

I will send out a v2 soon to fix this.

Thanks
Feifei







--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: tests: run rollback after conversion

2016-05-30 Thread David Sterba
Hi Qu,

the convert patchset does not pass a rollback test, fails in the case of
32k nodesize. Thre's not much info why, just 'rollback failed'.

The branch that passes is 'test-rollback', it's current devel without
the convert and low-mem fsck patchsets.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: tests: run rollback after conversion

2016-05-30 Thread David Sterba
Signed-off-by: David Sterba 
---
 tests/convert-tests.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/convert-tests.sh b/tests/convert-tests.sh
index 3fce8915f94a..2806b921544b 100755
--- a/tests/convert-tests.sh
+++ b/tests/convert-tests.sh
@@ -138,6 +138,9 @@ convert_test() {
run_check_stdout $SUDO_HELPER md5sum -c $CHECKSUMTMP |
grep -q 'FAILED' && _fail "file validation failed."
run_check_umount_test_dev
+
+   run_check $TOP/btrfs-convert --rollback $TEST_DEV
+   run_check fsck -n -t ext2,ext3,ext4 $TEST_DEV
 }
 
 if ! [ -z "$TEST" ]; then
-- 
2.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: fix check_shared for fiemap ioctl

2016-05-30 Thread David Sterba
On Mon, May 16, 2016 at 11:23:50AM +0800, Lu Fengqi wrote:
> +/*
> + * ref_root is used as the root of the ref tree that hold a collection
> + * of unique references.
> + */
> +struct ref_root {
> + /*
> +  * the unique_refs represents the number of ref_nodes with a positive
> +  * count stored in the tree. Even if a ref_node(the count is greater
> +  * than one) is added, the unique_refs will only increase one.
> +  */
> + unsigned int unique_refs;
> +
> + struct rb_root rb_root;

The rb_root could be moved to the beginning so the offset_of magic will
not generate extra offset to the sructure.

> +};
> +
> +/* ref_node is used to store a unique reference to the ref tree. */
> +struct ref_node {
> + /* for NORMAL_REF, otherwise all these fields should be set to 0 */
> + u64 root_id;
> + u64 object_id;
> + u64 offset;
> +
> + /* for SHARED_REF, otherwise parent field should be set to 0 */
> + u64 parent;
> +
> + /* ref to the ref_mod of btrfs_delayed_ref_node(delayed-ref.h) */
> + int ref_mod;
> +
> + struct rb_node rb_node;

Same here (move to the beginning)

> +};
> +
> +/* dynamically allocate and initialize a ref_root */
> +static struct ref_root *ref_root_alloc(gfp_t gfp_mask)
> +{
> + struct ref_root *ref_tree;
> +
> + ref_tree = kmalloc(sizeof(*ref_tree), gfp_mask);

Drop the gfp_mask and make it GFP_KERNEL

> + if (!ref_tree)
> + return NULL;
> +
> + ref_tree->rb_root = RB_ROOT;
> + ref_tree->unique_refs = 0;
> +
> + return ref_tree;
> +}
> +
> +/* free all node in the ref tree, and reinit ref_root */

   nodes

> +static void ref_root_fini(struct ref_root *ref_tree)
> +{
> + struct ref_node *node;
> + struct rb_node *next;
> +
> + while ((next = rb_first(_tree->rb_root)) != NULL) {
> + node = rb_entry(next, struct ref_node, rb_node);
> + rb_erase(next, _tree->rb_root);
> + kfree(node);

This could be slow as rb_erase has to do the rb-tree rotations. Can we
do a post-order traversal and just free the nodes?

> + }
> +
> + ref_tree->rb_root = RB_ROOT;
> + ref_tree->unique_refs = 0;
> +}
> +
> +/* free dynamically allocated ref_root */
> +static void ref_root_free(struct ref_root *ref_tree)
> +{
> + if (!ref_tree)
> + return;
> +
> + ref_root_fini(ref_tree);
> + kfree(ref_tree);
> +}
> +
> +/*
> + * search ref_node with (root_id, object_id, offset, parent) in the tree
> + *
> + * if found, the pointer of the ref_node will be returned;
> + * if not found, NULL will be returned and pos will point to the rb_node for
> + * insert, pos_parent will point to pos'parent for insert;
> +*/
> +static struct ref_node *__ref_tree_search(struct ref_root *ref_tree,
> +   struct rb_node ***pos,
> +   struct rb_node **pos_parent,
> +   u64 root_id, u64 object_id,
> +   u64 offset, u64 parent)
> +{
> + struct ref_node *cur = NULL;
> +
> + *pos = _tree->rb_root.rb_node;
> +
> + while (**pos) {
> + *pos_parent = **pos;
> + cur = rb_entry(*pos_parent, struct ref_node, rb_node);
> +
> + if (cur->root_id < root_id) {
> + *pos = &(**pos)->rb_right;
> + continue;
> + } else if (cur->root_id > root_id) {
> + *pos = &(**pos)->rb_left;
> + continue;
> + }
> +
> + if (cur->object_id < object_id) {
> + *pos = &(**pos)->rb_right;
> + continue;
> + } else if (cur->object_id > object_id) {
> + *pos = &(**pos)->rb_left;
> + continue;
> + }
> +
> + if (cur->offset < offset) {
> + *pos = &(**pos)->rb_right;
> + continue;
> + } else if (cur->offset > offset) {
> + *pos = &(**pos)->rb_left;
> + continue;
> + }
> +
> + if (cur->parent < parent) {
> + *pos = &(**pos)->rb_right;
> + continue;
> + } else if (cur->parent > parent) {
> + *pos = &(**pos)->rb_left;
> + continue;
> + }
> +
> + return cur;
> + }
> +
> + return NULL;
> +}
> +
> +/*
> + * insert a ref_node to the ref tree
> + * @pos used for specifiy the position to insert
> + * @pos_parent for specifiy pos's parent
> + *
> + * success, return 0;
> + * ref_node already exists, return -EEXIST;
> +*/
> +static int ref_tree_insert(struct ref_root *ref_tree, struct rb_node **pos,
> +struct rb_node *pos_parent, struct ref_node *ins)
> +{
> + struct rb_node **p = NULL;
> + struct rb_node *parent = NULL;
> + 

[PATCH 2/2] Btrfs: fix race between device replace and read repair

2016-05-30 Thread fdmanana
From: Filipe Manana 

While we are finishing a device replace operation we can have a concurrent
task trying to do a read repair operation, in which case it will call
btrfs_map_block() to get a struct btrfs_bio which can have a stripe that
points to the source device of the device replace operation. This allows
for the read repair task to dereference the stripe's device pointer after
the device replace operation has freed the source device, resulting in
an invalid memory access. This is similar to the problem solved by my
previous patch in the same series and named "Btrfs: fix race between
device replace and discard".

So fix this by surrounding the call to btrfs_map_block() and the code
that uses the returned struct btrfs_bio with calls to
btrfs_bio_counter_inc_blocked() and btrfs_bio_counter_dec(), giving the
proper serialization with the finishing phase of the device replace
operation.

Signed-off-by: Filipe Manana 
---
 fs/btrfs/extent_io.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3cd5782..6e953de 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2025,9 +2025,16 @@ int repair_io_failure(struct inode *inode, u64 start, 
u64 length, u64 logical,
bio->bi_iter.bi_size = 0;
map_length = length;
 
+   /*
+* Avoid races with device replace and make sure our bbio has devices
+* associated to its stripes that don't go away while we are doing the
+* read repair operation.
+*/
+   btrfs_bio_counter_inc_blocked(fs_info);
ret = btrfs_map_block(fs_info, WRITE, logical,
  _length, , mirror_num);
if (ret) {
+   btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
@@ -2037,6 +2044,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 
length, u64 logical,
dev = bbio->stripes[mirror_num-1].dev;
btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
+   btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return -EIO;
}
@@ -2045,6 +2053,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 
length, u64 logical,
 
if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
/* try to remap that extent elsewhere? */
+   btrfs_bio_counter_dec(fs_info);
bio_put(bio);
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
return -EIO;
@@ -2054,6 +2063,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 
length, u64 logical,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
  btrfs_ino(inode), start,
  rcu_str_deref(dev->name), sector);
+   btrfs_bio_counter_dec(fs_info);
bio_put(bio);
return 0;
 }
-- 
2.7.0.rc3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] Btrfs: fix race between device replace and discard

2016-05-30 Thread fdmanana
From: Filipe Manana 

While we are finishing a device replace operation, we can make a discard
operation (fs mounted with -o discard) do an invalid memory access like
the one reported by the following trace:

[ 3206.384654] general protection fault:  [#1] PREEMPT SMP
[ 3206.387520] Modules linked in: dm_mod btrfs crc32c_generic xor raid6_pq 
acpi_cpufreq tpm_tis psmouse tpm ppdev sg parport_pc evdev i2c_piix4 parport
processor serio_raw i2c_core pcspkr button loop autofs4 ext4 crc16 jbd2 mbcache 
sr_mod cdrom ata_generic sd_mod virtio_scsi ata_piix libata virtio_pci
virtio_ring scsi_mod e1000 virtio floppy [last unloaded: btrfs]
[ 3206.388595] CPU: 14 PID: 29194 Comm: fsstress Not tainted 
4.6.0-rc7-btrfs-next-29+ #1
[ 3206.388595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by 
qemu-project.org 04/01/2014
[ 3206.388595] task: 88017ace0100 ti: 880171b98000 task.ti: 
880171b98000
[ 3206.388595] RIP: 0010:[]  [] 
blkdev_issue_discard+0x5c/0x2a7
[ 3206.388595] RSP: 0018:880171b9bb80  EFLAGS: 00010246
[ 3206.388595] RAX: 880171b9bc28 RBX: 0090d000 RCX: 
[ 3206.388595] RDX: 82fa1b48 RSI: 8179f46c RDI: 82fa1b48
[ 3206.388595] RBP: 880171b9bcc0 R08:  R09: 0001
[ 3206.388595] R10: 880171b9bce0 R11: 0090f000 R12: 880171b9bbe8
[ 3206.388595] R13: 0010 R14: 4868 R15: 6b6b6b6b6b6b6b6b
[ 3206.388595] FS:  7f6182e4e700() GS:88023fdc() 
knlGS:
[ 3206.388595] CS:  0010 DS:  ES:  CR0: 80050033
[ 3206.388595] CR2: 7f617c2bbb18 CR3: 00017ad9c000 CR4: 06e0
[ 3206.388595] Stack:
[ 3206.388595]  4878  02400040 

[ 3206.388595]   880171b9bbe8 880171b9bbb0 
880171b9bbb0
[ 3206.388595]  880171b9bbc0 880171b9bbc0 880171b9bbd0 
880171b9bbd0
[ 3206.388595] Call Trace:
[ 3206.388595]  [] btrfs_issue_discard+0x12f/0x143 [btrfs]
[ 3206.388595]  [] ? btrfs_issue_discard+0x12f/0x143 [btrfs]
[ 3206.388595]  [] btrfs_discard_extent+0x87/0xde [btrfs]
[ 3206.388595]  [] btrfs_finish_extent_commit+0xb2/0x1df 
[btrfs]
[ 3206.388595]  [] ? __mutex_unlock_slowpath+0x150/0x15b
[ 3206.388595]  [] btrfs_commit_transaction+0x7fc/0x980 
[btrfs]
[ 3206.388595]  [] ? __mutex_unlock_slowpath+0x150/0x15b
[ 3206.388595]  [] btrfs_sync_file+0x38f/0x428 [btrfs]
[ 3206.388595]  [] vfs_fsync_range+0x8c/0x9e
[ 3206.388595]  [] vfs_fsync+0x1c/0x1e
[ 3206.388595]  [] do_fsync+0x31/0x4a
[ 3206.388595]  [] SyS_fsync+0x10/0x14
[ 3206.388595]  [] entry_SYSCALL_64_fastpath+0x18/0xa8
[ 3206.388595]  [] ? time_hardirqs_off+0x9/0x14
[ 3206.388595]  [] ? trace_hardirqs_off_caller+0x1f/0xaa

This happens because when we call btrfs_map_block() from
btrfs_discard_extent() to get a btrfs_bio structure, the device replace
operation has not finished yet, but before we use the device of one of the
stripes from the returned btrfs_bio structure, the device object is freed.

This is illustrated by the following diagram.

CPU 1  CPU 2

 btrfs_dev_replace_start()

 (...)

 btrfs_dev_replace_finishing()

   btrfs_start_transaction()
   btrfs_commit_transaction()

   (...)

btrfs_sync_file()
  
btrfs_start_transaction()

  (...)

  
btrfs_commit_transaction()

btrfs_finish_extent_commit()
  
btrfs_discard_extent()

btrfs_map_block()
  --> 
returns a struct btrfs_bio
  with 
a stripe that has a
  
device field pointing to
  
source device of the replace
  
operation (the device that
  is 
being replaced)

   mutex_lock(_mutex)
   mutex_lock(_info->fs_devices->device_list_mutex)
   mutex_lock(_info->chunk_mutex)

   btrfs_dev_replace_update_device_in_mapping_tree()
 --> iterates the mapping tree and for each
 extent map that has a stripe pointing to
 the source device, it updates the stripe
 to point to the target device instead

   btrfs_rm_dev_replace_blocked()
 --> waits for fs_info->bio_counter to go down 

Re: [PATCH] btrfs: fix check_shared for fiemap ioctl

2016-05-30 Thread David Sterba
On Fri, May 27, 2016 at 09:39:53AM +0800, Qu Wenruo wrote:
> Any comment?
> 
> This patch does not fix the submitted generic/352[1] and generic/353[2] 
> test cases, but also introduce a much better structure and design for 
> later backref walk use.
> 
> Instead of a list and do a O(n^3)~O(n^4) iteration for fiemap ioctl on a 
> reflinked(deduped) file, it's now only O(n)~O(nlogn) for SHARED flag 
> check to pass generic/352.

This is a good improvement, though there's potentially hidden cost in
the allocations and maintaining the temporary structures. Do you have
actual performance numbers?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/5] Btrfs: self-tests: Support non-4k page size

2016-05-30 Thread David Sterba
On Sun, May 29, 2016 at 01:17:34PM +0800, Fei Fei Xu wrote:
> > There are more instances of the pointed style issues, please fix all of
> > them. As the changes do not affect functionality I'll add the paches to
> > for-next, but I'm expecting a v2.
> Thanks, I will send out v2 soon according to all above comments.

What's the base of the patchset? Does not apply cleanly on the current
integration (ie. what's in master).
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs-progs: tests: add 020-extent-ref-cases

2016-05-30 Thread David Sterba
On Mon, May 30, 2016 at 10:58:13AM +0800, Lu Fengqi wrote:
> In order to confirm that btrfsck supports to check a variety of
> refs, add the
> following cases:
> * keyed_block_ref
> * keyed_data_ref
> * shared_block_ref
> * shared_data_ref
> * no_inline_ref (a extent item without inline ref)
> * no_skinny_ref
> 
> Signed-off-by: Lu Fengqi 

Applied, thanks. The tests currently fail even in 'devel', which patches
are supposed to address the errors?
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs,vfs: allow FILE_EXTENT_SAME on a file opened ro

2016-05-30 Thread Adam Borowski
On Sat, May 28, 2016 at 08:56:39PM -0400, Zygo Blaxell wrote:
> On Sun, May 29, 2016 at 02:21:03AM +0200, Adam Borowski wrote:
> > In any case, this patch doesn't introduce any cases not already triggerable
> > by root.
> 
> It allows non-root to trigger cases that previously could only be
> triggered by root.

Only the proposed "ro is enough" variant does.  The patch, as written,
requires write permission on the inode, thus alleviating your concerns:
* mangling the contents of dstfile: the user has rw access so he can do that
  already
* triggering a bug: the user could have opened dstfile rw (like duperemove
  currently does)


Meow!
-- 
An imaginary friend squared is a real enemy.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [1/1 v2] String and comment review: Fix typos; fix a couple of mandatory grammatical issues for clarity.

2016-05-30 Thread David Sterba
On Fri, May 27, 2016 at 07:13:35AM -0400, Nicholas D Steeves wrote:
> On 24 May 2016 at 06:50, David Sterba  wrote:
> > Sending typo fixes for all user visible text is OK and welcome anytime.
> >
> >> if I find any, and a strings &
> >> comments review for both -progs and kernel twice a year, where one
> >> review is part of preparing for an LTS kernel.
> >
> > Typos in comments can be done once in a year I think.
> 
> Thank you for the clarification.  When would be the best time?  While
> preparing for an LTS kernel, or a certain amount of time before one?
> Would you like me to skip 4.10 this year and stick to doing it in the
> month of May?

Tough question :) I think anything post 2017 is ok, but I also hope there
won't be many typos.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs filesystem keeps allocating new chunks for no apparent reason

2016-05-30 Thread Hans van Kranenburg

Hi,

since it got any followup and since I'm bold enough to bump it one more 
time... :)


I really don't understand the behaviour I described. Does it ring a bell 
with anyone? This system is still allocating new 1GB data chunks every 1 
or 2 days without using them at all, and I have to use balance every 
week to get them away again.


Hans

On 05/06/2016 11:28 PM, Hans van Kranenburg wrote:

Hi,

I've got a mostly inactive btrfs filesystem inside a virtual machine
somewhere that shows interesting behaviour: while no interesting disk
activity is going on, btrfs keeps allocating new chunks, a GiB at a time.

A picture, telling more than 1000 words:
https://syrinx.knorrie.org/~knorrie/btrfs/keep/btrfs_usage_ichiban.png
(when the amount of allocated/unused goes down, I did a btrfs balance)

Linux ichiban 4.5.0-0.bpo.1-amd64 #1 SMP Debian 4.5.1-1~bpo8+1
(2016-04-20) x86_64 GNU/Linux

# btrfs fi show /
Label: none  uuid: 9881fc30-8f69-4069-a8c8-c057b842b0c4
 Total devices 1 FS bytes used 6.17GiB
 devid1 size 20.00GiB used 16.54GiB path /dev/xvda

# btrfs fi df /
Data, single: total=15.01GiB, used=5.16GiB
System, single: total=32.00MiB, used=16.00KiB
Metadata, single: total=1.50GiB, used=1.01GiB
GlobalReserve, single: total=144.00MiB, used=0.00B

I'm a bit puzzled, since I haven't seen this happening on other
filesystems that use 4.4 or 4.5 kernels.

If I dump the allocated chunks and their % usage, it's clear that the
last 6 new added ones have a usage of only a few percent.

dev item devid 1 total bytes 21474836480 bytes used 17758683136
chunk vaddr 12582912 type 1 stripe 0 devid 1 offset 12582912 length
8388608 used 4276224 used_pct 50
chunk vaddr 1103101952 type 1 stripe 0 devid 1 offset 2185232384 length
1073741824 used 433127424 used_pct 40
chunk vaddr 3250585600 type 1 stripe 0 devid 1 offset 4332716032 length
1073741824 used 764391424 used_pct 71
chunk vaddr 9271508992 type 1 stripe 0 devid 1 offset 12079595520 length
1073741824 used 270704640 used_pct 25
chunk vaddr 12492734464 type 1 stripe 0 devid 1 offset 13153337344
length 1073741824 used 866574336 used_pct 80
chunk vaddr 13566476288 type 1 stripe 0 devid 1 offset 11005853696
length 1073741824 used 1028059136 used_pct 95
chunk vaddr 14640218112 type 1 stripe 0 devid 1 offset 3258974208 length
1073741824 used 762466304 used_pct 71
chunk vaddr 26250051584 type 1 stripe 0 devid 1 offset 19595788288
length 1073741824 used 114982912 used_pct 10
chunk vaddr 31618760704 type 1 stripe 0 devid 1 offset 15300820992
length 1073741824 used 488902656 used_pct 45
chunk vaddr 32692502528 type 4 stripe 0 devid 1 offset 5406457856 length
268435456 used 209272832 used_pct 77
chunk vaddr 32960937984 type 4 stripe 0 devid 1 offset 5943328768 length
268435456 used 251199488 used_pct 93
chunk vaddr 33229373440 type 4 stripe 0 devid 1 offset 7419723776 length
268435456 used 248709120 used_pct 92
chunk vaddr 33497808896 type 4 stripe 0 devid 1 offset 8896118784 length
268435456 used 247791616 used_pct 92
chunk vaddr 33766244352 type 4 stripe 0 devid 1 offset 8627683328 length
268435456 used 93061120 used_pct 34
chunk vaddr 34303115264 type 2 stripe 0 devid 1 offset 6748635136 length
33554432 used 16384 used_pct 0
chunk vaddr 34336669696 type 1 stripe 0 devid 1 offset 16374562816
length 1073741824 used 105054208 used_pct 9
chunk vaddr 35410411520 type 1 stripe 0 devid 1 offset 20971520 length
1073741824 used 10899456 used_pct 1
chunk vaddr 36484153344 type 1 stripe 0 devid 1 offset 1094713344 length
1073741824 used 441778176 used_pct 41
chunk vaddr 37557895168 type 4 stripe 0 devid 1 offset 5674893312 length
268435456 used 33439744 used_pct 12
chunk vaddr 37826330624 type 1 stripe 0 devid 1 offset 9164554240 length
1073741824 used 32096256 used_pct 2
chunk vaddr 38900072448 type 1 stripe 0 devid 1 offset 14227079168
length 1073741824 used 40140800 used_pct 3
chunk vaddr 39973814272 type 1 stripe 0 devid 1 offset 17448304640
length 1073741824 used 58093568 used_pct 5
chunk vaddr 41047556096 type 1 stripe 0 devid 1 offset 18522046464
length 1073741824 used 119701504 used_pct 11

The only things this host does is
  1) being a webserver for a small internal debian packages repository
  2) running low-volume mailman with a few lists, no archive-gzipping
mega cronjobs or anything enabled.
  3) some little legacy php thingies

Interesting fact is that most of the 1GiB increases happen at the same
time as cron.daily runs. However, there's only a few standard things in
there. An occasional package upgrade by unattended-upgrade, or some
logrotate. The total contents of /var/log/ together is only 66MB...
Graphs show only less than about 100 MB reads/writes in total around
this time.

As you can see in the graph the amount of used space is even decreasing,
because I cleaned up a bunch of old packages in the repository, and
still, btrfs keeps allocating new data chunks like a hungry beast.

Why would this happen?

Hans van Kranenburg
--
To unsubscribe from this list: 

Re: [PATCH] Improve balance performance when qgroups are turned on

2016-05-30 Thread Qu Wenruo



Mark Fasheh wrote on 2016/05/26 17:18 -0700:

The btrfs balance operation is significantly slower when qgroups are
enabled. To the best of my knowledge, a balance shouldn't have an effect on
qgroups counts (extents are not changing between subvolumes), so we don't
need to actually run the qgroup code when we balance.


This assumption is questionable.

When balancing, it's true we will set the chunk to ro, so new 
*allocation* won't happen in that chunk.


However we can still de-refer an extent during balance.

If that happens and we skipped the qgroup accounting, corruption happens.
As the extent before and after balance won't go through qgroup, so it's 
de-reference won't be accounted.


The following quick test script has already spot the problem:
--
#!/bin/bash

dev=/dev/sdb5
mnt=/mnt/test
fsstress=/home/adam/xfstests/ltp/fsstress

fsstress_work() {
$fsstress -d $mnt -n 10 -p 2 \
-z -f write=10 -f unlink=10 -f creat=10 \
-f fsync=20 -f sync=20
}

balance_work() {
while true; do
btrfs balance start -d $mnt &> /dev/null
done
}

umount $dev &> /dev/null
mkfs.btrfs -f $dev
mount $dev $mnt

btrfs quota en $mnt
btrfs quota rescan -w $mnt

fsstress_work &
fsstress_pid=$!

balance_work &
balance_pid=$!

sleep 30

kill $fsstress_pid
killall fsstress
kill $balance_pid &> /dev/null

wait

btrfs balance cancel $mnt &> /dev/null

rm $mnt/* -rf
sync
btrfs sub sync $mnt
btrfs qgroup show -prce $mnt
--

The result is not stable with your patch.
Sometimes several kilobytes is shown, as explained above.

While without your patch, the final qgroup is stable with 16KiB.

The xfstest case will follow soon.

Qu



Since there's only one thread doing balance at a time, it's easy to recored
that thread on the fs_info and check it inside qgroup_insert_dirty_extent().
If we're the balance thread, we drop the qgroup record instead of inserting
it.

Here are some sample numbers before and after this patch. The example fs
below is 22 gigabytes in size and was creating by copying /usr and /boot
from my test machine (a few times).

Balance with qgroups enabled, before patch:
# time btrfs balance start --full-balance /btrfs
Done, had to relocate 26 out of 26 chunks

real3m7.515s
user0m0.002s
sys 2m0.852s

Balance with qgroups enabeld, after patch:
# time btrfs balance start --full-balance /btrfs
Done, had to relocate 26 out of 26 chunks

real2m2.806s
user0m0.000s
sys 0m54.174s

Signed-off-by: Mark Fasheh 
---
 fs/btrfs/ctree.h   | 1 +
 fs/btrfs/delayed-ref.c | 2 +-
 fs/btrfs/disk-io.c | 1 +
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/qgroup.c  | 6 +-
 fs/btrfs/qgroup.h  | 3 ++-
 fs/btrfs/volumes.c | 4 
 7 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index bfe4a33..994f19a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1748,6 +1748,7 @@ struct btrfs_fs_info {
atomic_t balance_cancel_req;
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
+   struct task_struct *balance_thread;

unsigned data_chunk_allocations;
unsigned metadata_ratio;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 914ac13..81e9b92 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -606,7 +606,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;

-   qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+   qexisting = btrfs_qgroup_insert_dirty_extent(fs_info, 
delayed_refs,
 qrecord);
if (qexisting)
kfree(qrecord);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4545e2e..0bbdf808 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2236,6 +2236,7 @@ static void btrfs_init_balance(struct btrfs_fs_info 
*fs_info)
atomic_set(_info->balance_cancel_req, 0);
fs_info->balance_ctl = NULL;
init_waitqueue_head(_info->balance_wait_q);
+   fs_info->balance_thread = NULL;
 }

 static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e2287c7..33c784c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8195,7 +8195,7 @@ static int record_one_subtree_extent(struct 
btrfs_trans_handle *trans,

delayed_refs = >transaction->delayed_refs;
spin_lock(_refs->lock);
-   if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+   if (btrfs_qgroup_insert_dirty_extent(root->fs_info, delayed_refs, 
qrecord))
kfree(qrecord);
spin_unlock(_refs->lock);

diff --git a/fs/btrfs/qgroup.c