[PATCH 2/3] btrfs: cow_file_range() num_bytes and disk_num_bytes are same

2016-12-05 Thread Anand Jain
This patch deletes local variable disk_num_bytes as its value
is same as num_bytes in the function cow_file_range().

Signed-off-by: Anand Jain 
---
 fs/btrfs/inode.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 96e5f8a49d4c..79f073e94f2d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -944,7 +944,6 @@ static noinline int cow_file_range(struct inode *inode,
u64 alloc_hint = 0;
u64 num_bytes;
unsigned long ram_size;
-   u64 disk_num_bytes;
u64 cur_alloc_size;
u64 blocksize = root->sectorsize;
struct btrfs_key ins;
@@ -960,7 +959,6 @@ static noinline int cow_file_range(struct inode *inode,
 
num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize,  num_bytes);
-   disk_num_bytes = num_bytes;
 
/* if this is a small write inside eof, kick off defrag */
if (num_bytes < SZ_64K &&
@@ -989,16 +987,16 @@ static noinline int cow_file_range(struct inode *inode,
}
}
 
-   BUG_ON(disk_num_bytes >
+   BUG_ON(num_bytes >
   btrfs_super_total_bytes(root->fs_info->super_copy));
 
alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
 
-   while (disk_num_bytes > 0) {
+   while (num_bytes > 0) {
unsigned long op;
 
-   cur_alloc_size = disk_num_bytes;
+   cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
   root->sectorsize, 0, alloc_hint,
   , 1, 1);
@@ -1055,7 +1053,7 @@ static noinline int cow_file_range(struct inode *inode,
 
btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
 
-   if (disk_num_bytes < cur_alloc_size)
+   if (num_bytes < cur_alloc_size)
break;
 
/* we're not doing compressed IO, don't unlock the first
@@ -1073,7 +1071,6 @@ static noinline int cow_file_range(struct inode *inode,
 delalloc_end, locked_page,
 EXTENT_LOCKED | EXTENT_DELALLOC,
 op);
-   disk_num_bytes -= cur_alloc_size;
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
-- 
2.10.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] btrfs: use BTRFS_COMPRESS_NONE to specify no compression

2016-12-05 Thread Anand Jain
Signed-off-by: Anand Jain 
---
 fs/btrfs/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8e3a5a266917..96e5f8a49d4c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -540,7 +540,7 @@ static noinline void compress_file_range(struct inode 
*inode,
 * to make an uncompressed inline extent.
 */
ret = cow_file_range_inline(root, inode, start, end,
-   0, 0, NULL);
+   0, BTRFS_COMPRESS_NONE, NULL);
} else {
/* try making a compressed inline extent */
ret = cow_file_range_inline(root, inode, start, end,
@@ -969,8 +969,8 @@ static noinline int cow_file_range(struct inode *inode,
 
if (start == 0) {
/* lets try to make an inline extent */
-   ret = cow_file_range_inline(root, inode, start, end, 0, 0,
-   NULL);
+   ret = cow_file_range_inline(root, inode, start, end, 0,
+   BTRFS_COMPRESS_NONE, NULL);
if (ret == 0) {
extent_clear_unlock_delalloc(inode, start, end,
 delalloc_end, NULL,
-- 
2.10.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/3] Misc fixes mostly cleanup

2016-12-05 Thread Anand Jain
A set of unrelated miscellaneous cleanup patches.

Anand Jain (3):
  btrfs: use BTRFS_COMPRESS_NONE to specify no compression
  btrfs: cow_file_range() num_bytes and disk_num_bytes are same
  btrfs: consolidate auto defrag kick off policies

 fs/btrfs/inode.c | 44 ++--
 1 file changed, 26 insertions(+), 18 deletions(-)

-- 
2.10.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] btrfs: consolidate auto defrag kick off policies

2016-12-05 Thread Anand Jain
As of now writes smaller than 64k for non compressed extents and 16k
for compressed extents inside eof are considered as candidate
for auto defrag, put them together at a place.
---
 fs/btrfs/inode.c | 27 +++
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 79f073e94f2d..b157575166c6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -388,6 +388,22 @@ static inline int inode_need_compress(struct inode *inode)
return 0;
 }
 
+static inline void inode_should_defrag(struct inode *inode,
+   u64 start, u64 end, u64 num_bytes, int comp_type)
+{
+   u64 small_write = SZ_64K;
+   if (comp_type)
+   small_write = SZ_16K;
+
+   if (!num_bytes)
+   num_bytes = end - start + 1;
+
+   /* If this is a small write inside eof, kick off a defrag */
+   if (num_bytes < small_write &&
+   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+   btrfs_add_inode_defrag(NULL, inode);
+}
+
 /*
  * we create compressed extents in two phases.  The first
  * phase compresses a range of pages that have already been
@@ -429,10 +445,7 @@ static noinline void compress_file_range(struct inode 
*inode,
int compress_type = root->fs_info->compress_type;
int redirty = 0;
 
-   /* if this is a small write inside eof, kick off a defrag */
-   if ((end - start + 1) < SZ_16K &&
-   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
-   btrfs_add_inode_defrag(NULL, inode);
+   inode_should_defrag(inode, start, end, 0, compress_type);
 
actual_end = min_t(u64, isize, end + 1);
 again:
@@ -960,10 +973,8 @@ static noinline int cow_file_range(struct inode *inode,
num_bytes = ALIGN(end - start + 1, blocksize);
num_bytes = max(blocksize,  num_bytes);
 
-   /* if this is a small write inside eof, kick off defrag */
-   if (num_bytes < SZ_64K &&
-   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
-   btrfs_add_inode_defrag(NULL, inode);
+   inode_should_defrag(inode, start, end, num_bytes,
+   BTRFS_COMPRESS_NONE);
 
if (start == 0) {
/* lets try to make an inline extent */
-- 
2.10.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: recursive defrag cleanup duplicate code

2016-12-05 Thread Anand Jain
Signed-off-by: Anand Jain 
---
 cmds-filesystem.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/cmds-filesystem.c b/cmds-filesystem.c
index 41623f3183a8..ecac37edf936 100644
--- a/cmds-filesystem.c
+++ b/cmds-filesystem.c
@@ -1136,21 +1136,13 @@ static int cmd_filesystem_defrag(int argc, char **argv)
close_file_or_dir(fd, dirstream);
continue;
}
-   if (recursive) {
-   if (S_ISDIR(st.st_mode)) {
-   ret = nftw(argv[i], defrag_callback, 10,
+   if (recursive && S_ISDIR(st.st_mode)) {
+   ret = nftw(argv[i], defrag_callback, 10,
FTW_MOUNT | FTW_PHYS);
-   if (ret == ENOTTY)
-   exit(1);
-   /* errors are handled in the callback */
-   ret = 0;
-   } else {
-   if (defrag_global_verbose)
-   printf("%s\n", argv[i]);
-   ret = do_defrag(fd, defrag_global_fancy_ioctl,
-   _global_range);
-   e = errno;
-   }
+   if (ret == ENOTTY)
+   exit(1);
+   /* errors are handled in the callback */
+   ret = 0;
} else {
if (defrag_global_verbose)
printf("%s\n", argv[i]);
-- 
2.10.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] recursive defrag cleanup

2016-12-05 Thread Anand Jain
The command,
   btrfs fi defrag -v /btrfs
 does nothing, it won't defrag the files under /btrfs as user
 may expect. The command with recursive option
   btrfs fi defrag -vr /btrfs
 would defrag all the files under /btrfs including files in
 its sub directories.

 While attempting to fix this. The patch below 1/1 provides
 a cleanup. And the actual fix is pending, as to my understanding
 of nfwt() it does not provide the list of file without
 files under its sub directories.

Anand Jain (1):
  btrfs-progs: recursive defrag cleanup remove duplicate code

 cmds-filesystem.c | 20 ++--
 1 file changed, 6 insertions(+), 14 deletions(-)

-- 
2.10.0

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs-progs: Correct value printed by assertions/BUG_ON/WARN_ON

2016-12-05 Thread Qu Wenruo



At 12/06/2016 10:51 AM, Goldwyn Rodrigues wrote:



On 12/05/2016 08:03 PM, Qu Wenruo wrote:

BTW, the DISABLE_BACKTRACE branch seems quite different from backtrace one.

#define BUG_ON(c) assert_trace(#c, __FILE__, __func__, __LINE__, (long)(c))
#define WARN_ON(c) warning_trace(#c, __FILE__, __func__, __LINE__,
(long)(c))
#define ASSERT(c) assert_trace(#c, __FILE__, __func__, __LINE__,
(long)!(c))
#define BUG() assert_trace(NULL, __FILE__, __func__, __LINE__, 1)
#else
#define BUG_ON(c) assert(!(c))
#define WARN_ON(c) warning_trace(#c, __FILE__, __func__, __LINE__,
(long)(c))
#define ASSERT(c) assert(!(c))
#define BUG() assert(0)

Condition of BUG_ON/ASSERT/BUG are all logical notted for
DISABLE_BACKTRACE.
While WARN_ON() of both branch are the same condition.


WARN_ON is using warning_trace as opposed to assert, and that is the
reason it is not notted.



This seems quite confusing to me.

Any idea to make it more straightforward?



I just kept it the same as before. warning_trace was using an extra
variable, trace, which was not needed because the print_trace was
already in ifndefs.


I mean, better make the condition the same for both BUG/BUG_ON/ASSERT.
So that we don't need to manually logical not the condition.

For example:
#define ASSERT(c) assert_trace(#c, __FILE__, __func__, __LINE__,(long)(c))
and
#define ASSERT(c) assert((c))

This looks much more straightforward, and easier to expose bug at review 
time.


Thanks,
Qu




If you are talking about keeping WARN_ON outside of ifndef, yes, that
will reduce the code further by another line.




--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 02/14] btrfs-progs: check: introduce function to find dir_item

2016-12-05 Thread Qu Wenruo



At 11/02/2016 11:21 PM, David Sterba wrote:

On Wed, Sep 21, 2016 at 11:15:52AM +0800, Qu Wenruo wrote:

From: Lu Fengqi 

Introduce a new function find_dir_item() to find DIR_ITEM for the given
key, and check it with the specified INODE_REF/INODE_EXTREF match.

Signed-off-by: Lu Fengqi 
Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 140 +++
 1 file changed, 140 insertions(+)

diff --git a/cmds-check.c b/cmds-check.c
index 998ba63..4e25804 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -3848,6 +3848,146 @@ out:
return err;
 }

+#define ROOT_DIR_ERROR (1<<1)/* bad root_dir */
+#define DIR_ITEM_MISSING   (1<<2)/* DIR_ITEM not found */
+#define DIR_ITEM_MISMATCH  (1<<3)/* DIR_ITEM found but not match */


What's the reason for another definition of the error codes? They're
mostly copied from te I_ERR_* counterparts. I'd rather have one set of
error codes.


I tried to merge them into a 32bit error bits.

But things turns out that, I_ERR and REF_ERR have already taken 28 bits.
For a int type, we only have extra 4bits.

All fs tree level error code are OK to merge.
But extent tree level errors, including extent ref/backref error bits, 
and tree block level errors, like bad key type in current content or bad 
item size, have no corresponding bits.


These bits are already over 4 bits.


Yes, we can expand the error bit to u64, but that will be a huge 
modification for both the original mode and lowmem mode.



What about letting me separate the lowmem code from cmds-check.c into 
check/lowmem.c and using the current error when you're going to create 
check/ directory?


It should be OK to merge all lowmem error bits into one int type, but 
not possible to do it with original mode error bits without expanding 
the int type.
Since we have different error bit standard, this leads to completely 
different usage on these bits.

A lot of lowmem bit won't be used by original mode and vice-verse.

Thanks,
Qu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs-progs: Correct value printed by assertions/BUG_ON/WARN_ON

2016-12-05 Thread Goldwyn Rodrigues


On 12/05/2016 08:03 PM, Qu Wenruo wrote:
> BTW, the DISABLE_BACKTRACE branch seems quite different from backtrace one.
> 
> #define BUG_ON(c) assert_trace(#c, __FILE__, __func__, __LINE__, (long)(c))
> #define WARN_ON(c) warning_trace(#c, __FILE__, __func__, __LINE__,
> (long)(c))
> #define ASSERT(c) assert_trace(#c, __FILE__, __func__, __LINE__,
> (long)!(c))
> #define BUG() assert_trace(NULL, __FILE__, __func__, __LINE__, 1)
> #else
> #define BUG_ON(c) assert(!(c))
> #define WARN_ON(c) warning_trace(#c, __FILE__, __func__, __LINE__,
> (long)(c))
> #define ASSERT(c) assert(!(c))
> #define BUG() assert(0)
> 
> Condition of BUG_ON/ASSERT/BUG are all logical notted for
> DISABLE_BACKTRACE.
> While WARN_ON() of both branch are the same condition.

WARN_ON is using warning_trace as opposed to assert, and that is the
reason it is not notted.

> 
> This seems quite confusing to me.
> 
> Any idea to make it more straightforward?
> 

I just kept it the same as before. warning_trace was using an extra
variable, trace, which was not needed because the print_trace was
already in ifndefs.

If you are talking about keeping WARN_ON outside of ifndef, yes, that
will reduce the code further by another line.

-- 
Goldwyn
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs-progs: Correct value printed by assertions/BUG_ON/WARN_ON

2016-12-05 Thread Qu Wenruo

BTW, the DISABLE_BACKTRACE branch seems quite different from backtrace one.

#define BUG_ON(c) assert_trace(#c, __FILE__, __func__, __LINE__, (long)(c))
#define WARN_ON(c) warning_trace(#c, __FILE__, __func__, __LINE__, 
(long)(c))

#define ASSERT(c) assert_trace(#c, __FILE__, __func__, __LINE__, (long)!(c))
#define BUG() assert_trace(NULL, __FILE__, __func__, __LINE__, 1)
#else
#define BUG_ON(c) assert(!(c))
#define WARN_ON(c) warning_trace(#c, __FILE__, __func__, __LINE__, 
(long)(c))

#define ASSERT(c) assert(!(c))
#define BUG() assert(0)

Condition of BUG_ON/ASSERT/BUG are all logical notted for DISABLE_BACKTRACE.
While WARN_ON() of both branch are the same condition.

This seems quite confusing to me.

Any idea to make it more straightforward?

Thanks,
Qu

At 12/05/2016 07:38 PM, Goldwyn Rodrigues wrote:

Hi Qu,

Yes, the assert for ifdef BTRFS_DIABLE_BACKTRACE is not correct. The
condition should not have a not(!).

Thanks for reporting.

On 12/05/2016 01:10 AM, Qu Wenruo wrote:

Hi, Goldwyn and David,

This patch seems to cause btrfs test case 023 to fail.

Bisect leads me to this patch.


$ ./btrfs check ~/quota_balance_loop_backref.raw.restored
Checking filesystem on /home/adam/quota_balance_loop_backref.raw.restored
UUID: c33c5ce3-3ad9-4320-9201-c337c04e0051
checking extents
btrfs: cmds-check.c:12284: build_roots_info_cache: Assertion `!(ret ==
0)' failed.
Aborted (core dumped)


And gdb backref:
#0  0x76fd204f in raise () from /usr/lib/libc.so.6
#1  0x76fd347a in abort () from /usr/lib/libc.so.6
#2  0x76fcaea7 in __assert_fail_base () from /usr/lib/libc.so.6
#3  0x76fcaf52 in __assert_fail () from /usr/lib/libc.so.6
#4  0x00440426 in build_roots_info_cache (info=0x6f43c0) at
cmds-check.c:12284
#5  0x00440945 in repair_root_items (info=0x6f43c0) at
cmds-check.c:12412
#6  0x004418c3 in cmd_check (argc=2, argv=0x7fffe100) at
cmds-check.c:12892
#7  0x0040a74c in main (argc=2, argv=0x7fffe100) at btrfs.c:301


For frame 4:
(gdb) frame 4
#4  0x00440426 in build_roots_info_cache (info=0x6f43c0) at
cmds-check.c:12284
12284ASSERT(ret == 0);
(gdb) list
12279rii->cache_extent.start = root_id;
12280rii->cache_extent.size = 1;
12281rii->level = (u8)-1;
12282entry = >cache_extent;
12283ret = insert_cache_extent(roots_info_cache, entry);
12284ASSERT(ret == 0);
12285} else {
12286rii = container_of(entry, struct root_item_info,
12287   cache_extent);
12288}
(gdb) print ret
$1 = 0

For me, ASSERT(ret == 0) seems quite safe and common here.
Doesn't the patch changed the ASSERT() behavior?

Thanks,
Qu

At 11/30/2016 12:24 AM, Goldwyn Rodrigues wrote:

From: Goldwyn Rodrigues 

The values passed to BUG_ON/WARN_ON are negated(!) and printed, which
results in printing the value zero for each bug/warning. For example:
volumes.c:988: btrfs_alloc_chunk: Assertion `ret` failed, value 0

This is not useful. Instead changed to print the value of the parameter
passed to BUG_ON()/WARN_ON(). The value needed to be changed to long
to accomodate pointers being passed.

Also, consolidated assert() and BUG() into ifndef.

Signed-off-by: Goldwyn Rodrigues 
---
 kerncompat.h | 35 +++
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/kerncompat.h b/kerncompat.h
index ed9a042..9bd25bd 100644
--- a/kerncompat.h
+++ b/kerncompat.h
@@ -88,39 +88,36 @@ static inline void print_trace(void)
 }

 static inline void assert_trace(const char *assertion, const char
*filename,
-  const char *func, unsigned line, int val)
+  const char *func, unsigned line, long val)
 {
-if (val)
+if (!val)
 return;
 if (assertion)
-fprintf(stderr, "%s:%d: %s: Assertion `%s` failed, value %d\n",
+fprintf(stderr, "%s:%d: %s: Assertion `%s` failed, value %ld\n",
 filename, line, func, assertion, val);
 else
-fprintf(stderr, "%s:%d: %s: Assertion failed, value %d.\n",
+fprintf(stderr, "%s:%d: %s: Assertion failed, value %ld.\n",
 filename, line, func, val);
 print_trace();
 abort();
 exit(1);
 }

-#define BUG() assert_trace(NULL, __FILE__, __func__, __LINE__, 0)
-#else
-#define BUG() assert(0)
 #endif

 static inline void warning_trace(const char *assertion, const char
*filename,
-  const char *func, unsigned line, int val,
+  const char *func, unsigned line, long val,
   int trace)
 {
-if (val)
+if (!val)
 return;
 if (assertion)
 fprintf(stderr,
-"%s:%d: %s: Warning: assertion `%s` failed, value %d\n",
+"%s:%d: %s: Warning: assertion `%s` failed, value %ld\n",
 filename, line, func, assertion, 

Re: crc32c_le performance hit

2016-12-05 Thread Chris Murphy
On Mon, Dec 5, 2016 at 8:46 AM, Chris Mason  wrote:
> On 12/04/2016 04:28 PM, Chris Murphy wrote:
>>
>> 4.8.11-300.fc25.x86_64
>>
>> I'm currently doing a btrfs send/receive and I'm seeing a rather large
>> hit for crc32c, bigger than aes-ni (the volume is on dm crypt), using
>> perf top.
>>
>>   14.03%  btrfs[.] __crc32c_le
>>   10.50%  [kernel] [k] _aesni_enc4
>>
>
> This is surprising, although send/recv does do a lot of small crc runs. What
> is the overall CPU usage?

Before issuing btrfs send, btrfs receive:
%Cpu(s):  0.2 us,  0.2 sy,  0.0 ni, 99.5 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st

During:
%Cpu(s):  5.1 us, 18.2 sy,  0.0 ni, 35.0 id, 40.2 wa,  0.7 hi,  0.8 si,  0.0 st

Full output
https://paste.fedoraproject.org/500097/

Lots of waiting, but most of that 18% hit is coming from the two btrfs
commands themselves. The same thing doesn't happen with a btrfs scrub,
which also has to compute crc's albeit just once.




>Maybe pin btrfs to a single CPU and use mpstat to
> see how hot that one CPU is.  If we're 14% of a CPU running at 100%, that's
> a big deal.  If we're 14% of a CPU running at 5%, we safely ignore it.

I'm not sure how to do this, but if all btrfs processes were pinned to
a single core, based on the more than dozen active processes I see in
top during send/receive, it would definitely soak all of that core.


>
>>
>> Complete output is here for 1 month:
>>
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__paste.fedoraproject.org_498914_=DgIBaQ=5VD0RTtNlTh3ycd41b3MUw=9QPtTAxcitoznaWRKKHoEQ=YXgqrDMWJP2u2reRh04tnYz4CrO5SPUhSHd1cF2OcR0=axoqoaw0ZkYXdDxlTODLOfzqEK7uIJHqMsBjpkyIw5o=
>>
>> I don't remember crc32's taking this much CPU before, so it seems like
>> a regression but offhand I don't know when it started.
>>
>> [chris@f25s ~]$ dmesg | grep crc32
>> [4.226700] Btrfs loaded, crc32c=crc32c-intel
>
>
> At least we know you're using the intel accelerated one.  Every time someone
> posts this dmesg output to the list, I owe Jeff another beer.


Glad I can increase your beer liability. :-D



-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] generic/35[67]: disable swapfile tests on Btrfs

2016-12-05 Thread Omar Sandoval
From: Omar Sandoval 

Btrfs doesn't support swapfiles (yet?), so generic/356 fails
erroneously, and generic/357 only passes by accident. Let's add a
_require_scratch_swapfile helper and add it to these tests.

Signed-off-by: Omar Sandoval 
---
I have some code enabling swapfiles for Btrfs [1], but there's some ABBA
deadlock issues with i_rwsem and mmap_sem on swap-over-NFS that I
haven't had time to sort out. In the meantime, let's just skip these
tests.

1: https://github.com/osandov/linux/tree/btrfs-swap

 common/rc | 22 ++
 tests/generic/356 |  1 +
 tests/generic/357 |  1 +
 3 files changed, 24 insertions(+)

diff --git a/common/rc b/common/rc
index 2719b23..d863e56 100644
--- a/common/rc
+++ b/common/rc
@@ -1790,6 +1790,28 @@ _require_odirect()
rm -f $testfile 2>&1 > /dev/null
 }
 
+# Check that the filesystem supports swapfiles
+_require_scratch_swapfile()
+{
+   _require_scratch
+
+   _scratch_mkfs >/dev/null
+   _scratch_mount
+
+   # Minimum size for mkswap is 10 pages
+   local size=$(($(get_page_size) * 10))
+
+   _pwrite_byte 0x61 0 "$size" "$SCRATCH_MNT/swap" >/dev/null 2>&1
+   mkswap "$SCRATCH_MNT/swap" >/dev/null 2>&1
+   if ! swapon "$SCRATCH_MNT/swap" >/dev/null 2>&1; then
+   _scratch_unmount
+   _notrun "swapfiles are not supported"
+   fi
+
+   swapoff "$SCRATCH_MNT/swap" >/dev/null 2>&1
+   _scratch_unmount
+}
+
 # Check that a fs has enough free space (in 1024b blocks)
 #
 _require_fs_space()
diff --git a/tests/generic/356 b/tests/generic/356
index 6bb90c0..51eeb65 100755
--- a/tests/generic/356
+++ b/tests/generic/356
@@ -44,6 +44,7 @@ _cleanup()
 
 # real QA test starts here
 _supported_os Linux
+_require_scratch_swapfile
 _require_scratch_reflink
 _require_cp_reflink
 
diff --git a/tests/generic/357 b/tests/generic/357
index 439b314..0dd0c10 100755
--- a/tests/generic/357
+++ b/tests/generic/357
@@ -44,6 +44,7 @@ _cleanup()
 
 # real QA test starts here
 _supported_os Linux
+_require_scratch_swapfile
 _require_scratch_reflink
 _require_cp_reflink
 
-- 
2.10.2

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [SOLVED] Re: system hangs due to qgroups

2016-12-05 Thread Qu Wenruo



At 12/05/2016 10:43 PM, Marc Joliet wrote:

On Monday 05 December 2016 12:01:28 Marc Joliet wrote:

This seems to be a NULL pointer bug in qgroup relocation fix.



The latest fix (not merged yet) should address it.



You could try the for-next-20161125 branch from David to fix it:
https://github.com/kdave/btrfs-devel/tree/for-next-20161125


OK, I'll try that, thanks!  I just have to wait for it to finish cloning...


[...]

And for your recovery, I'd suggest to install an Archlinux into a USB
HDD or USB stick, and compile David's branch and install it into the USB
HDD.



Then use the USB storage as rescue tool to mount the fs, which should do
RW mount with or without skip_balance mount option.
So you could disable quota then.


OK, I'll try that, thanks!


Excellent, thank you, that worked!  My laptop is working normally again.  I'll
keep an eye on it, but so far two balance operations ran normally (that is,
they completed within a few minutes and without hanging the system).

(Specifically, since I didn't find out how to get a different kernel onto the
Arch USB stick, I simply installed the kernel on my desktop, then did
everything from an initramfs emergency shell, then moved the SSD back into the
laptop.)

Thanks, everyone!


Glad that helped.

I just forgot that you're using gentoo, not archlinux, and kernel 
install script won't work for archlinux.


Anyway, I'm glad that works for you.

BTW, if you haven't yet disable quota, would you please give a report on 
how many qgroup you have?

And how CPU is spinning for balancing with quota enabled?

This would help us to evaluate how qgroup slows down the process if 
there are too many snapshots.


Thanks,
Qu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: system hangs due to qgroups

2016-12-05 Thread Marc Joliet
On Monday 05 December 2016 11:16:35 Marc Joliet wrote:
[...]
> https://dl.dropboxusercontent.com/u/5328255/arthur_root_4.7.3_sanitized.imag
> e.xz
> https://dl.dropboxusercontent.com/u/5328255/arthur_root_4.8.5_sanitized.ima
> ge.xz

BTW, since my problem appears to have been known, does anybody still care 
about these?

-- 
Marc Joliet
--
"People who think they know everything really annoy those of us who know we
don't" - Bjarne Stroustrup


signature.asc
Description: This is a digitally signed message part.


Re: bio linked list corruption.

2016-12-05 Thread Vegard Nossum
On 5 December 2016 at 21:35, Linus Torvalds
 wrote:
> Note for Ingo and Peter: this patch has not been tested at all. But
> Vegard did test an earlier patch of mine that just verified that yes,
> the issue really was that wait queue entries remained on the wait
> queue head just as we were about to return and free it.

The second patch has been running for 1h+ without any problems of any
kind. I should typically have seen 2 crashes by now. I'll let it run
overnight to be sure.


Vegard
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bio linked list corruption.

2016-12-05 Thread Linus Torvalds
Adding the scheduler people to the participants list, and re-attaching
the patch, because while this patch is internal to the VM code, the
issue itself is not.

There might well be other cases where somebody goes "wake_up_all()"
will wake everybody up, so I can put the wait queue head on the stack,
and then after I've woken people up I can return".

Ingo/PeterZ: the reason that does *not* work is that "wake_up_all()"
does make sure that everybody is woken up, but the usual autoremove
wake function only removes the wakeup entry if the process was woken
up by that particular wakeup. If something else had woken it up, the
entry remains on the list, and the waker in this case returned with
the wait head still containing entries.

Which is deadly when the wait queue head is on the stack.

So I'm wondering if we should make that "synchronous_wake_function()"
available generally, and maybe introduce a DEFINE_WAIT_SYNC() helper
that uses it.

Of course, I'm really hoping that this shmem.c use is the _only_ such
case.  But I doubt it.

Comments?

Note for Ingo and Peter: this patch has not been tested at all. But
Vegard did test an earlier patch of mine that just verified that yes,
the issue really was that wait queue entries remained on the wait
queue head just as we were about to return and free it.

   Linus


On Mon, Dec 5, 2016 at 12:10 PM, Linus Torvalds
 wrote:
>
> Anyway, can you try this patch instead? It should actually cause the
> wake_up_all() to always remove all entries, and thus the WARN_ON()
> should no longer happen (and I removed the "list_del()" hackery).
>
>Linus
diff --git a/mm/shmem.c b/mm/shmem.c
index 166ebf5d2bce..17beb44e9f4f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1848,6 +1848,19 @@ alloc_nohuge:page = 
shmem_alloc_and_acct_page(gfp, info, sbinfo,
return error;
 }
 
+/*
+ * This is like autoremove_wake_function, but it removes the wait queue
+ * entry unconditionally - even if something else had already woken the
+ * target.
+ */
+static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int 
sync, void *key)
+{
+   int ret = default_wake_function(wait, mode, sync, key);
+   list_del_init(>task_list);
+   return ret;
+}
+
+
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
struct inode *inode = file_inode(vma->vm_file);
@@ -1883,7 +1896,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf)
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
wait_queue_head_t *shmem_falloc_waitq;
-   DEFINE_WAIT(shmem_fault_wait);
+   DEFINE_WAIT_FUNC(shmem_fault_wait, 
synchronous_wake_function);
 
ret = VM_FAULT_NOPAGE;
if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
@@ -2665,6 +2678,7 @@ static long shmem_fallocate(struct file *file, int mode, 
loff_t offset,
spin_lock(>i_lock);
inode->i_private = NULL;
wake_up_all(_falloc_waitq);
+   WARN_ON_ONCE(!list_empty(_falloc_waitq.task_list));
spin_unlock(>i_lock);
error = 0;
goto out;


Re: bio linked list corruption.

2016-12-05 Thread Linus Torvalds
On Mon, Dec 5, 2016 at 11:11 AM, Vegard Nossum  wrote:
>
> [ cut here ]
> WARNING: CPU: 22 PID: 14012 at mm/shmem.c:2668 shmem_fallocate+0x9a7/0xac0

Ok, good. So that's confirmed as the cause of this problem.

And the call chain that I wanted is obviously completely
uninteresting, because it's call cghain on the other side (the page
fault side) that would show the nested wake queue behavior. I was just
being stupid about it.

I wonder if we have any other places where we just blithely assume
that "wake_up_all()" will actually empty the whole wait queue. It's
_usually_ true, but as noted, nested waiting does happen.

Anyway, can you try this patch instead? It should actually cause the
wake_up_all() to always remove all entries, and thus the WARN_ON()
should no longer happen (and I removed the "list_del()" hackery).

   Linus
diff --git a/mm/shmem.c b/mm/shmem.c
index 166ebf5d2bce..17beb44e9f4f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1848,6 +1848,19 @@ alloc_nohuge:page = 
shmem_alloc_and_acct_page(gfp, info, sbinfo,
return error;
 }
 
+/*
+ * This is like autoremove_wake_function, but it removes the wait queue
+ * entry unconditionally - even if something else had already woken the
+ * target.
+ */
+static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int 
sync, void *key)
+{
+   int ret = default_wake_function(wait, mode, sync, key);
+   list_del_init(>task_list);
+   return ret;
+}
+
+
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
struct inode *inode = file_inode(vma->vm_file);
@@ -1883,7 +1896,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf)
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
wait_queue_head_t *shmem_falloc_waitq;
-   DEFINE_WAIT(shmem_fault_wait);
+   DEFINE_WAIT_FUNC(shmem_fault_wait, 
synchronous_wake_function);
 
ret = VM_FAULT_NOPAGE;
if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
@@ -2665,6 +2678,7 @@ static long shmem_fallocate(struct file *file, int mode, 
loff_t offset,
spin_lock(>i_lock);
inode->i_private = NULL;
wake_up_all(_falloc_waitq);
+   WARN_ON_ONCE(!list_empty(_falloc_waitq.task_list));
spin_unlock(>i_lock);
error = 0;
goto out;


Re: bio linked list corruption.

2016-12-05 Thread Vegard Nossum
On 5 December 2016 at 20:11, Vegard Nossum  wrote:
> On 5 December 2016 at 18:55, Linus Torvalds
>  wrote:
>> On Mon, Dec 5, 2016 at 9:09 AM, Vegard Nossum  
>> wrote:
>> Since you apparently can recreate this fairly easily, how about trying
>> this stupid patch?
>>
>> NOTE! This is entirely untested. I may have screwed this up entirely.
>> You get the idea, though - just remove the wait queue head from the
>> list - the list entries stay around, but nothing points to the stack
>> entry (that we're going to free) any more.
>>
>> And add the warning to see if this actually ever triggers (and because
>> I'd like to see the callchain when it does, to see if it's another
>> waitqueue somewhere or what..)
>
> [ cut here ]
> WARNING: CPU: 22 PID: 14012 at mm/shmem.c:2668 shmem_fallocate+0x9a7/0xac0
> Kernel panic - not syncing: panic_on_warn set ...

So I noticed that panic_on_warn just after sending the email and I've
been waiting for it it to trigger again.

The warning has triggered twice more without panic_on_warn set and I
haven't seen any crash yet.


Vegard
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bio linked list corruption.

2016-12-05 Thread Vegard Nossum
On 5 December 2016 at 18:55, Linus Torvalds
 wrote:
> On Mon, Dec 5, 2016 at 9:09 AM, Vegard Nossum  wrote:
>>
>> The warning shows that it made it past the list_empty_careful() check
>> in finish_wait() but then bugs out on the >task_list
>> dereference.
>>
>> Anything stick out?
>
> I hate that shmem waitqueue garbage. It's really subtle.
>
> I think the problem is that "wake_up_all()" in shmem_fallocate()
> doesn't necessarily wake up everything. It wakes up TASK_NORMAL -
> which does include TASK_UNINTERRUPTIBLE, but doesn't actually mean
> "everything on the list".
>
> I think that what happens is that the waiters somehow move from
> TASK_UNINTERRUPTIBLE to TASK_RUNNING early, and this means that
> wake_up_all() will ignore them, leave them on the list, and now that
> list on stack is no longer empty at the end.
>
> And the way *THAT* can happen is that the task is on some *other*
> waitqueue as well, and that other waiqueue wakes it up. That's not
> impossible, you can certainly have people on wait-queues that still
> take faults.
>
> Or somebody just uses a directed wake_up_process() or something.
>
> Since you apparently can recreate this fairly easily, how about trying
> this stupid patch?
>
> NOTE! This is entirely untested. I may have screwed this up entirely.
> You get the idea, though - just remove the wait queue head from the
> list - the list entries stay around, but nothing points to the stack
> entry (that we're going to free) any more.
>
> And add the warning to see if this actually ever triggers (and because
> I'd like to see the callchain when it does, to see if it's another
> waitqueue somewhere or what..)

[ cut here ]
WARNING: CPU: 22 PID: 14012 at mm/shmem.c:2668 shmem_fallocate+0x9a7/0xac0
Kernel panic - not syncing: panic_on_warn set ...

CPU: 22 PID: 14012 Comm: trinity-c73 Not tainted 4.9.0-rc7+ #220
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
Ubuntu-1.8.2-1ubuntu1 04/01/2014
8801e32af970 81fb08c1 83e74b60 8801e32afa48
83ed7600 847103e0 8801e32afa38 81515244
41b58ab3 844e21da 81515061 8151591e
Call Trace:
[] dump_stack+0x83/0xb2
[] panic+0x1e3/0x3ad
[] __warn+0x1bf/0x1e0
[] warn_slowpath_null+0x2c/0x40
[] shmem_fallocate+0x9a7/0xac0
[] vfs_fallocate+0x350/0x620
[] SyS_madvise+0x432/0x1290
[] do_syscall_64+0x1af/0x4d0
[] entry_SYSCALL64_slow_path+0x25/0x25
[ cut here ]

Attached a full log.


Vegard


0.txt.gz
Description: GNU Zip compressed data


Re: [PATCH v2] btrfs-progs: utils: negative numbers are more plausible than sizes over 8 EiB

2016-12-05 Thread Omar Sandoval
On Sat, Dec 03, 2016 at 03:39:54PM -0500, Zygo Blaxell wrote:
> I got tired of seeing "16.00EiB" whenever btrfs-progs encounters a
> negative size value, e.g. during resize:
> 
> Unallocated:
>/dev/mapper/datamd18   16.00EiB
> 
> This version is much more useful:
> 
> Unallocated:
>/dev/mapper/datamd18  -26.29GiB

Just checked and GCC 6.2.1 doesn't even enable -Wsign-conversion for
-Wextra, so this is probably the way to go.

Reviewed-by: Omar Sandoval 

> Signed-off-by: Zygo Blaxell 
> 
> ---
> v2: change the function prototype so it's easier to see that the
> mangling implied by the name "pretty" includes "reinterpretation
> of the u64 value as a signed quantity."
> ---
>  utils.c | 12 ++--
>  utils.h |  4 ++--
>  2 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/utils.c b/utils.c
> index 69b580a..07e8443 100644
> --- a/utils.c
> +++ b/utils.c
> @@ -2575,7 +2575,7 @@ out:
>   * Note: this function uses a static per-thread buffer. Do not call this
>   * function more than 10 times within one argument list!
>   */
> -const char *pretty_size_mode(u64 size, unsigned mode)
> +const char *pretty_size_mode(s64 size, unsigned mode)
>  {
>   static __thread int ps_index = 0;
>   static __thread char ps_array[10][32];
> @@ -2594,20 +2594,20 @@ static const char* unit_suffix_binary[] =
>  static const char* unit_suffix_decimal[] =
>   { "B", "kB", "MB", "GB", "TB", "PB", "EB"};
>  
> -int pretty_size_snprintf(u64 size, char *str, size_t str_size, unsigned 
> unit_mode)
> +int pretty_size_snprintf(s64 size, char *str, size_t str_size, unsigned 
> unit_mode)
>  {
>   int num_divs;
>   float fraction;
> - u64 base = 0;
> + s64 base = 0;
>   int mult = 0;
>   const char** suffix = NULL;
> - u64 last_size;
> + s64 last_size;
>  
>   if (str_size == 0)
>   return 0;
>  
>   if ((unit_mode & ~UNITS_MODE_MASK) == UNITS_RAW) {
> - snprintf(str, str_size, "%llu", size);
> + snprintf(str, str_size, "%lld", size);
>   return 0;
>   }
>  
> @@ -2642,7 +2642,7 @@ int pretty_size_snprintf(u64 size, char *str, size_t 
> str_size, unsigned unit_mod
>  num_divs = 0;
>  break;
>   default:
> - while (size >= mult) {
> + while ((size < 0 ? -size : size) >= mult) {
>   last_size = size;
>   size /= mult;
>   num_divs++;
> diff --git a/utils.h b/utils.h
> index 366ca29..525bde9 100644
> --- a/utils.h
> +++ b/utils.h
> @@ -174,9 +174,9 @@ int check_mounted_where(int fd, const char *file, char 
> *where, int size,
>  int btrfs_device_already_in_root(struct btrfs_root *root, int fd,
>int super_offset);
>  
> -int pretty_size_snprintf(u64 size, char *str, size_t str_bytes, unsigned 
> unit_mode);
> +int pretty_size_snprintf(s64 size, char *str, size_t str_bytes, unsigned 
> unit_mode);
>  #define pretty_size(size)pretty_size_mode(size, UNITS_DEFAULT)
> -const char *pretty_size_mode(u64 size, unsigned mode);
> +const char *pretty_size_mode(s64 size, unsigned mode);
>  
>  u64 parse_size(char *s);
>  u64 parse_qgroupid(const char *p);
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RESEND][PATCH v2] btrfs-progs: add dev stats returncode option

2016-12-05 Thread Austin S. Hemmelgarn
Currently, `btrfs device stats` returns non-zero only when there was an
error getting the counter values.  This is fine for when it gets run by a
user directly, but is a serious pain when trying to use it in a script or
for monitoring since you need to parse the (not at all machine friendly)
output to check the counter values.

This patch adds an option ('-s') which causes `btrfs device stats`
to set bit 6 in the return code if any of the counters are non-zero.
This greatly simplifies checking from a script or monitoring software if
any errors have been recorded.  In the event that this switch is passed
and an error occurs reading the stats, the return code will have bit
0 set (so if there are errors reading counters, and the counters which
were read were non-zero, the return value will be 65).

Signed-off-by: Austin S. Hemmelgarn 
---
Changes since v1:
 * Switched to using bit 6 instead of bit 7 so we don't stomp on Bash's
   manipulation of return codes.  Thanks to Mike Fleetwood for reminding
   me about this.

Apparently this didn't make it to the ML last time, so trying again.
Sorry if you got this twice David.

 Documentation/btrfs-device.asciidoc |  8 +++-
 cmds-device.c   | 39 ++---
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/Documentation/btrfs-device.asciidoc 
b/Documentation/btrfs-device.asciidoc
index 239c99b..d398b6d 100644
--- a/Documentation/btrfs-device.asciidoc
+++ b/Documentation/btrfs-device.asciidoc
@@ -98,7 +98,7 @@ remain as such. Reloading the kernel module will drop this 
information. There's
 an alternative way of mounting multiple-device filesystem without the need for
 prior scanning. See the mount option 'device'.
 
-*stats* [-z] |::
+*stats* [-zs] |::
 Read and print the device IO error statistics for all devices of the given
 filesystem identified by  or for a single . See section *DEVICE
 STATS* for more information.
@@ -108,6 +108,9 @@ STATS* for more information.
 -z
 Print the stats and reset the values to zero afterwards.
 
+-s
+Set bit 6 of the return-code if any error statistics are non-zero.
+
 *usage* [options]  [...]::
 Show detailed information about internal allocations in devices.
 +
@@ -231,6 +234,9 @@ EXIT STATUS
 *btrfs device* returns a zero exit status if it succeeds. Non zero is
 returned in case of failure.
 
+If the '-s' option is used, *btrfs device stats* will add 64 to the
+exit status if any of the error counters is non-zero.
+
 AVAILABILITY
 
 *btrfs* is part of btrfs-progs.
diff --git a/cmds-device.c b/cmds-device.c
index fa0830f..392e37c 100644
--- a/cmds-device.c
+++ b/cmds-device.c
@@ -376,6 +376,7 @@ static const char * const cmd_device_stats_usage[] = {
"Show current device IO stats.",
"",
"-z show current stats and reset values to zero",
+   "-s return non-zero if any stat counter is not 
zero",
NULL
 };
 
@@ -389,14 +390,18 @@ static int cmd_device_stats(int argc, char **argv)
int i;
int c;
int err = 0;
+   int status = 0;
__u64 flags = 0;
DIR *dirstream = NULL;
 
-   while ((c = getopt(argc, argv, "z")) != -1) {
+   while ((c = getopt(argc, argv, "zs")) != -1) {
switch (c) {
case 'z':
flags = BTRFS_DEV_STATS_RESET;
break;
+   case 's':
+   status = 1;
+   break;
case '?':
default:
usage(cmd_device_stats_usage);
@@ -440,7 +445,7 @@ static int cmd_device_stats(int argc, char **argv)
if (ioctl(fdmnt, BTRFS_IOC_GET_DEV_STATS, ) < 0) {
error("DEV_STATS ioctl failed on %s: %s",
  path, strerror(errno));
-   err = 1;
+   err |= 1;
} else {
char *canonical_path;
 
@@ -457,31 +462,51 @@ static int cmd_device_stats(int argc, char **argv)
 "devid:%llu", args.devid);
}
 
-   if (args.nr_items >= BTRFS_DEV_STAT_WRITE_ERRS + 1)
+   if (args.nr_items >= BTRFS_DEV_STAT_WRITE_ERRS + 1) {
printf("[%s].write_io_errs   %llu\n",
   canonical_path,
   (unsigned long long) args.values[
BTRFS_DEV_STAT_WRITE_ERRS]);
-   if (args.nr_items >= BTRFS_DEV_STAT_READ_ERRS + 1)
+   if ((status == 1) && 
(args.values[BTRFS_DEV_STAT_WRITE_ERRS] > 0)) {
+   err |= 64;
+   }
+   }
+   if (args.nr_items >= 

Re: bio linked list corruption.

2016-12-05 Thread Vegard Nossum
On 5 December 2016 at 19:11, Andy Lutomirski  wrote:
> On Sun, Dec 4, 2016 at 3:04 PM, Vegard Nossum  wrote:
>> On 23 November 2016 at 20:58, Dave Jones  wrote:
>>> On Wed, Nov 23, 2016 at 02:34:19PM -0500, Dave Jones wrote:
>>>
>>>  > [  317.689216] BUG: Bad page state in process kworker/u8:8  pfn:4d8fd4
>>>  > trace from just before this happened. Does this shed any light ?
>>>  >
>>>  > https://codemonkey.org.uk/junk/trace.txt
>>>
>>> crap, I just noticed the timestamps in the trace come from quite a bit
>>> later.  I'll tweak the code to do the taint checking/ftrace stop after
>>> every syscall, that should narrow the window some more.
>>
>> FWIW I hit this as well:
>>
>> BUG: unable to handle kernel paging request at 81ff08b7
>
> We really ought to improve this message.  If nothing else, it should
> say whether it was a read, a write, or an instruction fetch.
>
>> IP: [] __lock_acquire.isra.32+0xda/0x1a30
>> PGD 461e067 PUD 461f063
>> PMD 1e001e1
>
> Too lazy to manually decode this right now, but I don't think it matters.
>
>> Oops: 0003 [#1] PREEMPT SMP KASAN
>
> Note this is SMP, but that just means CONFIG_SMP=y.  Vegard, how many
> CPUs does your kernel think you have?

My first crash was running on a 1-CPU guest (not intentionally, but
because of a badly configured qemu). I'm honestly surprised it
triggered at all with 1 CPU, but I guess it shows that it's not a true
concurrency issue at least!

>
>> RIP: 0010:[]  []
>> __lock_acquire.isra.32+0xda/0x1a30
>> RSP: 0018:8801bab8f730  EFLAGS: 00010082
>> RAX: 81ff071f RBX:  RCX: 
>
> RAX points to kernel text.

Yes, it's somewhere in the middle of iov_iter_init() -- other crashes
also had put_prev_entity(), a null pointer, and some garbage values I
couldn't identify.

>
>> Code: 89 4d b8 44 89 45 c0 89 4d c8 4c 89 55 d0 e8 ee c3 ff ff 48 85
>> c0 4c 8b 55 d0 8b 4d c8 44 8b 45 c0 4c 8b 4d b8 0f 84 c6 01 00 00 <3e>
>> ff 80 98 01 00 00 49 8d be 48 07 00 00 48 ba 00 00 00 00 00
>
>   2b:3e ff 80 98 01 00 00 incl   %ds:*0x198(%rax)<--
> trapping instruction
>
> That's very strange.  I think this is:
>
> atomic_inc((atomic_t *)>ops);
>
> but my kernel contains:
>
> 3cb4:   f0 ff 80 98 01 00 00lock incl 0x198(%rax)
>
> So your kernel has been smp-alternatived.  That 3e comes from
> alternatives_smp_unlock.  If you're running on SMP with UP
> alternatives, things will break.

Yes, indeed. It was running on 1 CPU by mistake and still triggered the bug.

The crashes started really pouring in once I got my qemu fixed. Just
to reassure you, here's another crash which shows it's using the
correct instruction on an actual multicore guest:

BUG: unable to handle kernel paging request at 0003030001de
IP: [] __lock_acquire.isra.32+0xda/0x1a30
PGD 183fd2067 PUD 0

Oops: 0002 [#1] PREEMPT SMP KASAN
Dumping ftrace buffer:
  (ftrace buffer empty)
CPU: 23 PID: 9584 Comm: trinity-c104 Not tainted 4.9.0-rc7+ #219
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
Ubuntu-1.8.2-1ubuntu1 04/01/2014
task: 880189f68000 task.stack: 88017fe5
RIP: 0010:[]  []
__lock_acquire.isra.32+0xda/0x1a30
RSP: 0018:88017fe575e0  EFLAGS: 00010002
RAX: 00030346 RBX:  RCX: 
[...]
Code: 89 4d b8 44 89 45 c0 89 4d c8 4c 89 55 d0 e8 ee c3 ff ff 48 85
c0 4c 8b 55 d0 8b 4d c8 44 8b 45 c0 4c 8b 4d b8 0f 84 c6 01 00 00 
ff 80 98 01 00 00 49 8d be 48 07 00 00 48 ba 00 00 00 00 00
RIP  [] __lock_acquire.isra.32+0xda/0x1a30
RSP 
CR2: 0003030001de
---[ end trace 2846425104eb6141 ]---
Kernel panic - not syncing: Fatal exception
[ cut here ]

  2b:*  f0 ff 80 98 01 00 00lock incl 0x198(%rax)   <--
trapping instruction

> What's your kernel command line?  Can we have your entire kernel log from 
> boot?

Just in case you still want this, I've attached the boot log for the
"true SMP" guest above.


Vegard


5.txt.gz
Description: GNU Zip compressed data


Re: bio linked list corruption.

2016-12-05 Thread Linus Torvalds
On Mon, Dec 5, 2016 at 10:11 AM, Andy Lutomirski  wrote:
>
> So your kernel has been smp-alternatived.  That 3e comes from
> alternatives_smp_unlock.  If you're running on SMP with UP
> alternatives, things will break.

I'm assuming he's just running in a VM with a single CPU.

The problem that I pointed out with assuming wake_up_all() actually
removes all wait queue entries does not depend on SMP. The race is
much more fundamental and long-lived.

   Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bio linked list corruption.

2016-12-05 Thread Andy Lutomirski
On Sun, Dec 4, 2016 at 3:04 PM, Vegard Nossum  wrote:
> On 23 November 2016 at 20:58, Dave Jones  wrote:
>> On Wed, Nov 23, 2016 at 02:34:19PM -0500, Dave Jones wrote:
>>
>>  > [  317.689216] BUG: Bad page state in process kworker/u8:8  pfn:4d8fd4
>>  > trace from just before this happened. Does this shed any light ?
>>  >
>>  > https://codemonkey.org.uk/junk/trace.txt
>>
>> crap, I just noticed the timestamps in the trace come from quite a bit
>> later.  I'll tweak the code to do the taint checking/ftrace stop after
>> every syscall, that should narrow the window some more.
>
> FWIW I hit this as well:
>
> BUG: unable to handle kernel paging request at 81ff08b7

We really ought to improve this message.  If nothing else, it should
say whether it was a read, a write, or an instruction fetch.

> IP: [] __lock_acquire.isra.32+0xda/0x1a30
> PGD 461e067 PUD 461f063
> PMD 1e001e1

Too lazy to manually decode this right now, but I don't think it matters.

> Oops: 0003 [#1] PREEMPT SMP KASAN

Note this is SMP, but that just means CONFIG_SMP=y.  Vegard, how many
CPUs does your kernel think you have?

> RIP: 0010:[]  []
> __lock_acquire.isra.32+0xda/0x1a30
> RSP: 0018:8801bab8f730  EFLAGS: 00010082
> RAX: 81ff071f RBX:  RCX: 

RAX points to kernel text.

> Code: 89 4d b8 44 89 45 c0 89 4d c8 4c 89 55 d0 e8 ee c3 ff ff 48 85
> c0 4c 8b 55 d0 8b 4d c8 44 8b 45 c0 4c 8b 4d b8 0f 84 c6 01 00 00 <3e>
> ff 80 98 01 00 00 49 8d be 48 07 00 00 48 ba 00 00 00 00 00

  2b:3e ff 80 98 01 00 00 incl   %ds:*0x198(%rax)<--
trapping instruction

That's very strange.  I think this is:

atomic_inc((atomic_t *)>ops);

but my kernel contains:

3cb4:   f0 ff 80 98 01 00 00lock incl 0x198(%rax)

So your kernel has been smp-alternatived.  That 3e comes from
alternatives_smp_unlock.  If you're running on SMP with UP
alternatives, things will break.

What's your kernel command line?  Can we have your entire kernel log from boot?

Adding Borislav, since he's the guru for this code.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bio linked list corruption.

2016-12-05 Thread Linus Torvalds
On Mon, Dec 5, 2016 at 9:09 AM, Vegard Nossum  wrote:
>
> The warning shows that it made it past the list_empty_careful() check
> in finish_wait() but then bugs out on the >task_list
> dereference.
>
> Anything stick out?

I hate that shmem waitqueue garbage. It's really subtle.

I think the problem is that "wake_up_all()" in shmem_fallocate()
doesn't necessarily wake up everything. It wakes up TASK_NORMAL -
which does include TASK_UNINTERRUPTIBLE, but doesn't actually mean
"everything on the list".

I think that what happens is that the waiters somehow move from
TASK_UNINTERRUPTIBLE to TASK_RUNNING early, and this means that
wake_up_all() will ignore them, leave them on the list, and now that
list on stack is no longer empty at the end.

And the way *THAT* can happen is that the task is on some *other*
waitqueue as well, and that other waiqueue wakes it up. That's not
impossible, you can certainly have people on wait-queues that still
take faults.

Or somebody just uses a directed wake_up_process() or something.

Since you apparently can recreate this fairly easily, how about trying
this stupid patch?

NOTE! This is entirely untested. I may have screwed this up entirely.
You get the idea, though - just remove the wait queue head from the
list - the list entries stay around, but nothing points to the stack
entry (that we're going to free) any more.

And add the warning to see if this actually ever triggers (and because
I'd like to see the callchain when it does, to see if it's another
waitqueue somewhere or what..)

  Linus
diff --git a/mm/shmem.c b/mm/shmem.c
index 166ebf5d2bce..a80148b43476 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2665,6 +2665,8 @@ static long shmem_fallocate(struct file *file, int mode, 
loff_t offset,
spin_lock(>i_lock);
inode->i_private = NULL;
wake_up_all(_falloc_waitq);
+   if (WARN_ON_ONCE(!list_empty(_falloc_waitq.task_list)))
+   list_del(_falloc_waitq.task_list);
spin_unlock(>i_lock);
error = 0;
goto out;


BTRFS kernel OOPS 4.8.11

2016-12-05 Thread Gerard Saraber
I have a NAS with a mix of 6, 4 and 3 TB drives:

shrapnel zm # btrfs filesystem df /home/exports
Data, RAID1: total=19.59TiB, used=19.51TiB
System, RAID1: total=32.00MiB, used=2.75MiB
Metadata, RAID1: total=76.00GiB, used=74.71GiB
GlobalReserve, single: total=512.00MiB, used=0.00B
shrapnel zm # btrfs filesystem usage /home/exports
Overall:
 Device size:  63.68TiB
 Device allocated: 39.34TiB
 Device unallocated:   24.34TiB
 Device missing:  0.00B
 Used: 39.17TiB
 Free (estimated): 12.25TiB  (min: 12.25TiB)
 Data ratio:   2.00
 Metadata ratio:   2.00
 Global reserve:  512.00MiB  (used: 0.00B)

Data,RAID1: Size:19.59TiB, Used:19.51TiB
/dev/sda3.99TiB
/dev/sdb2.21TiB
/dev/sdc2.21TiB
/dev/sdd4.00TiB
/dev/sde2.21TiB
/dev/sdf3.99TiB
/dev/sdg1.30TiB
/dev/sdh4.00TiB
/dev/sdj1.30TiB
/dev/sdk1.30TiB
/dev/sdl1.30TiB
/dev/sdm2.21TiB
/dev/sdo2.18TiB
/dev/sdp2.21TiB
/dev/sdq2.21TiB
/dev/sds1.30TiB
/dev/sdt1.30TiB

Metadata,RAID1: Size:76.00GiB, Used:74.71GiB
/dev/sda   35.00GiB
/dev/sdb1.00GiB
/dev/sdc3.00GiB
/dev/sdd   32.00GiB
/dev/sde3.00GiB
/dev/sdf   35.00GiB
/dev/sdh   29.00GiB
/dev/sdj2.00GiB
/dev/sdk1.00GiB
/dev/sdl1.00GiB
/dev/sdm2.00GiB
/dev/sdo4.00GiB
/dev/sds3.00GiB
/dev/sdt1.00GiB

System,RAID1: Size:32.00MiB, Used:2.75MiB
/dev/sdd   32.00MiB
/dev/sdf   32.00MiB

Unallocated:
/dev/sda1.43TiB
/dev/sdb1.43TiB
/dev/sdc1.43TiB
/dev/sdd1.43TiB
/dev/sde1.43TiB
/dev/sdf1.43TiB
/dev/sdg1.43TiB
/dev/sdh1.43TiB
/dev/sdj1.43TiB
/dev/sdk1.43TiB
/dev/sdl1.43TiB
/dev/sdm1.43TiB
/dev/sdo1.46TiB
/dev/sdp1.43TiB
/dev/sdq1.43TiB
/dev/sds1.43TiB
/dev/sdt1.43TiB

One of them keeps throwing errors, during this command, the oops happened:
# btrfs device delete /dev/sdt /home/exports


Dec 05 08:33:30 [kernel] [259785.367744] BTRFS info (device sdt): csum
failed ino 122743909 extent 1473222864896 csum 879250177 wanted 3941849660
mirror 0
Dec 05 08:33:30 [kernel] [259785.387033] [ cut here
]
Dec 05 08:33:30 [kernel] [259785.387049] kernel BUG at
fs/btrfs/extent_io.c:2041!
Dec 05 08:33:30 [kernel] [259785.387062] invalid opcode:  [#1] SMP
Dec 05 08:33:30 [kernel] [259785.387072] Modules linked in: btrfs
zlib_deflate megaraid_sas
Dec 05 08:33:30 [kernel] [259785.387096] CPU: 2 PID: 14355 Comm:
kworker/u8:11 Tainted: GW   4.8.11 #1
Dec 05 08:33:30 [kernel] [259785.387112] Hardware name: Supermicro
X7DB8/X7DB8, BIOS 6.00 06/23/2006
Dec 05 08:33:30 [kernel] [259785.387161] Workqueue: btrfs-endio
btrfs_endio_helper [btrfs]
Dec 05 08:33:30 [kernel] [259785.387177] task: 88081b8e8b00 task.stack:
88046b49
Dec 05 08:33:30 [kernel] [259785.387189] RIP: 0010:[]
  [] repair_io_failure+0x221/0x250 [btrfs]
Dec 05 08:33:30 [kernel] [259785.387223] RSP: 0018:88046b493c30
  EFLAGS: 00010202
Dec 05 08:33:30 [kernel] [259785.387235] RAX:  RBX:
6b4e0140 RCX: 
Dec 05 08:33:30 [kernel] [259785.387250] RDX:  RSI:
 RDI: 88071dddc840
Dec 05 08:33:30 [kernel] [259785.387264] RBP: 88046b493c90 R08:
015701409000 R09: 88071dddc840
Dec 05 08:33:30 [kernel] [259785.387280] R10: 01ac R11:
6b4e37c0 R12: 88046b4e30a8
Dec 05 08:33:30 [kernel] [259785.387295] R13:  R14:
1df9496b9000 R15: 8801471b6000
Dec 05 08:33:30 [kernel] [259785.387560] FS:  ()
GS:88083fd0() knlGS:
Dec 05 08:33:30 [kernel] [259785.387825] CS:  0010 DS:  ES:  CR0:
80050033
Dec 05 08:33:30 [kernel] [259785.387962] CR2: 0226f000 CR3:
000255358000 CR4: 06e0
Dec 05 08:33:30 [kernel] [259785.388002] Stack:
Dec 05 08:33:30 [kernel] [259785.388002]  0283e000 880703f4a370
1000 ea001d05ff40
Dec 05 08:33:30 [kernel] [259785.388002]  1000 7000
88071dddc840 880703f4a180
Dec 05 08:33:30 [kernel] [259785.388002]  0283e000 8801471b6000
880703f4a370 880703f4a1e8
Dec 05 08:33:30 [kernel] [259785.388002] Call Trace:
Dec 05 08:33:30 [kernel] [259785.388002]  []
clean_io_failure+0x136/0x150 [btrfs]
Dec 05 08:33:30 [kernel] [259785.388002]  []
end_bio_extent_readpage+0x2be/0x510 [btrfs]
Dec 05 08:33:30 [kernel] 

Re: bio linked list corruption.

2016-12-05 Thread Dave Jones
On Mon, Dec 05, 2016 at 06:09:29PM +0100, Vegard Nossum wrote:
 > On 5 December 2016 at 12:10, Vegard Nossum  wrote:
 > > On 5 December 2016 at 00:04, Vegard Nossum  wrote:
 > >> FWIW I hit this as well:
 > >>
 > >> BUG: unable to handle kernel paging request at 81ff08b7
 > >> IP: [] __lock_acquire.isra.32+0xda/0x1a30
 > >> CPU: 0 PID: 21744 Comm: trinity-c56 Tainted: GB   4.9.0-rc7+ 
 > >> #217
 > > [...]
 > >
 > >> I think you can rule out btrfs in any case, probably block layer as
 > >> well, since it looks like this comes from shmem.
 > >
 > > I should rather say that the VM runs on a 9p root filesystem and it
 > > doesn't use/mount any block devices or disk-based filesystems.
 > >
 > > I have all the trinity logs for the crash if that's useful. I tried a
 > > couple of runs with just the (at the time) in-progress syscalls but it
 > > didn't turn up anything interesting. Otherwise it seems like a lot of
 > > data to go through by hand.
 > 
 > I've hit this another 7 times in the past ~3 hours.
 > 
 > Three times the address being dereferenced has pointed to
 > iov_iter_init+0xaf (even across a kernel rebuild), three times it has
 > pointed to put_prev_entity+0x55, once to 0x80008, and twice to
 > 0x292. The fact that it would hit even one of those more than once
 > across runs is pretty suspicious to me, although the ones that point
 > to iov_iter_init and put_prev_entity point to "random" instructions in
 > the sense that they are neither entry points nor return addresses.
 > 
 > shmem_fault() was always on the stack, but it came from different
 > syscalls: add_key(), newuname(), pipe2(), newstat(), fstat(),
 > clock_settime(), mount(), etc.
> [ cut here ]
 > The warning shows that it made it past the list_empty_careful() check
 > in finish_wait() but then bugs out on the >task_list
 > dereference.

I just pushed out the ftrace changes I made to Trinity that might help
you gather more clues.

Right now it's hardcoded to dump a trace to /boot/trace.txt when it
detects the kernel has become tainted.

Before a trinity run, I run this as root..

#!/bin/sh

cd /sys/kernel/debug/tracing/

echo 1 > buffer_size_kb
echo function >> current_tracer

for i in $(cat /home/davej/blacklist-symbols)
do
  echo $i >> set_ftrace_notrace
done

echo 1 >> tracing_on


blacklist-symbols is the more noisy stuff that pollutes traces.
Right now I use these: https://paste.fedoraproject.org/499794/14809582/
You may need to add some more.

(I'll get around to making all this scripting go away, and have trinity
just set this stuff up itself eventually)

Oh, and if you're not running as root, you might need a diff like below
so that trinity can stop the trace when it detects tainting.

Dave

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8696ce6bf2f6..2d6c97e871e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7217,7 +7217,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry 
*d_tracer)
trace_create_file("trace_clock", 0644, d_tracer, tr,
  _clock_fops);
 
-   trace_create_file("tracing_on", 0644, d_tracer,
+   trace_create_file("tracing_on", 0666, d_tracer,
  tr, _simple_fops);
 
create_trace_options_dir(tr);
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bio linked list corruption.

2016-12-05 Thread Vegard Nossum
On 5 December 2016 at 12:10, Vegard Nossum  wrote:
> On 5 December 2016 at 00:04, Vegard Nossum  wrote:
>> FWIW I hit this as well:
>>
>> BUG: unable to handle kernel paging request at 81ff08b7
>> IP: [] __lock_acquire.isra.32+0xda/0x1a30
>> CPU: 0 PID: 21744 Comm: trinity-c56 Tainted: GB   4.9.0-rc7+ #217
> [...]
>
>> I think you can rule out btrfs in any case, probably block layer as
>> well, since it looks like this comes from shmem.
>
> I should rather say that the VM runs on a 9p root filesystem and it
> doesn't use/mount any block devices or disk-based filesystems.
>
> I have all the trinity logs for the crash if that's useful. I tried a
> couple of runs with just the (at the time) in-progress syscalls but it
> didn't turn up anything interesting. Otherwise it seems like a lot of
> data to go through by hand.

I've hit this another 7 times in the past ~3 hours.

Three times the address being dereferenced has pointed to
iov_iter_init+0xaf (even across a kernel rebuild), three times it has
pointed to put_prev_entity+0x55, once to 0x80008, and twice to
0x292. The fact that it would hit even one of those more than once
across runs is pretty suspicious to me, although the ones that point
to iov_iter_init and put_prev_entity point to "random" instructions in
the sense that they are neither entry points nor return addresses.

shmem_fault() was always on the stack, but it came from different
syscalls: add_key(), newuname(), pipe2(), newstat(), fstat(),
clock_settime(), mount(), etc.

I also got this warning which is related:

[ cut here ]
WARNING: CPU: 9 PID: 25045 at lib/list_debug.c:59 __list_del_entry+0x14f/0x1d0
list_del corruption. prev->next should be 88014bdc79e8, but was
88014bfbfc60
Kernel panic - not syncing: panic_on_warn set ...

CPU: 9 PID: 25045 Comm: trinity-c22 Not tainted 4.9.0-rc7+ #219
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
Ubuntu-1.8.2-1ubuntu1 04/01/2014
88014bdc7700 81fb0861 83e74b60 88014bdc77d8
84006c00 847103e0 88014bdc77c8 81515244
41b58ab3 844e21c2 81515061 0054
Call Trace:
[] dump_stack+0x83/0xb2
[] panic+0x1e3/0x3ad
[] ? percpu_up_read_preempt_enable.constprop.45+0xcb/0xcb
[] ? __list_del_entry+0x14f/0x1d0
[] __warn+0x1bf/0x1e0
[] ? __lock_acquire.isra.32+0xc2/0x1a30
[] warn_slowpath_fmt+0xac/0xd0
[] ? __warn+0x1e0/0x1e0
[] ? finish_wait+0xb0/0x180
[] __list_del_entry+0x14f/0x1d0
[] ? finish_wait+0xb0/0x180
[] finish_wait+0xbb/0x180
[] shmem_fault+0x4c7/0x6b0
[] ? shmem_getpage_gfp+0x673/0x1c90
[] ? shmem_getpage_gfp+0x1c90/0x1c90
[] ? wake_atomic_t_function+0x210/0x210
[] __do_fault+0x206/0x410
[] ? do_page_mkwrite+0x320/0x320
[] ? handle_mm_fault+0x1cc/0x2a60
[] handle_mm_fault+0x10f7/0x2a60
[] ? handle_mm_fault+0x132/0x2a60
[] ? thread_group_cputime+0x49f/0x6e0
[] ? __pmd_alloc+0x370/0x370
[] ? thread_group_cputime+0x4bc/0x6e0
[] ? thread_group_cputime_adjusted+0x6d/0xe0
[] ? __do_page_fault+0x220/0x9f0
[] ? find_vma+0x30/0x150
[] __do_page_fault+0x452/0x9f0
[] trace_do_page_fault+0x1e5/0x3a0
[] do_async_page_fault+0x27/0xa0
[] async_page_fault+0x28/0x30
[] ? copy_user_generic_string+0x2c/0x40
[] ? SyS_times+0x93/0x110
[] ? do_sys_times+0x2b0/0x2b0
[] ? do_sys_times+0x2b0/0x2b0
[] do_syscall_64+0x1af/0x4d0
[] entry_SYSCALL64_slow_path+0x25/0x25
[ cut here ]

The warning shows that it made it past the list_empty_careful() check
in finish_wait() but then bugs out on the >task_list
dereference.

Anything stick out?


Vegard
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 10/18] btrfs: root->fs_info cleanup, btrfs_calc_{trans,trunc}_metadata_size

2016-12-05 Thread Jeff Mahoney
On 12/5/16 10:29 AM, David Sterba wrote:
> On Fri, Dec 02, 2016 at 12:07:30AM -0500, je...@suse.com wrote:
>> -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
>> +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info 
>> *fs_info,
>>   unsigned num_items)
>>  {
>> -return root->fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
>> +return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
> 
> Is there a missing patch that moves 'nodesize' to fs_info? The patch has
> a minor conflict in the original line where it's just 'root->nodesize',
> but thre are many compilation faiures due to lack of fs_info::nodesize.

Yeah, it looks like the list dropped it.  It shows up in the thread
posted to me.

I've pushed the series as the for-4.10/misc-4.10 branch in
my repo at git://git.kernel.org/pub/scm/linux/kernel/git/jeffm/linux-btrfs.git

It also contains the fix Omar suggested in his review.

-Jeff

-- 
Jeff Mahoney
SUSE Labs



signature.asc
Description: OpenPGP digital signature


BTRFS kernel OOPS 4.8.11

2016-12-05 Thread Gerard Saraber
I have a NAS with a mix of 6, 4 and 3 TB drives:

shrapnel zm # btrfs filesystem df /home/exports
Data, RAID1: total=19.59TiB, used=19.51TiB
System, RAID1: total=32.00MiB, used=2.75MiB
Metadata, RAID1: total=76.00GiB, used=74.71GiB
GlobalReserve, single: total=512.00MiB, used=0.00B
shrapnel zm # btrfs filesystem usage /home/exports
Overall:
 Device size:  63.68TiB
 Device allocated: 39.34TiB
 Device unallocated:   24.34TiB
 Device missing:  0.00B
 Used: 39.17TiB
 Free (estimated): 12.25TiB  (min: 12.25TiB)
 Data ratio:   2.00
 Metadata ratio:   2.00
 Global reserve:  512.00MiB  (used: 0.00B)

Data,RAID1: Size:19.59TiB, Used:19.51TiB
/dev/sda3.99TiB
/dev/sdb2.21TiB
/dev/sdc2.21TiB
/dev/sdd4.00TiB
/dev/sde2.21TiB
/dev/sdf3.99TiB
/dev/sdg1.30TiB
/dev/sdh4.00TiB
/dev/sdj1.30TiB
/dev/sdk1.30TiB
/dev/sdl1.30TiB
/dev/sdm2.21TiB
/dev/sdo2.18TiB
/dev/sdp2.21TiB
/dev/sdq2.21TiB
/dev/sds1.30TiB
/dev/sdt1.30TiB

Metadata,RAID1: Size:76.00GiB, Used:74.71GiB
/dev/sda   35.00GiB
/dev/sdb1.00GiB
/dev/sdc3.00GiB
/dev/sdd   32.00GiB
/dev/sde3.00GiB
/dev/sdf   35.00GiB
/dev/sdh   29.00GiB
/dev/sdj2.00GiB
/dev/sdk1.00GiB
/dev/sdl1.00GiB
/dev/sdm2.00GiB
/dev/sdo4.00GiB
/dev/sds3.00GiB
/dev/sdt1.00GiB

System,RAID1: Size:32.00MiB, Used:2.75MiB
/dev/sdd   32.00MiB
/dev/sdf   32.00MiB

Unallocated:
/dev/sda1.43TiB
/dev/sdb1.43TiB
/dev/sdc1.43TiB
/dev/sdd1.43TiB
/dev/sde1.43TiB
/dev/sdf1.43TiB
/dev/sdg1.43TiB
/dev/sdh1.43TiB
/dev/sdj1.43TiB
/dev/sdk1.43TiB
/dev/sdl1.43TiB
/dev/sdm1.43TiB
/dev/sdo1.46TiB
/dev/sdp1.43TiB
/dev/sdq1.43TiB
/dev/sds1.43TiB
/dev/sdt1.43TiB

One of them keeps throwing errors, during this command, the oops happened:
# btrfs device delete /dev/sdt /home/exports


Dec 05 08:33:30 [kernel] [259785.367744] BTRFS info (device sdt): csum
failed ino 122743909 extent 1473222864896 csum 879250177 wanted 3941849660
mirror 0
Dec 05 08:33:30 [kernel] [259785.387033] [ cut here
]
Dec 05 08:33:30 [kernel] [259785.387049] kernel BUG at
fs/btrfs/extent_io.c:2041!
Dec 05 08:33:30 [kernel] [259785.387062] invalid opcode:  [#1] SMP
Dec 05 08:33:30 [kernel] [259785.387072] Modules linked in: btrfs
zlib_deflate megaraid_sas
Dec 05 08:33:30 [kernel] [259785.387096] CPU: 2 PID: 14355 Comm:
kworker/u8:11 Tainted: GW   4.8.11 #1
Dec 05 08:33:30 [kernel] [259785.387112] Hardware name: Supermicro
X7DB8/X7DB8, BIOS 6.00 06/23/2006
Dec 05 08:33:30 [kernel] [259785.387161] Workqueue: btrfs-endio
btrfs_endio_helper [btrfs]
Dec 05 08:33:30 [kernel] [259785.387177] task: 88081b8e8b00 task.stack:
88046b49
Dec 05 08:33:30 [kernel] [259785.387189] RIP: 0010:[]
  [] repair_io_failure+0x221/0x250 [btrfs]
Dec 05 08:33:30 [kernel] [259785.387223] RSP: 0018:88046b493c30
  EFLAGS: 00010202
Dec 05 08:33:30 [kernel] [259785.387235] RAX:  RBX:
6b4e0140 RCX: 
Dec 05 08:33:30 [kernel] [259785.387250] RDX:  RSI:
 RDI: 88071dddc840
Dec 05 08:33:30 [kernel] [259785.387264] RBP: 88046b493c90 R08:
015701409000 R09: 88071dddc840
Dec 05 08:33:30 [kernel] [259785.387280] R10: 01ac R11:
6b4e37c0 R12: 88046b4e30a8
Dec 05 08:33:30 [kernel] [259785.387295] R13:  R14:
1df9496b9000 R15: 8801471b6000
Dec 05 08:33:30 [kernel] [259785.387560] FS:  ()
GS:88083fd0() knlGS:
Dec 05 08:33:30 [kernel] [259785.387825] CS:  0010 DS:  ES:  CR0:
80050033
Dec 05 08:33:30 [kernel] [259785.387962] CR2: 0226f000 CR3:
000255358000 CR4: 06e0
Dec 05 08:33:30 [kernel] [259785.388002] Stack:
Dec 05 08:33:30 [kernel] [259785.388002]  0283e000 880703f4a370
1000 ea001d05ff40
Dec 05 08:33:30 [kernel] [259785.388002]  1000 7000
88071dddc840 880703f4a180
Dec 05 08:33:30 [kernel] [259785.388002]  0283e000 8801471b6000
880703f4a370 880703f4a1e8
Dec 05 08:33:30 [kernel] [259785.388002] Call Trace:
Dec 05 08:33:30 [kernel] [259785.388002]  []
clean_io_failure+0x136/0x150 [btrfs]
Dec 05 08:33:30 [kernel] [259785.388002]  []
end_bio_extent_readpage+0x2be/0x510 [btrfs]
Dec 05 08:33:30 [kernel] 

Re: crc32c_le performance hit

2016-12-05 Thread Chris Mason

On 12/04/2016 04:28 PM, Chris Murphy wrote:

4.8.11-300.fc25.x86_64

I'm currently doing a btrfs send/receive and I'm seeing a rather large
hit for crc32c, bigger than aes-ni (the volume is on dm crypt), using
perf top.

  14.03%  btrfs[.] __crc32c_le
  10.50%  [kernel] [k] _aesni_enc4



This is surprising, although send/recv does do a lot of small crc runs. 
What is the overall CPU usage?  Maybe pin btrfs to a single CPU and use 
mpstat to see how hot that one CPU is.  If we're 14% of a CPU running at 
100%, that's a big deal.  If we're 14% of a CPU running at 5%, we safely 
ignore it.




Complete output is here for 1 month:
https://urldefense.proofpoint.com/v2/url?u=https-3A__paste.fedoraproject.org_498914_=DgIBaQ=5VD0RTtNlTh3ycd41b3MUw=9QPtTAxcitoznaWRKKHoEQ=YXgqrDMWJP2u2reRh04tnYz4CrO5SPUhSHd1cF2OcR0=axoqoaw0ZkYXdDxlTODLOfzqEK7uIJHqMsBjpkyIw5o=

I don't remember crc32's taking this much CPU before, so it seems like
a regression but offhand I don't know when it started.

[chris@f25s ~]$ dmesg | grep crc32
[4.226700] Btrfs loaded, crc32c=crc32c-intel


At least we know you're using the intel accelerated one.  Every time 
someone posts this dmesg output to the list, I owe Jeff another beer.


-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS kernel OOPS 4.8.11

2016-12-05 Thread Borislav Petkov
+ linux-btrfs

On Mon, Dec 05, 2016 at 09:30:52AM -0600, Gerard Saraber wrote:
> I have a NAS with a mix of 6, 4 and 3 TB drives:
> 
> shrapnel zm # btrfs filesystem df /home/exports
> Data, RAID1: total=19.59TiB, used=19.51TiB
> System, RAID1: total=32.00MiB, used=2.75MiB
> Metadata, RAID1: total=76.00GiB, used=74.71GiB
> GlobalReserve, single: total=512.00MiB, used=0.00B
> shrapnel zm # btrfs filesystem usage /home/exports
> Overall:
>  Device size:  63.68TiB
>  Device allocated: 39.34TiB
>  Device unallocated:   24.34TiB
>  Device missing:  0.00B
>  Used: 39.17TiB
>  Free (estimated): 12.25TiB  (min: 12.25TiB)
>  Data ratio:   2.00
>  Metadata ratio:   2.00
>  Global reserve:  512.00MiB  (used: 0.00B)
> 
> Data,RAID1: Size:19.59TiB, Used:19.51TiB
> /dev/sda3.99TiB
> /dev/sdb2.21TiB
> /dev/sdc2.21TiB
> /dev/sdd4.00TiB
> /dev/sde2.21TiB
> /dev/sdf3.99TiB
> /dev/sdg1.30TiB
> /dev/sdh4.00TiB
> /dev/sdj1.30TiB
> /dev/sdk1.30TiB
> /dev/sdl1.30TiB
> /dev/sdm2.21TiB
> /dev/sdo2.18TiB
> /dev/sdp2.21TiB
> /dev/sdq2.21TiB
> /dev/sds1.30TiB
> /dev/sdt1.30TiB
> 
> Metadata,RAID1: Size:76.00GiB, Used:74.71GiB
> /dev/sda   35.00GiB
> /dev/sdb1.00GiB
> /dev/sdc3.00GiB
> /dev/sdd   32.00GiB
> /dev/sde3.00GiB
> /dev/sdf   35.00GiB
> /dev/sdh   29.00GiB
> /dev/sdj2.00GiB
> /dev/sdk1.00GiB
> /dev/sdl1.00GiB
> /dev/sdm2.00GiB
> /dev/sdo4.00GiB
> /dev/sds3.00GiB
> /dev/sdt1.00GiB
> 
> System,RAID1: Size:32.00MiB, Used:2.75MiB
> /dev/sdd   32.00MiB
> /dev/sdf   32.00MiB
> 
> Unallocated:
> /dev/sda1.43TiB
> /dev/sdb1.43TiB
> /dev/sdc1.43TiB
> /dev/sdd1.43TiB
> /dev/sde1.43TiB
> /dev/sdf1.43TiB
> /dev/sdg1.43TiB
> /dev/sdh1.43TiB
> /dev/sdj1.43TiB
> /dev/sdk1.43TiB
> /dev/sdl1.43TiB
> /dev/sdm1.43TiB
> /dev/sdo1.46TiB
> /dev/sdp1.43TiB
> /dev/sdq1.43TiB
> /dev/sds1.43TiB
> /dev/sdt1.43TiB
> 
> One of them keeps throwing errors, during this command, the oops happened:
> # btrfs device delete /dev/sdt /home/exports
> 
> 
> Dec 05 08:33:30 [kernel] [259785.367744] BTRFS info (device sdt): csum
> failed ino 122743909 extent 1473222864896 csum 879250177 wanted 3941849660
> mirror 0
> Dec 05 08:33:30 [kernel] [259785.387033] [ cut here
> ]
> Dec 05 08:33:30 [kernel] [259785.387049] kernel BUG at
> fs/btrfs/extent_io.c:2041!
> Dec 05 08:33:30 [kernel] [259785.387062] invalid opcode:  [#1] SMP
> Dec 05 08:33:30 [kernel] [259785.387072] Modules linked in: btrfs
> zlib_deflate megaraid_sas
> Dec 05 08:33:30 [kernel] [259785.387096] CPU: 2 PID: 14355 Comm:
> kworker/u8:11 Tainted: GW   4.8.11 #1
> Dec 05 08:33:30 [kernel] [259785.387112] Hardware name: Supermicro
> X7DB8/X7DB8, BIOS 6.00 06/23/2006
> Dec 05 08:33:30 [kernel] [259785.387161] Workqueue: btrfs-endio
> btrfs_endio_helper [btrfs]
> Dec 05 08:33:30 [kernel] [259785.387177] task: 88081b8e8b00 task.stack:
> 88046b49
> Dec 05 08:33:30 [kernel] [259785.387189] RIP: 0010:[]
>   [] repair_io_failure+0x221/0x250 [btrfs]
> Dec 05 08:33:30 [kernel] [259785.387223] RSP: 0018:88046b493c30
>   EFLAGS: 00010202
> Dec 05 08:33:30 [kernel] [259785.387235] RAX:  RBX:
> 6b4e0140 RCX: 
> Dec 05 08:33:30 [kernel] [259785.387250] RDX:  RSI:
>  RDI: 88071dddc840
> Dec 05 08:33:30 [kernel] [259785.387264] RBP: 88046b493c90 R08:
> 015701409000 R09: 88071dddc840
> Dec 05 08:33:30 [kernel] [259785.387280] R10: 01ac R11:
> 6b4e37c0 R12: 88046b4e30a8
> Dec 05 08:33:30 [kernel] [259785.387295] R13:  R14:
> 1df9496b9000 R15: 8801471b6000
> Dec 05 08:33:30 [kernel] [259785.387560] FS:  ()
> GS:88083fd0() knlGS:
> Dec 05 08:33:30 [kernel] [259785.387825] CS:  0010 DS:  ES:  CR0:
> 80050033
> Dec 05 08:33:30 [kernel] [259785.387962] CR2: 0226f000 CR3:
> 000255358000 CR4: 06e0
> Dec 05 08:33:30 [kernel] [259785.388002] Stack:
> Dec 05 08:33:30 [kernel] [259785.388002]  0283e000 880703f4a370
> 1000 ea001d05ff40
> Dec 05 08:33:30 [kernel] [259785.388002]  1000 7000
> 88071dddc840 880703f4a180
> Dec 05 08:33:30 [kernel] [259785.388002]  

Re: [PATCH 10/18] btrfs: root->fs_info cleanup, btrfs_calc_{trans,trunc}_metadata_size

2016-12-05 Thread David Sterba
On Fri, Dec 02, 2016 at 12:07:30AM -0500, je...@suse.com wrote:
> -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
> +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info 
> *fs_info,
>unsigned num_items)
>  {
> - return root->fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
> + return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;

Is there a missing patch that moves 'nodesize' to fs_info? The patch has
a minor conflict in the original line where it's just 'root->nodesize',
but thre are many compilation faiures due to lack of fs_info::nodesize.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[SOLVED] Re: system hangs due to qgroups

2016-12-05 Thread Marc Joliet
On Monday 05 December 2016 12:01:28 Marc Joliet wrote:
> > This seems to be a NULL pointer bug in qgroup relocation fix.
> >
> > 
> >
> > The latest fix (not merged yet) should address it.
> >
> > 
> >
> > You could try the for-next-20161125 branch from David to fix it:
> > https://github.com/kdave/btrfs-devel/tree/for-next-20161125
> 
> OK, I'll try that, thanks!  I just have to wait for it to finish cloning...
> 
[...]
> > And for your recovery, I'd suggest to install an Archlinux into a USB
> > HDD or USB stick, and compile David's branch and install it into the USB
> > HDD.
> >
> > 
> >
> > Then use the USB storage as rescue tool to mount the fs, which should do
> > RW mount with or without skip_balance mount option.
> > So you could disable quota then.
> 
> OK, I'll try that, thanks!

Excellent, thank you, that worked!  My laptop is working normally again.  I'll 
keep an eye on it, but so far two balance operations ran normally (that is, 
they completed within a few minutes and without hanging the system).

(Specifically, since I didn't find out how to get a different kernel onto the 
Arch USB stick, I simply installed the kernel on my desktop, then did 
everything from an initramfs emergency shell, then moved the SSD back into the 
laptop.)

Thanks, everyone!
-- 
Marc Joliet
--
"People who think they know everything really annoy those of us who know we
don't" - Bjarne Stroustrup


signature.asc
Description: This is a digitally signed message part.


Re: system hangs due to qgroups

2016-12-05 Thread Marc Joliet
On Monday 05 December 2016 12:01:28 Marc Joliet wrote:
> > You could try the for-next-20161125 branch from David to fix it:
> > https://github.com/kdave/btrfs-devel/tree/for-next-20161125
> 
> OK, I'll try that, thanks!  I just have to wait for it to finish cloning...

FWIW, I get this warning:

  CC  fs/btrfs/inode.o
fs/btrfs/inode.c: In Funktion »run_delalloc_range«:
fs/btrfs/inode.c:1219:9: Warnung: »cur_end« könnte in dieser Funktion 
uninitialisiert verwendet werden [-Wmaybe-uninitialized]
   start = cur_end + 1;
 ^
fs/btrfs/inode.c:1172:6: Anmerkung: »cur_end« wurde hier deklariert

Should I be worried about that?  At a cursory glance, it looks like a false 
alarm, but I just want to be sure (and even so, false alarms are annoying).

Greetings
-- 
Marc Joliet
--
"People who think they know everything really annoy those of us who know we
don't" - Bjarne Stroustrup


signature.asc
Description: This is a digitally signed message part.


Re: Metadata balance fails ENOSPC

2016-12-05 Thread Duncan
Stefan Priebe - Profihost AG posted on Mon, 05 Dec 2016 12:12:12 +0100 as
excerpted:

> isn't there a way to move free space to unallocated space again?

Yes, btrfs balance, but...

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] btrfs-progs: Correct value printed by assertions/BUG_ON/WARN_ON

2016-12-05 Thread Goldwyn Rodrigues
Hi Qu,

Yes, the assert for ifdef BTRFS_DIABLE_BACKTRACE is not correct. The
condition should not have a not(!).

Thanks for reporting.

On 12/05/2016 01:10 AM, Qu Wenruo wrote:
> Hi, Goldwyn and David,
> 
> This patch seems to cause btrfs test case 023 to fail.
> 
> Bisect leads me to this patch.
> 
> 
> $ ./btrfs check ~/quota_balance_loop_backref.raw.restored
> Checking filesystem on /home/adam/quota_balance_loop_backref.raw.restored
> UUID: c33c5ce3-3ad9-4320-9201-c337c04e0051
> checking extents
> btrfs: cmds-check.c:12284: build_roots_info_cache: Assertion `!(ret ==
> 0)' failed.
> Aborted (core dumped)
> 
> 
> And gdb backref:
> #0  0x76fd204f in raise () from /usr/lib/libc.so.6
> #1  0x76fd347a in abort () from /usr/lib/libc.so.6
> #2  0x76fcaea7 in __assert_fail_base () from /usr/lib/libc.so.6
> #3  0x76fcaf52 in __assert_fail () from /usr/lib/libc.so.6
> #4  0x00440426 in build_roots_info_cache (info=0x6f43c0) at
> cmds-check.c:12284
> #5  0x00440945 in repair_root_items (info=0x6f43c0) at
> cmds-check.c:12412
> #6  0x004418c3 in cmd_check (argc=2, argv=0x7fffe100) at
> cmds-check.c:12892
> #7  0x0040a74c in main (argc=2, argv=0x7fffe100) at btrfs.c:301
> 
> 
> For frame 4:
> (gdb) frame 4
> #4  0x00440426 in build_roots_info_cache (info=0x6f43c0) at
> cmds-check.c:12284
> 12284ASSERT(ret == 0);
> (gdb) list
> 12279rii->cache_extent.start = root_id;
> 12280rii->cache_extent.size = 1;
> 12281rii->level = (u8)-1;
> 12282entry = >cache_extent;
> 12283ret = insert_cache_extent(roots_info_cache, entry);
> 12284ASSERT(ret == 0);
> 12285} else {
> 12286rii = container_of(entry, struct root_item_info,
> 12287   cache_extent);
> 12288}
> (gdb) print ret
> $1 = 0
> 
> For me, ASSERT(ret == 0) seems quite safe and common here.
> Doesn't the patch changed the ASSERT() behavior?
> 
> Thanks,
> Qu
> 
> At 11/30/2016 12:24 AM, Goldwyn Rodrigues wrote:
>> From: Goldwyn Rodrigues 
>>
>> The values passed to BUG_ON/WARN_ON are negated(!) and printed, which
>> results in printing the value zero for each bug/warning. For example:
>> volumes.c:988: btrfs_alloc_chunk: Assertion `ret` failed, value 0
>>
>> This is not useful. Instead changed to print the value of the parameter
>> passed to BUG_ON()/WARN_ON(). The value needed to be changed to long
>> to accomodate pointers being passed.
>>
>> Also, consolidated assert() and BUG() into ifndef.
>>
>> Signed-off-by: Goldwyn Rodrigues 
>> ---
>>  kerncompat.h | 35 +++
>>  1 file changed, 15 insertions(+), 20 deletions(-)
>>
>> diff --git a/kerncompat.h b/kerncompat.h
>> index ed9a042..9bd25bd 100644
>> --- a/kerncompat.h
>> +++ b/kerncompat.h
>> @@ -88,39 +88,36 @@ static inline void print_trace(void)
>>  }
>>
>>  static inline void assert_trace(const char *assertion, const char
>> *filename,
>> -  const char *func, unsigned line, int val)
>> +  const char *func, unsigned line, long val)
>>  {
>> -if (val)
>> +if (!val)
>>  return;
>>  if (assertion)
>> -fprintf(stderr, "%s:%d: %s: Assertion `%s` failed, value %d\n",
>> +fprintf(stderr, "%s:%d: %s: Assertion `%s` failed, value %ld\n",
>>  filename, line, func, assertion, val);
>>  else
>> -fprintf(stderr, "%s:%d: %s: Assertion failed, value %d.\n",
>> +fprintf(stderr, "%s:%d: %s: Assertion failed, value %ld.\n",
>>  filename, line, func, val);
>>  print_trace();
>>  abort();
>>  exit(1);
>>  }
>>
>> -#define BUG() assert_trace(NULL, __FILE__, __func__, __LINE__, 0)
>> -#else
>> -#define BUG() assert(0)
>>  #endif
>>
>>  static inline void warning_trace(const char *assertion, const char
>> *filename,
>> -  const char *func, unsigned line, int val,
>> +  const char *func, unsigned line, long val,
>>int trace)
>>  {
>> -if (val)
>> +if (!val)
>>  return;
>>  if (assertion)
>>  fprintf(stderr,
>> -"%s:%d: %s: Warning: assertion `%s` failed, value %d\n",
>> +"%s:%d: %s: Warning: assertion `%s` failed, value %ld\n",
>>  filename, line, func, assertion, val);
>>  else
>>  fprintf(stderr,
>> -"%s:%d: %s: Warning: assertion failed, value %d.\n",
>> +"%s:%d: %s: Warning: assertion failed, value %ld.\n",
>>  filename, line, func, val);
>>  #ifndef BTRFS_DISABLE_BACKTRACE
>>  if (trace)
>> @@ -299,17 +296,15 @@ static inline long IS_ERR(const void *ptr)
>>  #define vfree(x) free(x)
>>
>>  #ifndef BTRFS_DISABLE_BACKTRACE
>> -#define BUG_ON(c) assert_trace(#c, __FILE__, __func__, __LINE__, !(c))
>> -#define WARN_ON(c) 

Re: Metadata balance fails ENOSPC

2016-12-05 Thread Stefan Priebe - Profihost AG
isn't there a way to move free space to unallocated space again?


Am 03.12.2016 um 05:43 schrieb Andrei Borzenkov:
> 01.12.2016 18:48, Chris Murphy пишет:
>> On Thu, Dec 1, 2016 at 7:10 AM, Stefan Priebe - Profihost AG
>>  wrote:
>>>
>>> Am 01.12.2016 um 14:51 schrieb Hans van Kranenburg:
 On 12/01/2016 09:12 AM, Andrei Borzenkov wrote:
> On Thu, Dec 1, 2016 at 10:49 AM, Stefan Priebe - Profihost AG
>  wrote:
> ...
>>
>> Custom 4.4 kernel with patches up to 4.10. But i already tried 4.9-rc7
>> which does the same.
>>
>>
 # btrfs filesystem show /ssddisk/
 Label: none  uuid: a69d2e90-c2ca-4589-9876-234446868adc
 Total devices 1 FS bytes used 305.67GiB
 devid1 size 500.00GiB used 500.00GiB path /dev/vdb1

 # btrfs filesystem usage /ssddisk/
 Overall:
 Device size: 500.00GiB
 Device allocated:500.00GiB
 Device unallocated:1.05MiB
>>>
>>> Drive is actually fully allocated so if Btrfs needs to create a new
>>> chunk right now, it can't. However,
>>
>> Yes but there's lot of free space:
>> Free (estimated):193.46GiB  (min: 193.46GiB)
>>
>> How does this match?
>>
>>
>>> All three chunk types have quite a bit of unused space in them, so
>>> it's unclear why there's a no space left error.
>>>
>
> I remember discussion that balance always tries to pre-allocate one
> chunk in advance, and I believe there was patch to correct it but I am
> not sure whether it was merged.

 http://www.spinics.net/lists/linux-btrfs/msg56772.html
>>>
>>> Thanks - still don't understand why that one is not upstream or why it
>>> was reverted. Looks absolutely reasonable to me.
>>
>> It is upstream and hasn't been reverted.
>>
>> https://git.kernel.org/cgit/linux/kernel/git/stable/linux-stable.git/tree/fs/btrfs/volumes.c?id=refs/tags/v4.8.11
>> line 3650
>>
>> I would try Duncan's idea of using just one filter and seeing what happens:
>>
>> 'btrfs balance start -dusage=1 '
>>
> 
> Actually I just hit exactly the same symptoms on my VM where device was
> fully allocated and metadata balance failed, but data balance succeeded
> to free up space which allowed metadata balance to run too. This is
> under 4.8.10.
> 
> So it appears that balance logic between data and metadata is somehow
> different.
> 
> As this VM gets in 100% allocated condition fairly often I'd try to get
> better understanding next time.
> 
> 
>>
>> With enospc debug it says:
>> [39193.425682] BTRFS warning (device vdb1): no space to allocate a new
>> chunk for block group 839941881856
>> [39193.426033] BTRFS info (device vdb1): 1 enospc errors during balance
>>
>> It might be nice if this stated what kind of chunk it's trying to allocate.
>>
>>
>>
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: fix lockdep warning about log_mutex

2016-12-05 Thread Filipe Manana
On Thu, Dec 1, 2016 at 9:45 PM, Liu Bo  wrote:
> While checking INODE_REF/INODE_EXTREF for a corner case, we may acquire a
> different inode's log_mutex with holding the current inode's log_mutex, and
> lockdep has complained this with a possilble deadlock warning.
>
> Fix this by using mutex_lock_nested() when processing the other inode's
> log_mutex.
>
> Signed-off-by: Liu Bo 
Reviewed-by: Filipe Manana 

Thanks, it works and it's simple.

> ---
>  fs/btrfs/tree-log.c | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
> index 3d33c4e..e961451 100644
> --- a/fs/btrfs/tree-log.c
> +++ b/fs/btrfs/tree-log.c
> @@ -37,6 +37,7 @@
>   */
>  #define LOG_INODE_ALL 0
>  #define LOG_INODE_EXISTS 1
> +#define LOG_OTHER_INODE 2
>
>  /*
>   * directory trouble cases
> @@ -4624,7 +4625,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
> *trans,
> if (S_ISDIR(inode->i_mode) ||
> (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
>_I(inode)->runtime_flags) &&
> -inode_only == LOG_INODE_EXISTS))
> +inode_only >= LOG_INODE_EXISTS))
> max_key.type = BTRFS_XATTR_ITEM_KEY;
> else
> max_key.type = (u8)-1;
> @@ -4648,7 +4649,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
> *trans,
> return ret;
> }
>
> -   mutex_lock(_I(inode)->log_mutex);
> +   if (inode_only == LOG_OTHER_INODE) {
> +   inode_only = LOG_INODE_EXISTS;
> +   mutex_lock_nested(_I(inode)->log_mutex, 1);
> +   } else {
> +   mutex_lock(_I(inode)->log_mutex);
> +   }
>
> /*
>  * a brute force approach to making sure we get the most uptodate
> @@ -4800,7 +4806,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle 
> *trans,
>  * unpin it.
>  */
> err = btrfs_log_inode(trans, root, 
> other_inode,
> - LOG_INODE_EXISTS,
> + LOG_OTHER_INODE,
>   0, LLONG_MAX, ctx);
> iput(other_inode);
> if (err)
> --
> 2.5.5
>



-- 
Filipe David Manana,

"People will forget what you said,
 people will forget what you did,
 but people will never forget how you made them feel."
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bio linked list corruption.

2016-12-05 Thread Vegard Nossum
On 5 December 2016 at 00:04, Vegard Nossum  wrote:
> FWIW I hit this as well:
>
> BUG: unable to handle kernel paging request at 81ff08b7
> IP: [] __lock_acquire.isra.32+0xda/0x1a30
> CPU: 0 PID: 21744 Comm: trinity-c56 Tainted: GB   4.9.0-rc7+ #217
[...]

> I think you can rule out btrfs in any case, probably block layer as
> well, since it looks like this comes from shmem.

I should rather say that the VM runs on a 9p root filesystem and it
doesn't use/mount any block devices or disk-based filesystems.

I have all the trinity logs for the crash if that's useful. I tried a
couple of runs with just the (at the time) in-progress syscalls but it
didn't turn up anything interesting. Otherwise it seems like a lot of
data to go through by hand.

The crashing child seems to have just started, though, if that's relevant:

[child56:21744] [0]
open(filename="/sys/block/loop2/power/runtime_active_time",
flags=0x777b01, mode=666) = -1 (Not a directory)
[child56:21744] [1] [32BIT] sched_getattr(pid=1, param=0x7f37ec26c000,
size=3415) = -1 (Invalid argument)
[child56:21744] [2] [32BIT]
access(filename="/proc/2/task/2/net/stat/arp_cache", mode=2000) = -1
(Invalid argument)
[child56:21744] [3] getegid() = 0xfffe
[child56:21744] [4]
swapoff(path="/proc/721/task/721/net/dev_snmp6/tunl0") = -1 (Operation
not permitted)
[child56:21744] [5] timerfd_create(clockid=0x0, flags=0x0) = 439
[child56:21744] [6] pkey_mprotect(start=0x7f37ee656000, len=0,
prot=0x108, key=0x600) = 0
[child56:21744] [7] msync(start=0x7f37ee657000, len=0, flags=0x6) = 0
[child56:21744] [8] flock(fd=168, cmd=0xffc191f30b0c) = -1
(Invalid argument)
[child56:21744] [9] add_key(_type=0x437a15,
_description=0x7f37ec06c000, _payload=0x7f37ec06c000, plen=0,
ringid=0xfff8)

The other logfiles end thusly:

==> trinity-child0.log <==
[child0:21593] [311] faccessat(dfd=246,
filename="/proc/983/task/983/net/protocols", mode=2000) = -1 (Invalid
argument)
[child0:21593] [312] renameat(olddfd=246,
oldname="/proc/13/task/13/attr/sockcreate", newdfd=377,
newname="/proc/16/task/16/net/stat/rt_cache") = -1 (Permission denied)
[child0:21593] [313] [32BIT] readv(fd=289, vec=0x2e1a3d0, vlen=215) = 0

==> trinity-child100.log <==
[child100:21536] [439] setgid(gid=0x2a000200) = -1 (Operation not permitted)
[child100:21536] [440] waitid(which=175, upid=21587, infop=0x4,
options=3542, ru=0x7f37ec76c000) = -1 (Invalid argument)
[child100:21536] [441]
getxattr(pathname="/proc/980/task/980/net/ptype", name=0x7f37ee466000,
value=0x7f37ec26c000, size=49) = -1 (Operation not supported)

==> trinity-child101.log <==
[child101:21537] [55] getcwd(buf=0x7f37ee466000, size=4096) = 39
[child101:21537] [56] [32BIT] munlock(addr=0x7f37ee658000, len=0) = 0
[child101:21537] [57] semctl(semid=0xbd851e2b40e7df,
semnum=0x1b1b1b1b1b, cmd=0x20, arg=0xcacacacaca) = -1 (Invalid
argument)

==> trinity-child102.log <==
[child102:21542] [11] readahead(fd=353, offset=2, count=249) = -1
(Invalid argument)
[child102:21542] [12] add_key(_type=0x43793f,
_description=0x7f37ec46c000, _payload=0x7f37ee658000, plen=32,
ringid=0xfffa) = -1 (Invalid argument)
[child102:21542] [13] time(tloc=0x7f37ee466000) = 0x584474e0

==> trinity-child103.log <==
[child103:21543] [45] dup(fildes=183) = 512
[child103:21543] [46] rt_sigpending(set=0x7f37ec86c000, sigsetsize=32)
= -1 (Invalid argument)
[child103:21543] [47] newstat(filename="/proc/587/task/587/gid_map",
statbuf=0x7f37ee466000) = 0

==> trinity-child104.log <==
[child104:21546] [49] getdents(fd=162, dirent=0x0, count=127) = -1
(Not a directory)
[child104:21546] [50] [32BIT] clock_adjtime(which_clock=0, utx=0x4) =
-1 (Bad address)
[child104:21546] [51] setsid() = 0x542a

==> trinity-child105.log <==
[child105:21547] [523] epoll_wait(epfd=244, events=0x8, maxevents=246,
timeout=-1) = -1 (Invalid argument)
[child105:21547] [524] dup2(oldfd=244, newfd=244) = 244
[child105:21547] [525] acct(name=0x7f37ec26c000) = -1 (Operation not permitted)

==> trinity-child106.log <==
[child106:19910] [136] getegid() = 0xfffe
[child106:19910] [137] munmap(addr=0x7f37ee65a000, len=4096) = 0
[child106:19910] [138] clock_nanosleep(which_clock=0x1, flags=0x1,
rqtp=0x7f37ec06c000, rmtp=0x7f37ee466000)
==> trinity-child107.log <==
[child107:21224] [994] copy_file_range(fd_in=373, off_in=0x2400e210,
fd_out=373, off_out=8, len=8, flags=0x0) = -1 (Bad file descriptor)
[child107:21224] [995] kcmp(pid1=1, pid2=21453, type=0x5,
idx1=0x787878787878, idx2=0xff6060606060) = -1 (Operation not
permitted)
[child107:21224] [996] [32BIT] readv(fd=365, vec=0x2e27e10, vlen=36) = 0

==> trinity-child108.log <==
[child108:21226] [759] recvfrom(fd=219, ubuf=0x7f37ec26c000, size=8,
flags=0x0, addr=0x2e1ed80, addr_len=110) = -1 (Bad file descriptor)
[child108:21226] [760] shmat(shmid=-4097, shmaddr=0x7f37ee465000,
shmflg=-195) = -1 (Invalid argument)
[child108:21226] [761] [32BIT] seccomp(op=0x0, flags=0x0, 

Re: system hangs due to qgroups

2016-12-05 Thread Marc Joliet
On Monday 05 December 2016 10:00:13 Marc Joliet wrote:
> OK, I'll post the URLs once the images are uploaded.  (I had Dropbox public 
> URLs right before my desktop crashed -- see below -- but now dropbox-cli
> doesn't want to create them.)

Alright, here you go:

https://dl.dropboxusercontent.com/u/5328255/arthur_root_4.7.3_sanitized.image.xz
https://dl.dropboxusercontent.com/u/5328255/arthur_root_4.8.5_sanitized.image.xz

(FYI, "dropbox-cli puburl" appears to have broken recently, so I had to use 
the Dropbox web interface to get these URLs.)

Greetings
-- 
Marc Joliet
--
"People who think they know everything really annoy those of us who know we
don't" - Bjarne Stroustrup


signature.asc
Description: This is a digitally signed message part.


[PATCH 2/4] btrfs-progs: check: Fix lowmem mode stack overflow caused by fsck/023

2016-12-05 Thread Qu Wenruo
Lowmem mode fsck will overflow its stack since it will do infinite
backref check for tree reloc root.
We should not check backref if it's pointing to itself for tree reloc
root.

Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index 30847a0..ef90d87 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -,10 +,15 @@ static int check_tree_block_ref(struct btrfs_root *root,
u32 nodesize = root->nodesize;
u32 item_size;
u64 offset;
+   int tree_reloc_root = 0;
int found_ref = 0;
int err = 0;
int ret;
 
+   if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+   btrfs_header_bytenr(root->node) == bytenr)
+   tree_reloc_root = 1;
+
btrfs_init_path();
key.objectid = bytenr;
if (btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
@@ -10090,9 +10095,16 @@ static int check_tree_block_ref(struct btrfs_root 
*root,
(offset == root->objectid || offset == owner)) {
found_ref = 1;
} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+   /*
+* Backref of tree reloc root points to itself, no need
+* to check backref any more.
+*/
+   if (tree_reloc_root)
+   found_ref = 1;
+   else
/* Check if the backref points to valid referencer */
-   found_ref = !check_tree_block_ref(root, NULL, offset,
- level + 1, owner);
+   found_ref = !check_tree_block_ref(root, NULL,
+   offset, level + 1, owner);
}
 
if (found_ref)
-- 
2.10.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] btrfs-progs: check: Fix lowmem false alert on tree reloc tree

2016-12-05 Thread Qu Wenruo
Lowmem mode will report false alert if the fs has tree reloc tree like:
ERROR: shared extent[30011392 4096] lost its parent (parent: 30011392,
level: 1)

The problem is check_shared_block_backref() can't handle tree reloc
tree's self-pointing backref.

And still try to read out the tree block then seeking for the
referencer.

The correct method for it is to check if it's tree reloc root.
In that case, we should check found the ROOT_ITEM of tree reloc tree in
root tree.

Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/cmds-check.c b/cmds-check.c
index ef90d87..d0e1977 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -10465,6 +10465,34 @@ out:
 }
 
 /*
+ * Check if tree block @eb is tree reloc root.
+ * Return 0 if it's not or any problem happens
+ * Return 1 if it's a tree reloc root
+ */
+static int is_tree_reloc_root(struct btrfs_fs_info *fs_info,
+struct extent_buffer *eb)
+{
+   struct btrfs_root *tree_reloc_root;
+   struct btrfs_key key;
+   u64 bytenr = btrfs_header_bytenr(eb);
+   u64 owner = btrfs_header_owner(eb);
+   int ret = 0;
+
+   key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+   key.offset = owner;
+   key.type = BTRFS_ROOT_ITEM_KEY;
+
+   tree_reloc_root = btrfs_read_fs_root_no_cache(fs_info, );
+   if (IS_ERR(tree_reloc_root))
+   return 0;
+
+   if (bytenr == btrfs_header_bytenr(tree_reloc_root->node))
+   ret = 1;
+   btrfs_free_fs_root(tree_reloc_root);
+   return ret;
+}
+
+/*
  * Check referencer for shared block backref
  * If level == -1, this function will resolve the level.
  */
@@ -10486,6 +10514,13 @@ static int check_shared_block_backref(struct 
btrfs_fs_info *fs_info,
if (level < 0)
goto out;
 
+   /* It's possible it's a tree reloc root */
+   if (parent == bytenr) {
+   if (is_tree_reloc_root(fs_info, eb))
+   found_parent = 1;
+   goto out;
+   }
+
if (level + 1 != btrfs_header_level(eb))
goto out;
 
-- 
2.10.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] btrfs-progs: check: Fix assert when using lowmem on fs with tree reloc tree

2016-12-05 Thread Qu Wenruo
When using lowmem mode, btrfs check will report ASSERT for calling
btrfs_read_fs_root() on tree reloc tree.

Fix it by checking objectid and call btrfs_read_fs_root_no_cache() for
tree reloc tree.

Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index 30eabb2..30847a0 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -11411,7 +11411,11 @@ static int check_chunks_and_extents_v2(struct 
btrfs_root *root)
goto next;
key.offset = (u64)-1;
 
-   cur_root = btrfs_read_fs_root(root->fs_info, );
+   if (key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+   cur_root = btrfs_read_fs_root_no_cache(root->fs_info,
+   );
+   else
+   cur_root = btrfs_read_fs_root(root->fs_info, );
if (IS_ERR(cur_root) || !cur_root) {
error("failed to read tree: %lld", key.objectid);
goto next;
@@ -11420,6 +11424,8 @@ static int check_chunks_and_extents_v2(struct 
btrfs_root *root)
ret = traverse_tree_block(cur_root, cur_root->node);
err |= ret;
 
+   if (key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+   btrfs_free_fs_root(cur_root);
 next:
ret = btrfs_next_item(root1, );
if (ret)
-- 
2.10.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/4] btrfs-progs: check: Fix false alert on generation mismatch for tree reloc tree

2016-12-05 Thread Qu Wenruo
For fs with tree reloc tree(under balancing), lowmem mode will report
false alert like:
ERROR: extent[62914560 4096] backref generation mismatch, wanted: <=9,
have: 13

This is because lowmem mode adds a more restrict check, to ensure
generation in fs tree won't be smaller than extent tree.

In fact such assumption is not right for tree reloc tree, so remove such
check.

Signed-off-by: Qu Wenruo 
---
 cmds-check.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index d0e1977..c5f6f70 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -10155,12 +10155,10 @@ static int check_extent_data_item(struct btrfs_root 
*root,
struct btrfs_extent_inline_ref *iref;
struct btrfs_extent_data_ref *dref;
u64 owner;
-   u64 file_extent_gen;
u64 disk_bytenr;
u64 disk_num_bytes;
u64 extent_num_bytes;
u64 extent_flags;
-   u64 extent_gen;
u32 item_size;
unsigned long end;
unsigned long ptr;
@@ -10172,7 +10170,6 @@ static int check_extent_data_item(struct btrfs_root 
*root,
 
btrfs_item_key_to_cpu(eb, _key, slot);
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-   file_extent_gen = btrfs_file_extent_generation(eb, fi);
 
/* Nothing to check for hole and inline data extents */
if (btrfs_file_extent_type(eb, fi) == BTRFS_FILE_EXTENT_INLINE ||
@@ -10221,7 +10218,6 @@ static int check_extent_data_item(struct btrfs_root 
*root,
ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 
extent_flags = btrfs_extent_flags(leaf, ei);
-   extent_gen = btrfs_extent_generation(leaf, ei);
 
if (!(extent_flags & BTRFS_EXTENT_FLAG_DATA)) {
error(
@@ -10231,14 +10227,6 @@ static int check_extent_data_item(struct btrfs_root 
*root,
err |= BACKREF_MISMATCH;
}
 
-   if (file_extent_gen < extent_gen) {
-   error(
-"extent[%llu %llu] backref generation mismatch, wanted: <=%llu, have: %llu",
-   disk_bytenr, disk_num_bytes, file_extent_gen,
-   extent_gen);
-   err |= BACKREF_MISMATCH;
-   }
-
/* Check data backref inside that extent item */
item_size = btrfs_item_size_nr(leaf, path.slots[0]);
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
-- 
2.10.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/4] Lowmem fsck false alert fixes

2016-12-05 Thread Qu Wenruo
Btrfs-progs test case 023 will cause assert and a lot of false alerts
for lowmem mode.

The problems are caused by several reasons, from bad handler for tree
reloc root(calling btrfs_read_fs_root on tree reloc tree) to too
restrict check.

Fix the lowmem mode bugs.

There is another bug which affects both original mode and lowmem mode,
it seems to be caused by this commit:
commit 00e769d04c2c83029d6c71fbded133597d93ad55
Author: Goldwyn Rodrigues 
Date:   Tue Nov 29 10:24:52 2016 -0600

btrfs-progs: Correct value printed by assertions/BUG_ON/WARN_ON

Informed Goldwyn to fix it.
So the fix for the common assert is not included in this patchset.

Qu Wenruo (4):
  btrfs-progs: check: Fix assert when using lowmem on fs with tree reloc
tree
  btrfs-progs: check: Fix lowmem mode stack overflow caused by fsck/023
  btrfs-progs: check: Fix lowmem false alert on tree reloc tree
  btrfs-progs: check: Fix false alert on generation mismatch for tree
reloc tree

 cmds-check.c | 71 +++-
 1 file changed, 56 insertions(+), 15 deletions(-)

-- 
2.10.2



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: system hangs due to qgroups

2016-12-05 Thread Marc Joliet
On Sunday 04 December 2016 11:52:40 Chris Murphy wrote:
> On Sun, Dec 4, 2016 at 9:02 AM, Marc Joliet  wrote:
> > Also, now the file system fails with the BUG I mentioned, see here:
> > 
> > [Sun Dec  4 12:27:07 2016] BUG: unable to handle kernel paging request at
> > fe10
> > [Sun Dec  4 12:27:07 2016] IP: []
> > qgroup_fix_relocated_data_extents+0x1f/0x2a0
> > [Sun Dec  4 12:27:07 2016] PGD 1c07067 PUD 1c09067 PMD 0
> > [Sun Dec  4 12:27:07 2016] Oops:  [#1] PREEMPT SMP
> > [Sun Dec  4 12:27:07 2016] Modules linked in: crc32c_intel serio_raw
> > [Sun Dec  4 12:27:07 2016] CPU: 0 PID: 370 Comm: mount Not tainted 4.8.11-
> > gentoo #1
> > [Sun Dec  4 12:27:07 2016] Hardware name: FUJITSU LIFEBOOK A530/FJNBB06,
> > BIOS Version 1.19   08/15/2011
> > [Sun Dec  4 12:27:07 2016] task: 8801b1d9 task.stack:
> > 8801b1268000 [Sun Dec  4 12:27:07 2016] RIP:
> > 0010:[]
> > [] qgroup_fix_relocated_data_extents+0x1f/0x2a0
> > [Sun Dec  4 12:27:07 2016] RSP: 0018:8801b126bcd8  EFLAGS: 00010246
> > [Sun Dec  4 12:27:07 2016] RAX:  RBX: 8801b10b3150
> > RCX:
> > 
> > [Sun Dec  4 12:27:07 2016] RDX: 8801b20f24f0 RSI: 8801b2790800
> > RDI:
> > 8801b20f2460
> > [Sun Dec  4 12:27:07 2016] RBP: 8801b10bc000 R08: 00020340
> > R09:
> > 8801b20f2460
> > [Sun Dec  4 12:27:07 2016] R10: 8801b48b7300 R11: ea0005dd0ac0
> > R12:
> > 8801b126bd70
> > [Sun Dec  4 12:27:07 2016] R13:  R14: 8801b2790800
> > R15:
> > b20f2460
> > [Sun Dec  4 12:27:07 2016] FS:  7f97a7846780()
> > GS:8801bbc0() knlGS:
> > [Sun Dec  4 12:27:07 2016] CS:  0010 DS:  ES:  CR0:
> > 80050033 [Sun Dec  4 12:27:07 2016] CR2: fe10 CR3:
> > 0001b12ae000 CR4: 06f0
> > [Sun Dec  4 12:27:07 2016] Stack:
> > [Sun Dec  4 12:27:07 2016]  0801 0801
> > 8801b20f2460 8801b4aaa000
> > [Sun Dec  4 12:27:07 2016]  0801 8801b20f2460
> > 812c23ed 8801b1d9
> > [Sun Dec  4 12:27:07 2016]   00ff8801b126bd18
> > 8801b10b3150 8801b4aa9800
> > [Sun Dec  4 12:27:07 2016] Call Trace:
> > [Sun Dec  4 12:27:07 2016]  [] ?
> > start_transaction+0x8d/0x4e0
> > [Sun Dec  4 12:27:07 2016]  [] ?
> > btrfs_recover_relocation+0x3b3/0x440
> > [Sun Dec  4 12:27:07 2016]  [] ?
> > btrfs_remount+0x3ca/0x560 [Sun Dec  4 12:27:07 2016] 
> > [] ? shrink_dcache_sb+0x54/0x70 [Sun Dec  4 12:27:07
> > 2016]  [] ? do_remount_sb+0x63/0x1d0 [Sun Dec  4
> > 12:27:07 2016]  [] ? do_mount+0x6f3/0xbe0 [Sun Dec  4
> > 12:27:07 2016]  [] ?
> > copy_mount_options+0xbf/0x170
> > [Sun Dec  4 12:27:07 2016]  [] ? SyS_mount+0x61/0xa0
> > [Sun Dec  4 12:27:07 2016]  [] ?
> > entry_SYSCALL_64_fastpath+0x13/0x8f
> > [Sun Dec  4 12:27:07 2016] Code: 66 90 66 2e 0f 1f 84 00 00 00 00 00 41 57
> > 41 56 41 55 41 54 55 53 48 83 ec 50 48 8b 46 08 4c 8b 6e 10 48 8b a8 f0
> > 01 00 00 31 c0 <4d> 8b a5 10 fe ff ff f6 85 80 0c 00 00 01 74 09 80 be b0
> > 05 00 [Sun Dec  4 12:27:07 2016] RIP  []
> > qgroup_fix_relocated_data_extents+0x1f/0x2a0
> > [Sun Dec  4 12:27:07 2016]  RSP 
> > [Sun Dec  4 12:27:07 2016] CR2: fe10
> > [Sun Dec  4 12:27:07 2016] ---[ end trace bd51bbcfd10492f7 ]---
> 
> I can't parse this. Maybe someone else can. Do you get the same thing,
> or a different thing, if you do a normal mount rather than a remount?

The call trace is of course a bit different, but in both cases the RIP line is 
almost identical (if that even matters?).  Compare the line from my first 
message:

"RIP [] qgroup_fix_relocated_data_extents+0x1f/0x2a8"

with the newest line:

"RIP [] qgroup_fix_relocated_data_extents+0x1f/0x2a0"

But I just remembered, I have one from trying to mount the top-level subvolume 
on my desktop:

[So Dez  4 18:45:19 2016] BUG: unable to handle kernel paging request at 
fe10
[So Dez  4 18:45:19 2016] IP: [] 
qgroup_fix_relocated_data_extents+0x33/0x2e0
[So Dez  4 18:45:19 2016] PGD 1a07067 PUD 1a09067 PMD 0 
[So Dez  4 18:45:19 2016] Oops:  [#1] PREEMPT SMP
[So Dez  4 18:45:19 2016] Modules linked in: joydev dummy iptable_filter 
ip_tables x_tables hid_logitech_hidpp hid_logitech_dj snd_hda_codec_hdmi 
snd_hda_codec_analog snd_hda_codec_generic uvcvideo videobuf2_vmalloc 
videobuf2_memops videobuf2_v4l2 videobuf2_core videodev snd_usb_audio 
snd_hwdep snd_usbmidi_lib radeon i2c_algo_bit drm_kms_helper cfbfillrect 
syscopyarea cfbimgblt sysfillrect sysimgblt fb_sys_fops cfbcopyarea kvm_amd 
kvm ttm irqbypass evdev drm k8temp backlight snd_ice1724 snd_ak4113 snd_pt2258 
snd_hda_intel snd_i2c snd_ak4114 snd_hda_codec snd_ac97_codec snd_hda_core 
ac97_bus snd_ice17xx_ak4xxx snd_ak4xxx_adda snd_rawmidi snd_seq_device snd_pcm 
forcedeth snd_timer snd rtc_cmos asus_atk0110 i2c_nforce2 i2c_core sg sr_mod 
cdrom xhci_pci ata_generic ohci_pci xhci_hcd pata_amd pata_acpi ohci_hcd