[PATCH] btrfs: tree-checker: Replace root parameter with fs_info

2018-01-24 Thread Qu Wenruo
When inspecting the error message with real corruption, the "root=%llu"
always shows "1" (root tree), instead of correct owner.

The problem is that we are getting @root from page->mapping->host, which
points the same btree inode, so we will always get the same root.

This makes the root owner output meaningless, and harder to port
tree-checker to btrfs-progs.

So get rid of the false and meaningless @root parameter and replace it
with @fs_info.
To get the owner, we can only rely on btrfs_header_owner() now.

Signed-off-by: Qu Wenruo 
---
 fs/btrfs/disk-io.c  |   6 +-
 fs/btrfs/tree-checker.c | 149 +---
 fs/btrfs/tree-checker.h |   7 ++-
 3 files changed, 84 insertions(+), 78 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index ed095202942f..b564038b858f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -601,12 +601,12 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio 
*io_bio,
 * that we don't try and read the other copies of this block, just
 * return -EIO.
 */
-   if (found_level == 0 && btrfs_check_leaf_full(root, eb)) {
+   if (found_level == 0 && btrfs_check_leaf_full(fs_info, eb)) {
set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = -EIO;
}
 
-   if (found_level > 0 && btrfs_check_node(root, eb))
+   if (found_level > 0 && btrfs_check_node(fs_info, eb))
ret = -EIO;
 
if (!ret)
@@ -3849,7 +3849,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 * So here we should only check item pointers, not item data.
 */
if (btrfs_header_level(buf) == 0 &&
-   btrfs_check_leaf_relaxed(root, buf)) {
+   btrfs_check_leaf_relaxed(fs_info, buf)) {
btrfs_print_leaf(buf);
ASSERT(0);
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index c3c8d48f6618..4ea44fa29a36 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -53,7 +53,7 @@
  * Allows callers to customize the output.
  */
 __printf(4, 5)
-static void generic_err(const struct btrfs_root *root,
+static void generic_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
 {
@@ -65,10 +65,10 @@ static void generic_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
 
-   btrfs_crit(root->fs_info,
+   btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d, %pV",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
-   root->objectid, btrfs_header_bytenr(eb), slot, &vaf);
+   btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot, &vaf);
va_end(args);
 }
 
@@ -77,7 +77,7 @@ static void generic_err(const struct btrfs_root *root,
  * offset has its own meaning.
  */
 __printf(4, 5)
-static void file_extent_err(const struct btrfs_root *root,
+static void file_extent_err(const struct btrfs_fs_info *fs_info,
const struct extent_buffer *eb, int slot,
const char *fmt, ...)
 {
@@ -91,10 +91,11 @@ static void file_extent_err(const struct btrfs_root *root,
vaf.fmt = fmt;
vaf.va = &args;
 
-   btrfs_crit(root->fs_info,
+   btrfs_crit(fs_info,
"corrupt %s: root=%llu block=%llu slot=%d ino=%llu file_offset=%llu, 
%pV",
-   btrfs_header_level(eb) == 0 ? "leaf" : "node", root->objectid,
-   btrfs_header_bytenr(eb), slot, key.objectid, key.offset, &vaf);
+   btrfs_header_level(eb) == 0 ? "leaf" : "node",
+   btrfs_header_owner(eb), btrfs_header_bytenr(eb), slot,
+   key.objectid, key.offset, &vaf);
va_end(args);
 }
 
@@ -102,26 +103,26 @@ static void file_extent_err(const struct btrfs_root *root,
  * Return 0 if the btrfs_file_extent_##name is aligned to @alignment
  * Else return 1
  */
-#define CHECK_FE_ALIGNED(root, leaf, slot, fi, name, alignment)
  \
+#define CHECK_FE_ALIGNED(fs_info, leaf, slot, fi, name, alignment)   \
 ({   \
if (!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment))) \
-   file_extent_err((root), (leaf), (slot),   \
+   file_extent_err((fs_info), (leaf), (slot),\
"invalid %s for file extent, have %llu, should be aligned to %u", \
(#name), btrfs_file_extent_##name((leaf), (fi)),  \
(alignment)); \
(!IS_ALIGNED(btrfs_file_extent_##name((leaf), (fi)), (alignment)));   \
 })
 
-static int check_extent_data_item(struct btrfs_root *root,
+static int check_extent_data_item(struct btrfs_fs_info *fs_info,
   

Re: [PATCH] bytrfs-progs: Print error on invalid extent item format during check

2018-01-24 Thread Anand Jain

nitpick:

 typo in $subject.

Thanks, Anand
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread Qu Wenruo


On 2018年01月25日 04:41, ^m'e wrote:
> And here it is:
> 
> 
> # ./btrfs-debug-tree.static -t 1385 /dev/sdb3 | grep -C 20 30039322 |

In fact, that's only dump tree 1385.
Now we also need the dump of tree 257.

Thanks,
Qu

> tee /mnt/custom/rescue/btrfs-recovery/btrfs-debug.30039322.2.log
> location key (30037910 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 3
> name: bam
> item 52 key (30037720 DIR_ITEM 508462201) itemoff 14104 itemsize 40
> location key (30039832 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 10
> name: suse-build
> item 53 key (30037720 DIR_ITEM 541125215) itemoff 14070 itemsize 34
> location key (30038354 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 4
> name: cram
> item 54 key (30037720 DIR_ITEM 543235706) itemoff 14035 itemsize 35
> location key (30039133 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 5
> name: lsuio
> item 55 key (30037720 DIR_ITEM 586823170) itemoff 14000 itemsize 35
> location key (30038846 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 5
> name: geany
> item 56 key (30037720 DIR_ITEM 603413733) itemoff 13938 itemsize 62
> location key (30039322 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 32
> name: obs-service-download_src_package
> item 57 key (30037720 DIR_ITEM 623694194) itemoff 13903 itemsize 35
> location key (30038092 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 5
> name: byacc
> item 58 key (30037720 DIR_ITEM 637448305) itemoff 13868 itemsize 35
> location key (43374420 INODE_ITEM 0) type DIR
> transid 200308 data_len 0 name_len 5
> name: vpuml
> item 59 key (30037720 DIR_ITEM 660989717) itemoff 13828 itemsize 40
> location key (30038283 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 10
> name: comparator
> item 60 key (30037720 DIR_ITEM 666000672) itemoff 13782 itemsize 46
> location key (30039257 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 16
> name: molecule-plugins
> item 61 key (30037720 DIR_ITEM 679217690) itemoff 13749 itemsize 33
> location key (36281336 INODE_ITEM 0) type DIR
> --
> location key (30039292 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 15
> name: nvidia-cuda-sdk
> item 73 key (30037720 DIR_INDEX 238) itemoff 13448 itemsize 49
> location key (30039299 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 19
> name: nvidia-cuda-toolkit
> item 74 key (30037720 DIR_INDEX 239) itemoff 13411 itemsize 37
> location key (30039309 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 7
> name: objconv
> item 75 key (30037720 DIR_INDEX 240) itemoff 13361 itemsize 50
> location key (30039314 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 20
> name: obs-service-cpanspec
> item 76 key (30037720 DIR_INDEX 241) itemoff 13305 itemsize 56
> location key (30039318 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 26
> name: obs-service-download_files
> item 77 key (30037720 DIR_INDEX 242) itemoff 13243 itemsize 62
> location key (30039322 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 32
> name: obs-service-download_src_package
> item 78 key (30037720 DIR_INDEX 243) itemoff 13189 itemsize 54
> location key (30039326 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 24
> name: obs-service-download_url
> item 79 key (30037720 DIR_INDEX 244) itemoff 13135 itemsize 54
> location key (30039330 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 24
> name: obs-service-extract_file
> item 80 key (30037720 DIR_INDEX 245) itemoff 13077 itemsize 58
> location key (30039334 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 28
> name: obs-service-format_spec_file
> item 81 key (30037720 DIR_INDEX 246) itemoff 13007 itemsize 70
> location key (30039338 INODE_ITEM 0) type DIR
> transid 136248 data_len 0 name_len 40
> name: obs-service-generator_driver_update_disk
> item 82 key (30037720 DIR_INDEX 247) itemoff 12953 itemsize 54
> location key (30039342 INODE_ITEM 0) type DIR
> --
> mtime 1504685599.188061317 (2017-09-06 08:13:19)
> otime 1504685599.188061317 (2017-09-06 08:13:19)
> item 73 key (30039320 INODE_REF 30039318) itemoff 5278 itemsize 22
> index 3 namelen 12 name: metadata.xml
> item 74 key (30039320 EXTENT_DATA 0) itemoff 4809 itemsize 469
> generation 136248 type 0 (inline)
> inline ex

Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread Qu Wenruo


On 2018年01月25日 03:00, ^m'e wrote:
> The complete check:
> 
> Checking filesystem on /dev/sdb3
> UUID: de1723e2-150c-4448-bb36-be14d7d96093
> ERROR: extent[64368619520, 524288] referencer count mismatch (root:
> 257, owner: 7804556, offset: 212992) wanted: 1, have: 0
> ERROR: data extent[1862352896 425984] backref lost
> ERROR: data extent[1886453760 479232] backref lost
> ERROR: data extent[1902219264 524288] backref lost
> ERROR: data extent[1817378816 151552] backref lost
> ERROR: data extent[1799688192 57344] backref lost
> ERROR: data extent[1830277120 258048] backref lost
> ERROR: data extent[2558107648 1368064] backref lost

Well, a little surprised to know that --repair doesn't repair it.
I'll fix them after all other problems are fixed.

> ERROR: errors found in extent allocation tree or chunk allocation
> cache and super generation don't match, space cache will be invalidated
> ERROR: root 257 DIR_ITEM[30039322 4007295565] couldn't find relative
> INODE_ITEM[0] namelen 0 filename  filetype 0
> ERROR: root 257 DIR_ITEM[30039322 4007295565] data_len shouldn't be 32907

OK, another root, other than the root I'm repairing.

So the corruption seems to happen in multiple snapshots.

Anyway, I could fix it using the same code, just changing the root objectid.

At least, the fix in the offending DIR_ITEM is working.
Just a few more roots needs to be fixed.

> ERROR: root 257 DIR_ITEM[30039322 4007295565] couldn't find relative
> INODE_ITEM[0] namelen 3 filename  filetype 0
> ERROR: root 258 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
> ERROR: root 1327 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
> ERROR: root 1331 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
> ERROR: root 1333 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
> ERROR: root 1362 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
> ERROR: root 1366 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
> ERROR: root 1382 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum

Not a big problem, so handle it later.

> ERROR: root 1385 DIR_ITEM[30039322 2438219243] couldn't find relative
> INODE_ITEM[0] namelen 12 filename metadata.xml filetype 1
> ERROR: root 1385 DIR INODE [30039322] size(152) not equal to 136
> ERROR: root 1385 INODE REF[30039324 30039322] and DIR_ITEM[30039322
> 2438219243] mismatch namelen 12 filename metadata.xml filetype 1

And I forgot this one.
But this is pretty simple, I'll focus on previous problems and handle it
later.

Thanks,
Qu

> found 104348008448 bytes used err is -5
> total csum bytes: 99534904
> total tree bytes: 3204612096
> total fs tree bytes: 2984034304
> total extent tree bytes: 87818240
> btree space waste bytes: 778261104
> file data blocks allocated: 242964852736
>  referenced 198454632448
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 



signature.asc
Description: OpenPGP digital signature


Re: bad key ordering - repairable?

2018-01-24 Thread Chris Murphy
On Wed, Jan 24, 2018 at 5:30 AM, Austin S. Hemmelgarn
 wrote:

>> APFS is really vague on this front, it may be checksumming metadata,
>> it's not checksumming data and with no option to. Apple proposes their
>> branded storage devices do not return bogus data. OK so then why
>> checksum the metadata?
>
> Even aside from the fact that it might be checksumming data, Apple's storage
> engineers are still smoking something pretty damn strong if they think that
> they can claim their storage devices _never_ return bogus data.  Either
> they're running some kind of checksumming _and_ replication below the block
> layer in the storage device itself (which actually might explain the insane
> cost of at least one piece of their hardware), or they think they've come up
> with some fail-safe way to detect corruption and return errors reliably, and
> in either case things can still fail.  I smell a potential future lawsuit in
> the works.


I read somewhere the hardware (or more correctly their flash firmware)
supposedly uses 128 bytes of checksum per 4KB data. That's a lot, I
wonder if it's actually some kind of parity. But regardless, this kind
of in-hardware checksumming won't account for things like misdirected
or torn writes or literally any sort of corruption happening prior to
the flash firmware computing those checksums.

On flash storage, maybe they're just concerned about bit rot or even
the most superficial bit flips, and having just enough information to
detect and correct for 1 or 2 flips per 4KB, not totally dissimilar to
ECC memory. But that they don't use ECC memory, leave them open to
corruption in the storage stack happening outside the literal storage
device.


> Actually, I forgot about the (newer) metadata checksumming feature in ext4,
> and was just basing my statement on behavior the last time I used it for
> anything serious.  Having just checked mkfs.ext4, it appears that the
> metadata in the SB that tells the kernel what to do when it runs into an
> error for the FS still defaults to continuing on as if nothing happens, even
> if you enable metadata checksumming (which still seems to be disabled by
> default).  Whether or not that actually is honored by modern kernels, I
> don't know, but I've seen no evidence to suggest that it isn't.


Depending on the corruption, Btrfs continues as well. If I corrupt a
deadend leaf that contains file metadata (like names or security
contexts), I just get some complaints of corruption. The file system
remains rw mounted though. I don't know the metric by which metadata
can be damaged and Btrfs says "whoooaa!!" and puts on the brakes by
going read only. XFS certainly has its limits and goes read only when
it detects certain metadata corruption via checksum fail. I'd guess
ext4 will do the same thing, otherwise whats the point if it's going
to knowingly eat itself alive?


-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: bad key ordering - repairable?

2018-01-24 Thread Duncan
Claes Fransson posted on Wed, 24 Jan 2018 20:44:33 +0100 as excerpted:

> So, I have now some results from the PassMark Memtest86! I let the
> default automatic tests run for about 19 hours and 16 passes. It
> reported zero "Errors", but 4 lines of "[Note] RAM may be vulnerable to
> high frequency row hammer bit flips". If I understand it correctly,
> it means that some errors were detected when the RAM was tested at
> higher rates than guaranteed accurate by the vendors.

>From Wikipedia:

Row hammer (also written as rowhammer) is an unintended side effect in 
dynamic random-access memory (DRAM) that causes memory cells to leak 
their charges and interact electrically between themselves, possibly 
altering the contents of nearby memory rows that were not addressed in 
the original memory access. This circumvention of the isolation between 
DRAM memory cells results from the high cell density in modern DRAM, and 
can be triggered by specially crafted memory access patterns that rapidly 
activate the same memory rows numerous times.[1][2][3]

The row hammer effect has been used in some privilege escalation computer 
security exploits.

https://en.wikipedia.org/wiki/Row_hammer

So it has nothing to do with (generic) testing the RAM at higher rates 
than guaranteed by the vendors, but rather, with deliberate rapid 
repeated access (at normal clock rates) of the same cell rows in ordered 
to trigger a bitflip in nearby memory cells that could not normally be 
accessed due to process separation and insufficient privileges.

IOW, it's unlikely to be accidentally tripped, and thus is exceedingly 
unlikely to be relevant here, unless you're being hacked, of course.


That said, and entirely unrelated to rowhammer, I know one of the 
problems of memory test false-negatives from experience.

In my case, I was even running ECC RAM.  But the memory I had purchased 
(back in the day when memory was far more expensive and sub-GB memory was 
the norm) was cheap, and as it happened, marked as stable at slightly 
higher clock rates than it actually was.  But I couldn't afford more (or 
I'd have procured less dodgy RAM in the first place) and had little 
recourse but to live with it for awhile.  A year or so later there was a 
BIOS update that added better memory clocking control, and I was able to 
declock the RAM slightly from its rating (IIRC to PC-3000 level, it was 
PC3200 rated, this was DDR1 era), after which it was /entirely/ stable, 
even after reducing some of the wait-state settings somewhat to try to 
claw back some of what I lost due to the underclocking.

I run gentoo, and nearly all of my problems occurred when I was doing 
updates, building packages at 100% CPU with multiple cores accessing the 
same RAM.  FWIW, the most frequent /detected/ problem was bunzip checksum 
errors as it decompressed and verified the data in memory (before writing 
out)... that would move or go away if I tried again.  Occasionally I'd 
get machine-check errors (MCEs), but not frequently, and the ECC RAM 
subsystem /never/ reported errors.

But the memory tests gave that memory an all-clear.

The problem with the memory tests in this case is that they tend to work 
on an otherwise unloaded system, and test the retention of the memory 
cells, /not/ so much the speed and reliability at which they are accessed 
under fully loaded system stress -- and how could they when memory speed 
is normally set by the BIOS and not something the memory tester has 
access to?

But my memory problems weren't with the memory cells themselves -- they 
retained their data just fine and indeed it was ECC RAM so would have 
triggered ECC errors if they didn't -- but with the precision timing of 
memory IO -- it wasn't quite up to the specs it claimed to support and 
would occasionally produce in-transit errors (the ECC would have detected 
and possibly corrected errors in storage), and the memory testers simply 
didn't test that like a fully loaded system doing unpacks of sources and 
builds from them did.

As mentioned, once I got a BIOS update that let me declock the RAM a bit, 
everything was fine, and it remained fine when I did upgrade the RAM some 
years later, after prices had fallen, as well.

(The system was first-gen AMD Opteron, on a server-grade Tyan board, that 
I ran from purchase in late 2003 for over eight years, maxing out the 
pair of CPUs to dual-core Opteron 290s and the RAM to 8 gigs, over time, 
until the board finally died in 2012 due to burst capacitors.  Which 
reminds me, I'm still running the replacement, a Gigabyte with an fx6100 
overclocked a bit to 3.9 GHz and 16 gig RAM, and it's now nearing six 
years old, so I suppose I better start planning for the next upgrade...  
I've spent that six years upgrading to big-screen TVs as monitors, with a 
65inch/165cm 4K as my primary now and a 48inch/122cm as a secondary to 
put youtube or whatever on fullscreen, and to now my second generation of 
ssds, a pair of 1 TB samsung evos, b

Re: Superblock update: Is there really any benefits of updating synchronously?

2018-01-24 Thread Hans van Kranenburg
On 01/24/2018 07:54 PM, waxhead wrote:
> Hans van Kranenburg wrote:
>> On 01/23/2018 08:51 PM, waxhead wrote:
>>> Nikolay Borisov wrote:
 On 23.01.2018 16:20, Hans van Kranenburg wrote:
>>
>> [...]
>>
>
> We also had a discussion about the "backup roots" that are stored
> besides the superblock, and that they are "better than nothing" to help
> maybe recover something from a borken fs, but never ever guarantee you
> will get a working filesystem back.
>
> The same holds for superblocks from a previous generation. As soon as
> the transaction for generation X succesfully hits the disk, all space
> that was occupied in generation X-1 but no longer in X is available to
> be overwritten immediately.
>
>>> Ok so this means that superblocks with a older generation is utterly
>>> useless and will lead to corruption (effectively making my argument
>>> above useless as that would in fact assist corruption then).
>>
>> Mostly, yes.
>>
>>> Does this means that if disk space was allocated in X-1 and is freed in
>>> X it will unallocated if you roll back to X-1 e.g. writing to
>>> unallocated storage.
>>
>> Can you reword that? I can't follow that sentence.
> Sure why not. I'll give it a go:
> 
> Does this mean that if...
> * Superblock generation N-1 have range 1234-2345 allocated and used.
> 
> and
> 
> * Superblock generation N-0 (the current) have range 1234-2345 free 
> because someone deleted a file or something

Ok, so I assume that with current you mean the one on disk now.

> Then
> 
> It is no point in rolling back to generation N-1 because that refers to 
> what is no essentially free "memory" which may or may have not been 
> written over by generation N-0.

If space that was used in N-1 turned into free space during N-0, then
N-0 will never have reused that space already, since if writing out N-0
had crashed halfway, so the superblock as seen when mounting is still
N-1, then you need to be able to fully use N-1.

It can be used immediately by N+1 however after the N-0 superblock is
safe on disk.

> And therefore N-1 which still thinks 
> range 1234-2345 is allocated may point to the wrong data.

So, at least for disk space used by metadata blocks:

1234-2345 - N-1 - in use
1234-2345 - N-0 - not in use, but can't be overwritten yet
1234-2345 - N+1 - can start writing whatever it wants in that disk
location any time

> I hope that was easier to follow - if not don't hold back on the 
> explicitives! :)
> 
>>
>>> I was under the impression that a superblock was like a "snapshot" of
>>> the entire filesystem and that rollbacks via pre-gen superblocks was
>>> possible. Am I mistaking?
>>
>> Yes. The first fundamental thing in Btrfs is COW which makes sure that
>> everything referenced from transaction X, from the superblock all the
>> way down to metadata trees and actual data space is never overwritten by
>> changes done in transaction X+1.
>>
> Perhaps a tad off topic, but assuming the (hopefully) better explanation 
> above clear things up a bit. What happens if a block is freed?! in X+1 
> --- which must mean that it can be overwritten in transaction X+1 (which 
> I assume means a new superblock generation). After all without freeing 
> and overwriting data there is no way to re-use space.

Freed in X you mean? Or not? But you write "freed?! in X+1".

For actual data disk space, it's the same pattern as above (so space
freed up during a transaction can only be reused in the next one), but
implemented a bit differently.

For metadata trees which do not have reference counting, (e.g. the
extent tree), there's the pinned extent (metadata block disk locations)
list I mentioned already.

For data, we have the filesystem (subvolume) trees which reference all
files and the data extents that they use data from, and via the links to
the extent tree they keep all locations where actual data is on disk as
occupied.

Now comes the different part. Because the filesystem trees already
implement the extra reference counting functionality, this is being used
to prevent freed up data space from already being overwritten in the
same transaction.

How does this work? Well, that's the rest of the wiki section I linked
below. :-D So you're asking exactly the right next question here I guess.

When making changes to a subvolume tree (normal file create, write
content, rename delete etc), btrfs is secretly just cloning the tree
into a new subvolume with the same subvolume ID. Wait, what? Whoa! So if
you're changing subvolume 1234, there's an item (1234 ROOT_ITEM N-0) on
disk in tree 1, and in memory it starts working on (1234 ROOT_ITEM N+1).
As an end user, you never see this happening when you look at btrfs sub
list etc, it's hidden from you.

"When the transaction commits, a new root pointer is inserted in the
root tree for each new subvolume root." [...] "At this time the root
tree has two pointers for each subvolume changed during the transaction.
One item points to 

Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread ^m'e
And here it is:


# ./btrfs-debug-tree.static -t 1385 /dev/sdb3 | grep -C 20 30039322 |
tee /mnt/custom/rescue/btrfs-recovery/btrfs-debug.30039322.2.log
location key (30037910 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 3
name: bam
item 52 key (30037720 DIR_ITEM 508462201) itemoff 14104 itemsize 40
location key (30039832 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 10
name: suse-build
item 53 key (30037720 DIR_ITEM 541125215) itemoff 14070 itemsize 34
location key (30038354 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 4
name: cram
item 54 key (30037720 DIR_ITEM 543235706) itemoff 14035 itemsize 35
location key (30039133 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 5
name: lsuio
item 55 key (30037720 DIR_ITEM 586823170) itemoff 14000 itemsize 35
location key (30038846 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 5
name: geany
item 56 key (30037720 DIR_ITEM 603413733) itemoff 13938 itemsize 62
location key (30039322 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 32
name: obs-service-download_src_package
item 57 key (30037720 DIR_ITEM 623694194) itemoff 13903 itemsize 35
location key (30038092 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 5
name: byacc
item 58 key (30037720 DIR_ITEM 637448305) itemoff 13868 itemsize 35
location key (43374420 INODE_ITEM 0) type DIR
transid 200308 data_len 0 name_len 5
name: vpuml
item 59 key (30037720 DIR_ITEM 660989717) itemoff 13828 itemsize 40
location key (30038283 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 10
name: comparator
item 60 key (30037720 DIR_ITEM 666000672) itemoff 13782 itemsize 46
location key (30039257 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 16
name: molecule-plugins
item 61 key (30037720 DIR_ITEM 679217690) itemoff 13749 itemsize 33
location key (36281336 INODE_ITEM 0) type DIR
--
location key (30039292 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 15
name: nvidia-cuda-sdk
item 73 key (30037720 DIR_INDEX 238) itemoff 13448 itemsize 49
location key (30039299 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 19
name: nvidia-cuda-toolkit
item 74 key (30037720 DIR_INDEX 239) itemoff 13411 itemsize 37
location key (30039309 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 7
name: objconv
item 75 key (30037720 DIR_INDEX 240) itemoff 13361 itemsize 50
location key (30039314 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 20
name: obs-service-cpanspec
item 76 key (30037720 DIR_INDEX 241) itemoff 13305 itemsize 56
location key (30039318 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 26
name: obs-service-download_files
item 77 key (30037720 DIR_INDEX 242) itemoff 13243 itemsize 62
location key (30039322 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 32
name: obs-service-download_src_package
item 78 key (30037720 DIR_INDEX 243) itemoff 13189 itemsize 54
location key (30039326 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 24
name: obs-service-download_url
item 79 key (30037720 DIR_INDEX 244) itemoff 13135 itemsize 54
location key (30039330 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 24
name: obs-service-extract_file
item 80 key (30037720 DIR_INDEX 245) itemoff 13077 itemsize 58
location key (30039334 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 28
name: obs-service-format_spec_file
item 81 key (30037720 DIR_INDEX 246) itemoff 13007 itemsize 70
location key (30039338 INODE_ITEM 0) type DIR
transid 136248 data_len 0 name_len 40
name: obs-service-generator_driver_update_disk
item 82 key (30037720 DIR_INDEX 247) itemoff 12953 itemsize 54
location key (30039342 INODE_ITEM 0) type DIR
--
mtime 1504685599.188061317 (2017-09-06 08:13:19)
otime 1504685599.188061317 (2017-09-06 08:13:19)
item 73 key (30039320 INODE_REF 30039318) itemoff 5278 itemsize 22
index 3 namelen 12 name: metadata.xml
item 74 key (30039320 EXTENT_DATA 0) itemoff 4809 itemsize 469
generation 136248 type 0 (inline)
inline extent data size 448 ram_bytes 448 compression 0 (none)
item 75 key (30039321 INODE_ITEM 0) itemoff 4649 itemsize 160
generation 136248 transid 202216 size 213 nbytes 213
block group 0 mode 100644 links 1 uid 250 gid 250 rdev 0
sequence 0 flags 0xb(none)
atime 1504685599.188061317 (2

Re: bad key ordering - repairable?

2018-01-24 Thread Claes Fransson
On Jan 24, 2018 01:31, "Chris Murphy"  wrote:

On Tue, Jan 23, 2018 at 11:13 AM, Claes Fransson
 wrote:

> I haven't noticed before that there is actually RAM-modules from
> different vendors in the laptop. One 8GB by Samsung, and one 4GB by
> Kingston!

If they have the correct tolerances, I don't think it's a problem.
Some memory controllers use a kind of interleaving if the module sizes
are the same, so worse case you might be leaving a bit of a
performance improvement on the table by the fact they aren't the same
size.

If the memory testing doesn't pan out, you could go down a bit of a
rabbit hole and run each module in production for twice the length of
time you figure you should see a corruption appear.


So, I have now some results from the PassMark Memtest86! I let the
default automatic tests run for about 19 hours and 16 passes. It
reported zero "Errors", but 4 lines of "[Note] RAM may be vulnerable
to high frequency row hammer bit flips". If I understand it correctly,
it means that some errors were detected when the RAM was tested at
higher rates than guaranteed accurate by the vendors. I am not sure
what that may indicate regarding the performance of the RAM for my
Btrfs filesystem. I "only" got irreparable corruptions maybe once
every couple of months or half a year.

I also forgot that I have been trying using Zswap the last couple of
months with OpenSUSE on the Btrfs-filesystem (and also Fedora on the
Ext4-partition). Maybe that is a source for the last corruption (I am
pretty sure I was not using Zswap during previous corruptions, of
which I think at least one was reporting "transid verify failed" or
similar.) Sometimes, but not when the filesystem went readonly, the
computer has been freezing almost completely (mouse pointer moving
only extremely slowly) when running out of RAM the last months. I have
sometimes waited many hours for the operating system to swap out not
so important memory to the swap-partition, but end up having to force
a reboot. I suspect that it might be Zswap not working optimally,
maybe it also affects Btrfs? I have used pretty low swappiness values,
1 or 10.

I might try using only one of the RAM modules in the future if nothing
else works. I usually use most of my available 12 GB RAM though (and
often even more :) ) when using my laptop.



> I also found that there indeed was a new firmware version for my
> SSD-disk, so I have now updated it's firmware to the newest version.
> Unfortunately I couldn't find any information of what possible issues
> it was supposed to fix. The laptop has already the latest BIOS version
> provided by ASUS for the model.

I don't know enough about the bad key ordering error and its cause. If
that corruption can happen only in memory then the SSD firmware update
may change nothing. If there's some possibility the corruption can be
the result of SSD firmware bugs, then it might make sense to use DUP
metadata in the short term, even on an SSD. Any memory corruption
would affect both copies. Any SSD induced corruption *might* affect
both copies, depending on whether the SSD deduplicates or colocates
the two copies of metadata...but I'd like to think that there's at
least a pretty decent chance one of the copies would be good in which
case you'd get Btrfs self-healing for metadata only.

Thanks, I might try metadata DUP in the future.

Anyway, it's a tedious search.

As for Btrfs getting better at handling these kinds of cases. Yeah
it's a valid question. What we know about other file systems is they
can become unrepairable because they don't detect corruption soon
enough. Whereas Btrfs has detected a problem early on yet it's still
damaged enough now that effectively you can no longer mount it rw.
>From a data integrity point of view, at least you can ro mount and get
your data off the volume with a normal file copy operation, not
something that's certain with other file systems.

If you were to try another file system, I'd look at XFS, tools and
kernels in the past couple of years support metadata checksumming with
the V5 format.


Yes, XFS should also have deduplication as an experimental feature.
Don't know how stable it is yet, I might try it. In the future it is
also supposed to get snapshot feature.

Thanks for all your tips and thoughts.

Claes




--
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread ^m'e
The complete check:

Checking filesystem on /dev/sdb3
UUID: de1723e2-150c-4448-bb36-be14d7d96093
ERROR: extent[64368619520, 524288] referencer count mismatch (root:
257, owner: 7804556, offset: 212992) wanted: 1, have: 0
ERROR: data extent[1862352896 425984] backref lost
ERROR: data extent[1886453760 479232] backref lost
ERROR: data extent[1902219264 524288] backref lost
ERROR: data extent[1817378816 151552] backref lost
ERROR: data extent[1799688192 57344] backref lost
ERROR: data extent[1830277120 258048] backref lost
ERROR: data extent[2558107648 1368064] backref lost
ERROR: errors found in extent allocation tree or chunk allocation
cache and super generation don't match, space cache will be invalidated
ERROR: root 257 DIR_ITEM[30039322 4007295565] couldn't find relative
INODE_ITEM[0] namelen 0 filename  filetype 0
ERROR: root 257 DIR_ITEM[30039322 4007295565] data_len shouldn't be 32907
ERROR: root 257 DIR_ITEM[30039322 4007295565] couldn't find relative
INODE_ITEM[0] namelen 3 filename  filetype 0
ERROR: root 258 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
ERROR: root 1327 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
ERROR: root 1331 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
ERROR: root 1333 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
ERROR: root 1362 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
ERROR: root 1366 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
ERROR: root 1382 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
ERROR: root 1385 DIR_ITEM[30039322 2438219243] couldn't find relative
INODE_ITEM[0] namelen 12 filename metadata.xml filetype 1
ERROR: root 1385 DIR INODE [30039322] size(152) not equal to 136
ERROR: root 1385 INODE REF[30039324 30039322] and DIR_ITEM[30039322
2438219243] mismatch namelen 12 filename metadata.xml filetype 1
found 104348008448 bytes used err is -5
total csum bytes: 99534904
total tree bytes: 3204612096
total fs tree bytes: 2984034304
total extent tree bytes: 87818240
btree space waste bytes: 778261104
file data blocks allocated: 242964852736
 referenced 198454632448
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Superblock update: Is there really any benefits of updating synchronously?

2018-01-24 Thread waxhead

Hans van Kranenburg wrote:

On 01/23/2018 08:51 PM, waxhead wrote:

Nikolay Borisov wrote:

On 23.01.2018 16:20, Hans van Kranenburg wrote:


[...]



We also had a discussion about the "backup roots" that are stored
besides the superblock, and that they are "better than nothing" to help
maybe recover something from a borken fs, but never ever guarantee you
will get a working filesystem back.

The same holds for superblocks from a previous generation. As soon as
the transaction for generation X succesfully hits the disk, all space
that was occupied in generation X-1 but no longer in X is available to
be overwritten immediately.


Ok so this means that superblocks with a older generation is utterly
useless and will lead to corruption (effectively making my argument
above useless as that would in fact assist corruption then).


Mostly, yes.


Does this means that if disk space was allocated in X-1 and is freed in
X it will unallocated if you roll back to X-1 e.g. writing to
unallocated storage.


Can you reword that? I can't follow that sentence.

Sure why not. I'll give it a go:

Does this mean that if...
* Superblock generation N-1 have range 1234-2345 allocated and used.

and

* Superblock generation N-0 (the current) have range 1234-2345 free 
because someone deleted a file or something


Then

It is no point in rolling back to generation N-1 because that refers to 
what is no essentially free "memory" which may or may have not been 
written over by generation N-0. And therefore N-1 which still thinks 
range 1234-2345 is allocated may point to the wrong data.


I hope that was easier to follow - if not don't hold back on the 
explicitives! :)





I was under the impression that a superblock was like a "snapshot" of
the entire filesystem and that rollbacks via pre-gen superblocks was
possible. Am I mistaking?


Yes. The first fundamental thing in Btrfs is COW which makes sure that
everything referenced from transaction X, from the superblock all the
way down to metadata trees and actual data space is never overwritten by
changes done in transaction X+1.

Perhaps a tad off topic, but assuming the (hopefully) better explanation 
above clear things up a bit. What happens if a block is freed?! in X+1 
--- which must mean that it can be overwritten in transaction X+1 (which 
I assume means a new superblock generation). After all without freeing 
and overwriting data there is no way to re-use space.



For metadata trees that are NOT filesystem trees a.k.a. subvolumes, the
way this is done is actually quite simple. If a block is cowed, the old
location is added to a 'pinned extents' list (in memory), which is used
as a blacklist for choosing space to put new writes in. After a
transaction is completed on disk, that list with pinned extents is
emptied and all that space is available for immediate reuse. This way we
make sure that if the transaction that is ongoing is aborted, the
previous one (latest one that is completely on disk) is always still
there. If the computer crashes and the in memory list is lost, no big
deal, we just continue from the latest completed transaction again after
a reboot. (ignoring extra log things for simplicity)

So, the only situation in which you can fully use an X-1 superblock is
when none of that previously pinned space has actually been overwritten
yet afterwards.

And if any of the space was overwritten already, you can go play around
with using an older superblock and your filesystem mounts and everything
might look fine, until you hit that distant corner and BOOM!
Got it , this takes care of my questions above, but I'll leave them in 
just for completeness sake.

Thanks for the good explanation.



 >8  Extra!! Moar!!  >8 

But, doing so does not give you snapshot functionality yet! It's more
like a poor mans snapshot that only can prevent from messing up the
current version.

Snapshot functionality is implemented only for filesystem trees
(subvolumes) by adding reference counting (which does end up on disk) to
the metadata blocks, and then COW trees as a whole.

If you make a snapshot of a filesystem tree, the snapshot gets a whole
new tree ID! It's not a previous version of the same subvolume you're
looking at, it's a clone!

This is a big difference. The extent tree is always tree 2. The chunk
tree is always tree 3. But your subvolume snapshot gets a new tree number.

Technically, it would maybe be possible to implement reference counting
and snapshots to all of the metadata trees, but it would probably mean
that the whole filesystem would get stuck in rewriting itself all day
instead of doing any useful work. The current extent tree already has
such amount of rumination problems that the added work of keeping track
of reference counts would make it completely unusable.

In the wiki, it's here:
https://btrfs.wiki.kernel.org/index.php/Btrfs_design#Copy_on_Write_Logging

Actually, I just paraphrased the first two of those six alineas... The
subvolume

Re: Periodic frame losses when recording to btrfs volume with OBS

2018-01-24 Thread Chris Mason
On 01/22/2018 04:17 PM, Sebastian Ochmann wrote:
> Hello,
> 
> I attached to the ffmpeg-mux process for a little while and pasted the 
> result here:
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__pastebin.com_XHaMLX8z&d=DwIDaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=9QPtTAxcitoznaWRKKHoEQ&m=IkofqwZ_S5C0_qAXjt4EQae-mVE09Ir8zmSbuGqXaCs&s=1nw7xUkEoQF7MgYOlZ8iAA9U0UsRQObH1Z4VLqx8IF8&e=
>  
> 
> 
> Can you help me with interpreting this result? If you'd like me to run 
> strace with specific options, please let me know. This is a level of 
> debugging I'm not dealing with on a daily basis. :)
> 

Going to guess it's these sequences:

lseek(3, 1302012898, SEEK_SET)  = 1302012898
write(3, 
"\37C\266u\1\377\377\377\377\377\377\377\277\204|\271\347J\347\203\3@\0\243CY\202\0\0\0!\21"...,
 262144) = 262144
write(3, 
"\310\22\323g7J#h\351\0\323\270\f\206\5\207(.\232\246\27\371/\376\341\0\0\200\th\3\37"...,
 262144) = 262144
write(3, 
"\225*\245<8N\32\263\237k\331]\313\215\301\366$\7\216\0349\302AS\201\302\307T\361\365\3375"...,
 262144) = 262144
write(3, 
"\272e\37\255\250\24n\235\341E\272Me\36'\345W\353\2337K.n\367\264\\\370\307\341_\206|"...,
 262144) = 262144
write(3, ""..., 53271) = 53271
lseek(3, 1302012902, SEEK_SET)  = 1302012902
write(3, "\1\0\0\0\0\20\320\v", 8)  = 8
lseek(3, 1303114745, SEEK_SET)  = 1303114745

It's seeking, writing, then jumping back and updating what had been written 
before.

That's going to hit the stable page writing code in btrfs that I had mentioned 
earlier.

At Facebook, we've been experimenting with fixes for this that are limited to 
O_APPEND 
slowly growing log files.  Let me think harder...

-chris
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: Use schedule_timeout_interruptible

2018-01-24 Thread Josef Bacik
On Tue, Jan 23, 2018 at 02:46:53PM +0200, Nikolay Borisov wrote:
> Instead of manually fiddling with the state of the task
> (RUNNING->INTERRUPTIBLE->RUNNING) again just use 
> schedule_timeout_interruptible
> which adjusts the task state as needed. No functional changes.
> 
> Signed-off-by: Nikolay Borisov 

Reviewed-by: Josef Bacik 

Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread ^m'e
My bad, forgot to check out the correct branch. Recloned, compiled and
fixed. Then rechecking:

---
# btrfs check --mode=lowmem /dev/sdb3
Checking filesystem on /dev/sdb3
UUID: de1723e2-150c-4448-bb36-be14d7d96093
checking extents
ERROR: extent[64368619520, 524288] referencer count mismatch (root:
257, owner: 7804556, offset: 212992) wanted: 1, have: 0
ERROR: data extent[1862352896 425984] backref lost
ERROR: data extent[1886453760 479232] backref lost
ERROR: data extent[1902219264 524288] backref lost
ERROR: data extent[1817378816 151552] backref lost
ERROR: data extent[1799688192 57344] backref lost
ERROR: data extent[1830277120 258048] backref lost
ERROR: data extent[2558107648 1368064] backref lost
ERROR: errors found in extent allocation tree or chunk allocation
checking free space cache
cache and super generation don't match, space cache will be invalidated
checking fs roots
ERROR: root 257 DIR_ITEM[30039322 4007295565] couldn't find relative
INODE_ITEM[0] namelen 0 filename  filetype 0
ERROR: root 257 DIR_ITEM[30039322 4007295565] data_len shouldn't be 32907
ERROR: root 257 DIR_ITEM[30039322 4007295565] couldn't find relative
INODE_ITEM[0] namelen 3 filename  filetype 0
ERROR: root 258 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
ERROR: root 1327 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
ERROR: root 1331 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
ERROR: root 1333 EXTENT_DATA[60710 18219008] prealloc shouldn't have datasum
ERROR: root 1362 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
ERROR: root 1366 EXTENT_DATA[1119172 1966080] prealloc shouldn't have datasum
...
---


Trying again to repair:

---
# btrfs check -p --repair /dev/sdb3
enabling repair mode
Checking filesystem on /dev/sdb3
UUID: de1723e2-150c-4448-bb36-be14d7d96093
ref mismatch on [1799688192 57344] extent item 0, found 1
Backref 1799688192 root 1385 owner 47301992 offset 0 num_refs 0 not
found in extent tree
Incorrect local backref count on 1799688192 root 1385 owner 47301992
offset 0 found 1 wanted 0 back 0x93a27f8
backpointer mismatch on [1799688192 57344]
repair deleting extent record: key 1799688192 168 57344
adding new data backref on 1799688192 root 1385 owner 47301992 offset 0 found 1
Repaired extent references for 1799688192
ref mismatch on [1817378816 151552] extent item 0, found 1
Backref 1817378816 root 1385 owner 47301982 offset 0 num_refs 0 not
found in extent tree
Incorrect local backref count on 1817378816 root 1385 owner 47301982
offset 0 found 1 wanted 0 back 0x94643f0
backpointer mismatch on [1817378816 151552]
repair deleting extent record: key 1817378816 168 151552
adding new data backref on 1817378816 root 1385 owner 47301982 offset 0 found 1
Repaired extent references for 1817378816
ref mismatch on [1830277120 258048] extent item 0, found 1
Backref 1830277120 root 1385 owner 47302002 offset 0 num_refs 0 not
found in extent tree
Incorrect local backref count on 1830277120 root 1385 owner 47302002
offset 0 found 1 wanted 0 back 0xc7bd1b8
backpointer mismatch on [1830277120 258048]
repair deleting extent record: key 1830277120 168 258048
adding new data backref on 1830277120 root 1385 owner 47302002 offset 0 found 1
Repaired extent references for 1830277120
ref mismatch on [1862352896 425984] extent item 0, found 1
Backref 1862352896 root 1385 owner 47301952 offset 0 num_refs 0 not
found in extent tree
Incorrect local backref count on 1862352896 root 1385 owner 47301952
offset 0 found 1 wanted 0 back 0xef22790
backpointer mismatch on [1862352896 425984]
repair deleting extent record: key 1862352896 168 425984
adding new data backref on 1862352896 root 1385 owner 47301952 offset 0 found 1
Repaired extent references for 1862352896
ref mismatch on [1886453760 479232] extent item 0, found 1
Backref 1886453760 root 1385 owner 47301962 offset 0 num_refs 0 not
found in extent tree
Incorrect local backref count on 1886453760 root 1385 owner 47301962
offset 0 found 1 wanted 0 back 0x93a1058
backpointer mismatch on [1886453760 479232]
repair deleting extent record: key 1886453760 168 479232
adding new data backref on 1886453760 root 1385 owner 47301962 offset 0 found 1
Repaired extent references for 1886453760
ref mismatch on [1902219264 524288] extent item 0, found 1
Backref 1902219264 root 1385 owner 47301972 offset 0 num_refs 0 not
found in extent tree
Incorrect local backref count on 1902219264 root 1385 owner 47301972
offset 0 found 1 wanted 0 back 0x93883a8
backpointer mismatch on [1902219264 524288]
repair deleting extent record: key 1902219264 168 524288
adding new data backref on 1902219264 root 1385 owner 47301972 offset 0 found 1
Repaired extent references for 1902219264
ref mismatch on [2558107648 1368064] extent item 0, found 1
Backref 2558107648 root 1385 owner 47302009 offset 0 num

Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread Qu Wenruo


On 2018年01月24日 19:57, ^m'e wrote:
> Thanks Qu!
> 
> I did it (had to add 'progs_extra' to the 'static' make target...),
> but it looks like there's something missing:

Did you check out the branch called "dirty_fix"?

Thanks,
Qu
> 
> -
> # ./btrfs-corrupt-block.static -X /dev/sdb3
> ./btrfs-corrupt-block.static: invalid option -- 'X'
> usage: btrfs-corrupt-block [options] device
> -l Logical extent to be corrupted
> -c Copy of the extent to be corrupted (usually 1 or 2, default: 0)
> -b Number of bytes to be corrupted
> -e Extent to be corrupted
> -E The whole extent tree to be corrupted
> -u Given chunk item to be corrupted
> -U The whole chunk tree to be corrupted
> -i The inode item to corrupt (must also specify the field to corrupt)
> -x The file extent item to corrupt (must also specify -i for the
> inode and -f for the field to corrupt)
> -m The metadata block to corrupt (must also specify -f for the
> field to corrupt)
> -K The key to corrupt in the format ,, (must also
> specify -f for the field)
> -f The field in the item to corrupt
> -I An item to corrupt (must also specify the field to corrupt and
> a root+key for the item)
> -D Corrupt a dir item, must specify key and field
> -d Delete this item (must specify -K)
> -
> 
> I cloned at --depth=1, if that matters... Didn't dare to play around
> wiht the lowercase 'x' option... O_o
> 
> 
> On Wed, Jan 24, 2018 at 10:14 AM, Qu Wenruo  wrote:
>> Here is the super dirty tricky fix (and less deadly now).
>>
>> https://github.com/adam900710/btrfs-progs/tree/dirty_fix
>>
>> Please compile the branch and run:
>>
>> # ./btrfs-corrupt-block -X 
>>
>> Where  must be unmounted, the original btrfs-corrupt-block tool
>> doesn't have mount check, and I'm too lazy to add such check.
>>
>> The hack will remove the offending DIR_ITEM completely, and unlink the
>> old "Manifest" file, and repair the link for newer "Manifest" file.
>>
>> And it shouldn't write anything to disk if any operation failed, so it's
>> less deadly.
>>
>> Wish you good luck.
>>
>> Thanks,
>> Qu
>>
>> On 2018年01月24日 17:18, Foo Bar wrote:
>>> Qu Wenruo wrote on 2018-01-24 09:49:
 Sorry for the late reply, I was off yesterday.

>>>
>>> No problem :-)
>>>
>>> Booted normally today, system up, but see this (I forgot to stop the 
>>> snapshot
>>> cron task...)
>>>
>>>   [  115.127961] BTRFS error (device sdb3): Send: inconsistent snapshot, 
>>> found
>>> deleted reference for inode 30039323 without updated inode item, send root 
>>> is
>>> 1399, parent root is 1385
>>>
>>> So inode 30039323 looks definitely the bad one. Let's get rid of it and keep
>>> the newest dups, if any, thanks!
>>>
>>> Cheers,
>>>
>>>   Marco
>>>
 On 2018年01月22日 23:04, ^m'e wrote:
> Thanks for the quick reply, Qu!
>
> I forgot to say that I see weird characters in the btrfs check repair
> in lines "ERROR: DIR_ITEM... name ..." output. Although that can be
> due to corruption, I seem to remember that a previous version of
> btrfs-progs I used didn't show that...
> I also see:
>
>[19428.934684] init_special_inode: bogus i_mode (700) for inode
> sdb3:18446744073709551361
>
> BTW, no sensible names in the debug output, and as far as I can see,
> it might be all stuff in '[rootfs]/usr/portage': if that's the case,
> corrupted inodes can be safely removed, as the portage package tree
> can be easily rebuild. Here you are:
>
> -->8-
> # cat btrfs-debug.30039322.log[snip]

 This where the dir starts.

> item 78 key (30039322 INODE_ITEM 0) itemoff 4203 itemsize 160
> generation 136248 transid 229515 size 152 nbytes 0
> block group 0 mode 40755 links 1 uid 250 gid 250 rdev 0
> sequence 0 flags 0xf(none)
> atime 1504685599.188061317 (2017-09-06 08:13:19)
> ctime 1516557882.551679697 (2018-01-21 18:04:42)
> mtime 1516557882.551679697 (2018-01-21 18:04:42)
> otime 1504685599.188061317 (2017-09-06 08:13:19)
> item 79 key (30039322 INODE_REF 30037720) itemoff 4161 itemsize 42
> index 242 namelen 32 name: obs-service-download_src_package
> item 80 key (30039322 DIR_ITEM 1076301169) itemoff 4083 itemsize 78
> location key (30039325 INODE_ITEM 0) type FILE
> transid 136248 data_len 0 name_len 48
> name: obs-service-download_src_package-20130318.ebuild
> item 81 key (30039322 DIR_ITEM 2438219243) itemoff 4041 itemsize 42
> location key (0 UNKNOWN.0 0) type FILE
> transid 136192 data_len 0 name_len 12
> name: metadata.xml
> item 82 key (30039322 DIR_ITEM 4007295565) itemoff 3927 itemsize 114

Re: bad key ordering - repairable?

2018-01-24 Thread Austin S. Hemmelgarn

On 2018-01-23 19:44, Chris Murphy wrote:

On Tue, Jan 23, 2018 at 5:51 AM, Austin S. Hemmelgarn
 wrote:


This is extremely important to understand.  BTRFS and ZFS are essentially
the only filesystems available on Linux that actually validate things enough
to notice this reliably (ReFS on Windows probably does, and I think whatever
Apple is calling their new FS does too).


ReFS always checksums metadata, optionally can checksum data.
Good to know, I've not actually dealt with ReFS myself yet (we're mostly 
a Linux shop where I work, and the two Windows servers we do have aren't 
using ReFS simply because it wasn't beyond the technology preview level 
when we installed them and we don't want to screw anything up).


APFS is really vague on this front, it may be checksumming metadata,
it's not checksumming data and with no option to. Apple proposes their
branded storage devices do not return bogus data. OK so then why
checksum the metadata?
Even aside from the fact that it might be checksumming data, Apple's 
storage engineers are still smoking something pretty damn strong if they 
think that they can claim their storage devices _never_ return bogus 
data.  Either they're running some kind of checksumming _and_ 
replication below the block layer in the storage device itself (which 
actually might explain the insane cost of at least one piece of their 
hardware), or they think they've come up with some fail-safe way to 
detect corruption and return errors reliably, and in either case things 
can still fail.  I smell a potential future lawsuit in the works...



Even if ext4 did notice it, it
would just mark the filesystem for a check and then keep going without doing
anything else about it (seriously, the default behavior for internal errors
on ext4 is to just continue like nothing happened and mark the FS for fsck).


I haven't used ext4 with metadata checksumming enabled, and have no
idea how it behaves when it starts encountering checksum errors during
normal use. For sure XFS will complain a lot and will go read only
when it gets confused. I'd expect any file system going to the trouble
of checksumming would have to have some means of bailing out, rather
than just continuing on.
Actually, I forgot about the (newer) metadata checksumming feature in 
ext4, and was just basing my statement on behavior the last time I used 
it for anything serious.  Having just checked mkfs.ext4, it appears that 
the metadata in the SB that tells the kernel what to do when it runs 
into an error for the FS still defaults to continuing on as if nothing 
happens, even if you enable metadata checksumming (which still seems to 
be disabled by default).  Whether or not that actually is honored by 
modern kernels, I don't know, but I've seen no evidence to suggest that 
it isn't.


Btrfs (and maybe ZFS) COW everything except supers. So ostensibly a
future feature might let them continue on with a kind of
integrated/single volume variation on seed/sprout device. I'd like to
see something like this just for undoable and testable offline
repairs, rather than offline repair only being predicated on
overwritting metadata.Agreed.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread ^m'e
Thanks Qu!

I did it (had to add 'progs_extra' to the 'static' make target...),
but it looks like there's something missing:

-
# ./btrfs-corrupt-block.static -X /dev/sdb3
./btrfs-corrupt-block.static: invalid option -- 'X'
usage: btrfs-corrupt-block [options] device
-l Logical extent to be corrupted
-c Copy of the extent to be corrupted (usually 1 or 2, default: 0)
-b Number of bytes to be corrupted
-e Extent to be corrupted
-E The whole extent tree to be corrupted
-u Given chunk item to be corrupted
-U The whole chunk tree to be corrupted
-i The inode item to corrupt (must also specify the field to corrupt)
-x The file extent item to corrupt (must also specify -i for the
inode and -f for the field to corrupt)
-m The metadata block to corrupt (must also specify -f for the
field to corrupt)
-K The key to corrupt in the format ,, (must also
specify -f for the field)
-f The field in the item to corrupt
-I An item to corrupt (must also specify the field to corrupt and
a root+key for the item)
-D Corrupt a dir item, must specify key and field
-d Delete this item (must specify -K)
-

I cloned at --depth=1, if that matters... Didn't dare to play around
wiht the lowercase 'x' option... O_o


On Wed, Jan 24, 2018 at 10:14 AM, Qu Wenruo  wrote:
> Here is the super dirty tricky fix (and less deadly now).
>
> https://github.com/adam900710/btrfs-progs/tree/dirty_fix
>
> Please compile the branch and run:
>
> # ./btrfs-corrupt-block -X 
>
> Where  must be unmounted, the original btrfs-corrupt-block tool
> doesn't have mount check, and I'm too lazy to add such check.
>
> The hack will remove the offending DIR_ITEM completely, and unlink the
> old "Manifest" file, and repair the link for newer "Manifest" file.
>
> And it shouldn't write anything to disk if any operation failed, so it's
> less deadly.
>
> Wish you good luck.
>
> Thanks,
> Qu
>
> On 2018年01月24日 17:18, Foo Bar wrote:
>> Qu Wenruo wrote on 2018-01-24 09:49:
>>> Sorry for the late reply, I was off yesterday.
>>>
>>
>> No problem :-)
>>
>> Booted normally today, system up, but see this (I forgot to stop the snapshot
>> cron task...)
>>
>>   [  115.127961] BTRFS error (device sdb3): Send: inconsistent snapshot, 
>> found
>> deleted reference for inode 30039323 without updated inode item, send root is
>> 1399, parent root is 1385
>>
>> So inode 30039323 looks definitely the bad one. Let's get rid of it and keep
>> the newest dups, if any, thanks!
>>
>> Cheers,
>>
>>   Marco
>>
>>> On 2018年01月22日 23:04, ^m'e wrote:
 Thanks for the quick reply, Qu!

 I forgot to say that I see weird characters in the btrfs check repair
 in lines "ERROR: DIR_ITEM... name ..." output. Although that can be
 due to corruption, I seem to remember that a previous version of
 btrfs-progs I used didn't show that...
 I also see:

[19428.934684] init_special_inode: bogus i_mode (700) for inode
 sdb3:18446744073709551361

 BTW, no sensible names in the debug output, and as far as I can see,
 it might be all stuff in '[rootfs]/usr/portage': if that's the case,
 corrupted inodes can be safely removed, as the portage package tree
 can be easily rebuild. Here you are:

 -->8-
 # cat btrfs-debug.30039322.log[snip]
>>>
>>> This where the dir starts.
>>>
 item 78 key (30039322 INODE_ITEM 0) itemoff 4203 itemsize 160
 generation 136248 transid 229515 size 152 nbytes 0
 block group 0 mode 40755 links 1 uid 250 gid 250 rdev 0
 sequence 0 flags 0xf(none)
 atime 1504685599.188061317 (2017-09-06 08:13:19)
 ctime 1516557882.551679697 (2018-01-21 18:04:42)
 mtime 1516557882.551679697 (2018-01-21 18:04:42)
 otime 1504685599.188061317 (2017-09-06 08:13:19)
 item 79 key (30039322 INODE_REF 30037720) itemoff 4161 itemsize 42
 index 242 namelen 32 name: obs-service-download_src_package
 item 80 key (30039322 DIR_ITEM 1076301169) itemoff 4083 itemsize 78
 location key (30039325 INODE_ITEM 0) type FILE
 transid 136248 data_len 0 name_len 48
 name: obs-service-download_src_package-20130318.ebuild
 item 81 key (30039322 DIR_ITEM 2438219243) itemoff 4041 itemsize 42
 location key (0 UNKNOWN.0 0) type FILE
 transid 136192 data_len 0 name_len 12
 name: metadata.xml
 item 82 key (30039322 DIR_ITEM 4007295565) itemoff 3927 itemsize 114
 location key (0 UNKNOWN.0 0) type DIR_ITEM.0
 transid 0 data_len 0 name_len 0
 name:
 location key (0 UNKNOWN.125 72057594038112709) type DIR_ITEM.0
 transid 0 data_len 32907 name_len 3
 name:
>

Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread Qu Wenruo
Here is the super dirty tricky fix (and less deadly now).

https://github.com/adam900710/btrfs-progs/tree/dirty_fix

Please compile the branch and run:

# ./btrfs-corrupt-block -X 

Where  must be unmounted, the original btrfs-corrupt-block tool
doesn't have mount check, and I'm too lazy to add such check.

The hack will remove the offending DIR_ITEM completely, and unlink the
old "Manifest" file, and repair the link for newer "Manifest" file.

And it shouldn't write anything to disk if any operation failed, so it's
less deadly.

Wish you good luck.

Thanks,
Qu

On 2018年01月24日 17:18, Foo Bar wrote:
> Qu Wenruo wrote on 2018-01-24 09:49:
>> Sorry for the late reply, I was off yesterday.
>>
> 
> No problem :-)
> 
> Booted normally today, system up, but see this (I forgot to stop the snapshot
> cron task...)
> 
>   [  115.127961] BTRFS error (device sdb3): Send: inconsistent snapshot, found
> deleted reference for inode 30039323 without updated inode item, send root is
> 1399, parent root is 1385
> 
> So inode 30039323 looks definitely the bad one. Let's get rid of it and keep
> the newest dups, if any, thanks!
> 
> Cheers,
> 
>   Marco
> 
>> On 2018年01月22日 23:04, ^m'e wrote:
>>> Thanks for the quick reply, Qu!
>>>
>>> I forgot to say that I see weird characters in the btrfs check repair
>>> in lines "ERROR: DIR_ITEM... name ..." output. Although that can be
>>> due to corruption, I seem to remember that a previous version of
>>> btrfs-progs I used didn't show that...
>>> I also see:
>>>
>>>[19428.934684] init_special_inode: bogus i_mode (700) for inode
>>> sdb3:18446744073709551361
>>>
>>> BTW, no sensible names in the debug output, and as far as I can see,
>>> it might be all stuff in '[rootfs]/usr/portage': if that's the case,
>>> corrupted inodes can be safely removed, as the portage package tree
>>> can be easily rebuild. Here you are:
>>>
>>> -->8-
>>> # cat btrfs-debug.30039322.log[snip]
>>
>> This where the dir starts.
>>
>>> item 78 key (30039322 INODE_ITEM 0) itemoff 4203 itemsize 160
>>> generation 136248 transid 229515 size 152 nbytes 0
>>> block group 0 mode 40755 links 1 uid 250 gid 250 rdev 0
>>> sequence 0 flags 0xf(none)
>>> atime 1504685599.188061317 (2017-09-06 08:13:19)
>>> ctime 1516557882.551679697 (2018-01-21 18:04:42)
>>> mtime 1516557882.551679697 (2018-01-21 18:04:42)
>>> otime 1504685599.188061317 (2017-09-06 08:13:19)
>>> item 79 key (30039322 INODE_REF 30037720) itemoff 4161 itemsize 42
>>> index 242 namelen 32 name: obs-service-download_src_package
>>> item 80 key (30039322 DIR_ITEM 1076301169) itemoff 4083 itemsize 78
>>> location key (30039325 INODE_ITEM 0) type FILE
>>> transid 136248 data_len 0 name_len 48
>>> name: obs-service-download_src_package-20130318.ebuild
>>> item 81 key (30039322 DIR_ITEM 2438219243) itemoff 4041 itemsize 42
>>> location key (0 UNKNOWN.0 0) type FILE
>>> transid 136192 data_len 0 name_len 12
>>> name: metadata.xml
>>> item 82 key (30039322 DIR_ITEM 4007295565) itemoff 3927 itemsize 114
>>> location key (0 UNKNOWN.0 0) type DIR_ITEM.0
>>> transid 0 data_len 0 name_len 0
>>> name:
>>> location key (0 UNKNOWN.125 72057594038112709) type DIR_ITEM.0
>>> transid 0 data_len 32907 name_len 3
>>> name:
>>> data
>>
>> The whole item is corrupted.
>> Seems to be a half-written item get flushed to disk.
>>
>> I assume this is the DIR_ITEM for *two* Manifest, but that's just
>> insane, as we're going to have 2 files with the same name "Manifest"
>>
>>> item 83 key (30039322 DIR_INDEX 2) itemoff 3889 itemsize 38
>>> location key (30039323 INODE_ITEM 0) type FILE
>>> transid 3377699720527872 data_len 0 name_len 8
>>
>> The transid seems corrupted too.
>>
>> Maybe I need to delete this item too?
>>
>>> item 64 key (47302013 INODE_REF 30039322) itemoff 11278 itemsize 18
>>> index 5 namelen 8 name: Manifest
>>
>> Now we do have 2 "Manifest".
>>
>> Which one do you prefer to delete?
>>
>> The latter one, inode 47302013 seems newer, while previous one, inode
>> 30039323 is pretty old.
>>
>> Despite that, I didn't see big problem in the dump.
>>
>> I'll just craft the dirty fix to delete one inode and the incorrect dir
>> index/item.
>>
>> Thanks,
>> Qu
>>
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 1/2] btrfs-progs: mkfs: Fix minimal device size calculation for new temporary chunk layout

2018-01-24 Thread Nikolay Borisov


On 24.01.2018 04:38, Qu Wenruo wrote:
> Commit 0b2161becf8e ("btrfs-progs: mkfs: Prevent temporary system chunk
> to use space in reserved 1M range") changed the hard-coded temporary
> chunk layout to avoid the first 1M.
> 
> However this also affects btrfs_min_dev_size() which still assume
> temporary chunks starts at device offset 0.
> 
> This patch will fix it.
> 
> Fixes: 0b2161becf8e ("btrfs-progs: mkfs: Prevent temporary system chunk
> to use space in reserved 1M range")
> Signed-off-by: Qu Wenruo 

Reviewed-by: Nikolay Borisov 

> ---
>  mkfs/common.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/mkfs/common.c b/mkfs/common.c
> index 5c5e9c3b9e01..8e85942ef4eb 100644
> --- a/mkfs/common.c
> +++ b/mkfs/common.c
> @@ -483,6 +483,8 @@ u64 btrfs_min_dev_size(u32 nodesize, int mixed, u64 
> meta_profile,
>  
>   /*
>* Minimal size calculation is complex due to several factors:
> +  * 0) Reserved 1M range.
> +  *
>* 1) Temporary chunk reuse
>*If specified chunk profile is SINGLE, we can reuse
>*temporary chunks, no need to allocate new chunks.
> @@ -501,7 +503,8 @@ u64 btrfs_min_dev_size(u32 nodesize, int mixed, u64 
> meta_profile,
>* The latter two are all 8M, accroding to @calc_size of
>* btrfs_alloc_chunk().
>*/
> - reserved += BTRFS_MKFS_SYSTEM_GROUP_SIZE + SZ_8M * 2;
> + reserved += BTRFS_BLOCK_RESERVED_1M_FOR_SUPER +
> + BTRFS_MKFS_SYSTEM_GROUP_SIZE + SZ_8M * 2;
>  
>   /*
>* For real chunks, we need to select different sizes:
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] btrfs-progs: Remove unnecessary parameter for btrfs_add_block_group

2018-01-24 Thread Nikolay Borisov


On 24.01.2018 04:30, Qu Wenruo wrote:
> @chunk_objectid of btrfs_make_block_group() function is always fixed to
> BTRFS_FIRST_FREE_OBJECTID, so there is no need to pass it as parameter
> explicitly.
> 
> Signed-off-by: Qu Wenruo 

Reviewed-by: Nikolay Borisov 
> ---
> v2:
>   Rebase to devel branch, as I introduced some conflict.
> ---
>  cmds-check.c   |  5 ++---
>  convert/main.c |  4 +---
>  ctree.h|  5 ++---
>  extent-tree.c  | 14 +++---
>  mkfs/main.c|  7 +--
>  5 files changed, 13 insertions(+), 22 deletions(-)
> 
> diff --git a/cmds-check.c b/cmds-check.c
> index 5fc0ea9d4f4d..99fbafc5538c 100644
> --- a/cmds-check.c
> +++ b/cmds-check.c
> @@ -13088,7 +13088,7 @@ static int repair_chunk_item(struct 
> btrfs_trans_handle *trans,
>  
>   if (err & REFERENCER_MISSING) {
>   ret = btrfs_make_block_group(trans, chunk_root->fs_info, 0,
> -  type, chunk_key.objectid, chunk_key.offset, length);
> +  type, chunk_key.offset, length);
>   if (ret) {
>   error("fail to add block group item[%llu %llu]",
> chunk_key.offset, length);
> @@ -13680,8 +13680,7 @@ static int reset_block_groups(struct btrfs_fs_info 
> *fs_info)
>  
>   chunk = btrfs_item_ptr(leaf, path.slots[0], struct btrfs_chunk);
>   btrfs_add_block_group(fs_info, 0,
> -   btrfs_chunk_type(leaf, chunk),
> -   key.objectid, key.offset,
> +   btrfs_chunk_type(leaf, chunk), key.offset,
> btrfs_chunk_length(leaf, chunk));
>   set_extent_dirty(&fs_info->free_space_cache, key.offset,
>key.offset + btrfs_chunk_length(leaf, chunk));
> diff --git a/convert/main.c b/convert/main.c
> index 89f9261172ca..3cad37e94462 100644
> --- a/convert/main.c
> +++ b/convert/main.c
> @@ -916,9 +916,7 @@ static int make_convert_data_block_groups(struct 
> btrfs_trans_handle *trans,
>   if (ret < 0)
>   break;
>   ret = btrfs_make_block_group(trans, fs_info, 0,
> - BTRFS_BLOCK_GROUP_DATA,
> - BTRFS_FIRST_CHUNK_TREE_OBJECTID,
> - cur, len);
> + BTRFS_BLOCK_GROUP_DATA, cur, len);
>   if (ret < 0)
>   break;
>   cur += len;
> diff --git a/ctree.h b/ctree.h
> index 7db0cd94ff07..a43c41f56924 100644
> --- a/ctree.h
> +++ b/ctree.h
> @@ -2529,11 +2529,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info 
> *info);
>  int btrfs_read_block_groups(struct btrfs_root *root);
>  struct btrfs_block_group_cache *
>  btrfs_add_block_group(struct btrfs_fs_info *fs_info, u64 bytes_used, u64 
> type,
> -   u64 chunk_objectid, u64 chunk_offset, u64 size);
> +   u64 chunk_offset, u64 size);
>  int btrfs_make_block_group(struct btrfs_trans_handle *trans,
>  struct btrfs_fs_info *fs_info, u64 bytes_used,
> -u64 type, u64 chunk_objectid, u64 chunk_offset,
> -u64 size);
> +u64 type, u64 chunk_offset, u64 size);
>  int btrfs_make_block_groups(struct btrfs_trans_handle *trans,
>   struct btrfs_fs_info *fs_info);
>  int btrfs_update_block_group(struct btrfs_root *root, u64 bytenr, u64 num,
> diff --git a/extent-tree.c b/extent-tree.c
> index edf659d5e4b8..94adb333a358 100644
> --- a/extent-tree.c
> +++ b/extent-tree.c
> @@ -1916,7 +1916,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle 
> *trans,
>   BUG_ON(ret);
>  
>   ret = btrfs_make_block_group(trans, fs_info, 0, space_info->flags,
> -  BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes);
> +  start, num_bytes);
>   BUG_ON(ret);
>   return 0;
>  }
> @@ -3310,7 +3310,7 @@ error:
>  
>  struct btrfs_block_group_cache *
>  btrfs_add_block_group(struct btrfs_fs_info *fs_info, u64 bytes_used, u64 
> type,
> -   u64 chunk_objectid, u64 chunk_offset, u64 size)
> +   u64 chunk_offset, u64 size)
>  {
>   int ret;
>   int bit = 0;
> @@ -3326,7 +3326,8 @@ btrfs_add_block_group(struct btrfs_fs_info *fs_info, 
> u64 bytes_used, u64 type,
>  
>   cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
>   btrfs_set_block_group_used(&cache->item, bytes_used);
> - btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
> + btrfs_set_block_group_chunk_objectid(&cache->item,
> +  BTRFS_FIRST_CHUNK_TREE_OBJECTID);
>   cache->flags = type;
>   btrfs_set_block_group_flags(&cache->item, type);
>  
> @@ -3351,15 +3352,14 

Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread Foo Bar
Qu Wenruo wrote on 2018-01-24 09:49:
> Sorry for the late reply, I was off yesterday.
>

No problem :-)

Booted normally today, system up, but see this (I forgot to stop the snapshot
cron task...)

  [  115.127961] BTRFS error (device sdb3): Send: inconsistent snapshot, found
deleted reference for inode 30039323 without updated inode item, send root is
1399, parent root is 1385

So inode 30039323 looks definitely the bad one. Let's get rid of it and keep
the newest dups, if any, thanks!

Cheers,

  Marco

> On 2018年01月22日 23:04, ^m'e wrote:
>> Thanks for the quick reply, Qu!
>>
>> I forgot to say that I see weird characters in the btrfs check repair
>> in lines "ERROR: DIR_ITEM... name ..." output. Although that can be
>> due to corruption, I seem to remember that a previous version of
>> btrfs-progs I used didn't show that...
>> I also see:
>>
>>[19428.934684] init_special_inode: bogus i_mode (700) for inode
>> sdb3:18446744073709551361
>>
>> BTW, no sensible names in the debug output, and as far as I can see,
>> it might be all stuff in '[rootfs]/usr/portage': if that's the case,
>> corrupted inodes can be safely removed, as the portage package tree
>> can be easily rebuild. Here you are:
>>
>> -->8-
>> # cat btrfs-debug.30039322.log[snip]
> 
> This where the dir starts.
> 
>> item 78 key (30039322 INODE_ITEM 0) itemoff 4203 itemsize 160
>> generation 136248 transid 229515 size 152 nbytes 0
>> block group 0 mode 40755 links 1 uid 250 gid 250 rdev 0
>> sequence 0 flags 0xf(none)
>> atime 1504685599.188061317 (2017-09-06 08:13:19)
>> ctime 1516557882.551679697 (2018-01-21 18:04:42)
>> mtime 1516557882.551679697 (2018-01-21 18:04:42)
>> otime 1504685599.188061317 (2017-09-06 08:13:19)
>> item 79 key (30039322 INODE_REF 30037720) itemoff 4161 itemsize 42
>> index 242 namelen 32 name: obs-service-download_src_package
>> item 80 key (30039322 DIR_ITEM 1076301169) itemoff 4083 itemsize 78
>> location key (30039325 INODE_ITEM 0) type FILE
>> transid 136248 data_len 0 name_len 48
>> name: obs-service-download_src_package-20130318.ebuild
>> item 81 key (30039322 DIR_ITEM 2438219243) itemoff 4041 itemsize 42
>> location key (0 UNKNOWN.0 0) type FILE
>> transid 136192 data_len 0 name_len 12
>> name: metadata.xml
>> item 82 key (30039322 DIR_ITEM 4007295565) itemoff 3927 itemsize 114
>> location key (0 UNKNOWN.0 0) type DIR_ITEM.0
>> transid 0 data_len 0 name_len 0
>> name:
>> location key (0 UNKNOWN.125 72057594038112709) type DIR_ITEM.0
>> transid 0 data_len 32907 name_len 3
>> name:
>> data
> 
> The whole item is corrupted.
> Seems to be a half-written item get flushed to disk.
> 
> I assume this is the DIR_ITEM for *two* Manifest, but that's just
> insane, as we're going to have 2 files with the same name "Manifest"
> 
>> item 83 key (30039322 DIR_INDEX 2) itemoff 3889 itemsize 38
>> location key (30039323 INODE_ITEM 0) type FILE
>> transid 3377699720527872 data_len 0 name_len 8
> 
> The transid seems corrupted too.
> 
> Maybe I need to delete this item too?
> 
>> item 64 key (47302013 INODE_REF 30039322) itemoff 11278 itemsize 18
>> index 5 namelen 8 name: Manifest
> 
> Now we do have 2 "Manifest".
> 
> Which one do you prefer to delete?
> 
> The latter one, inode 47302013 seems newer, while previous one, inode
> 30039323 is pretty old.
> 
> Despite that, I didn't see big problem in the dump.
> 
> I'll just craft the dirty fix to delete one inode and the incorrect dir
> index/item.
> 
> Thanks,
> Qu
> 

-- 
  /\/\
 /  \  _)/ 
 \/\/ / \_/ __ \
 |  Y Y  \  ___/
 |__|_|  /\___  >
   \/ \/
VoIP-CH .. <0225085...@sip.netvoip.ch>
VoIP-IT . <0683394...@voip.eutelia.it>
VoIP-2 ... 
VoIP-3  
PSTN CH . +41 22 508 57 43
PSTN IT . +39 06 8339 4229
fax IT .. +39 06 9838 1481



signature.asc
Description: OpenPGP digital signature


Re: Can't mount (even in ro) after power outage - corrupt leaf, open_ctree failed

2018-01-24 Thread Zatkovský Dušan
Ok, so I ended with btrfs restore, seems that all (or most important) 
files were restored.


Now looking for another reliable filesystem which will not unrecoverably 
die on power outage.


msk


Dňa 22. 1. 2018 o 10:14 Zatkovský Dušan napísal(a):

Hi.

Badblocks finished on both disks with no errors. The only messages 
from kernel
during night are 6x perf: interrupt took too long (2511 > 2500), 
lowering kernel.perf_event_max_sample_rate to 79500


root@nas:~# smartctl -l scterc /dev/sda
smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.9.0-4-amd64] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, 
www.smartmontools.org


SCT Error Recovery Control:
   Read: 70 (7.0 seconds)
  Write: 70 (7.0 seconds)

root@nas:~# smartctl -l scterc /dev/sdb
smartctl 6.6 2016-05-31 r4324 [x86_64-linux-4.9.0-4-amd64] (local build)
Copyright (C) 2002-16, Bruce Allen, Christian Franke, 
www.smartmontools.org


SCT Error Recovery Control:
   Read: 70 (7.0 seconds)
  Write: 70 (7.0 seconds)

root@nas:~# btrfs-debug-tree -t chunk /dev/sda4 | grep 'METADATA\|SYSTEM'
incorrect offsets 13686 13622
    type METADATA|RAID1 num_stripes 2
    type METADATA|RAID1 num_stripes 2
    type SYSTEM|RAID1 num_stripes 2
    type METADATA|RAID1 num_stripes 2
    type METADATA|RAID1 num_stripes 2

root@nas:~# btrfs-debug-tree -t chunk /dev/sdb4 | grep 'METADATA\|SYSTEM'
incorrect offsets 13686 13622
    type METADATA|RAID1 num_stripes 2
    type METADATA|RAID1 num_stripes 2
    type SYSTEM|RAID1 num_stripes 2
    type METADATA|RAID1 num_stripes 2
    type METADATA|RAID1 num_stripes 2

(still used "old" version of btrfs tools, working remotely now, I will 
boot something newer when I will get access to that NAS at EOD)


Thank you
msk


Dňa 22. 1. 2018 o 0:24 Chris Murphy napísal(a):
On Sun, Jan 21, 2018 at 4:13 PM, Chris Murphy 
 wrote:

On Sun, Jan 21, 2018 at 3:31 PM, msk conf  wrote:

Hello,

thank you for the reply.


What do you get for btrfs fi df /array


Can't do that because filesystem is not mountable. I will get stats 
for '/'
filesystem instead (because '/array' is an empty directory - 
mountpoint on /

Try
$ sudo btrfs-debug-tree -t chunk /dev/mapper/first | grep 
'METADATA\|SYSTEM'


You need to adapt that /dev/ node for your case, I just copy pasted
that from my setup. Anyway, that will look at the chunk tree and show
the profile for these chunk types.






--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: btrfs check: backref lost, mismatch with its hash -- can't repair

2018-01-24 Thread Qu Wenruo
Sorry for the late reply, I was off yesterday.

On 2018年01月22日 23:04, ^m'e wrote:
> Thanks for the quick reply, Qu!
> 
> I forgot to say that I see weird characters in the btrfs check repair
> in lines "ERROR: DIR_ITEM... name ..." output. Although that can be
> due to corruption, I seem to remember that a previous version of
> btrfs-progs I used didn't show that...
> I also see:
> 
>[19428.934684] init_special_inode: bogus i_mode (700) for inode
> sdb3:18446744073709551361
> 
> BTW, no sensible names in the debug output, and as far as I can see,
> it might be all stuff in '[rootfs]/usr/portage': if that's the case,
> corrupted inodes can be safely removed, as the portage package tree
> can be easily rebuild. Here you are:
> 
> -->8-
> # cat btrfs-debug.30039322.log[snip]

This where the dir starts.

> item 78 key (30039322 INODE_ITEM 0) itemoff 4203 itemsize 160
> generation 136248 transid 229515 size 152 nbytes 0
> block group 0 mode 40755 links 1 uid 250 gid 250 rdev 0
> sequence 0 flags 0xf(none)
> atime 1504685599.188061317 (2017-09-06 08:13:19)
> ctime 1516557882.551679697 (2018-01-21 18:04:42)
> mtime 1516557882.551679697 (2018-01-21 18:04:42)
> otime 1504685599.188061317 (2017-09-06 08:13:19)
> item 79 key (30039322 INODE_REF 30037720) itemoff 4161 itemsize 42
> index 242 namelen 32 name: obs-service-download_src_package
> item 80 key (30039322 DIR_ITEM 1076301169) itemoff 4083 itemsize 78
> location key (30039325 INODE_ITEM 0) type FILE
> transid 136248 data_len 0 name_len 48
> name: obs-service-download_src_package-20130318.ebuild
> item 81 key (30039322 DIR_ITEM 2438219243) itemoff 4041 itemsize 42
> location key (0 UNKNOWN.0 0) type FILE
> transid 136192 data_len 0 name_len 12
> name: metadata.xml
> item 82 key (30039322 DIR_ITEM 4007295565) itemoff 3927 itemsize 114
> location key (0 UNKNOWN.0 0) type DIR_ITEM.0
> transid 0 data_len 0 name_len 0
> name:
> location key (0 UNKNOWN.125 72057594038112709) type DIR_ITEM.0
> transid 0 data_len 32907 name_len 3
> name:
> data

The whole item is corrupted.
Seems to be a half-written item get flushed to disk.

I assume this is the DIR_ITEM for *two* Manifest, but that's just
insane, as we're going to have 2 files with the same name "Manifest"

> item 83 key (30039322 DIR_INDEX 2) itemoff 3889 itemsize 38
> location key (30039323 INODE_ITEM 0) type FILE
> transid 3377699720527872 data_len 0 name_len 8

The transid seems corrupted too.

Maybe I need to delete this item too?

> name: Manifest
> item 84 key (30039322 DIR_INDEX 3) itemoff 3847 itemsize 42
> location key (30039324 INODE_ITEM 0) type FILE
> transid 136248 data_len 0 name_len 12
> name: metadata.xml
> item 85 key (30039322 DIR_INDEX 4) itemoff 3769 itemsize 78
> location key (30039325 INODE_ITEM 0) type FILE
> transid 136248 data_len 0 name_len 48
> name: obs-service-download_src_package-20130318.ebuild
> item 86 key (30039322 DIR_INDEX 5) itemoff 3731 itemsize 38
> location key (47302013 INODE_ITEM 0) type FILE
> transid 229515 data_len 0 name_len 8
> name: Manifest
> item 87 key (30039323 INODE_ITEM 0) itemoff 3571 itemsize 160
> generation 136248 transid 202216 size 782 nbytes 782
> block group 0 mode 100644 links 1 uid 250 gid 250 rdev 0
> sequence 0 flags 0xb(none)
> atime 1504685599.188061317 (2017-09-06 08:13:19)
> ctime 1512818139.540278499 (2017-12-09 11:15:39)
> mtime 1504685599.188061317 (2017-09-06 08:13:19)
> otime 1504685599.188061317 (2017-09-06 08:13:19)
> item 88 key (30039323 INODE_REF 30039322) itemoff 3553 itemsize 18
> index 2 namelen 8 name: Manifest
> item 89 key (30039323 EXTENT_DATA 0) itemoff 2750 itemsize 803
> generation 136248 type 0 (inline)
> inline extent data size 782 ram_bytes 782 compression 0 (none)
> item 90 key (30039324 INODE_ITEM 0) itemoff 2590 itemsize 160
> generation 136248 transid 202216 size 448 nbytes 448
> block group 0 mode 100644 links 1 uid 250 gid 250 rdev 0
> sequence 0 flags 0xb(none)
> atime 1504685599.188061317 (2017-09-06 08:13:19)
> ctime 1512818139.540278499 (2017-12-09 11:15:39)
> mtime 1504685599.188061317 (2017-09-06 08:13:19)
> otime 1504685599.188061317 (2017-09-06 08:13:19)
> item 91 key (30039324 INODE_REF 30039322) itemoff 2568 itemsize 22
> index 3 namelen 12 name: metadata.xml
> leaf 36701356032 items 44 free space 8701 generation 202216 owner 257
> leaf 36701356032 flags 0x1(WRITTEN) backref revision 1
> fs uuid de1723e2-150c-4448-bb36-be14d7d96093
> chunk uuid 443a227c-4f87-419a-b89a-3f5714eea403
>

Re: [PATCH v6 05/99] xarray: Add definition of struct xarray

2018-01-24 Thread Paul Bolle
Mathhew,

Just a minor question.

On Wed, 2018-01-17 at 12:20 -0800, Matthew Wilcox wrote:
> This is a direct replacement for struct radix_tree_root.  Some of the
> struct members have changed name; convert those, and use a #define so
> that radix_tree users continue to work without change.
> 
> Signed-off-by: Matthew Wilcox 

> --- a/include/linux/xarray.h
> +++ b/include/linux/xarray.h
> @@ -10,6 +10,8 @@
>   */
>  
>  #include 
> +#include 
> +#include 

The top Makefile includes linux/kconfig.h globally. (See the odd USERINCLUDE
variable, which is actually part of the LINUXINCLUDE variable, but split off
to make things confusing.)

Why do you need to include linux/kconfig.h here?

>  #include 
>  #include 

Thanks,


Paul Bolle
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html