Re: [PATCH] Btrfs: make sure logged extents complete in the current transaction
On Tue, Nov 18, 2014 at 05:19:41PM -0500, Josef Bacik wrote: Liu Bo pointed out that my previous fix would lose the generation update in the scenario I described. It is actually much worse than that, we could lose the entire extent if we lose power right after the transaction commits. Consider the following write extent 0-4k log extent in log tree commit transaction power fail happens here ordered extent completes We would lose the 0-4k extent because it hasn't updated the actual fs tree, and the transaction commit will reset the log so it isn't replayed. If we lose power before the transaction commit we are save, otherwise we are not. Fix this by keeping track of all extents we logged in this transaction. Then when we go to commit the transaction make sure we wait for all of those ordered extents to complete before proceeding. This will make sure that if we lose power after the transaction commit we still have our data. This also fixes the problem of the improperly updated extent generation. Thanks, This looks saner. Reviewed-by: Liu Bo bo.li@oracle.com thanks, -liubo cc: sta...@vger.kernel.org Signed-off-by: Josef Bacik jba...@fb.com --- fs/btrfs/ordered-data.c | 6 -- fs/btrfs/ordered-data.h | 6 +- fs/btrfs/transaction.c | 33 + fs/btrfs/transaction.h | 2 ++ fs/btrfs/tree-log.c | 6 +++--- 5 files changed, 47 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ac734ec..7c2dd7a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(entry-work_list); init_completion(entry-completion); INIT_LIST_HEAD(entry-log_list); + INIT_LIST_HEAD(entry-trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -472,7 +473,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list, spin_unlock_irq(log-log_extents_lock[index]); } -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, +struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; int index = transid % 2; @@ -497,7 +499,7 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) wait_event(ordered-wait, test_bit(BTRFS_ORDERED_IO_DONE, ordered-flags)); - btrfs_put_ordered_extent(ordered); + list_add_tail(ordered-trans_list, trans-ordered); spin_lock_irq(log-log_extents_lock[index]); } spin_unlock_irq(log-log_extents_lock[index]); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d81a274..171a841 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -121,6 +121,9 @@ struct btrfs_ordered_extent { /* If we need to wait on this to be done */ struct list_head log_list; + /* If the transaction needs to wait on this ordered extent */ + struct list_head trans_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -197,7 +200,8 @@ void btrfs_get_logged_extents(struct inode *inode, void btrfs_put_logged_extents(struct list_head *logged_list); void btrfs_submit_logged_extents(struct list_head *logged_list, struct btrfs_root *log); -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, +struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index dcaae36..63c6d05 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -220,6 +220,7 @@ loop: INIT_LIST_HEAD(cur_trans-pending_snapshots); INIT_LIST_HEAD(cur_trans-pending_chunks); INIT_LIST_HEAD(cur_trans-switch_commits); + INIT_LIST_HEAD(cur_trans-pending_ordered); list_add_tail(cur_trans-list, fs_info-trans_list); extent_io_tree_init(cur_trans-dirty_pages, fs_info-btree_inode-i_mapping); @@ -488,6 +489,7 @@ again: h-sync = false; INIT_LIST_HEAD(h-qgroup_ref_list); INIT_LIST_HEAD(h-new_bgs); + INIT_LIST_HEAD(h-ordered); smp_mb(); if (cur_trans-state = TRANS_STATE_BLOCKED @@ -719,6 +721,12 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(trans-new_bgs)) btrfs_create_pending_block_groups(trans, root); + if (!list_empty(trans-ordered)) { +
[PATCH] btrfs-progs: use system attr instead of attr library
We use the attr version provided by system in other places already, now we can remove dependency on the separate attr library. Signed-off-by: David Sterba dste...@suse.cz --- props.c | 8 +++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/props.c b/props.c index 9fd612f97026..c7c67529fd79 100644 --- a/props.c +++ b/props.c @@ -17,7 +17,7 @@ #include sys/stat.h #include sys/ioctl.h #include sys/types.h -#include attr/xattr.h +#include sys/xattr.h #include fcntl.h #include unistd.h @@ -29,6 +29,12 @@ #define XATTR_BTRFS_PREFIX btrfs. #define XATTR_BTRFS_PREFIX_LEN (sizeof(XATTR_BTRFS_PREFIX) - 1) +/* + * Defined as synonyms in attr/xattr.h + */ +#ifndef ENOATTR +#define ENOATTR ENODATA +#endif static int prop_read_only(enum prop_object_type type, const char *object, -- 2.1.3 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs: do not move em to modified list when unpinning
On 11/18/2014 10:45 PM, Dave Chinner wrote: On Fri, Nov 14, 2014 at 04:16:30PM -0500, Josef Bacik wrote: We use the modified list to keep track of which extents have been modified so we know which ones are candidates for logging at fsync() time. Newly modified extents are added to the list at modification time, around the same time the ordered extent is created. We do this so that we don't have to wait for ordered extents to complete before we know what we need to log. The problem is when something like this happens log extent 0-4k on inode 1 copy csum for 0-4k from ordered extent into log sync log commit transaction log some other extent on inode 1 ordered extent for 0-4k completes and adds itself onto modified list again log changed extents see ordered extent for 0-4k has already been logged at this point we assume the csum has been copied sync log crash On replay we will see the extent 0-4k in the log, drop the original 0-4k extent which is the same one that we are replaying which also drops the csum, and then we won't find the csum in the log for that bytenr. This of course causes us to have errors about not having csums for certain ranges of our inode. So remove the modified list manipulation in unpin_extent_cache, any modified extents should have been added well before now, and we don't want them re-logged. This fixes my test that I could reliably reproduce this problem with. Thanks, Is it possiible to turn this unspecified test in into another generic fsync xfstest? It depends on a new dm target I'm working on to better test power fail scenarios, once I have that merged I have a few xfstests I'll be submitting in this area. Would you actually mind taking a quick look at it to make sure it seems sane? https://git.kernel.org/cgit/linux/kernel/git/josef/btrfs-next.git/log/?h=dm-powerfail The 'split' option is what is meant for ext* and xfs (I haven't tested that part yet), which will just return the old data in the case of unflushed data/metadata. Anything you'd like to see added or changed? Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/18/2014 9:40 PM, Chris Murphy wrote: It’s well known on linux-raid@ that consumer drives have well over 30 second deep recoveries when they lack SCT command support. The WDC and Seagate “green” drives are over 2 minutes apparently. This isn’t easy to test because it requires a sector with enough error that it requires the ECC to do something, and yet not so much error that it gives up in less than 30 seconds. So you have to track down a drive model spec document (one of those 100 pagers). This makes sense, sorta, because the manufacturer use case is typically single drive only, and most proscribe raid5/6 with such products. So it’s a “recover data at all costs” behavior because it’s assumed to be the only (immediately) available copy. It doesn't make sense to me. If it can't recover the data after one or two hundred retries in one or two seconds, it can keep trying until the cows come home and it just isn't ever going to work. I don’t see how that’s possible because anything other than the drive explicitly producing a read error (which includes the affected LBA’s), it’s ambiguous what the actual problem is as far as the kernel is concerned. It has no way of knowing which of possibly dozens of ata commands queued up in the drive have actually hung up the drive. It has no idea why the drive is hung up as well. IIRC, this is true when the drive returns failure as well. The whole bio is marked as failed, and the page cache layer then begins retrying with progressively smaller requests to see if it can get *some* data out. No I think 30 is pretty sane for servers using SATA drives because if the bus is reset all pending commands in the queue get obliterated which is worse than just waiting up to 30 seconds. With SAS drives maybe less time makes sense. But in either case you still need configurable SCT ERC, or it needs to be a sane fixed default like 70 deciseconds. Who cares if multiple commands in the queue are obliterated if they can all be retried on the other mirror? Better to fall back to the other mirror NOW instead of waiting 30 seconds ( or longer! ). Sure, you might end up recovering more than you really had to, but that won't hurt anything. -BEGIN PGP SIGNATURE- Version: GnuPG v2.0.17 (MingW32) iQEcBAEBAgAGBQJUbLMyAAoJEI5FoCIzSKrwSM8IAJO2cwhHyxK4LFjINEbNT+ij fT4EpyzOCs704zhOTgssgSQ8ym85PRQ8VyAIrz338m+lHqKbktZtRt7vWaealmOp 6eleIDJ/I7kggnlhkqg1V8Nctap8qBeRE34K/PaGtTrkRzBYnYxbGdDDz+rXaDi6 CSEMLJBo3I69Oj9qSOV4O18ntV/S3eln0sQ8+w2btbc3xGkG3X2FwVIJokb6IAmu ngHUeDGXUgkEOvzw3aGDheLueGDPe+V3YlsjSbw2rH75svzXqFCUO8Jcg4NfxT0q Nl03eoTEGlyf8x2geMWfhoKFatJ7sCMy48K0ZFAAX1k8j0ssjNaEC+q6pwrA/xU= =Gehg -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BTRFS messes up snapshot LV with origin
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/18/2014 9:54 PM, Chris Murphy wrote: Why is it silly? Btrfs on a thin volume has practical use case aside from just being thinly provisioned, its snapshots are block device based, not merely that of an fs tree. Umm... because one of the big selling points of btrfs is that it is in a much better position to make snapshots being aware of the fs tree rather than doing it in the block layer. So it is kind of silly in the first place to be using lvm snapshots under btrfs, but it is is doubly silly to use lvm for snapshots, and btrfs for the mirroring rather than lvm. Pick one layer and use it for both functions. Even if that is lvm, then it should also be handling the mirroring. -BEGIN PGP SIGNATURE- Version: GnuPG v2.0.17 (MingW32) iQEcBAEBAgAGBQJUbLUxAAoJEI5FoCIzSKrwh0oH/3TZ2oo8u2BjHYO3b0x8800/ LFkmGFWrZFSnAvtWuN5B1WlhMXku4dxLRXz14fJKFp3fNmnYRNVvw3tu9btvsBsC sZdwLaKwKPHTK8RS+QCI2pZPX+cGB+F7/z9PCHrzIzzCKk/4SvnJ76e2nnZFpY1m Md3f1BCHEVUPMMXbqv6Ry6v7PDs/8bx8WITYyAL9uh3tjh0dXQsjbZJn5u4XDitS /CoE8eX4rf1vc7qHI4K56TtArCcXQxAHcC56fXmcmS03bVhAkkJ5Z+/uwi6+TkJe 55rMFCd7UFy9pwKha3Q2flJHtDYG6ns7Njyff6BSL9Yzq7tHh4wLk1H3XxaOCP8= =ktv/ -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] fstests: mark replace tests in btrfs/group
A couple tests exercise replace but were not marked as such in the group file. Signed-off-by: Eric Sandeen sand...@redhat.com --- diff --git a/tests/btrfs/group b/tests/btrfs/group index 9adf862..1f23979 100644 --- a/tests/btrfs/group +++ b/tests/btrfs/group @@ -13,7 +13,7 @@ 008 auto quick 009 auto quick 010 auto quick -011 auto +011 auto replace 012 auto 013 auto quick 014 auto @@ -22,7 +22,7 @@ 017 auto quick 018 auto quick 019 auto quick -020 auto quick +020 auto quick replace 021 auto quick 022 auto 023 auto -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Fix lockups from btrfs_clear_path_blocking
The fair reader/writer locks mean that btrfs_clear_path_blocking needs to strictly follow lock ordering rules even when we already have blocking locks on a given path. Before we can clear a blocking lock on the path, we need to make sure all of the locks have been converted to blocking. This will remove lock inversions against anyone spinning in write_lock() against the buffers we're trying to get read locks on. These inversions didn't exist before the fair read/writer locks, but now we need to be more careful. We papered over this deadlock in the past by changing btrfs_try_read_lock() to be a true trylock against both the spinlock and the blocking lock. This was slower, and not sufficient to fix all the deadlocks. This patch adds a btrfs_tree_read_lock_atomic(), which basically means get the spinlock but trylock on the blocking lock. Signed-off-by: Chris Mason c...@fb.com Reported-by: Patrick Schmid sch...@phys.ethz.ch cc: sta...@vger.kernel.org #v3.15+ diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 19bc616..150822e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, { int i; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* lockdep really cares that we take all of these spinlocks -* in the right order. If any of the locks in the path are not -* currently blocking, it is going to complain. So, make really -* really sure by forcing the path to blocking before we clear -* the path blocking. -*/ if (held) { btrfs_set_lock_blocking_rw(held, held_rw); if (held_rw == BTRFS_WRITE_LOCK) @@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, held_rw = BTRFS_READ_LOCK_BLOCKING; } btrfs_set_path_blocking(p); -#endif for (i = BTRFS_MAX_LEVEL - 1; i = 0; i--) { if (p-nodes[i] p-locks[i]) { @@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p, } } -#ifdef CONFIG_DEBUG_LOCK_ALLOC if (held) btrfs_clear_lock_blocking_rw(held, held_rw); -#endif } /* this also releases the path */ @@ -2893,7 +2883,7 @@ cow_done: } p-locks[level] = BTRFS_WRITE_LOCK; } else { - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); @@ -3025,7 +3015,7 @@ again: } level = btrfs_header_level(b); - err = btrfs_try_tree_read_lock(b); + err = btrfs_tree_read_lock_atomic(b); if (!err) { btrfs_set_path_blocking(p); btrfs_tree_read_lock(b); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 5665d21..f8229ef 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -128,6 +128,26 @@ again: } /* + * take a spinning read lock. + * returns 1 if we get the read lock and 0 if we don't + * this won't wait for blocking writers + */ +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb) +{ + if (atomic_read(eb-blocking_writers)) + return 0; + + read_lock(eb-lock); + if (atomic_read(eb-blocking_writers)) { + read_unlock(eb-lock); + return 0; + } + atomic_inc(eb-read_locks); + atomic_inc(eb-spinning_readers); + return 1; +} + +/* * returns 1 if we get the read lock and 0 if we don't * this won't wait for blocking writers */ @@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) atomic_read(eb-blocking_readers)) return 0; - if (!write_trylock(eb-lock)) - return 0; - + write_lock(eb-lock); if (atomic_read(eb-blocking_writers) || atomic_read(eb-blocking_readers)) { write_unlock(eb-lock); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index b81e0e9..c44a9d5 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw); void btrfs_assert_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); +int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); + static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to
Re: scrub implies failing drive - smartctl blissfully unaware
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/18/2014 9:46 PM, Duncan wrote: I'm not sure about normal operation, but certainly, many drives take longer than 30 seconds to stabilize after power-on, and I routinely see resets during this time. As far as I have seen, typical drive spin up time is on the order of 3-7 seconds. Hell, I remember my pair of first generation seagate cheetah 15,000 rpm drives seemed to take *forever* to spin up and that still was maybe only 15 seconds. If a drive takes longer than 30 seconds, then there is something wrong with it. I figure there is a reason why spin up time is tracked by SMART so it seems like long spin up time is a sign of a sick drive. This doesn't happen on single-hardware-device block devices and filesystems because in that case it's either up or down, if the device doesn't come up in time the resume simply fails entirely, instead of coming up with one or more devices there, but others missing as they didn't stabilize in time, as is unfortunately all too common in the multi- device scenario. No, the resume doesn't fail entirely. The drive is reset, and the IO request is retried, and by then it should succeed. I've seen this with both spinning rust and with SSDs, with mdraid and btrfs, with multiple mobos and device controllers, and with resume both from suspend to ram (if the machine powers down the storage devices in that case, as most modern ones do) and hibernate to permanent storage device, over several years worth of kernel series, so it's a reasonably widespread phenomena, at least among consumer-level SATA devices. (My experience doesn't extend to enterprise-raid-level devices or proper SCSI, etc, so I simply don't know, there.) If you are restoring from hibernation, then the drives are already spun up before the kernel is loaded. While two minutes is getting a bit long, I think it's still within normal range, and some devices definitely take over a minute enough of the time to be both noticeable and irritating. It certainly is not normal for a drive to take that long to spin up. IIRC, the 30 second timeout comes from the ATA specs which state that it can take up to 30 seconds for a drive to spin up. That said, I SHOULD say I'd be far *MORE* irritated if the device simply pretended it was stable and started reading/writing data before it really had stabilized, particularly with SSDs where that sort of behavior has been observed and is known to put some devices at risk of complete scrambling of either media or firmware, beyond recovery at times. That of course is the risk of going the other direction, and I'd a WHOLE lot rather have devices play it safe for another 30 seconds or so after they / think/ they're stable and be SURE, than pretend to be just fine when voltages have NOT stabilized yet and thus end up scrambling things irrecoverably. I've never had that happen here tho I've never stress- tested for it, only done normal operation, but I've seen testing reports where the testers DID make it happen surprisingly easily, to a surprising number of their test devices. Power supply voltage is stable within milliseconds. What takes HDDs time to start up is mechanically bringing the spinning rust up to speed. On SSDs, I think you are confusing testing done on power *cycling* ( i.e. yanking the power cord in the middle of a write ) with startup. So, umm... I suspect the 2-minute default is 2 minutes due to power-up stabilizing issues, where two minutes is a reasonable compromise between failing the boot most of the time if the timeout is too low, and taking excessively long for very little further gain. The default is 30 seconds, not 2 minutes. sure whether it's even possible, without some specific hardware feature available to tell the kernel that it has in fact NOT been in power-saving mode for say 5-10 minutes, hopefully long enough that voltage readings really /are/ fully stabilized and a shorter timeout is possible. Again, there is no several minute period where voltage stabilizes and the drive takes longer to access. This is a complete red herring. -BEGIN PGP SIGNATURE- Version: GnuPG v2.0.17 (MingW32) iQEcBAEBAgAGBQJUbMBPAAoJEI5FoCIzSKrwcV0H/20pv7O5+CDf2cRg5G5vt7PR 4J1NuVIBsboKwjwCj8qdxHQJHihvLYkTQKANqaqHv0+wx0u2DaQdPU/LRnqN71xA jP7b9lx9X6rPnAnZUDBbxzAc8HLeutgQ8YD/WB0sE5IXlI1/XFGW4tXIZ4iYmtN9 GUdL+zcdtEiYE993xiGSMXF4UBrN8d/5buBRsUsPVivAZes6OHbf9bd72c1IXBuS ADZ7cH7XGmLL3OXA+hm7d99429HFZYAgI7DjrLWp6Tb9ja5Gvhy+AVvrbU5ZWMwu XUnNsLsBBhEGuZs5xpkotZgaQlmJpw4BFY4BKwC6PL+7ex7ud3hGCGeI6VDmI0U= =DLHU -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
btrfs send and an existing backup
Hi there! I'm new on btrfs, and I like it :) But i have a question. I have a existing backup on an external HDD. This was ext4 before i converted it to btrfs. And i installed my debian new on btrfs with some subvolumes. (f.e. home, var, multimedia/Video multimedia/Audio...) On my backup there are no subvolumes. Now i wrote a script to take local snapshots on my laptops HDD an mirror this snapshots with btrfs send/receive to the external HDD. An i don't know, how to do, to make the inital snapshot on the external HDD. I want to use the existing data there, so I don't have to transmit the whole bunch of data to the external drive, which exists there already... What happens, if i make the same structure on the external drive with creating subvolumes and »cp --reflink«, give this subvolumes the correct names, and fire a »btrfs send«? Or is the best (ONLY???) way, to make an initial snapshot on the external drive and delete the old backup there? greetings jakob -- http://xundeenergie.at http://verkehrsloesungen.wordpress.com/ http://cogitationum.wordpress.com/ -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: make sure logged extents complete in the current transaction V2
Liu Bo pointed out that my previous fix would lose the generation update in the scenario I described. It is actually much worse than that, we could lose the entire extent if we lose power right after the transaction commits. Consider the following write extent 0-4k log extent in log tree commit transaction power fail happens here ordered extent completes We would lose the 0-4k extent because it hasn't updated the actual fs tree, and the transaction commit will reset the log so it isn't replayed. If we lose power before the transaction commit we are save, otherwise we are not. Fix this by keeping track of all extents we logged in this transaction. Then when we go to commit the transaction make sure we wait for all of those ordered extents to complete before proceeding. This will make sure that if we lose power after the transaction commit we still have our data. This also fixes the problem of the improperly updated extent generation. Thanks, cc: sta...@vger.kernel.org Signed-off-by: Josef Bacik jba...@fb.com --- V1-V2: Don't add previously logged ordered extents into the logged list, this keeps us from moving ordered extents off of the global transaction list once it's already been added there. fs/btrfs/ordered-data.c | 9 +++-- fs/btrfs/ordered-data.h | 8 +++- fs/btrfs/transaction.c | 33 + fs/btrfs/transaction.h | 2 ++ fs/btrfs/tree-log.c | 6 +++--- 5 files changed, 52 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ac734ec..269e21d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, INIT_LIST_HEAD(entry-work_list); init_completion(entry-completion); INIT_LIST_HEAD(entry-log_list); + INIT_LIST_HEAD(entry-trans_list); trace_btrfs_ordered_extent_add(inode, entry); @@ -443,6 +444,8 @@ void btrfs_get_logged_extents(struct inode *inode, ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); if (!list_empty(ordered-log_list)) continue; + if (test_bit(BTRFS_ORDERED_LOGGED, ordered-flags)) + continue; list_add_tail(ordered-log_list, logged_list); atomic_inc(ordered-refs); } @@ -472,7 +475,8 @@ void btrfs_submit_logged_extents(struct list_head *logged_list, spin_unlock_irq(log-log_extents_lock[index]); } -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid) { struct btrfs_ordered_extent *ordered; int index = transid % 2; @@ -497,7 +501,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) wait_event(ordered-wait, test_bit(BTRFS_ORDERED_IO_DONE, ordered-flags)); - btrfs_put_ordered_extent(ordered); + if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, ordered-flags)) + list_add_tail(ordered-trans_list, trans-ordered); spin_lock_irq(log-log_extents_lock[index]); } spin_unlock_irq(log-log_extents_lock[index]); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d81a274..0124bff 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -71,6 +71,8 @@ struct btrfs_ordered_sum { ordered extent */ #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */ +#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent +* in the logging code. */ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; @@ -121,6 +123,9 @@ struct btrfs_ordered_extent { /* If we need to wait on this to be done */ struct list_head log_list; + /* If the transaction needs to wait on this ordered extent */ + struct list_head trans_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -197,7 +202,8 @@ void btrfs_get_logged_extents(struct inode *inode, void btrfs_put_logged_extents(struct list_head *logged_list); void btrfs_submit_logged_extents(struct list_head *logged_list, struct btrfs_root *log); -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *log, u64 transid); void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); int __init ordered_data_init(void); void ordered_data_exit(void); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index
Re: Btrfs on a failing drive
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Again, please stop taking this conversation private; keep the mailing list on the Cc. On 11/19/2014 11:37 AM, Fennec Fox wrote: well ive used spinrite and its found a few sectors and they never move so obviously the drives firmware isnt dealing with bad blocks on the drive anyways ive got a new drive on order but what can i do to prevent the drive from killing any more data? The drive will only remap bad blocks when you try to write to them, so if you haven't written to them then it is no surprise that they aren't going anywhere. If the drive is actually returning bad data rather than failing the read outright, then the only thing you can do is to have btrfs duplicate all data so if the checksum on one copy is bad it can try the other. -BEGIN PGP SIGNATURE- Version: GnuPG v2.0.17 (MingW32) iQEcBAEBAgAGBQJUbN8VAAoJEI5FoCIzSKrwGjkIAKxXbBcMaItyBe08yC/bipUH 2crWLj5MKej1sn1HEo1WqgJM1hCEZuHCBa8I6ZIECcZmzs4rvKhzU4WWIQ7J/tMN 8OYUzdsWboxbKHY5hrNEVsi8QcUTbz7HT3doaaYDhI7qERu1Ib/4FH+m5yFYEIu8 tx5+N2PzyXctDlNnjY/pcFg+I2+QyA5Rb9X+fLpvVoZCEW7TTMhejfKSQpMEfzHW JsYyKwDpQO6cGIWi19P7pgHc2bsCzShPtFo9UQJh5TtuxjsqP01ju1UfQBX0+Y25 B2LDAjyGE71pY68tBuS7EC9XSB9Iks5yEJotmwYTv3/L7bgDeAGPrj5cFOKG9Tc= =8JoK -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BTRFS messes up snapshot LV with origin
On Wed, Nov 19, 2014 at 8:20 AM, Phillip Susi ps...@ubuntu.com wrote: -BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/18/2014 9:54 PM, Chris Murphy wrote: Why is it silly? Btrfs on a thin volume has practical use case aside from just being thinly provisioned, its snapshots are block device based, not merely that of an fs tree. Umm... because one of the big selling points of btrfs is that it is in a much better position to make snapshots being aware of the fs tree rather than doing it in the block layer. This is why we have fsfreeze before taking block level snapshots. And I point out that consistent snapshots with Btrfs have posed challenges too, there's a recent fstest snapshoting after file write + truncate for this reason. A block layer snapshot will snapshot the entire file system, not just one tree. We don't have a way in Btrfs to snapshot the entire volume. Considering how things still aren't exactly stable yet, in particular with many snapshots, it's not unreasonable to want to freeze then snapshot the entire volume before doing some possibly risky testing or usage where even a Btrfs snapshot doesn't protect your entire volume should things go wrong. So it is kind of silly in the first place to be using lvm snapshots under btrfs, but it is is doubly silly to use lvm for snapshots, and btrfs for the mirroring rather than lvm. Pick one layer and use it for both functions. Even if that is lvm, then it should also be handling the mirroring. Thin volumes are more efficient. And the user creating them doesn't have to mess around with locating physical devices or possibly partitioning them. Plus in enterprise environments with lots of storage and many different kinds of use cases, even knowledable users aren't always granted full access to the physical storage anyway. They get a VG to play with, or now they can have a thin pool and only consume on storage what is actually used, and not what they've reserved. You can mkfs a 4TG virtual size volume, while it only uses 1MB of physical extents on storage. And all of that is orthogonal to using XFS or Btrfs which again comes down to use case. And whether I'd have LVM mirror or Btrfs mirror is again a question of use case, maybe I'm OK with LVM mirroring and I just get the rare corrupt file warning and that's OK. In another use case, corruption isn't OK, I need higher availability of known good data therefore I need Btrfs doing the mirroring. So I find your argument thus far uncompelling. Chris Murphy -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BTRFS messes up snapshot LV with origin
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/19/2014 1:33 PM, Chris Murphy wrote: Thin volumes are more efficient. And the user creating them doesn't have to mess around with locating physical devices or possibly partitioning them. Plus in enterprise environments with lots of storage and many different kinds of use cases, even knowledable users aren't always granted full access to the physical storage anyway. They get a VG to play with, or now they can have a thin pool and only consume on storage what is actually used, and not what they've reserved. You can mkfs a 4TG virtual size volume, while it only uses 1MB of physical extents on storage. And all of that is orthogonal to using XFS or Btrfs which again comes down to use case. And whether I'd have LVM mirror or Btrfs mirror is again a question of use case, maybe I'm OK with LVM mirroring and I just get the rare corrupt file warning and that's OK. In another use case, corruption isn't OK, I need higher availability of known good data therefore I need Btrfs doing the mirroring. Correct me if I'm wrong, but this kind of setup is basically where you have a provider running an lvm thin pool volume on their hardware, and exposing it to the customer's vm as a virtual disk. In that case, then the provider can do their snapshots and it won't cause this problem since the snapshots aren't visible to the vm. Also in these cases the provider is normally already providing data protection by having the vg on a raid6 or raid60 or something, so having the client vm mirror the data in btrfs is a bit redundant. -BEGIN PGP SIGNATURE- Version: GnuPG v2.0.17 (MingW32) iQEcBAEBAgAGBQJUbO4nAAoJEI5FoCIzSKrwl/QIAJ7arJ0ZXVc16pBRjE2F66uV GAOhatdx8pLhGey6by+gV8Ltvx4bK3BG40dkvQIM9RN9UFC5vofQ4FnzIn1nfXZB qyyITE2mF+lE3RNCb8ZKxwG58rfa9NOModPCeNVFWkS6+fyyhGY23sliWbVO6b15 w6BD5xu/Pp7Fhgkx81AL07XpusR9c8pKZd8ZHw4nozFHw20+13XuL+2g8axpZS+O Xd9W5GRlC+0k9jQ0q9xGi1jh6QpjMSWVj54MNS5jRubsY65TtmFPkdvgaMGD4U5k bADSEUMfij9NRMw8VwA4ik/JEi1IbukD4u1geKeZTowMGXReel2RimeA/PhFYcc= =tmDI -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
On 11/19/2014 08:07 AM, Phillip Susi wrote: On 11/18/2014 9:46 PM, Duncan wrote: I'm not sure about normal operation, but certainly, many drives take longer than 30 seconds to stabilize after power-on, and I routinely see resets during this time. As far as I have seen, typical drive spin up time is on the order of 3-7 seconds. Hell, I remember my pair of first generation seagate cheetah 15,000 rpm drives seemed to take *forever* to spin up and that still was maybe only 15 seconds. If a drive takes longer than 30 seconds, then there is something wrong with it. I figure there is a reason why spin up time is tracked by SMART so it seems like long spin up time is a sign of a sick drive. I was recently re-factoring Underdog (http://underdog.sourceforge.net) startup scripts to separate out the various startup domains (e.g. lvm, luks, mdadm) in the prtotype init. So I notice you (Duncan) use the word stabilize, as do a small number of drivers in the linux kernel. This word has very little to do with disks per se. Between SCSI probing LUNs (where the controller tries every theoretical address and gives a potential device ample time to reply), and usb-storage having a simple timer delay set for each volume it sees, there is a lot of waiting in the name of safety going on in the linux kernel at device initialization. When I added the messages scanning /dev/sd?? to the startup sequence as I iterate through the disks and partitions present I discovered that the first time I called blkid (e.g. right between /dev/sda and /dev/sda1) I'd get a huge hit of many human seconds (I didn't time it, but I'd say eight or so) just for having a 2Tb My Book WD 3.0 disk enclosure attached as /dev/sdc. This enclosure having spun up in the previous boot cycle and only bing a soft reboot was immaterial. In this case usb-store is going to take its time and do its deal regardless of the state of the physical drive itself. So there are _lots_ of places where you are going to get delays and very few of them involve the disk itself going from power-off to ready. You said it yourself with respect to SSDs. It's cheaper, and less error prone, and less likely to generate customer returns if the generic controller chips just send init, wait a fixed delay, then request a status compared to trying to are-you-there-yet poll each device like a nagging child. And you are going to see that at every level. And you are going to see it multiply with _sparsely_ provisioned buses where the cycle is going to be retried for absent LUNs (one disk on a Wide SCSI bus and a controller set to probe all LUNs is particularly egregious) One of the reasons that the whole industry has started favoring point-to-point (SATA, SAS) or physical intercessor chaining point-to-point (eSATA) buses is to remove a lot of those wait-and-see delays. That said, you should not see a drive (or target enclosure, or controller) reset during spin up. In a SCSI setting this is almost always a cabling, termination, or addressing issue. In IDE its jumper mismatch (master vs slave vs cable-select). Less often its a partitioning issue (trying to access sectors beyond the end of the drive). Another strong actor is selecting the wrong storage controller chipset driver. In that case you may be faling back from high-end device you think it is, through intermediate chip-set, and back to ACPI or BIOS emulation Another common cause is having a dedicated hardware RAID controller (dell likes to put LSI MegaRaid controllers in their boxes for example), many mother boards have hardware RAID support available through the bios, etc, leaving that feature active, then the adding a drive and _not_ initializing that drive with the RAID controller disk setup. In this case the controller is going to repeatedly probe the drive for its proprietary controller signature blocks (and reset the drive after each attempt) and then finally fall back to raw block pass-through. This can take a long time (thirty seconds to a minute). But seriously, if you are seeing reset anywhere in any storage chain during a normal power-on cycle then you've got a problem with geometry or configuration. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/19/2014 4:05 PM, Robert White wrote: It's cheaper, and less error prone, and less likely to generate customer returns if the generic controller chips just send init, wait a fixed delay, then request a status compared to trying to are-you-there-yet poll each device like a nagging child. And you are going to see that at every level. And you are going to see it multiply with _sparsely_ provisioned buses where the cycle is going to be retried for absent LUNs (one disk on a Wide SCSI bus and a controller set to probe all LUNs is particularly egregious) No, they do not wait a fixed time, then proceed. They do in fact issue the command, then poll or wait for an interrupt to know when it is done, then time out and give up if that doesn't happen within a reasonable amount of time. One of the reasons that the whole industry has started favoring point-to-point (SATA, SAS) or physical intercessor chaining point-to-point (eSATA) buses is to remove a lot of those wait-and-see delays. Nope... even with the ancient PIO mode PATA interface, you polled a ready bit in the status register to see if it was done yet. If you always waited 30 seconds for every command your system wouldn't boot up until next year. Another strong actor is selecting the wrong storage controller chipset driver. In that case you may be faling back from high-end device you think it is, through intermediate chip-set, and back to ACPI or BIOS emulation There is no such thing as ACPI or BIOS emulation. AHCI SATA controllers do usually have an old IDE emulation mode instead of AHCI mode, but this isn't going to cause ridiculously long delays. Another common cause is having a dedicated hardware RAID controller (dell likes to put LSI MegaRaid controllers in their boxes for example), many mother boards have hardware RAID support available through the bios, etc, leaving that feature active, then the adding a drive and That would be fake raid, not hardware raid. _not_ initializing that drive with the RAID controller disk setup. In this case the controller is going to repeatedly probe the drive for its proprietary controller signature blocks (and reset the drive after each attempt) and then finally fall back to raw block pass-through. This can take a long time (thirty seconds to a minute). No, no, and no. If it reads the drive and does not find its metadata, it falls back to pass through. The actual read takes only milliseconds, though it may have to wait a few seconds for the drive to spin up. There is no reason it would keep retrying after a successful read. The way you end up with 30-60 second startup time with a raid is if you have several drives and staggered spinup mode enabled, then each drive is started one at a time instead of all at once so their cumulative startup time can add up fairly high. -BEGIN PGP SIGNATURE- Version: GnuPG v2.0.17 (MingW32) iQEcBAEBAgAGBQJUbQ/qAAoJEI5FoCIzSKrwuhwH/R/+EVTpNlw36naJ8mxqMagt /xafq+1kGhwNjLTPV68CI4Wt24WSGOLqpq5FPWlTMxuN0VSnX/wqBeSbz4w2Vl3F VNic+4RqhmzS3EnLXNzkHyF2Z+hQEEldOlheAobkQb4hv/7jVxBri42nMdHQUq5w em181txT8zkltmV+dm8aYcro8Z4ewntQtyGaO6U/nCfxt9Odr2rfytyeuSyJi9uY +dKlGSb5klIFwCOOSoRqEz2+KOFHF7td9RrcfIRcPRgjKROH0YilQ8T53lTMoNL1 aUMsbyUy+edEBN1a4o/FqK3dEvBSu1nnRGRpSgm2fFGKhyi/z9gmJ1ZXTdYZRXE= =/O7+ -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
Shame you already know everything? On 11/19/2014 01:47 PM, Phillip Susi wrote: On 11/19/2014 4:05 PM, Robert White wrote: One of the reasons that the whole industry has started favoring point-to-point (SATA, SAS) or physical intercessor chaining point-to-point (eSATA) buses is to remove a lot of those wait-and-see delays. Nope... even with the ancient PIO mode PATA interface, you polled a ready bit in the status register to see if it was done yet. If you always waited 30 seconds for every command your system wouldn't boot up until next year. The controller, the thing that sets the ready bit and sends the interrupt is distinct from the driver, the thing that polls the ready bit when the interrupt is sent. At the bus level there are fixed delays and retries. Try putting two drives on a pin-select IDE bus and strapping them both as _slave_ (or indeed master) sometime and watch the shower of fixed delay retries. Another strong actor is selecting the wrong storage controller chipset driver. In that case you may be faling back from high-end device you think it is, through intermediate chip-set, and back to ACPI or BIOS emulation There is no such thing as ACPI or BIOS emulation. That's odd... my bios reads from storage to boot the device and it does so using the ACPI storage methods. ACPI 4.0 Specification Section 9.8 even disagrees with you at some length. Let's just do the titles shall we: 9.8 ATA Controller Devices 9.8.1 Objects for both ATA and SATA Controllers. 9.8.2 IDE Controller Device 9.8.3 Serial ATA (SATA) controller Device Oh, and _lookie_ _here_ in Linux Kernel Menuconfig at Device Drivers - * Serial ATA and Parallel ATA drivers (libata) - * ACPI firmware driver for PATA CONFIG_PATA_ACPI: This option enables an ACPI method driver which drives motherboard PATA controller interfaces through the ACPI firmware in the BIOS. This driver can sometimes handle otherwise unsupported hardware. You are a storage _genius_ for knowing that all that stuff doesn't exist... the rest of us must simply muddle along in our delusion... AHCI SATA controllers do usually have an old IDE emulation mode instead of AHCI mode, but this isn't going to cause ridiculously long delays. Do tell us more... I didn't say the driver would cause long delays, I said that the time it takes to error out other improperly supported drivers and fall back to this one could induce long delays and resets. I think I am done with your expertise in the question of all things storage related. Not to be rude... but I'm physically ill and maybe I shouldn't be posting right now... 8-) -- Rob. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
P.S. On 11/19/2014 01:47 PM, Phillip Susi wrote: Another common cause is having a dedicated hardware RAID controller (dell likes to put LSI MegaRaid controllers in their boxes for example), many mother boards have hardware RAID support available through the bios, etc, leaving that feature active, then the adding a drive and That would be fake raid, not hardware raid. The LSI MegaRaid controller people would _love_ to hear more about your insight into how their battery-backed multi-drive RAID controller is fake. You should go work for them. Try the contact us link at the bottom of this page. I'm sure they are waiting for your insight with baited breath! http://www.lsi.com/products/raid-controllers/pages/megaraid-sas-9260-8i.aspx _not_ initializing that drive with the RAID controller disk setup. In this case the controller is going to repeatedly probe the drive for its proprietary controller signature blocks (and reset the drive after each attempt) and then finally fall back to raw block pass-through. This can take a long time (thirty seconds to a minute). No, no, and no. If it reads the drive and does not find its metadata, it falls back to pass through. The actual read takes only milliseconds, though it may have to wait a few seconds for the drive to spin up. There is no reason it would keep retrying after a successful read. Odd, my MegaRaid controller takes about fifteen seconds by-the-clock to initialize and to the integrity check on my single initialized drive. It's amazing that with a fail and retry it would be _faster_... It's like you know _everything_... -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
Phillip Susi posted on Wed, 19 Nov 2014 11:07:43 -0500 as excerpted: -BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/18/2014 9:46 PM, Duncan wrote: I'm not sure about normal operation, but certainly, many drives take longer than 30 seconds to stabilize after power-on, and I routinely see resets during this time. As far as I have seen, typical drive spin up time is on the order of 3-7 seconds. Hell, I remember my pair of first generation seagate cheetah 15,000 rpm drives seemed to take *forever* to spin up and that still was maybe only 15 seconds. If a drive takes longer than 30 seconds, then there is something wrong with it. I figure there is a reason why spin up time is tracked by SMART so it seems like long spin up time is a sign of a sick drive. It's not physical spinup, but electronic device-ready. It happens on SSDs too and they don't have anything to spinup. But, for instance on my old seagate 300-gigs that I used to have in 4-way mdraid, when I tried to resume from hibernate the drives would be spunup and talking to the kernel, but for some seconds to a couple minutes or so after spinup, they'd sometimes return something like (example) Seagrte3x0 instead of Seagate300. Of course that wasn't the exact string, I think it was the model number or perhaps the serial number or something, but looking at dmsg I could see the ATA layer up for each of the four devices, the connection establish and seem to be returning good data, then the mdraid layer would try to assemble and would kick out a drive or two due to the device string mismatch compared to what was there before the hibernate. With the string mismatch, from its perspective the device had disappeared and been replaced with something else. But if I held it at the grub prompt for a couple minutes and /then/ let it go, or part of the time on its own, all four drives would match and it'd work fine. For just short hibernates (as when testing hibernate/ resume), it'd come back just fine; as it would nearly all the time out to two hours or so. Beyond that, out to 10 or 12 hours, the longer it sat the more likely it would be to fail, if it didn't hold it at the grub prompt for a few minutes to let it stabilize. And now I seen similar behavior resuming from suspend (the old hardware wouldn't resume from suspend to ram, only hibernate, the new hardware resumes from suspend to ram just fine, but I had trouble getting it to resume from hibernate back when I first setup and tried it; I've not tried hibernate since and didn't even setup swap to hibernate to when I got the SSDs so I've not tried it for a couple years) on SSDs with btrfs raid. Btrfs isn't as informative as was mdraid on why it kicks a device, but dmesg says both devices are up, while btrfs is suddenly spitting errors on one device. A reboot later and both devices are back in the btrfs and I can do a scrub to resync, which generally finds and fixes errors on the btrfs that were writable (/home and /var/log), but of course not on the btrfs mounted as root, since it's read-only by default. Same pattern. Immediate suspend and resume is fine. Out to about 6 hours it tends to be fine as well. But at 8-10 hours in suspend, btrfs starts spitting errors often enough that I generally quit trying to suspend at all, I simply shut down now. (With SSDs and systemd, shutdown and restart is fast enough, and the delay from having to refill cache low enough, that the time difference between suspend and full shutdown is hardly worth troubling with anyway, certainly not when there's a risk to data due to failure to properly resume.) But it worked fine when I had only a single device to bring back up. Nothing to be slower than another device to respond and thus to be kicked out as dead. I finally realized what was happening after I read a study paper mentioning capacitor charge time and solid-state stability time, and how a lot of cheap devices say they're ready before the electronics have actually properly stabilized. On SSDs, this is a MUCH worse issue than it is on spinning rust, because the logical layout isn't practically forced to serial like it is on spinning rust, and the firmware can get so jumbled it pretty much scrambles the device. And it's not just the normal storage either. In the study, many devices corrupted their own firmware as well! Now that was definitely a worst-case study in that they were deliberately yanking and/or fast-switching the power, not just doing time-on waits, but still, a surprisingly high proportion of SSDs not only scrambled the storage, but scrambled their firmware as well. (On those devices the firmware may well have been on the same media as the storage, with the firmware simply read in first in a hardware bootstrap mode, and the firmware programmed to avoid that area in normal operation thus making it as easily corrupted as the the normal storage.) The paper specifically
Re: scrub implies failing drive - smartctl blissfully unaware
On Wed, Nov 19, 2014 at 8:11 AM, Phillip Susi ps...@ubuntu.com wrote: -BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 11/18/2014 9:40 PM, Chris Murphy wrote: It’s well known on linux-raid@ that consumer drives have well over 30 second deep recoveries when they lack SCT command support. The WDC and Seagate “green” drives are over 2 minutes apparently. This isn’t easy to test because it requires a sector with enough error that it requires the ECC to do something, and yet not so much error that it gives up in less than 30 seconds. So you have to track down a drive model spec document (one of those 100 pagers). This makes sense, sorta, because the manufacturer use case is typically single drive only, and most proscribe raid5/6 with such products. So it’s a “recover data at all costs” behavior because it’s assumed to be the only (immediately) available copy. It doesn't make sense to me. If it can't recover the data after one or two hundred retries in one or two seconds, it can keep trying until the cows come home and it just isn't ever going to work. I'm not a hard drive engineer, so I can't argue either point. But consumer drives clearly do behave this way. On Linux, the kernel's default 30 second command timer eventually results in what look like link errors rather than drive read errors. And instead of the problems being fixed with the normal md and btrfs recovery mechanisms, the errors simply get worse and eventually there's data loss. Exhibits A, B, C, D - the linux-raid list is full to the brim of such reports and their solution. I don’t see how that’s possible because anything other than the drive explicitly producing a read error (which includes the affected LBA’s), it’s ambiguous what the actual problem is as far as the kernel is concerned. It has no way of knowing which of possibly dozens of ata commands queued up in the drive have actually hung up the drive. It has no idea why the drive is hung up as well. IIRC, this is true when the drive returns failure as well. The whole bio is marked as failed, and the page cache layer then begins retrying with progressively smaller requests to see if it can get *some* data out. Well that's very course. It's not at a sector level, so as long as the drive continues to try to read from a particular LBA, but fails to either succeed reading or give up and report a read error, within 30 seconds, then you just get a bunch of wonky system behavior. Conversely what I've observed on Windows in such a case, is it tolerates these deep recoveries on consumer drives. So they just get really slow but the drive does seem to eventually recover (until it doesn't). But yeah 2 minutes is a long time. So then the user gets annoyed and reinstalls their system. Since that means writing to the affected drive, the firmware logic causes bad sectors to be dereferenced when the write error is persistent. Problem solved, faster system. No I think 30 is pretty sane for servers using SATA drives because if the bus is reset all pending commands in the queue get obliterated which is worse than just waiting up to 30 seconds. With SAS drives maybe less time makes sense. But in either case you still need configurable SCT ERC, or it needs to be a sane fixed default like 70 deciseconds. Who cares if multiple commands in the queue are obliterated if they can all be retried on the other mirror? Because now you have a member drive that's inconsistent. At least in the md raid case, a certain number of read failures causes the drive to be ejected from the array. Anytime there's a write failure, it's ejected from the array too. What you want is for the drive to give up sooner with an explicit read error, so md can help fix the problem by writing good data to the effected LBA. That doesn't happen when there are a bunch of link resets happening. Better to fall back to the other mirror NOW instead of waiting 30 seconds ( or longer! ). Sure, you might end up recovering more than you really had to, but that won't hurt anything. Again, if your drive SCT ERC is configurable, and set to something sane like 70 deciseconds, that read failure happens at MOST 7 seconds after the read attempt. And md is notified of *exactly* what sectors are affected, it immediately goes to mirror data, or rebuilds it from parity, and then writes the correct data to the previously reported bad sectors. And that will fix the problem. So really, if you're going to play the multiple device game, you need drive error timing to be shorter than the kernel's. Chris Murphy -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
Robert White posted on Wed, 19 Nov 2014 13:05:13 -0800 as excerpted: One of the reasons that the whole industry has started favoring point-to-point (SATA, SAS) or physical intercessor chaining point-to-point (eSATA) buses is to remove a lot of those wait-and-see delays. That said, you should not see a drive (or target enclosure, or controller) reset during spin up. In a SCSI setting this is almost always a cabling, termination, or addressing issue. In IDE its jumper mismatch (master vs slave vs cable-select). Less often its a partitioning issue (trying to access sectors beyond the end of the drive). Another strong actor is selecting the wrong storage controller chipset driver. In that case you may be faling back from high-end device you think it is, through intermediate chip-set, and back to ACPI or BIOS emulation FWIW I run a custom-built monolithic kernel, with only the specific drivers (SATA/AHCI in this case) builtin. There's no drivers for anything else it could fallback to. Once in awhile I do see it try at say 6-gig speeds, then eventually fall back to 3 and ultimately 1.5, but that /is/ indicative of other issues when I see it. And like I said, there's no other drivers to fall back to, so obviously I never see it doing that. Another common cause is having a dedicated hardware RAID controller (dell likes to put LSI MegaRaid controllers in their boxes for example), many mother boards have hardware RAID support available through the bios, etc, leaving that feature active, then the adding a drive and _not_ initializing that drive with the RAID controller disk setup. In this case the controller is going to repeatedly probe the drive for its proprietary controller signature blocks (and reset the drive after each attempt) and then finally fall back to raw block pass-through. This can take a long time (thirty seconds to a minute). Everything's set JBOD here. I don't trust those proprietary firmware raid things. Besides, that kills portability. JBOD SATA and AHCI are sufficiently standardized that should the hardware die, I can switch out to something else and not have to worry about rebuilding the custom kernel with the new drivers. Some proprietary firmware raid, requiring dmraid at the software kernel level to support, when I can just as easily use full software mdraid on standardized JBOD, no thanks! And be sure, that's one of the first things I check when I setup a new box, any so-called hardware raid that's actually firmware/software raid, disabled, JBOD mode, enabled. But seriously, if you are seeing reset anywhere in any storage chain during a normal power-on cycle then you've got a problem with geometry or configuration. IIRC I don't get it routinely. But I've seen it a few times, attributing it as I said to the 30-second SATA level timeout not being long enough. Most often, however, it's at resume, not original startup, which is understandable as state at resume doesn't match state at suspend/ hibernate. The irritating thing, as previously discussed, is when one device takes long enough to come back that mdraid or btrfs drops it out, generally forcing the reboot I was trying to avoid with the suspend/ hibernate in the first place, along with a re-add and resync (for mdraid) or a scrub (for btrfs raid). -- Duncan - List replies preferred. No HTML msgs. Every nonfree program has a lord, a master -- and if you use the program, he is your master. Richard Stallman -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: scrub implies failing drive - smartctl blissfully unaware
On 11/19/2014 04:25 PM, Duncan wrote: Most often, however, it's at resume, not original startup, which is understandable as state at resume doesn't match state at suspend/ hibernate. The irritating thing, as previously discussed, is when one device takes long enough to come back that mdraid or btrfs drops it out, generally forcing the reboot I was trying to avoid with the suspend/ hibernate in the first place, along with a re-add and resync (for mdraid) or a scrub (for btrfs raid). If you want a practical solution you might want to look at http://underdog.soruceforge.net (my project, shameless plug). The actual user context return isn't in there but I use the project to build initramfs images into all my kernels. [DISCLAIMER: The cryptsetup and LUKS stuff is rock solid but the mdadm incremental build stuff is very rough and so lightly untested] You could easily add a drive preheat code block (spin up and status check all drives with pause and repeat function) as a preamble function that could/would safely take place before any glance is made towards the resume stage. extemporaneous example:: --- snip --- cat 'EOT' /opt/underdog/utility/preheat.mod #!/bin/bash # ROOT_COMMANDS+=( commands your preheat needs ) UNDERDOG+=( init.d/preheat ) EOT cat 'EOT' /opt/underdog/prototype/init.d/preheat #!/bin/bash function __preamble_preheat() { whatever logic you need return 0 } __preamble_funcs+=( [preheat]=__preamble_preheat ) EOT --- snip --- install underdog, paste the above into a shell once. edit /opt/underdog/prototype/init.d/preamble to put whatever logic in you need. Follow the instructions in /opt/underdog/README.txt for making the initramfs image or, as I do, build the initramfs into the kernel image. The preamble will be run in the resultant /init script before the swap partitions are submitted for attempted resume. (The system does support complexity like resuming from a swap partition inside an LVM/LV built over a LUKS encrypted media expanse, or just a plain laptop with one plain partitioned disk, with zero changes to the necessary default config.) -- Rob. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs: remove empty fs_devices to prevent memory runout
There is a global list @fs_uuids to keep @fs_devices object for each created btrfs. But when a btrfs becomes empty (all devices belong to it are gone), its @fs_devices remains in @fs_uuids list until module exit. If we keeps mkfs.btrfs on the same device again and again, all empty @fs_devices produced are sure to eat up our memory. So this case has better to be prevented. I think that each time we setup btrfs on that device, we should check whether we are stealing some device from another btrfs seen before. To faciliate the search procedure, we could insert all @btrfs_device in a rb_root, one @btrfs_device per each physical device, with @bdev-bd_dev as key. Each time device stealing happens, we should replace the corresponding @btrfs_device in the rb_root with an up-to-date version. If the stolen device is the last device in its @fs_devices, then we have an empty btrfs to be deleted. Actually there are 3 ways to steal devices and lead to empty btrfs 1. mkfs, with -f option 2. device add, with -f option 3. device replace, with -f option We should act under these cases. Moreover, if there are seed devices, then it is asured that the devices in cloned @fs_devices are not treated as valid devices. Signed-off-by: Gui Hecheng guihc.f...@cn.fujitsu.com --- fs/btrfs/super.c | 1 + fs/btrfs/volumes.c | 181 - fs/btrfs/volumes.h | 6 ++ 3 files changed, 172 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 54bd91e..ee09a56 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2154,6 +2154,7 @@ static void __exit exit_btrfs_fs(void) btrfs_end_io_wq_exit(); unregister_filesystem(btrfs_fs_type); btrfs_exit_sysfs(); + btrfs_cleanup_valid_dev_root(); btrfs_cleanup_fs_uuids(); btrfs_exit_compress(); btrfs_hash_exit(); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0192051..ba86b1b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -27,6 +27,7 @@ #include linux/kthread.h #include linux/raid/pq.h #include linux/semaphore.h +#include linux/rbtree.h #include asm/div64.h #include ctree.h #include extent_map.h @@ -52,6 +53,120 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +static struct rb_root valid_dev_root = RB_ROOT; + +static struct btrfs_device *insert_valid_device(struct btrfs_device *new_dev) +{ + struct rb_node **p; + struct rb_node *parent; + struct rb_node *new; + struct btrfs_device *old_dev; + + WARN_ON(!mutex_is_locked(uuid_mutex)); + + parent = NULL; + new = new_dev-rb_node; + + p = valid_dev_root.rb_node; + while (*p) { + parent = *p; + old_dev = rb_entry(parent, struct btrfs_device, rb_node); + + if (new_dev-devnum old_dev-devnum) + p = parent-rb_left; + else if (new_dev-devnum old_dev-devnum) + p = parent-rb_right; + else { + rb_replace_node(parent, new, valid_dev_root); + RB_CLEAR_NODE(parent); + + goto out; + } + } + + old_dev = NULL; + rb_link_node(new, parent, p); + rb_insert_color(new, valid_dev_root); + +out: + return old_dev; +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + WARN_ON(fs_devices-opened); + while (!list_empty(fs_devices-devices)) { + device = list_entry(fs_devices-devices.next, + struct btrfs_device, dev_list); + list_del(device-dev_list); + rcu_string_free(device-name); + kfree(device); + } + kfree(fs_devices); +} + +static void remove_empty_fs_if_need(struct btrfs_fs_devices *old_fs) +{ + struct btrfs_fs_devices *seed_fs; + + if (!list_empty(old_fs-devices)) + return; + + list_del(old_fs-list); + + /* free the seed clones */ + seed_fs = old_fs-seed; + free_fs_devices(old_fs); + while (seed_fs) { + old_fs = seed_fs; + seed_fs = seed_fs-seed; + free_fs_devices(old_fs); + } + +} + +static void replace_invalid_device(struct btrfs_device *new_dev) +{ + struct btrfs_device *invalid_dev; + struct btrfs_fs_devices *old_fs; + + WARN_ON(!mutex_is_locked(uuid_mutex)); + + invalid_dev = insert_valid_device(new_dev); + if (!invalid_dev) + return; + + old_fs = invalid_dev-fs_devices; + mutex_lock(old_fs-device_list_mutex); + list_del(invalid_dev-dev_list); + rcu_string_free(invalid_dev-name); + kfree(invalid_dev); + mutex_unlock(old_fs-device_list_mutex); + +