Re: [PATCH] Btrfs: make sure logged extents complete in the current transaction

2014-11-19 Thread Liu Bo
On Tue, Nov 18, 2014 at 05:19:41PM -0500, Josef Bacik wrote:
 Liu Bo pointed out that my previous fix would lose the generation update in 
 the
 scenario I described.  It is actually much worse than that, we could lose the
 entire extent if we lose power right after the transaction commits.  Consider
 the following
 
 write extent 0-4k
 log extent in log tree
 commit transaction
power fail happens here
 ordered extent completes
 
 We would lose the 0-4k extent because it hasn't updated the actual fs tree, 
 and
 the transaction commit will reset the log so it isn't replayed.  If we lose
 power before the transaction commit we are save, otherwise we are not.
 
 Fix this by keeping track of all extents we logged in this transaction.  Then
 when we go to commit the transaction make sure we wait for all of those 
 ordered
 extents to complete before proceeding.  This will make sure that if we lose
 power after the transaction commit we still have our data.  This also fixes 
 the
 problem of the improperly updated extent generation.  Thanks,

This looks saner.

Reviewed-by: Liu Bo bo.li@oracle.com

thanks,
-liubo

 
 cc: sta...@vger.kernel.org
 Signed-off-by: Josef Bacik jba...@fb.com
 ---
  fs/btrfs/ordered-data.c |  6 --
  fs/btrfs/ordered-data.h |  6 +-
  fs/btrfs/transaction.c  | 33 +
  fs/btrfs/transaction.h  |  2 ++
  fs/btrfs/tree-log.c |  6 +++---
  5 files changed, 47 insertions(+), 6 deletions(-)
 
 diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
 index ac734ec..7c2dd7a 100644
 --- a/fs/btrfs/ordered-data.c
 +++ b/fs/btrfs/ordered-data.c
 @@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode 
 *inode, u64 file_offset,
   INIT_LIST_HEAD(entry-work_list);
   init_completion(entry-completion);
   INIT_LIST_HEAD(entry-log_list);
 + INIT_LIST_HEAD(entry-trans_list);
  
   trace_btrfs_ordered_extent_add(inode, entry);
  
 @@ -472,7 +473,8 @@ void btrfs_submit_logged_extents(struct list_head 
 *logged_list,
   spin_unlock_irq(log-log_extents_lock[index]);
  }
  
 -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
 +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 +struct btrfs_root *log, u64 transid)
  {
   struct btrfs_ordered_extent *ordered;
   int index = transid % 2;
 @@ -497,7 +499,7 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, 
 u64 transid)
   wait_event(ordered-wait, test_bit(BTRFS_ORDERED_IO_DONE,
  ordered-flags));
  
 - btrfs_put_ordered_extent(ordered);
 + list_add_tail(ordered-trans_list, trans-ordered);
   spin_lock_irq(log-log_extents_lock[index]);
   }
   spin_unlock_irq(log-log_extents_lock[index]);
 diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
 index d81a274..171a841 100644
 --- a/fs/btrfs/ordered-data.h
 +++ b/fs/btrfs/ordered-data.h
 @@ -121,6 +121,9 @@ struct btrfs_ordered_extent {
   /* If we need to wait on this to be done */
   struct list_head log_list;
  
 + /* If the transaction needs to wait on this ordered extent */
 + struct list_head trans_list;
 +
   /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
   wait_queue_head_t wait;
  
 @@ -197,7 +200,8 @@ void btrfs_get_logged_extents(struct inode *inode,
  void btrfs_put_logged_extents(struct list_head *logged_list);
  void btrfs_submit_logged_extents(struct list_head *logged_list,
struct btrfs_root *log);
 -void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
 +void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
 +struct btrfs_root *log, u64 transid);
  void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
  int __init ordered_data_init(void);
  void ordered_data_exit(void);
 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
 index dcaae36..63c6d05 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
 @@ -220,6 +220,7 @@ loop:
   INIT_LIST_HEAD(cur_trans-pending_snapshots);
   INIT_LIST_HEAD(cur_trans-pending_chunks);
   INIT_LIST_HEAD(cur_trans-switch_commits);
 + INIT_LIST_HEAD(cur_trans-pending_ordered);
   list_add_tail(cur_trans-list, fs_info-trans_list);
   extent_io_tree_init(cur_trans-dirty_pages,
fs_info-btree_inode-i_mapping);
 @@ -488,6 +489,7 @@ again:
   h-sync = false;
   INIT_LIST_HEAD(h-qgroup_ref_list);
   INIT_LIST_HEAD(h-new_bgs);
 + INIT_LIST_HEAD(h-ordered);
  
   smp_mb();
   if (cur_trans-state = TRANS_STATE_BLOCKED 
 @@ -719,6 +721,12 @@ static int __btrfs_end_transaction(struct 
 btrfs_trans_handle *trans,
   if (!list_empty(trans-new_bgs))
   btrfs_create_pending_block_groups(trans, root);
  
 + if (!list_empty(trans-ordered)) {
 + 

[PATCH] btrfs-progs: use system attr instead of attr library

2014-11-19 Thread David Sterba
We use the attr version provided by system in other places already,
now we can remove dependency on the separate attr library.

Signed-off-by: David Sterba dste...@suse.cz
---
 props.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/props.c b/props.c
index 9fd612f97026..c7c67529fd79 100644
--- a/props.c
+++ b/props.c
@@ -17,7 +17,7 @@
 #include sys/stat.h
 #include sys/ioctl.h
 #include sys/types.h
-#include attr/xattr.h
+#include sys/xattr.h
 #include fcntl.h
 #include unistd.h
 
@@ -29,6 +29,12 @@
 #define XATTR_BTRFS_PREFIX btrfs.
 #define XATTR_BTRFS_PREFIX_LEN (sizeof(XATTR_BTRFS_PREFIX) - 1)
 
+/*
+ * Defined as synonyms in attr/xattr.h
+ */
+#ifndef ENOATTR
+#define ENOATTR ENODATA
+#endif
 
 static int prop_read_only(enum prop_object_type type,
  const char *object,
-- 
2.1.3

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Btrfs: do not move em to modified list when unpinning

2014-11-19 Thread Josef Bacik

On 11/18/2014 10:45 PM, Dave Chinner wrote:

On Fri, Nov 14, 2014 at 04:16:30PM -0500, Josef Bacik wrote:

We use the modified list to keep track of which extents have been modified so we
know which ones are candidates for logging at fsync() time.  Newly modified
extents are added to the list at modification time, around the same time the
ordered extent is created.  We do this so that we don't have to wait for ordered
extents to complete before we know what we need to log.  The problem is when
something like this happens

log extent 0-4k on inode 1
copy csum for 0-4k from ordered extent into log
sync log
commit transaction
log some other extent on inode 1
ordered extent for 0-4k completes and adds itself onto modified list again
log changed extents
see ordered extent for 0-4k has already been logged
at this point we assume the csum has been copied
sync log
crash

On replay we will see the extent 0-4k in the log, drop the original 0-4k extent
which is the same one that we are replaying which also drops the csum, and then
we won't find the csum in the log for that bytenr.  This of course causes us to
have errors about not having csums for certain ranges of our inode.  So remove
the modified list manipulation in unpin_extent_cache, any modified extents
should have been added well before now, and we don't want them re-logged.  This
fixes my test that I could reliably reproduce this problem with.  Thanks,


Is it possiible to turn this unspecified test in into another
generic fsync xfstest?



It depends on a new dm target I'm working on to better test power fail 
scenarios, once I have that merged I have a few xfstests I'll be 
submitting in this area.  Would you actually mind taking a quick look at 
it to make sure it seems sane?


https://git.kernel.org/cgit/linux/kernel/git/josef/btrfs-next.git/log/?h=dm-powerfail

The 'split' option is what is meant for ext* and xfs (I haven't tested 
that part yet), which will just return the old data in the case of 
unflushed data/metadata.  Anything you'd like to see added or changed? 
Thanks,


Josef
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 11/18/2014 9:40 PM, Chris Murphy wrote:
 It’s well known on linux-raid@ that consumer drives have well over
 30 second deep recoveries when they lack SCT command support. The
 WDC and Seagate “green” drives are over 2 minutes apparently. This
 isn’t easy to test because it requires a sector with enough error
 that it requires the ECC to do something, and yet not so much error
 that it gives up in less than 30 seconds. So you have to track down
 a drive model spec document (one of those 100 pagers).
 
 This makes sense, sorta, because the manufacturer use case is 
 typically single drive only, and most proscribe raid5/6 with such 
 products. So it’s a “recover data at all costs” behavior because
 it’s assumed to be the only (immediately) available copy.

It doesn't make sense to me.  If it can't recover the data after one
or two hundred retries in one or two seconds, it can keep trying until
the cows come home and it just isn't ever going to work.

 I don’t see how that’s possible because anything other than the
 drive explicitly producing  a read error (which includes the
 affected LBA’s), it’s ambiguous what the actual problem is as far
 as the kernel is concerned. It has no way of knowing which of
 possibly dozens of ata commands queued up in the drive have
 actually hung up the drive. It has no idea why the drive is hung up
 as well.

IIRC, this is true when the drive returns failure as well.  The whole
bio is marked as failed, and the page cache layer then begins retrying
with progressively smaller requests to see if it can get *some* data out.

 No I think 30 is pretty sane for servers using SATA drives because
 if the bus is reset all pending commands in the queue get
 obliterated which is worse than just waiting up to 30 seconds. With
 SAS drives maybe less time makes sense. But in either case you
 still need configurable SCT ERC, or it needs to be a sane fixed
 default like 70 deciseconds.

Who cares if multiple commands in the queue are obliterated if they
can all be retried on the other mirror?  Better to fall back to the
other mirror NOW instead of waiting 30 seconds ( or longer! ).  Sure,
you might end up recovering more than you really had to, but that
won't hurt anything.

-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUbLMyAAoJEI5FoCIzSKrwSM8IAJO2cwhHyxK4LFjINEbNT+ij
fT4EpyzOCs704zhOTgssgSQ8ym85PRQ8VyAIrz338m+lHqKbktZtRt7vWaealmOp
6eleIDJ/I7kggnlhkqg1V8Nctap8qBeRE34K/PaGtTrkRzBYnYxbGdDDz+rXaDi6
CSEMLJBo3I69Oj9qSOV4O18ntV/S3eln0sQ8+w2btbc3xGkG3X2FwVIJokb6IAmu
ngHUeDGXUgkEOvzw3aGDheLueGDPe+V3YlsjSbw2rH75svzXqFCUO8Jcg4NfxT0q
Nl03eoTEGlyf8x2geMWfhoKFatJ7sCMy48K0ZFAAX1k8j0ssjNaEC+q6pwrA/xU=
=Gehg
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS messes up snapshot LV with origin

2014-11-19 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 11/18/2014 9:54 PM, Chris Murphy wrote:
 Why is it silly? Btrfs on a thin volume has practical use case
 aside from just being thinly provisioned, its snapshots are block
 device based, not merely that of an fs tree.

Umm... because one of the big selling points of btrfs is that it is in
a much better position to make snapshots being aware of the fs tree
rather than doing it in the block layer.

So it is kind of silly in the first place to be using lvm snapshots
under btrfs, but it is is doubly silly to use lvm for snapshots, and
btrfs for the mirroring rather than lvm.  Pick one layer and use it
for both functions.  Even if that is lvm, then it should also be
handling the mirroring.

-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUbLUxAAoJEI5FoCIzSKrwh0oH/3TZ2oo8u2BjHYO3b0x8800/
LFkmGFWrZFSnAvtWuN5B1WlhMXku4dxLRXz14fJKFp3fNmnYRNVvw3tu9btvsBsC
sZdwLaKwKPHTK8RS+QCI2pZPX+cGB+F7/z9PCHrzIzzCKk/4SvnJ76e2nnZFpY1m
Md3f1BCHEVUPMMXbqv6Ry6v7PDs/8bx8WITYyAL9uh3tjh0dXQsjbZJn5u4XDitS
/CoE8eX4rf1vc7qHI4K56TtArCcXQxAHcC56fXmcmS03bVhAkkJ5Z+/uwi6+TkJe
55rMFCd7UFy9pwKha3Q2flJHtDYG6ns7Njyff6BSL9Yzq7tHh4wLk1H3XxaOCP8=
=ktv/
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] fstests: mark replace tests in btrfs/group

2014-11-19 Thread Eric Sandeen
A couple tests exercise replace but were not marked as such
in the group file.

Signed-off-by: Eric Sandeen sand...@redhat.com
---

diff --git a/tests/btrfs/group b/tests/btrfs/group
index 9adf862..1f23979 100644
--- a/tests/btrfs/group
+++ b/tests/btrfs/group
@@ -13,7 +13,7 @@
 008 auto quick
 009 auto quick
 010 auto quick
-011 auto
+011 auto replace
 012 auto
 013 auto quick
 014 auto
@@ -22,7 +22,7 @@
 017 auto quick
 018 auto quick
 019 auto quick
-020 auto quick
+020 auto quick replace
 021 auto quick
 022 auto
 023 auto

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Fix lockups from btrfs_clear_path_blocking

2014-11-19 Thread Chris Mason
The fair reader/writer locks mean that btrfs_clear_path_blocking needs
to strictly follow lock ordering rules even when we already have
blocking locks on a given path.

Before we can clear a blocking lock on the path, we need to make sure
all of the locks have been converted to blocking.  This will remove lock
inversions against anyone spinning in write_lock() against the buffers
we're trying to get read locks on.  These inversions didn't exist before
the fair read/writer locks, but now we need to be more careful.

We papered over this deadlock in the past by changing
btrfs_try_read_lock() to be a true trylock against both the spinlock and
the blocking lock.  This was slower, and not sufficient to fix all the
deadlocks.  This patch adds a btrfs_tree_read_lock_atomic(), which
basically means get the spinlock but trylock on the blocking lock.

Signed-off-by: Chris Mason c...@fb.com
Reported-by: Patrick Schmid sch...@phys.ethz.ch
cc: sta...@vger.kernel.org #v3.15+

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 19bc616..150822e 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -80,13 +80,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 {
int i;
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-   /* lockdep really cares that we take all of these spinlocks
-* in the right order.  If any of the locks in the path are not
-* currently blocking, it is going to complain.  So, make really
-* really sure by forcing the path to blocking before we clear
-* the path blocking.
-*/
if (held) {
btrfs_set_lock_blocking_rw(held, held_rw);
if (held_rw == BTRFS_WRITE_LOCK)
@@ -95,7 +88,6 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
held_rw = BTRFS_READ_LOCK_BLOCKING;
}
btrfs_set_path_blocking(p);
-#endif
 
for (i = BTRFS_MAX_LEVEL - 1; i = 0; i--) {
if (p-nodes[i]  p-locks[i]) {
@@ -107,10 +99,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path 
*p,
}
}
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
if (held)
btrfs_clear_lock_blocking_rw(held, held_rw);
-#endif
 }
 
 /* this also releases the path */
@@ -2893,7 +2883,7 @@ cow_done:
}
p-locks[level] = BTRFS_WRITE_LOCK;
} else {
-   err = btrfs_try_tree_read_lock(b);
+   err = btrfs_tree_read_lock_atomic(b);
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
@@ -3025,7 +3015,7 @@ again:
}
 
level = btrfs_header_level(b);
-   err = btrfs_try_tree_read_lock(b);
+   err = btrfs_tree_read_lock_atomic(b);
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 5665d21..f8229ef 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -128,6 +128,26 @@ again:
 }
 
 /*
+ * take a spinning read lock.
+ * returns 1 if we get the read lock and 0 if we don't
+ * this won't wait for blocking writers
+ */
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb)
+{
+   if (atomic_read(eb-blocking_writers))
+   return 0;
+
+   read_lock(eb-lock);
+   if (atomic_read(eb-blocking_writers)) {
+   read_unlock(eb-lock);
+   return 0;
+   }
+   atomic_inc(eb-read_locks);
+   atomic_inc(eb-spinning_readers);
+   return 1;
+}
+
+/*
  * returns 1 if we get the read lock and 0 if we don't
  * this won't wait for blocking writers
  */
@@ -158,9 +178,7 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb)
atomic_read(eb-blocking_readers))
return 0;
 
-   if (!write_trylock(eb-lock))
-   return 0;
-
+   write_lock(eb-lock);
if (atomic_read(eb-blocking_writers) ||
atomic_read(eb-blocking_readers)) {
write_unlock(eb-lock);
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index b81e0e9..c44a9d5 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -35,6 +35,8 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, 
int rw);
 void btrfs_assert_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_read_lock(struct extent_buffer *eb);
 int btrfs_try_tree_write_lock(struct extent_buffer *eb);
+int btrfs_tree_read_lock_atomic(struct extent_buffer *eb);
+
 
 static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
 {
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to 

Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 11/18/2014 9:46 PM, Duncan wrote:
 I'm not sure about normal operation, but certainly, many drives
 take longer than 30 seconds to stabilize after power-on, and I
 routinely see resets during this time.

As far as I have seen, typical drive spin up time is on the order of
3-7 seconds.  Hell, I remember my pair of first generation seagate
cheetah 15,000 rpm drives seemed to take *forever* to spin up and that
still was maybe only 15 seconds.  If a drive takes longer than 30
seconds, then there is something wrong with it.  I figure there is a
reason why spin up time is tracked by SMART so it seems like long spin
up time is a sign of a sick drive.

 This doesn't happen on single-hardware-device block devices and 
 filesystems because in that case it's either up or down, if the
 device doesn't come up in time the resume simply fails entirely,
 instead of coming up with one or more devices there, but others
 missing as they didn't stabilize in time, as is unfortunately all
 too common in the multi- device scenario.

No, the resume doesn't fail entirely.  The drive is reset, and the
IO request is retried, and by then it should succeed.

 I've seen this with both spinning rust and with SSDs, with mdraid
 and btrfs, with multiple mobos and device controllers, and with
 resume both from suspend to ram (if the machine powers down the
 storage devices in that case, as most modern ones do) and hibernate
 to permanent storage device, over several years worth of kernel
 series, so it's a reasonably widespread phenomena, at least among
 consumer-level SATA devices.  (My experience doesn't extend to
 enterprise-raid-level devices or proper SCSI, etc, so I simply
 don't know, there.)

If you are restoring from hibernation, then the drives are already
spun up before the kernel is loaded.

 While two minutes is getting a bit long, I think it's still within
 normal range, and some devices definitely take over a minute enough
 of the time to be both noticeable and irritating.

It certainly is not normal for a drive to take that long to spin up.
IIRC, the 30 second timeout comes from the ATA specs which state that
it can take up to 30 seconds for a drive to spin up.

 That said, I SHOULD say I'd be far *MORE* irritated if the device
 simply pretended it was stable and started reading/writing data
 before it really had stabilized, particularly with SSDs where that
 sort of behavior has been observed and is known to put some devices
 at risk of complete scrambling of either media or firmware, beyond
 recovery at times.  That of course is the risk of going the other
 direction, and I'd a WHOLE lot rather have devices play it safe for
 another 30 seconds or so after they / think/ they're stable and be
 SURE, than pretend to be just fine when voltages have NOT
 stabilized yet and thus end up scrambling things irrecoverably.
 I've never had that happen here tho I've never stress- tested for
 it, only done normal operation, but I've seen testing reports where
 the testers DID make it happen surprisingly easily, to a surprising
  number of their test devices.

Power supply voltage is stable within milliseconds.  What takes HDDs
time to start up is mechanically bringing the spinning rust up to
speed.  On SSDs, I think you are confusing testing done on power
*cycling* ( i.e. yanking the power cord in the middle of a write )
with startup.

 So, umm... I suspect the 2-minute default is 2 minutes due to
 power-up stabilizing issues, where two minutes is a reasonable
 compromise between failing the boot most of the time if the timeout
 is too low, and taking excessively long for very little further
 gain.

The default is 30 seconds, not 2 minutes.

 sure whether it's even possible, without some specific hardware
 feature available to tell the kernel that it has in fact NOT been
 in power-saving mode for say 5-10 minutes, hopefully long enough
 that voltage readings really /are/ fully stabilized and a shorter
 timeout is possible.

Again, there is no several minute period where voltage stabilizes and
the drive takes longer to access.  This is a complete red herring.


-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUbMBPAAoJEI5FoCIzSKrwcV0H/20pv7O5+CDf2cRg5G5vt7PR
4J1NuVIBsboKwjwCj8qdxHQJHihvLYkTQKANqaqHv0+wx0u2DaQdPU/LRnqN71xA
jP7b9lx9X6rPnAnZUDBbxzAc8HLeutgQ8YD/WB0sE5IXlI1/XFGW4tXIZ4iYmtN9
GUdL+zcdtEiYE993xiGSMXF4UBrN8d/5buBRsUsPVivAZes6OHbf9bd72c1IXBuS
ADZ7cH7XGmLL3OXA+hm7d99429HFZYAgI7DjrLWp6Tb9ja5Gvhy+AVvrbU5ZWMwu
XUnNsLsBBhEGuZs5xpkotZgaQlmJpw4BFY4BKwC6PL+7ex7ud3hGCGeI6VDmI0U=
=DLHU
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


btrfs send and an existing backup

2014-11-19 Thread Jakob Schürz

Hi there!

I'm new on btrfs, and I like it :)

But i have a question. I have a existing backup on an external HDD. This 
was ext4 before i converted it to btrfs.
And i installed my debian new on btrfs with some subvolumes. (f.e. home, 
var, multimedia/Video multimedia/Audio...)


On my backup there are no subvolumes.

Now i wrote a script to take local snapshots on my laptops HDD an mirror 
this snapshots with btrfs send/receive to the external HDD.


An i don't know, how to do, to make the inital snapshot on the external 
HDD. I want to use the existing data there, so I don't have to transmit 
the whole bunch of data to the external drive, which exists there 
already...


What happens, if i make the same structure on the external drive with 
creating subvolumes and »cp --reflink«, give this subvolumes the correct 
names, and fire a »btrfs send«?


Or is the best (ONLY???) way, to make an initial snapshot on the 
external drive and delete the old backup there?


greetings
jakob

--
http://xundeenergie.at
http://verkehrsloesungen.wordpress.com/
http://cogitationum.wordpress.com/

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] Btrfs: make sure logged extents complete in the current transaction V2

2014-11-19 Thread Josef Bacik
Liu Bo pointed out that my previous fix would lose the generation update in the
scenario I described.  It is actually much worse than that, we could lose the
entire extent if we lose power right after the transaction commits.  Consider
the following

write extent 0-4k
log extent in log tree
commit transaction
 power fail happens here
ordered extent completes

We would lose the 0-4k extent because it hasn't updated the actual fs tree, and
the transaction commit will reset the log so it isn't replayed.  If we lose
power before the transaction commit we are save, otherwise we are not.

Fix this by keeping track of all extents we logged in this transaction.  Then
when we go to commit the transaction make sure we wait for all of those ordered
extents to complete before proceeding.  This will make sure that if we lose
power after the transaction commit we still have our data.  This also fixes the
problem of the improperly updated extent generation.  Thanks,

cc: sta...@vger.kernel.org
Signed-off-by: Josef Bacik jba...@fb.com
---
V1-V2: Don't add previously logged ordered extents into the logged list, this
keeps us from moving ordered extents off of the global transaction list once
it's already been added there.

 fs/btrfs/ordered-data.c |  9 +++--
 fs/btrfs/ordered-data.h |  8 +++-
 fs/btrfs/transaction.c  | 33 +
 fs/btrfs/transaction.h  |  2 ++
 fs/btrfs/tree-log.c |  6 +++---
 5 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index ac734ec..269e21d 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -220,6 +220,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, 
u64 file_offset,
INIT_LIST_HEAD(entry-work_list);
init_completion(entry-completion);
INIT_LIST_HEAD(entry-log_list);
+   INIT_LIST_HEAD(entry-trans_list);
 
trace_btrfs_ordered_extent_add(inode, entry);
 
@@ -443,6 +444,8 @@ void btrfs_get_logged_extents(struct inode *inode,
ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
if (!list_empty(ordered-log_list))
continue;
+   if (test_bit(BTRFS_ORDERED_LOGGED, ordered-flags))
+   continue;
list_add_tail(ordered-log_list, logged_list);
atomic_inc(ordered-refs);
}
@@ -472,7 +475,8 @@ void btrfs_submit_logged_extents(struct list_head 
*logged_list,
spin_unlock_irq(log-log_extents_lock[index]);
 }
 
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+  struct btrfs_root *log, u64 transid)
 {
struct btrfs_ordered_extent *ordered;
int index = transid % 2;
@@ -497,7 +501,8 @@ void btrfs_wait_logged_extents(struct btrfs_root *log, u64 
transid)
wait_event(ordered-wait, test_bit(BTRFS_ORDERED_IO_DONE,
   ordered-flags));
 
-   btrfs_put_ordered_extent(ordered);
+   if (!test_and_set_bit(BTRFS_ORDERED_LOGGED, ordered-flags))
+   list_add_tail(ordered-trans_list, trans-ordered);
spin_lock_irq(log-log_extents_lock[index]);
}
spin_unlock_irq(log-log_extents_lock[index]);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index d81a274..0124bff 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -71,6 +71,8 @@ struct btrfs_ordered_sum {
   ordered extent */
 #define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
 
+#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
+* in the logging code. */
 struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
@@ -121,6 +123,9 @@ struct btrfs_ordered_extent {
/* If we need to wait on this to be done */
struct list_head log_list;
 
+   /* If the transaction needs to wait on this ordered extent */
+   struct list_head trans_list;
+
/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
wait_queue_head_t wait;
 
@@ -197,7 +202,8 @@ void btrfs_get_logged_extents(struct inode *inode,
 void btrfs_put_logged_extents(struct list_head *logged_list);
 void btrfs_submit_logged_extents(struct list_head *logged_list,
 struct btrfs_root *log);
-void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
+  struct btrfs_root *log, u64 transid);
 void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 

Re: Btrfs on a failing drive

2014-11-19 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Again, please stop taking this conversation private; keep the mailing
list on the Cc.

On 11/19/2014 11:37 AM, Fennec Fox wrote:
 well ive used spinrite and its found a few sectors   and they
 never move   so obviously the drives firmware isnt dealing with bad
 blocks on the drive   anyways ive got a new drive on order  but
 what can i do to prevent the drive from killing any more data?

The drive will only remap bad blocks when you try to write to them, so
if you haven't written to them then it is no surprise that they aren't
going anywhere.

If the drive is actually returning bad data rather than failing the
read outright, then the only thing you can do is to have btrfs
duplicate all data so if the checksum on one copy is bad it can try
the other.

-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUbN8VAAoJEI5FoCIzSKrwGjkIAKxXbBcMaItyBe08yC/bipUH
2crWLj5MKej1sn1HEo1WqgJM1hCEZuHCBa8I6ZIECcZmzs4rvKhzU4WWIQ7J/tMN
8OYUzdsWboxbKHY5hrNEVsi8QcUTbz7HT3doaaYDhI7qERu1Ib/4FH+m5yFYEIu8
tx5+N2PzyXctDlNnjY/pcFg+I2+QyA5Rb9X+fLpvVoZCEW7TTMhejfKSQpMEfzHW
JsYyKwDpQO6cGIWi19P7pgHc2bsCzShPtFo9UQJh5TtuxjsqP01ju1UfQBX0+Y25
B2LDAjyGE71pY68tBuS7EC9XSB9Iks5yEJotmwYTv3/L7bgDeAGPrj5cFOKG9Tc=
=8JoK
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS messes up snapshot LV with origin

2014-11-19 Thread Chris Murphy
On Wed, Nov 19, 2014 at 8:20 AM, Phillip Susi ps...@ubuntu.com wrote:
 -BEGIN PGP SIGNED MESSAGE-
 Hash: SHA1

 On 11/18/2014 9:54 PM, Chris Murphy wrote:
 Why is it silly? Btrfs on a thin volume has practical use case
 aside from just being thinly provisioned, its snapshots are block
 device based, not merely that of an fs tree.

 Umm... because one of the big selling points of btrfs is that it is in
 a much better position to make snapshots being aware of the fs tree
 rather than doing it in the block layer.

This is why we have fsfreeze before taking block level snapshots. And
I point out that consistent snapshots with Btrfs have posed challenges
too, there's a recent fstest snapshoting after file write + truncate
for this reason.

A block layer snapshot will snapshot the entire file system, not just
one tree. We don't have a way in Btrfs to snapshot the entire volume.
Considering how things still aren't exactly stable yet, in particular
with many snapshots, it's not unreasonable to want to freeze then
snapshot the entire volume before doing some possibly risky testing or
usage where even a Btrfs snapshot doesn't protect your entire volume
should things go wrong.



 So it is kind of silly in the first place to be using lvm snapshots
 under btrfs, but it is is doubly silly to use lvm for snapshots, and
 btrfs for the mirroring rather than lvm.  Pick one layer and use it
 for both functions.  Even if that is lvm, then it should also be
 handling the mirroring.


Thin volumes are more efficient. And the user creating them doesn't
have to mess around with locating physical devices or possibly
partitioning them. Plus in enterprise environments with lots of
storage and many different kinds of use cases, even knowledable users
aren't always granted full access to the physical storage anyway. They
get a VG to play with, or now they can have a thin pool and only
consume on storage what is actually used, and not what they've
reserved. You can mkfs a 4TG virtual size volume, while it only uses
1MB of physical extents on storage. And all of that is orthogonal to
using XFS or Btrfs which again comes down to use case. And whether I'd
have LVM mirror or Btrfs mirror is again a question of use case, maybe
I'm OK with LVM mirroring and I just get the rare corrupt file warning
and that's OK. In another use case, corruption isn't OK, I need higher
availability of known good data therefore I need Btrfs doing the
mirroring.

So I find your argument thus far uncompelling.


Chris Murphy
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS messes up snapshot LV with origin

2014-11-19 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 11/19/2014 1:33 PM, Chris Murphy wrote:
 Thin volumes are more efficient. And the user creating them doesn't
 have to mess around with locating physical devices or possibly
 partitioning them. Plus in enterprise environments with lots of
 storage and many different kinds of use cases, even knowledable
 users aren't always granted full access to the physical storage
 anyway. They get a VG to play with, or now they can have a thin
 pool and only consume on storage what is actually used, and not
 what they've reserved. You can mkfs a 4TG virtual size volume, 
 while it only uses 1MB of physical extents on storage. And all of 
 that is orthogonal to using XFS or Btrfs which again comes down to 
 use case. And whether I'd have LVM mirror or Btrfs mirror is again 
 a question of use case, maybe I'm OK with LVM mirroring and I just 
 get the rare corrupt file warning and that's OK. In another use 
 case, corruption isn't OK, I need higher availability of known
 good data therefore I need Btrfs doing the mirroring.

Correct me if I'm wrong, but this kind of setup is basically where you
have a provider running an lvm thin pool volume on their hardware, and
exposing it to the customer's vm as a virtual disk.  In that case,
then the provider can do their snapshots and it won't cause this
problem since the snapshots aren't visible to the vm.  Also in these
cases the provider is normally already providing data protection by
having the vg on a raid6 or raid60 or something, so having the client
vm mirror the data in btrfs is a bit redundant.




-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUbO4nAAoJEI5FoCIzSKrwl/QIAJ7arJ0ZXVc16pBRjE2F66uV
GAOhatdx8pLhGey6by+gV8Ltvx4bK3BG40dkvQIM9RN9UFC5vofQ4FnzIn1nfXZB
qyyITE2mF+lE3RNCb8ZKxwG58rfa9NOModPCeNVFWkS6+fyyhGY23sliWbVO6b15
w6BD5xu/Pp7Fhgkx81AL07XpusR9c8pKZd8ZHw4nozFHw20+13XuL+2g8axpZS+O
Xd9W5GRlC+0k9jQ0q9xGi1jh6QpjMSWVj54MNS5jRubsY65TtmFPkdvgaMGD4U5k
bADSEUMfij9NRMw8VwA4ik/JEi1IbukD4u1geKeZTowMGXReel2RimeA/PhFYcc=
=tmDI
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Robert White

On 11/19/2014 08:07 AM, Phillip Susi wrote:

On 11/18/2014 9:46 PM, Duncan wrote:

I'm not sure about normal operation, but certainly, many drives
take longer than 30 seconds to stabilize after power-on, and I
routinely see resets during this time.


As far as I have seen, typical drive spin up time is on the order of
3-7 seconds.  Hell, I remember my pair of first generation seagate
cheetah 15,000 rpm drives seemed to take *forever* to spin up and that
still was maybe only 15 seconds.  If a drive takes longer than 30
seconds, then there is something wrong with it.  I figure there is a
reason why spin up time is tracked by SMART so it seems like long spin
up time is a sign of a sick drive.


I was recently re-factoring Underdog (http://underdog.sourceforge.net) 
startup scripts to separate out the various startup domains (e.g. lvm, 
luks, mdadm) in the prtotype init.


So I notice you (Duncan) use the word stabilize, as do a small number 
of drivers in the linux kernel. This word has very little to do with 
disks per se.


Between SCSI probing LUNs (where the controller tries every theoretical 
address and gives a potential device ample time to reply), and 
usb-storage having a simple timer delay set for each volume it sees, 
there is a lot of waiting in the name of safety going on in the linux 
kernel at device initialization.


When I added the messages scanning /dev/sd?? to the startup sequence 
as I iterate through the disks and partitions present I discovered that 
the first time I called blkid (e.g. right between /dev/sda and 
/dev/sda1) I'd get a huge hit of many human seconds (I didn't time it, 
but I'd say eight or so) just for having a 2Tb My Book WD 3.0 disk 
enclosure attached as /dev/sdc. This enclosure having spun up in the 
previous boot cycle and only bing a soft reboot was immaterial. In this 
case usb-store is going to take its time and do its deal regardless of 
the state of the physical drive itself.


So there are _lots_ of places where you are going to get delays and very 
few of them involve the disk itself going from power-off to ready.


You said it yourself with respect to SSDs.

It's cheaper, and less error prone, and less likely to generate customer 
returns if the generic controller chips just send init, wait a fixed 
delay, then request a status compared to trying to are-you-there-yet 
poll each device like a nagging child. And you are going to see that at 
every level. And you are going to see it multiply with _sparsely_ 
provisioned buses where the cycle is going to be retried for absent LUNs 
(one disk on a Wide SCSI bus and a controller set to probe all LUNs is 
particularly egregious)


One of the reasons that the whole industry has started favoring 
point-to-point (SATA, SAS) or physical intercessor chaining 
point-to-point (eSATA) buses is to remove a lot of those wait-and-see 
delays.


That said, you should not see a drive (or target enclosure, or 
controller) reset during spin up. In a SCSI setting this is almost 
always a cabling, termination, or addressing issue. In IDE its jumper 
mismatch (master vs slave vs cable-select). Less often its a 
partitioning issue (trying to access sectors beyond the end of the drive).


Another strong actor is selecting the wrong storage controller chipset 
driver. In that case you may be faling back from high-end device you 
think it is, through intermediate chip-set, and back to ACPI or BIOS 
emulation


Another common cause is having a dedicated hardware RAID controller 
(dell likes to put LSI MegaRaid controllers in their boxes for example), 
many mother boards have hardware RAID support available through the 
bios, etc, leaving that feature active, then the adding a drive and 
_not_ initializing that drive with the RAID controller disk setup. In 
this case the controller is going to repeatedly probe the drive for its 
proprietary controller signature blocks (and reset the drive after each 
attempt) and then finally fall back to raw block pass-through. This can 
take a long time (thirty seconds to a minute).


But seriously, if you are seeing reset anywhere in any storage chain 
during a normal power-on cycle then you've got a problem  with geometry 
or configuration.

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 11/19/2014 4:05 PM, Robert White wrote:
 It's cheaper, and less error prone, and less likely to generate
 customer returns if the generic controller chips just send init,
 wait a fixed delay, then request a status compared to trying to
 are-you-there-yet poll each device like a nagging child. And you
 are going to see that at every level. And you are going to see it
 multiply with _sparsely_ provisioned buses where the cycle is going
 to be retried for absent LUNs (one disk on a Wide SCSI bus and a
 controller set to probe all LUNs is particularly egregious)

No, they do not wait a fixed time, then proceed.  They do in fact
issue the command, then poll or wait for an interrupt to know when it
is done, then time out and give up if that doesn't happen within a
reasonable amount of time.

 One of the reasons that the whole industry has started favoring 
 point-to-point (SATA, SAS) or physical intercessor chaining 
 point-to-point (eSATA) buses is to remove a lot of those
 wait-and-see delays.

Nope... even with the ancient PIO mode PATA interface, you polled a
ready bit in the status register to see if it was done yet.  If you
always waited 30 seconds for every command your system wouldn't boot
up until next year.

 Another strong actor is selecting the wrong storage controller
 chipset driver. In that case you may be faling back from high-end
 device you think it is, through intermediate chip-set, and back to
 ACPI or BIOS emulation

There is no such thing as ACPI or BIOS emulation.  AHCI SATA
controllers do usually have an old IDE emulation mode instead of AHCI
mode, but this isn't going to cause ridiculously long delays.

 Another common cause is having a dedicated hardware RAID
 controller (dell likes to put LSI MegaRaid controllers in their
 boxes for example), many mother boards have hardware RAID support
 available through the bios, etc, leaving that feature active, then
 the adding a drive and

That would be fake raid, not hardware raid.

 _not_ initializing that drive with the RAID controller disk setup.
 In this case the controller is going to repeatedly probe the drive
 for its proprietary controller signature blocks (and reset the
 drive after each attempt) and then finally fall back to raw block
 pass-through. This can take a long time (thirty seconds to a
 minute).

No, no, and no.  If it reads the drive and does not find its metadata,
it falls back to pass through.  The actual read takes only
milliseconds, though it may have to wait a few seconds for the drive
to spin up.  There is no reason it would keep retrying after a
successful read.

The way you end up with 30-60 second startup time with a raid is if
you have several drives and staggered spinup mode enabled, then each
drive is started one at a time instead of all at once so their
cumulative startup time can add up fairly high.


-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUbQ/qAAoJEI5FoCIzSKrwuhwH/R/+EVTpNlw36naJ8mxqMagt
/xafq+1kGhwNjLTPV68CI4Wt24WSGOLqpq5FPWlTMxuN0VSnX/wqBeSbz4w2Vl3F
VNic+4RqhmzS3EnLXNzkHyF2Z+hQEEldOlheAobkQb4hv/7jVxBri42nMdHQUq5w
em181txT8zkltmV+dm8aYcro8Z4ewntQtyGaO6U/nCfxt9Odr2rfytyeuSyJi9uY
+dKlGSb5klIFwCOOSoRqEz2+KOFHF7td9RrcfIRcPRgjKROH0YilQ8T53lTMoNL1
aUMsbyUy+edEBN1a4o/FqK3dEvBSu1nnRGRpSgm2fFGKhyi/z9gmJ1ZXTdYZRXE=
=/O7+
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Robert White

Shame you already know everything?

On 11/19/2014 01:47 PM, Phillip Susi wrote:

On 11/19/2014 4:05 PM, Robert White wrote:






One of the reasons that the whole industry has started favoring
point-to-point (SATA, SAS) or physical intercessor chaining
point-to-point (eSATA) buses is to remove a lot of those
wait-and-see delays.


Nope... even with the ancient PIO mode PATA interface, you polled a
ready bit in the status register to see if it was done yet.  If you
always waited 30 seconds for every command your system wouldn't boot
up until next year.


The controller, the thing that sets the ready bit and sends the 
interrupt is distinct from the driver, the thing that polls the ready 
bit when the interrupt is sent. At the bus level there are fixed delays 
and retries. Try putting two drives on a pin-select IDE bus and 
strapping them both as _slave_ (or indeed master) sometime and watch the 
shower of fixed delay retries.



Another strong actor is selecting the wrong storage controller
chipset driver. In that case you may be faling back from high-end
device you think it is, through intermediate chip-set, and back to
ACPI or BIOS emulation


There is no such thing as ACPI or BIOS emulation.


That's odd... my bios reads from storage to boot the device and it does 
so using the ACPI storage methods.


ACPI 4.0 Specification Section 9.8 even disagrees with you at some length.

Let's just do the titles shall we:

9.8 ATA Controller Devices
9.8.1 Objects for both ATA and SATA Controllers.
9.8.2 IDE Controller Device
9.8.3 Serial ATA (SATA) controller Device

Oh, and _lookie_ _here_ in Linux Kernel Menuconfig at
Device Drivers -
 * Serial ATA and Parallel ATA drivers (libata) -
  * ACPI firmware driver for PATA

CONFIG_PATA_ACPI:

This option enables an ACPI method driver which drives motherboard PATA 
controller interfaces through the ACPI firmware in the BIOS. This driver 
can sometimes handle otherwise unsupported hardware.


You are a storage _genius_ for knowing that all that stuff doesn't 
exist... the rest of us must simply muddle along in our delusion...


 AHCI SATA
 controllers do usually have an old IDE emulation mode instead of AHCI
 mode, but this isn't going to cause ridiculously long delays.

Do tell us more... I didn't say the driver would cause long delays, I 
said that the time it takes to error out other improperly supported 
drivers and fall back to this one could induce long delays and resets.


I think I am done with your expertise in the question of all things 
storage related.


Not to be rude... but I'm physically ill and maybe I shouldn't be 
posting right now... 8-)


-- Rob.
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Robert White

P.S.

On 11/19/2014 01:47 PM, Phillip Susi wrote:

Another common cause is having a dedicated hardware RAID
controller (dell likes to put LSI MegaRaid controllers in their
boxes for example), many mother boards have hardware RAID support
available through the bios, etc, leaving that feature active, then
the adding a drive and


That would be fake raid, not hardware raid.


The LSI MegaRaid controller people would _love_ to hear more about your 
insight into how their battery-backed multi-drive RAID controller is 
fake. You should go work for them. Try the contact us link at the 
bottom of this page. I'm sure they are waiting for your insight with 
baited breath!


http://www.lsi.com/products/raid-controllers/pages/megaraid-sas-9260-8i.aspx


_not_ initializing that drive with the RAID controller disk setup.
In this case the controller is going to repeatedly probe the drive
for its proprietary controller signature blocks (and reset the
drive after each attempt) and then finally fall back to raw block
pass-through. This can take a long time (thirty seconds to a
minute).


No, no, and no.  If it reads the drive and does not find its metadata,
it falls back to pass through.  The actual read takes only
milliseconds, though it may have to wait a few seconds for the drive
to spin up.  There is no reason it would keep retrying after a
successful read.


Odd, my MegaRaid controller takes about fifteen seconds by-the-clock to 
initialize and to the integrity check on my single initialized drive. 
It's amazing that with a fail and retry it would be _faster_...


It's like you know _everything_...


--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Duncan
Phillip Susi posted on Wed, 19 Nov 2014 11:07:43 -0500 as excerpted:

 -BEGIN PGP SIGNED MESSAGE-
 Hash: SHA1
 
 On 11/18/2014 9:46 PM, Duncan wrote:
 I'm not sure about normal operation, but certainly, many drives take
 longer than 30 seconds to stabilize after power-on, and I routinely see
 resets during this time.
 
 As far as I have seen, typical drive spin up time is on the order of 3-7
 seconds.  Hell, I remember my pair of first generation seagate cheetah
 15,000 rpm drives seemed to take *forever* to spin up and that still was
 maybe only 15 seconds.  If a drive takes longer than 30 seconds, then
 there is something wrong with it.  I figure there is a reason why spin
 up time is tracked by SMART so it seems like long spin up time is a sign
 of a sick drive.

It's not physical spinup, but electronic device-ready.  It happens on 
SSDs too and they don't have anything to spinup.

But, for instance on my old seagate 300-gigs that I used to have in 4-way 
mdraid, when I tried to resume from hibernate the drives would be spunup 
and talking to the kernel, but for some seconds to a couple minutes or so 
after spinup, they'd sometimes return something like (example) 
Seagrte3x0 instead of Seagate300.  Of course that wasn't the exact 
string, I think it was the model number or perhaps the serial number or 
something, but looking at dmsg I could see the ATA layer up for each of 
the four devices, the connection establish and seem to be returning good 
data, then the mdraid layer would try to assemble and would kick out a 
drive or two due to the device string mismatch compared to what was there 
before the hibernate.  With the string mismatch, from its perspective the 
device had disappeared and been replaced with something else.

But if I held it at the grub prompt for a couple minutes and /then/ let 
it go, or part of the time on its own, all four drives would match and 
it'd work fine.  For just short hibernates (as when testing hibernate/
resume), it'd come back just fine; as it would nearly all the time out to 
two hours or so.  Beyond that, out to 10 or 12 hours, the longer it sat 
the more likely it would be to fail, if it didn't hold it at the grub 
prompt for a few minutes to let it stabilize.

And now I seen similar behavior resuming from suspend (the old hardware 
wouldn't resume from suspend to ram, only hibernate, the new hardware 
resumes from suspend to ram just fine, but I had trouble getting it to 
resume from hibernate back when I first setup and tried it; I've not 
tried hibernate since and didn't even setup swap to hibernate to when I 
got the SSDs so I've not tried it for a couple years) on SSDs with btrfs 
raid.  Btrfs isn't as informative as was mdraid on why it kicks a device, 
but dmesg says both devices are up, while btrfs is suddenly spitting 
errors on one device.  A reboot later and both devices are back in the 
btrfs and I can do a scrub to resync, which generally finds and fixes 
errors on the btrfs that were writable (/home and /var/log), but of 
course not on the btrfs mounted as root, since it's read-only by default.

Same pattern.  Immediate suspend and resume is fine.  Out to about 6 
hours it tends to be fine as well.  But at 8-10 hours in suspend, btrfs 
starts spitting errors often enough that I generally quit trying to 
suspend at all, I simply shut down now.  (With SSDs and systemd, shutdown 
and restart is fast enough, and the delay from having to refill cache low 
enough, that the time difference between suspend and full shutdown is 
hardly worth troubling with anyway, certainly not when there's a risk to 
data due to failure to properly resume.)

But it worked fine when I had only a single device to bring back up.  
Nothing to be slower than another device to respond and thus to be kicked 
out as dead.


I finally realized what was happening after I read a study paper 
mentioning capacitor charge time and solid-state stability time, and how 
a lot of cheap devices say they're ready before the electronics have 
actually properly stabilized.  On SSDs, this is a MUCH worse issue than 
it is on spinning rust, because the logical layout isn't practically 
forced to serial like it is on spinning rust, and the firmware can get so 
jumbled it pretty much scrambles the device.  And it's not just the 
normal storage either.  In the study, many devices corrupted their own 
firmware as well!

Now that was definitely a worst-case study in that they were deliberately 
yanking and/or fast-switching the power, not just doing time-on waits, 
but still, a surprisingly high proportion of SSDs not only scrambled the 
storage, but scrambled their firmware as well.  (On those devices the 
firmware may well have been on the same media as the storage, with the 
firmware simply read in first in a hardware bootstrap mode, and the 
firmware programmed to avoid that area in normal operation thus making it 
as easily corrupted as the the normal storage.)

The paper specifically 

Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Chris Murphy
On Wed, Nov 19, 2014 at 8:11 AM, Phillip Susi ps...@ubuntu.com wrote:
 -BEGIN PGP SIGNED MESSAGE-
 Hash: SHA1

 On 11/18/2014 9:40 PM, Chris Murphy wrote:
 It’s well known on linux-raid@ that consumer drives have well over
 30 second deep recoveries when they lack SCT command support. The
 WDC and Seagate “green” drives are over 2 minutes apparently. This
 isn’t easy to test because it requires a sector with enough error
 that it requires the ECC to do something, and yet not so much error
 that it gives up in less than 30 seconds. So you have to track down
 a drive model spec document (one of those 100 pagers).

 This makes sense, sorta, because the manufacturer use case is
 typically single drive only, and most proscribe raid5/6 with such
 products. So it’s a “recover data at all costs” behavior because
 it’s assumed to be the only (immediately) available copy.

 It doesn't make sense to me.  If it can't recover the data after one
 or two hundred retries in one or two seconds, it can keep trying until
 the cows come home and it just isn't ever going to work.

I'm not a hard drive engineer, so I can't argue either point. But
consumer drives clearly do behave this way. On Linux, the kernel's
default 30 second command timer eventually results in what look like
link errors rather than drive read errors. And instead of the problems
being fixed with the normal md and btrfs recovery mechanisms, the
errors simply get worse and eventually there's data loss. Exhibits A,
B, C, D - the linux-raid list is full to the brim of such reports and
their solution.


 I don’t see how that’s possible because anything other than the
 drive explicitly producing  a read error (which includes the
 affected LBA’s), it’s ambiguous what the actual problem is as far
 as the kernel is concerned. It has no way of knowing which of
 possibly dozens of ata commands queued up in the drive have
 actually hung up the drive. It has no idea why the drive is hung up
 as well.

 IIRC, this is true when the drive returns failure as well.  The whole
 bio is marked as failed, and the page cache layer then begins retrying
 with progressively smaller requests to see if it can get *some* data out.

Well that's very course. It's not at a sector level, so as long as the
drive continues to try to read from a particular LBA, but fails to
either succeed reading or give up and report a read error, within 30
seconds, then you just get a bunch of wonky system behavior.

Conversely what I've observed on Windows in such a case, is it
tolerates these deep recoveries on consumer drives. So they just get
really slow but the drive does seem to eventually recover (until it
doesn't). But yeah 2 minutes is a long time. So then the user gets
annoyed and reinstalls their system. Since that means writing to the
affected drive, the firmware logic causes bad sectors to be
dereferenced when the write error is persistent. Problem solved,
faster system.




 No I think 30 is pretty sane for servers using SATA drives because
 if the bus is reset all pending commands in the queue get
 obliterated which is worse than just waiting up to 30 seconds. With
 SAS drives maybe less time makes sense. But in either case you
 still need configurable SCT ERC, or it needs to be a sane fixed
 default like 70 deciseconds.

 Who cares if multiple commands in the queue are obliterated if they
 can all be retried on the other mirror?

Because now you have a member drive that's inconsistent. At least in
the md raid case, a certain number of read failures causes the drive
to be ejected from the array. Anytime there's a write failure, it's
ejected from the array too. What you want is for the drive to give up
sooner with an explicit read error, so md can help fix the problem by
writing good data to the effected LBA. That doesn't happen when there
are a bunch of link resets happening.


 Better to fall back to the
 other mirror NOW instead of waiting 30 seconds ( or longer! ).  Sure,
 you might end up recovering more than you really had to, but that
 won't hurt anything.

Again, if your drive SCT ERC is configurable, and set to something
sane like 70 deciseconds, that read failure happens at MOST 7 seconds
after the read attempt. And md is notified of *exactly* what sectors
are affected, it immediately goes to mirror data, or rebuilds it from
parity, and then writes the correct data to the previously reported
bad sectors. And that will fix the problem.

So really, if you're going to play the multiple device game, you need
drive error timing to be shorter than the kernel's.



Chris Murphy
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Duncan
Robert White posted on Wed, 19 Nov 2014 13:05:13 -0800 as excerpted:

 One of the reasons that the whole industry has started favoring
 point-to-point (SATA, SAS) or physical intercessor chaining
 point-to-point (eSATA) buses is to remove a lot of those wait-and-see
 delays.
 
 That said, you should not see a drive (or target enclosure, or
 controller) reset during spin up. In a SCSI setting this is almost
 always a cabling, termination, or addressing issue. In IDE its jumper
 mismatch (master vs slave vs cable-select). Less often its a
 partitioning issue (trying to access sectors beyond the end of the
 drive).
 
 Another strong actor is selecting the wrong storage controller chipset
 driver. In that case you may be faling back from high-end device you
 think it is, through intermediate chip-set, and back to ACPI or BIOS
 emulation

FWIW I run a custom-built monolithic kernel, with only the specific 
drivers (SATA/AHCI in this case) builtin.  There's no drivers for 
anything else it could fallback to.

Once in awhile I do see it try at say 6-gig speeds, then eventually fall 
back to 3 and ultimately 1.5, but that /is/ indicative of other issues 
when I see it.  And like I said, there's no other drivers to fall back 
to, so obviously I never see it doing that.

 Another common cause is having a dedicated hardware RAID controller
 (dell likes to put LSI MegaRaid controllers in their boxes for example),
 many mother boards have hardware RAID support available through the
 bios, etc, leaving that feature active, then the adding a drive and
 _not_ initializing that drive with the RAID controller disk setup. In
 this case the controller is going to repeatedly probe the drive for its
 proprietary controller signature blocks (and reset the drive after each
 attempt) and then finally fall back to raw block pass-through. This can
 take a long time (thirty seconds to a minute).

Everything's set JBOD here.  I don't trust those proprietary firmware 
raid things.  Besides, that kills portability.  JBOD SATA and AHCI are 
sufficiently standardized that should the hardware die, I can switch out 
to something else and not have to worry about rebuilding the custom 
kernel with the new drivers.  Some proprietary firmware raid, requiring 
dmraid at the software kernel level to support, when I can just as easily 
use full software mdraid on standardized JBOD, no thanks!

And be sure, that's one of the first things I check when I setup a new 
box, any so-called hardware raid that's actually firmware/software raid, 
disabled, JBOD mode, enabled.

 But seriously, if you are seeing reset anywhere in any storage chain
 during a normal power-on cycle then you've got a problem  with geometry
 or configuration.

IIRC I don't get it routinely.  But I've seen it a few times, attributing 
it as I said to the 30-second SATA level timeout not being long enough.

Most often, however, it's at resume, not original startup, which is 
understandable as state at resume doesn't match state at suspend/
hibernate.  The irritating thing, as previously discussed, is when one 
device takes long enough to come back that mdraid or btrfs drops it out, 
generally forcing the reboot I was trying to avoid with the suspend/
hibernate in the first place, along with a re-add and resync (for mdraid) 
or a scrub (for btrfs raid).

-- 
Duncan - List replies preferred.   No HTML msgs.
Every nonfree program has a lord, a master --
and if you use the program, he is your master.  Richard Stallman

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: scrub implies failing drive - smartctl blissfully unaware

2014-11-19 Thread Robert White

On 11/19/2014 04:25 PM, Duncan wrote:

Most often, however, it's at resume, not original startup, which is
understandable as state at resume doesn't match state at suspend/
hibernate.  The irritating thing, as previously discussed, is when one
device takes long enough to come back that mdraid or btrfs drops it out,
generally forcing the reboot I was trying to avoid with the suspend/
hibernate in the first place, along with a re-add and resync (for mdraid)
or a scrub (for btrfs raid).


If you want a practical solution you might want to look at 
http://underdog.soruceforge.net (my project, shameless plug). The actual 
user context return isn't in there but I use the project to build 
initramfs images into all my kernels.


[DISCLAIMER: The cryptsetup and LUKS stuff is rock solid but the mdadm 
incremental build stuff is very rough and so lightly untested]


You could easily add a drive preheat code block (spin up and status 
check all drives with pause and repeat function) as a preamble function 
that could/would safely take place before any glance is made towards the 
resume stage.


extemporaneous example::

--- snip ---
cat 'EOT' /opt/underdog/utility/preheat.mod
#!/bin/bash
# ROOT_COMMANDS+=( commands your preheat needs )
UNDERDOG+=( init.d/preheat )
EOT

cat 'EOT' /opt/underdog/prototype/init.d/preheat
#!/bin/bash
function __preamble_preheat() {
whatever logic you need
return 0
}
__preamble_funcs+=( [preheat]=__preamble_preheat )
EOT
--- snip ---

install underdog, paste the above into a shell once. edit 
/opt/underdog/prototype/init.d/preamble to put whatever logic in you need.


Follow the instructions in /opt/underdog/README.txt for making the 
initramfs image or, as I do, build the initramfs into the kernel image.


The preamble will be run in the resultant /init script before the swap 
partitions are submitted for attempted resume.


(The system does support complexity like resuming from a swap partition 
inside an LVM/LV built over a LUKS encrypted media expanse, or just a 
plain laptop with one plain partitioned disk, with zero changes to the 
necessary default config.)


-- Rob.



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: remove empty fs_devices to prevent memory runout

2014-11-19 Thread Gui Hecheng
There is a global list @fs_uuids to keep @fs_devices object
for each created btrfs. But when a btrfs becomes empty
(all devices belong to it are gone), its @fs_devices remains
in @fs_uuids list until module exit.
If we keeps mkfs.btrfs on the same device again and again,
all empty @fs_devices produced are sure to eat up our memory.
So this case has better to be prevented.

I think that each time we setup btrfs on that device, we should
check whether we are stealing some device from another btrfs
seen before. To faciliate the search procedure, we could insert
all @btrfs_device in a rb_root, one @btrfs_device per each physical
device, with @bdev-bd_dev as key. Each time device stealing happens,
we should replace the corresponding @btrfs_device in the rb_root with
an up-to-date version.
If the stolen device is the last device in its @fs_devices,
then we have an empty btrfs to be deleted.

Actually there are 3 ways to steal devices and lead to empty btrfs
1. mkfs, with -f option
2. device add, with -f option
3. device replace, with -f option
We should act under these cases.

Moreover, if there are seed devices, then it is asured that
the devices in cloned @fs_devices are not treated as valid devices.

Signed-off-by: Gui Hecheng guihc.f...@cn.fujitsu.com
---
 fs/btrfs/super.c   |   1 +
 fs/btrfs/volumes.c | 181 -
 fs/btrfs/volumes.h |   6 ++
 3 files changed, 172 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 54bd91e..ee09a56 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2154,6 +2154,7 @@ static void __exit exit_btrfs_fs(void)
btrfs_end_io_wq_exit();
unregister_filesystem(btrfs_fs_type);
btrfs_exit_sysfs();
+   btrfs_cleanup_valid_dev_root();
btrfs_cleanup_fs_uuids();
btrfs_exit_compress();
btrfs_hash_exit();
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0192051..ba86b1b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -27,6 +27,7 @@
 #include linux/kthread.h
 #include linux/raid/pq.h
 #include linux/semaphore.h
+#include linux/rbtree.h
 #include asm/div64.h
 #include ctree.h
 #include extent_map.h
@@ -52,6 +53,120 @@ static void btrfs_dev_stat_print_on_load(struct 
btrfs_device *device);
 
 DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
+static struct rb_root valid_dev_root = RB_ROOT;
+
+static struct btrfs_device *insert_valid_device(struct btrfs_device *new_dev)
+{
+   struct rb_node **p;
+   struct rb_node *parent;
+   struct rb_node *new;
+   struct btrfs_device *old_dev;
+
+   WARN_ON(!mutex_is_locked(uuid_mutex));
+
+   parent = NULL;
+   new = new_dev-rb_node;
+
+   p = valid_dev_root.rb_node;
+   while (*p) {
+   parent = *p;
+   old_dev = rb_entry(parent, struct btrfs_device, rb_node);
+
+   if (new_dev-devnum  old_dev-devnum)
+   p = parent-rb_left;
+   else if (new_dev-devnum  old_dev-devnum)
+   p = parent-rb_right;
+   else {
+   rb_replace_node(parent, new, valid_dev_root);
+   RB_CLEAR_NODE(parent);
+
+   goto out;
+   }
+   }
+
+   old_dev = NULL;
+   rb_link_node(new, parent, p);
+   rb_insert_color(new, valid_dev_root);
+
+out:
+   return old_dev;
+}
+
+static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
+{
+   struct btrfs_device *device;
+   WARN_ON(fs_devices-opened);
+   while (!list_empty(fs_devices-devices)) {
+   device = list_entry(fs_devices-devices.next,
+   struct btrfs_device, dev_list);
+   list_del(device-dev_list);
+   rcu_string_free(device-name);
+   kfree(device);
+   }
+   kfree(fs_devices);
+}
+
+static void remove_empty_fs_if_need(struct btrfs_fs_devices *old_fs)
+{
+   struct btrfs_fs_devices *seed_fs;
+
+   if (!list_empty(old_fs-devices))
+   return;
+
+   list_del(old_fs-list);
+
+   /* free the seed clones */
+   seed_fs = old_fs-seed;
+   free_fs_devices(old_fs);
+   while (seed_fs) {
+   old_fs = seed_fs;
+   seed_fs = seed_fs-seed;
+   free_fs_devices(old_fs);
+   }
+
+}
+
+static void replace_invalid_device(struct btrfs_device *new_dev)
+{
+   struct btrfs_device *invalid_dev;
+   struct btrfs_fs_devices *old_fs;
+
+   WARN_ON(!mutex_is_locked(uuid_mutex));
+
+   invalid_dev = insert_valid_device(new_dev);
+   if (!invalid_dev)
+   return;
+
+   old_fs = invalid_dev-fs_devices;
+   mutex_lock(old_fs-device_list_mutex);
+   list_del(invalid_dev-dev_list);
+   rcu_string_free(invalid_dev-name);
+   kfree(invalid_dev);
+   mutex_unlock(old_fs-device_list_mutex);
+
+