[PATCH] Btrfs: use list instead of rbtree for free_space cluster

2014-12-30 Thread Liu Bo
Our free_space cluster currently only uses rb_next to find a proper
free_space entry by interating rbtree, there is no search involved,
so it's more efficient to iterate a list rather than a rbtree.

This is a straightforward change that converts rbtree to list.

Signed-off-by: Liu Bo 
---
 fs/btrfs/ctree.h|   3 +-
 fs/btrfs/free-space-cache.c | 187 +++-
 fs/btrfs/free-space-cache.h |   1 +
 3 files changed, 99 insertions(+), 92 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index be47b10..7e539a9 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1209,7 +1209,8 @@ struct btrfs_block_rsv {
 struct btrfs_free_cluster {
spinlock_t lock;
spinlock_t refill_lock;
-   struct rb_root root;
+
+   struct list_head free_space;
 
/* largest extent in this cluster */
u64 max_size;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 448cf6f..ad0d845 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -43,6 +43,26 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
  struct btrfs_free_space *info);
 
+static struct btrfs_free_space *alloc_free_space_cache(gfp_t mask)
+{
+   struct btrfs_free_space *e;
+
+   e = kmem_cache_zalloc(btrfs_free_space_cachep,
+ GFP_NOFS);
+   if (!e)
+   return NULL;
+
+   RB_CLEAR_NODE(&e->offset_index);
+   INIT_LIST_HEAD(&e->cluster_list);
+   return e;
+}
+
+static void reclaim_free_space_cache(struct btrfs_free_space *info)
+{
+   WARN_ON_ONCE(!list_empty(&info->cluster_list));
+   kmem_cache_free(btrfs_free_space_cachep, info);
+}
+
 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
   struct btrfs_path *path,
   u64 offset)
@@ -630,7 +650,7 @@ again:
unlink_free_space(ctl, prev);
unlink_free_space(ctl, e);
prev->bytes += e->bytes;
-   kmem_cache_free(btrfs_free_space_cachep, e);
+   reclaim_free_space_cache(e);
link_free_space(ctl, prev);
prev = NULL;
spin_unlock(&ctl->tree_lock);
@@ -725,19 +745,18 @@ static int __load_free_space_cache(struct btrfs_root 
*root, struct inode *inode,
goto free_cache;
 
while (num_entries) {
-   e = kmem_cache_zalloc(btrfs_free_space_cachep,
- GFP_NOFS);
+   e = alloc_free_space_cache(GFP_NOFS);
if (!e)
goto free_cache;
 
ret = io_ctl_read_entry(&io_ctl, e, &type);
if (ret) {
-   kmem_cache_free(btrfs_free_space_cachep, e);
+   reclaim_free_space_cache(e);
goto free_cache;
}
 
if (!e->bytes) {
-   kmem_cache_free(btrfs_free_space_cachep, e);
+   reclaim_free_space_cache(e);
goto free_cache;
}
 
@@ -748,7 +767,7 @@ static int __load_free_space_cache(struct btrfs_root *root, 
struct inode *inode,
if (ret) {
btrfs_err(root->fs_info,
"Duplicate entries in free space cache, 
dumping");
-   kmem_cache_free(btrfs_free_space_cachep, e);
+   reclaim_free_space_cache(e);
goto free_cache;
}
} else {
@@ -756,8 +775,7 @@ static int __load_free_space_cache(struct btrfs_root *root, 
struct inode *inode,
num_bitmaps--;
e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
if (!e->bitmap) {
-   kmem_cache_free(
-   btrfs_free_space_cachep, e);
+   reclaim_free_space_cache(e);
goto free_cache;
}
spin_lock(&ctl->tree_lock);
@@ -768,7 +786,7 @@ static int __load_free_space_cache(struct btrfs_root *root, 
struct inode *inode,
if (ret) {
btrfs_err(root->fs_info,
"Duplicate entries in free space cache, 
dumping");
-   kmem_cache_free(btrfs_free_space_cachep, e);
+   reclaim_free_space_cache(e);
goto free_cache;
}
list_add_tail(&e->list, &bitmaps);

[PATCH 0/5] Btrfs progs, coverity fixes for 3.18-rc3

2014-12-30 Thread David Sterba
Several issues reported by coverity, minor resource leaks and two bugfixes.

David Sterba (5):
  btrfs-progs: check, fix path leak in error branch
  btrfs-progs: fi show, don't leak canonical path
  btrfs-progs: check, missing parens around compound block in
find_normal_file_extent
  btrfs-progs: fix overflow check in btrfs_insert_inode_ref
  btrfs-progs: fix minor leak of dev_info in btrfs_scan_kernel

 cmds-check.c  | 8 +---
 cmds-filesystem.c | 6 --
 inode-item.c  | 2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/5] btrfs-progs: check, fix path leak in error branch

2014-12-30 Thread David Sterba
Resolves-coverity-id: 1260250
Signed-off-by: David Sterba 
---
 cmds-check.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmds-check.c b/cmds-check.c
index e74b116c0c43..71e4f4f3a13b 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -2839,7 +2839,7 @@ static int repair_btree(struct btrfs_root *root,
ret = PTR_ERR(trans);
fprintf(stderr, "Error starting transaction: %s\n",
strerror(-ret));
-   return ret;
+   goto out_free_path;
}
cache = first_cache_extent(corrupt_blocks);
while (cache) {
@@ -2894,8 +2894,9 @@ static int repair_btree(struct btrfs_root *root,
cache = next_cache_extent(cache);
}
 out:
-   btrfs_free_path(path);
btrfs_commit_transaction(trans, root);
+out_free_path:
+   btrfs_free_path(path);
return ret;
 }
 
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/5] btrfs-progs: fix minor leak of dev_info in btrfs_scan_kernel

2014-12-30 Thread David Sterba
Resolves-coverity-id: 1127098
Signed-off-by: David Sterba 
---
 cmds-filesystem.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmds-filesystem.c b/cmds-filesystem.c
index 1c1d34ae8ca2..a3cf114fb6ac 100644
--- a/cmds-filesystem.c
+++ b/cmds-filesystem.c
@@ -516,8 +516,10 @@ static int btrfs_scan_kernel(void *search)
continue;
ret = get_fs_info(mnt->mnt_dir, &fs_info_arg,
&dev_info_arg);
-   if (ret)
+   if (ret) {
+   kfree(dev_info_arg);
goto out;
+   }
 
if (get_label_mounted(mnt->mnt_dir, label)) {
kfree(dev_info_arg);
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/5] btrfs-progs: fix overflow check in btrfs_insert_inode_ref

2014-12-30 Thread David Sterba
Resolves-coverity-id: 1260247
Signed-off-by: David Sterba 
---
 inode-item.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inode-item.c b/inode-item.c
index 993f3091e335..522d25a433ac 100644
--- a/inode-item.c
+++ b/inode-item.c
@@ -89,7 +89,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
-   if (ret == EOVERFLOW)
+   if (ret == -EOVERFLOW)
ret = -EMLINK;
goto out;
} else {
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/5] btrfs-progs: fi show, don't leak canonical path

2014-12-30 Thread David Sterba
Resolves-coverity-id: 1260252
Signed-off-by: David Sterba 
---
 cmds-filesystem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmds-filesystem.c b/cmds-filesystem.c
index 80875fffddfe..1c1d34ae8ca2 100644
--- a/cmds-filesystem.c
+++ b/cmds-filesystem.c
@@ -471,7 +471,6 @@ static int print_one_fs(struct btrfs_ioctl_fs_info_args 
*fs_info,
char *canonical_path;
 
tmp_dev_info = (struct btrfs_ioctl_dev_info_args *)&dev_info[i];
-   canonical_path = canonicalize_path((char *)tmp_dev_info->path);
 
/* Add check for missing devices even mounted */
fd = open((char *)tmp_dev_info->path, O_RDONLY);
@@ -480,6 +479,7 @@ static int print_one_fs(struct btrfs_ioctl_fs_info_args 
*fs_info,
continue;
}
close(fd);
+   canonical_path = canonicalize_path((char *)tmp_dev_info->path);
printf("\tdevid %4llu size %s used %s path %s\n",
tmp_dev_info->devid,
pretty_size(tmp_dev_info->total_bytes),
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/5] btrfs-progs: check, missing parens around compound block in find_normal_file_extent

2014-12-30 Thread David Sterba
Resolves-coverity-id: 1260248
Signed-off-by: David Sterba 
---
 cmds-check.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmds-check.c b/cmds-check.c
index 71e4f4f3a13b..d2d218a88589 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -2160,9 +2160,10 @@ static int find_normal_file_extent(struct btrfs_root 
*root, u64 ino)
fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
type = btrfs_file_extent_type(path->nodes[0], fi);
-   if (type != BTRFS_FILE_EXTENT_INLINE)
+   if (type != BTRFS_FILE_EXTENT_INLINE) {
ret = 1;
goto out;
+   }
}
 out:
btrfs_free_path(path);
-- 
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Btrfs progs release 3.18

2014-12-30 Thread David Sterba
Hi,

let me announce the release of btrfs-progs version 3.18. There are
updates to UI and several enhancements of check/repair. About 100
commits from 14 contributors, thank you all!

Tarballs: https://www.kernel.org/pub/linux/kernel/people/kdave/btrfs-progs/
Git: git://git.kernel.org/pub/scm/linux/kernel/git/kdave/btrfs-progs.git


Major user-visible changes:

* mkfs - skinny-metadata feature is now on by default, first introduced
  in kernel 3.10

New subcommands:

* filesystem usage - give an overview of fs usage in a way that's more
  comprehensible than existing 'fi df'
  * on kernels < 3.18: requires root due to restricted ioctl FS_INFO
  * raid 5/6 numbers may be missing due to restricte ioctl (TREE_SEARCH)
  * that's used to calculate the numbers
* device usage - more detailed information about per-device allocations
  * same restrictions as for 'fi usage'

New features, options:

* check
  * option to set a different tree root byte number
  * ability to link lost files to lost+found, caused by a recent
kernel bug
  * repair of severely corrupted fs (use with care):
* recover from corrupted b-tree
* ability to rebuild inode items from other items
* improved rebuilding of extent tree with snapshots
* convert
  * option to show progress
* subvol create - print the commit mode inline, print the global mode
  only if --verbose

Other updates:

* musl-libc build support
* bugfixes for coverity reports
* new test images, testing framework
* documentation
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fwd: I need to P. are we almost there yet?

2014-12-30 Thread Jose Manuel Perez Bethencourt
I think you are missing crucial info on the layout on disk that BTRFS
implements. While a traditional RAID1 has a rigid layout that has
fixed and easily predictable locations for all data (exactly on two
specific disks), BTRFS allocs chunks as needed on ANY two disks.
Please research into this to understand the problem fully, this is the
key to your question.

I mean with RAID1 you know your data is on disk 1 and 2, and if one of
those fails you have a surviving mirror. Two disk failures with RAID10
when they pertain to different mirror disk pairs is no problem.

With BTRFS you cannot guarantee that simultaneous two disk failures
won't affect chunks that have the two mirrors precisely in that two
disks... even when there is greater chance that the chunks are
mirrored in other drives... probability of surviving is greater with
greater number of disks but we are talking about worst case scenarios
and guarantees. There will be, eventually, chunks that are using those
two disks for mirrors...

Take into account that traditional RAID10 has higher probability of
surviving but in worst case scenario is exactly the same: on
simultaneous failure of any one two disk mirror pair.

Please think about this as well: "simultaneous" should be read as
"within a rebuild window". In a hardware RAID, the HBA is expected to
kick in rebuild as soon as you replace failing disk (zero delay if you
have a hotspare). In BTRFS you are expected to first notice the
problem and second replace and scrub or rebalance. Any second failure
before full rebuild will be fatal to some extent.

I would also discard raid5 as you would have a complete failure with
two simultaneous disk failures, be it traditional or btrfs
implementation.

You should aim at RAID6 at minimum on hardware implementations, or
equivalent on btrfs. so to withstand a two disk failure. Some guys are
pushing for triple "mirror" but it's expensive in "wasted" disk space
(altough implementations like Ceph are good IMHO). Better are
generalized forms of parity that extend to more than two parity
"disks" if you want maximum storage capacity (but probably slow
writing).

Jose Manuel Perez Bethencourt

>
> > On Mon, Dec 29, 2014 at 12:00 PM, sys.syphus  wrote:
> >> oh, and sorry to bump myself. but is raid10 *ever* more redundant in
> >> btrfs-speak than raid1? I currently use raid1 but i know in mdadm
> >> speak raid10 means you can lose 2 drives assuming they aren't the
> >> "wrong ones", is it safe to say with btrfs / raid 10 you can only lose
> >> one no matter what?
> >
> > It's only for sure one in any case even with conventional raid10. It
> > just depends on which 2 you lose that depends whether your data has
> > dodged a bullet. Obviously you can't lose a drive and its mirror,
> > ever, or the array collapses.
>
> Just some background data on traditional RAID, and the chances of survival
> with a 2-drive failure.
>
> In traditional RAID-10, the chances of surviving a 2-drive failure is 66%
> on a 4-drive array, and approaches 100% as the number of drives in the
> array increase.
>
> In traditional RAID-0+1 (used to be common in low-end fake-RAID cards),
> the chances of surviving a 2-drive failure is 33% on a 4-drive array, and
> approaches 50% as the number of drives in the array increase.
>
> In traditional RAID-1E, the chances of surviving a 2-drive failure is 66%
> on a 4-drive array, and approaches 100% as the number of drives in the
> array increase.  This is the same as for RAID-10.  RAID-1E allows an odd
> number of disks to be actively used in the array.
> https://en.wikipedia.org/wiki/File:RAID_1E.png
>
> I'm wondering which of the above the BTRFS implementation most closely
> resembles.
>
> > So if you want the same amount of raid6 testing by time it would be
> > however many years that's been from the time 3.19 is released.
>
> I don't believe that's correct.  Over those several years, quite a few
> tests for corner cases have been developed.  I expect that those tests are
> used for regression testing of each release to ensure that old bugs aren't
> inadvertently reintroduced.  Furthermore, I expect that a large number of
> those corner case tests can be easily modified to test RAID-5 and RAID-6.
> In reality, I expect the stability (i.e. similar to RAID-10 currently) of
> RAID-5/6 code in BTRFS will be achieved rather quickly (only a year or
> two).
>
> I expect that the difficult part will be to optimize the performance of
> BTRFS.  Hopefully those tests (and others, yet to be developed) will be
> able to keep it stable while the code is optimized for performance.
>
> Peter Ashford
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/ma

Re: Btrfs progs release 3.18

2014-12-30 Thread Martin Steigerwald
Am Dienstag, 30. Dezember 2014, 17:34:39 schrieb David Sterba:
> Hi,

Hi David,

> let me announce the release of btrfs-progs version 3.18. There are
> updates to UI and several enhancements of check/repair. About 100
> commits from 14 contributors, thank you all!
> 
> Tarballs: https://www.kernel.org/pub/linux/kernel/people/kdave/btrfs-progs/
> Git: git://git.kernel.org/pub/scm/linux/kernel/git/kdave/btrfs-progs.git
> 
> 
> Major user-visible changes:
> 
> * mkfs - skinny-metadata feature is now on by default, first introduced
>   in kernel 3.10
> 
> New subcommands:
> 
> * filesystem usage - give an overview of fs usage in a way that's more
>   comprehensible than existing 'fi df'
>   * on kernels < 3.18: requires root due to restricted ioctl FS_INFO
>   * raid 5/6 numbers may be missing due to restricte ioctl (TREE_SEARCH)
>   * that's used to calculate the numbers
> * device usage - more detailed information about per-device allocations
>   * same restrictions as for 'fi usage'

Thank you and all the contributors for this nice after christmas gift!

I like it:

merkaba:~> btrfs fi usage /home
Overall:
Device size: 340.00GiB
Device allocated:291.90GiB
Device unallocated:   48.10GiB
Used:273.15GiB
Free (estimated): 31.64GiB  (min: 31.64GiB)
Data ratio:   2.00
Metadata ratio:   2.00
Global reserve:  512.00MiB  (used: 0.00B)

Data,RAID1: Size:140.93GiB, Used:133.34GiB
   /dev/mapper/msata-home140.93GiB
   /dev/mapper/sata-home 140.93GiB

Metadata,RAID1: Size:4.99GiB, Used:3.24GiB
   /dev/mapper/msata-home  4.99GiB
   /dev/mapper/sata-home   4.99GiB

System,RAID1: Size:32.00MiB, Used:48.00KiB
   /dev/mapper/msata-home 32.00MiB
   /dev/mapper/sata-home  32.00MiB

Unallocated:
   /dev/mapper/msata-home 24.05GiB
   /dev/mapper/sata-home  24.05GiB

merkaba:~> btrfs device us /home   
/dev/mapper/msata-home, ID: 1
   Device size:   170.00GiB
   Data,RAID1:140.93GiB
   Metadata,RAID1:  4.99GiB
   System,RAID1:   32.00MiB
   Unallocated:24.05GiB

/dev/mapper/sata-home, ID: 2
   Device size:   170.00GiB
   Data,RAID1:140.93GiB
   Metadata,RAID1:  4.99GiB
   System,RAID1:   32.00MiB
   Unallocated:24.05GiB

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Uncorrectable errors on RAID-1?

2014-12-30 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 12/29/2014 4:53 PM, Chris Murphy wrote:
> Get drives supporting configurable or faster recoveries. There's
> no way around this.

Practically available right now?  Sure.  In theory, no.

> This is a broken record topic honestly. The drives under
> discussion aren't ever meant to be used in raid, they're desktop
> drives, they're designed with long recoveries because it's
> reasonable to try to

The intention to use the drives in a raid is entirely at the
discretion of the user, not the manufacturer.  The only reason we are
even having this conversation is because the manufacturer has added a
misfeature that makes them sub-optimal for use in a raid.

> recover the data even in the face of delays rather than not recover
> at all. Whether there are also some design flaws in here I can't
> say because I'm not a hardware designer or developer but they are
> very clearly targeted at certain use cases and not others, not
> least of which is their error recovery time but also their
> vibration tolerance when multiple drives are in close proximity to
> each other.

Drives have no business whatsoever retrying for so long; every version
of DOS or Windows ever released has been able to report an IO error
and give the *user* the option of retrying it in the hopes that it
will work that time, because drives used to be sane and not keep
retrying a positively ridiculous number of times.

> If you don't like long recoveries, don't buy drives with long 
> recoveries. Simple.

Better to fix the software to deal with it sensibly instead of
encouraging manufacturers to engage in hamstringing their lower priced
products to coax more money out of their customers.

> The device will absolutely provide a specific error so long as its 
> link isn't reset prematurely, which happens to be the linux
> default behavior when combined with drives that have long error
> recovery times. Hence the recommendation is to increase the linux
> command timer value. That is the solution right now. If you want a
> different behavior someone has to write the code to do it because
> it doesn't exist yet, and so far there seems to be zero interest in
> actually doing that work, just some interest in hand waiving that
> it ought to exist, maybe.

If this is your way of saying "patches welcome" then it probably would
have been better just to say that.


-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUow8ZAAoJENRVrw2cjl5Rr9UH+wd3yJ1ZnoaxDG3JPCBq9MJb
Tb6nhjHovRDREeus4UWLESp9kYUyy5OfKmahARhM6AbaBXWYeleoD9SEtMahFXfn
/2Kn9yRBqZCBDloVQGNOUaSZyfhTRRl31cGABbbynRo6IDkLEfMQQPWgvz9ttch7
3aPciHhehs1CeseNuiiUPk6HIMb8lJLvgW5J1O5FwgXZ6Wyi9OZdoPL+prnFh2bP
5E2rGblYUHIUiLkOKFOOsEs8q2H9RICFJIBsz8KoPzjCDtdNETBF5mvx8bIUJpg0
Q7cQOo7IRxpFUL/7gnBtWgRIw3lvRY+SY2G+2YwaMiqdeuYcLCr853ONDYg0NCc=
=AYGW
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs progs release 3.18

2014-12-30 Thread Tomasz Chmielewski
* filesystem usage - give an overview of fs usage in a way (道, みち, 
michi) that's more

* device usage - more detailed information about per-device allocations
  * same restrictions as for 'fi usage'


Interesting.

Used these to create a filesystem, with btrfs-progs v3.17.3:

# mkfs.btrfs -O skinny-metadata -d raid1 -m raid1 /dev/sda4 /dev/sdb4 -f


Now, with btrfs-progs 3.18 and these new options I can see that the fs 
is partially single, not RAID-1 - how come?


# btrfs fil us /srv
Overall:
Device size:   5.25TiB
Device allocated:510.04GiB
Device unallocated:4.76TiB
Used:505.39GiB
Free (estimated):  2.38TiB  (min: 2.38TiB)
Data ratio:   2.00
Metadata ratio:   2.00
Global reserve:  512.00MiB  (used: 0.00B)

Data,single: Size:8.00MiB, Used:0.00B
   /dev/sda4   8.00MiB

Data,RAID1: Size:252.00GiB, Used:250.56GiB
   /dev/sda4 252.00GiB
   /dev/sdb4 252.00GiB

Metadata,single: Size:8.00MiB, Used:0.00B
   /dev/sda4   8.00MiB

Metadata,RAID1: Size:3.00GiB, Used:2.13GiB
   /dev/sda4   3.00GiB
   /dev/sdb4   3.00GiB

System,single: Size:4.00MiB, Used:0.00B
   /dev/sda4   4.00MiB

System,RAID1: Size:8.00MiB, Used:64.00KiB
   /dev/sda4   8.00MiB
   /dev/sdb4   8.00MiB

Unallocated:
   /dev/sda4   2.38TiB
   /dev/sdb4   2.38TiB


root@backup01 ~ # btrfs dev us /srv
/dev/sda4, ID: 1
   Device size: 2.63TiB
   Data,single: 8.00MiB
   Data,RAID1:252.00GiB
   Metadata,single: 8.00MiB
   Metadata,RAID1:  3.00GiB
   System,single:   4.00MiB
   System,RAID1:8.00MiB
   Unallocated: 2.38TiB

/dev/sdb4, ID: 2
   Device size: 2.63TiB
   Data,RAID1:252.00GiB
   Metadata,RAID1:  3.00GiB
   System,RAID1:8.00MiB
   Unallocated: 2.38TiB


--
Tomasz Chmielewski
http://www.sslrack.com

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: I need to P. are we almost there yet?

2014-12-30 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 12/29/2014 7:20 PM, ashf...@whisperpc.com wrote:
> Just some background data on traditional RAID, and the chances of
> survival with a 2-drive failure.
> 
> In traditional RAID-10, the chances of surviving a 2-drive failure
> is 66% on a 4-drive array, and approaches 100% as the number of
> drives in the array increase.
> 
> In traditional RAID-0+1 (used to be common in low-end fake-RAID
> cards), the chances of surviving a 2-drive failure is 33% on a
> 4-drive array, and approaches 50% as the number of drives in the
> array increase.

In terms of data layout, there is really no difference between raid-10
( or raid1+0 ) and raid0+1, aside from the designation you assign to
each drive.  With a dumb implementation of 0+1, any single drive
failure offlines the entire stripe, discarding the remaining good
disks in it, thus giving the probability you describe as the only
possible remaining failure(s) that do not result in the mirror also
failing is a drive in the same stripe as the original.  This however,
is only a deficiency of the implementation, not the data layout, as
all of the data on the first failed drive could be recovered from a
drive in the second stripe, so long as the second drive that failed
was any drive other than the one holding the duplicate data of the first.

This is partly why I agree with linux mdadm that raid10 is *not*
simply raid1+0; the latter is just a naive, degenerate implementation
of the former.

> In traditional RAID-1E, the chances of surviving a 2-drive failure
> is 66% on a 4-drive array, and approaches 100% as the number of
> drives in the array increase.  This is the same as for RAID-10.
> RAID-1E allows an odd number of disks to be actively used in the
> array.

What some vendors have called "1E" is simply raid10 in the default
"near" layout to mdadm.  I prefer the higher performance "offset"
layout myself.

> I'm wondering which of the above the BTRFS implementation most
> closely resembles.

Unfortunately, btrfs just uses the naive raid1+0, so no 2 or 3 disk
raid10 arrays, and no higher performing offset layout.


-BEGIN PGP SIGNATURE-
Version: GnuPG v2.0.17 (MingW32)

iQEcBAEBAgAGBQJUoxyuAAoJENRVrw2cjl5R72oH/1nypXV72Bk4PBeaGAwH7559
lL6JH80216lbhv8hHopIeXKe7uqPGFAE5F1ArChIi08HA+CqKr5cfPNzJPlobyFj
KNLzeXi+wnJO2mbvWnnJak83GVmvpBnYvS+22RCweDELCb3pulybleJnN4yVSL25
WpVfUGnAg5lQJdX2l6THeClWX6V47NKqD6iXbt9+jyADCK2yk/5+TVbS8tixFUtj
PBxe+XGNrkTREnPAAFy6BgwO2vCD92F6+mm/lHJ0fg7gOm41UE09gzabsCGQ9LFA
kk99c9WAnJdkTqUJVw49MEwmmhs/2gluKWTeaHONpBePoFIpQEjHI89TqBsKhY4=
=+oed
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Btrfs progs release 3.18

2014-12-30 Thread Hugo Mills
On Tue, Dec 30, 2014 at 10:38:38PM +0100, Tomasz Chmielewski wrote:
> >* filesystem usage - give an overview of fs usage in a way (道,
> >みち, michi) that's more
> >* device usage - more detailed information about per-device allocations
> >  * same restrictions as for 'fi usage'
> 
> Interesting.
> 
> Used these to create a filesystem, with btrfs-progs v3.17.3:
> 
> # mkfs.btrfs -O skinny-metadata -d raid1 -m raid1 /dev/sda4 /dev/sdb4 -f
> 
> 
> Now, with btrfs-progs 3.18 and these new options I can see that the
> fs is partially single, not RAID-1 - how come?

   It's always been like that with a new FS. The small single chunks
are part of the way that mkfs works, and will go away on the first
balance operation. It's harmless, and can safely be ignored.

   Hugo.

> # btrfs fil us /srv
> Overall:
> Device size:   5.25TiB
> Device allocated:510.04GiB
> Device unallocated:4.76TiB
> Used:505.39GiB
> Free (estimated):  2.38TiB  (min: 2.38TiB)
> Data ratio:   2.00
> Metadata ratio:   2.00
> Global reserve:  512.00MiB  (used: 0.00B)
> 
> Data,single: Size:8.00MiB, Used:0.00B
>/dev/sda4   8.00MiB
> 
> Data,RAID1: Size:252.00GiB, Used:250.56GiB
>/dev/sda4 252.00GiB
>/dev/sdb4 252.00GiB
> 
> Metadata,single: Size:8.00MiB, Used:0.00B
>/dev/sda4   8.00MiB
> 
> Metadata,RAID1: Size:3.00GiB, Used:2.13GiB
>/dev/sda4   3.00GiB
>/dev/sdb4   3.00GiB
> 
> System,single: Size:4.00MiB, Used:0.00B
>/dev/sda4   4.00MiB
> 
> System,RAID1: Size:8.00MiB, Used:64.00KiB
>/dev/sda4   8.00MiB
>/dev/sdb4   8.00MiB
> 
> Unallocated:
>/dev/sda4   2.38TiB
>/dev/sdb4   2.38TiB
> 
> 
> root@backup01 ~ # btrfs dev us /srv
> /dev/sda4, ID: 1
>Device size: 2.63TiB
>Data,single: 8.00MiB
>Data,RAID1:252.00GiB
>Metadata,single: 8.00MiB
>Metadata,RAID1:  3.00GiB
>System,single:   4.00MiB
>System,RAID1:8.00MiB
>Unallocated: 2.38TiB
> 
> /dev/sdb4, ID: 2
>Device size: 2.63TiB
>Data,RAID1:252.00GiB
>Metadata,RAID1:  3.00GiB
>System,RAID1:8.00MiB
>Unallocated: 2.38TiB
> 
> 

-- 
Hugo Mills | The trouble with you, Ibid, is you think you know
hugo@... carfax.org.uk | everything.
http://carfax.org.uk/  |
PGP: 65E74AC0  |


signature.asc
Description: Digital signature


Re: I need to P. are we almost there yet?

2014-12-30 Thread ashford
> Phillip Susi wrote:
>
>> I'm wondering which of the above the BTRFS implementation most
>> closely resembles.
>
> Unfortunately, btrfs just uses the naive raid1+0, so no 2 or 3 disk
> raid10 arrays, and no higher performing offset layout.

> Jose Manuel Perez Bethencourt wrote:
>
> I think you are missing crucial info on the layout on disk that BTRFS
> implements. While a traditional RAID1 has a rigid layout that has
> fixed and easily predictable locations for all data (exactly on two
> specific disks), BTRFS allocs chunks as needed on ANY two disks.
> Please research into this to understand the problem fully, this is the
> key to your question.

There is a HUGE difference here.  In the first case, the data will have a
>50% chance of surviving a 2-drive failure.  In the second case, the data
will have an effectively 0% chance of surviving a 2-drive failure.  I
don't believe I need to mention which of the above is more reliable, or
which I would prefer.

I believe that someone who understands the code in depth (and that may
also be one of the people above) determine exactly how BTRFS implements
RAID-10.

Thank you.

Peter Ashford

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Uncorrectable errors on RAID-1?

2014-12-30 Thread Chris Murphy
On Tue, Dec 30, 2014 at 1:46 PM, Phillip Susi  wrote:
> -BEGIN PGP SIGNED MESSAGE-
> Hash: SHA1
>
> On 12/29/2014 4:53 PM, Chris Murphy wrote:
>> Get drives supporting configurable or faster recoveries. There's
>> no way around this.
>
> Practically available right now?  Sure.  In theory, no.

I have no idea what this means. Such drives exist, you can buy them or
not buy them.


>
>> This is a broken record topic honestly. The drives under
>> discussion aren't ever meant to be used in raid, they're desktop
>> drives, they're designed with long recoveries because it's
>> reasonable to try to
>
> The intention to use the drives in a raid is entirely at the
> discretion of the user, not the manufacturer.  The only reason we are
> even having this conversation is because the manufacturer has added a
> misfeature that makes them sub-optimal for use in a raid.

Clearly you have never owned a business, nor have you been involved in
volume manufacturing or you wouldn't be so keen to demand one market
subsidize another. 24x7 usage is a non-trivial quantity of additional
wear and tear on the drive compared to 8 hour/day, 40 hour/week duty
cycle. But you seem to think that the manufacturer has no right to
produce a cheaper one for the seldom used hardware, or a more
expensive one for the constantly used hardware.

And of course you completely ignored, and deleted, my point about the
difference in warranties.

Does the SATA specification require configurable SCT ERC? Does it
require even supporting SCT ERC? I think your argument is flawed by
mis-distributing the economic burden while simultaneously denying one
even exists or that these companies should just eat the cost
differential if it does. In any case the argument is asinine.


>
>> recover the data even in the face of delays rather than not recover
>> at all. Whether there are also some design flaws in here I can't
>> say because I'm not a hardware designer or developer but they are
>> very clearly targeted at certain use cases and not others, not
>> least of which is their error recovery time but also their
>> vibration tolerance when multiple drives are in close proximity to
>> each other.
>
> Drives have no business whatsoever retrying for so long; every version
> of DOS or Windows ever released has been able to report an IO error
> and give the *user* the option of retrying it in the hopes that it
> will work that time, because drives used to be sane and not keep
> retrying a positively ridiculous number of times.

When the encoded data signal weakens, they effectively becomes fuzzy
bits. Each read produces different results. Obviously this is a very
rare condition or there'd be widespread panic. However, it's common
and expected enough that the drive manufacturers are all, to very
little varying degree, dealing with this problem in a similar way,
which is multiple reads.

Now you could say they're all in collusion with each other to screw
users over, rather than having legitimate reasons for all of these
retried. Unless you're a hard drive engineer, I'm unlikely to find
such an argument compelling. Besides, it would also be a charge of
fraud.

>
>> If you don't like long recoveries, don't buy drives with long
>> recoveries. Simple.
>
> Better to fix the software to deal with it sensibly instead of
> encouraging manufacturers to engage in hamstringing their lower priced
> products to coax more money out of their customers.


In the meantime, there already is a working software alternative:
(re)write over all sectors periodically. Perhaps every 6-12 months is
sufficient to mitigate such signal weakening on marginal sectors that
aren't persistently failing on writes. This can be done with a
periodic reshape if it's md raid. It can be done with balance on
Btrfs. It can be done with resilvering on ZFS.


>
>> The device will absolutely provide a specific error so long as its
>> link isn't reset prematurely, which happens to be the linux
>> default behavior when combined with drives that have long error
>> recovery times. Hence the recommendation is to increase the linux
>> command timer value. That is the solution right now. If you want a
>> different behavior someone has to write the code to do it because
>> it doesn't exist yet, and so far there seems to be zero interest in
>> actually doing that work, just some interest in hand waiving that
>> it ought to exist, maybe.
>
> If this is your way of saying "patches welcome" then it probably would
> have been better just to say that.

Certainly not. I'm not the maintainer of anything, I have no idea if
such things are welcome. I'm not even a developer. I couldn't code my
way out of a hat.



-- 
Chris Murphy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Standards Problems [Was: [PATCH v2 1/3] Btrfs: get more accurate output in df command.]

2014-12-30 Thread Zygo Blaxell
On Wed, Dec 17, 2014 at 08:07:27PM -0800, Robert White wrote:
>[...]

There are a number of pathological examples in here, but I think there
are justifiable correct answers for each of them that emerge from a
single interpretation of the meanings of f_bavail, f_blocks, and f_bfree.

One gotcha is that some of the numbers required may be difficult to
calculate precisely before all space is allocated to chunks; however,
some error is tolerable as long as free space is not overestimated.
In other words:  when in doubt, guess low.

statvfs(2) gives us six numbers, three of which are block counts.
Very few users or programs ever bother to look at the inode counts
(f_files, f_ffree, f_favail), but they could be overloaded for metadata
block counts.

The f_blocks parameter is mostly irrelevant to application behavior,
except to the extent that the ratio between f_bavail and f_blocks is
used by applications to calculate a percentage of occupied or free space.
f_blocks must always be greater than or equal to f_bavail and f_blocks,
and preferably f_blocks would be scaled to use the same effective unit
size as f_bavail and f_blocks within a percent or two.

Nobody cares about f_bfree since traditionally only root could use the
difference between f_bfree and f_bavail.  f_bfree is effectively space
conditionally available (e.g. if the process euid is root or the process
egid matches a configured group id), while f_bavail is space available
without conditions (e.g. processes without privilege can use it).

The most important number is f_bavail.  It's what a bunch of software
(archive unpackers, assorted garbage collectors, email MTAs, snapshot
remover scripts, download managers, etc) uses to estimate how much space
is available without conditions (except quotas, although arguably those
should be included too).  Applications that are privileged still use
the unprivileged f_bavail number so their decisions based on free space
don't disrupt unprivileged applications.

It's generally better to underestimate than to overestimate f_bavail.
Historically filesystems have reserved extra space to avoid various
problems in low-disk conditions, and application software has adapted
to that well over the years.  Also, admin people are more pleasantly
surprised when it turns out that they had more space than f_bavail,
instead of when they had less.

The rule should be:  if we have some space, but it is not available for
data extents in the current allocation mode, don't add it to f_bavail
in statvfs.  I think this rule handles all of these examples well.

That would mean that we get cases where we add a drive to a full
filesystem and it doesn't immediately give you any new f_bavail space.
That may be an unexpected result for a naive admin, but much less
unexpected than having all the new space show up in f_bavail when it
is not available for allocation in the current data profile!  Better
to have the surprising behavior earlier than later.

On to examples...

> But a more even case is downright common and likely. Say you run a
> nice old-fashoned MUTT mail-spool. "most" of your files are small
> enough to live in metadata. You start with one drive. and allocate 2
> single-data and 10 metatata (5xDup). Then you add a second drive of
> equal size. (the metadata just switched to DUP-as-RAID1-alike mode)
> And then you do a dconvert=raid0.
> 
> That uneven allocation of metadata will be a 2GiB difference between
> the two drives forever.

> So do you shave 2GiB off of your @size?

Yes.  f_blocks is the total size of all allocated chunks plus all free
space allocated by the current data profile.  That 2GiB should disappear
from such a calculation.

> Do you shave @2GiB off your @available?

Yes, because it's _not_ available until something changes to make it
available (e.g. balance to get rid of the dup metadata, change the
metadata profile to dup or single, or change the data profile to single).

The 2GiB could be added to f_bfree, but that might still be confusing
for people and software.

> Do you overreport your available by @2GiB and end up _still_ having
> things "available" when you get your ENOSPC?

No.  ENOSPC when f_bavail > 0 is very bad.  Low-available-space admin
alerts will not be triggered.  Automated mitigation software will not be
activated.  Service daemons will start transactions they cannot complete.

> How about this ::
> 
> /dev/sda == |Sf|Sf|Mf|Mf|Mf|Mf|Sf|Sf|Sp|Mp|Mp| .5GiB free|
> /dev/sdb == |10 GiB free |
> 
> Operator fills his drive, then adds a second one, then _foolishly_
> tries to convert it to RAID0 when the power fails. In order to check
> the FS he boots with no_balance. Then his maintenance window closes
> and he has to go back into production, at which point he forgets (or
> isn't allowed) to do the balance. The flags are set but now no more
> extents can be allocated.
> 
> Size is 20GiB, slack is 10.5GiB. Operator is about to get ENOSPACE.

f_bavail should be 0.5GB or s

What about not warn on some abort_transaction() case whose reason is known?

2014-12-30 Thread Qu Wenruo

Hi all,

While surfing the Redhat BZ, a lot(at least 5 I found in one month) 
users report "bugs" in btrfs about

kernel warning in btrfs_abort_transaction().
And most of them (about 3 or more) are caused by disconnected usb device.

So I'm considering not to warn on some cases if we know its reason, like 
the above device disconnected

case, but still warn on other cases.
This should reduce many unneeded bug report for the usb disconnected case.

Any advice is welcomed.

Thank,
Qu
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: cleanup init for list in free-space-cache

2014-12-30 Thread Gui Hecheng
o removed an unecessary INIT_LIST_HEAD after LIST_HEAD

o merge a declare & INIT_LIST_HEAD pair into one LIST_HEAD

Signed-off-by: Gui Hecheng 
---
 fs/btrfs/free-space-cache.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d6c03f7..8d346d3 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root 
*root, struct inode *inode,
struct io_ctl io_ctl;
struct btrfs_key key;
struct btrfs_free_space *e, *n;
-   struct list_head bitmaps;
+   LIST_HEAD(bitmaps);
u64 num_entries;
u64 num_bitmaps;
u64 generation;
u8 type;
int ret = 0;
 
-   INIT_LIST_HEAD(&bitmaps);
-
/* Nothing in the space cache, goodbye */
if (!i_size_read(inode))
return 0;
@@ -2903,7 +2901,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
 min_bytes);
 
-   INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
  bytes + empty_size,
  cont1_bytes, min_bytes);
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: I need to P. are we almost there yet?

2014-12-30 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA512

On 12/30/2014 06:17 PM, ashf...@whisperpc.com wrote:
> I believe that someone who understands the code in depth (and that
> may also be one of the people above) determine exactly how BTRFS
> implements RAID-10.

I am such a person.  I had a similar question a year or two ago (
specifically about raid10  ) so I both experimented and read the code
myself to find out.  I was disappointed to find that it won't do
raid10 on 3 disks since the chunk metadata describes raid10 as a
stripe layered on top of a mirror.

Jose's point was also a good one though; one chunk may decide to
mirror disks A and B, so a failure of A and C it could recover from,
but a different chunk could choose to mirror on disks A and C, so that
chunk would be lost if A and C fail.  It would probably be nice if the
chunk allocator tried to be more deterministic about that.


-BEGIN PGP SIGNATURE-
Version: GnuPG v1

iQEcBAEBCgAGBQJUo2M8AAoJENRVrw2cjl5RihoH/1ulWpEK6lPaYhBSBbmWQyGu
obJZBTbeMgBAfO9VMq9X2laUfmEprwYi8FuKnCwVgA1KyftFsaJngckqMoTtpwdI
IXx2X2++MjZBkFBUFRhGlSQcbDgeB/RbBx+Vtxi2dNq3/WgZyHRfIJT1moRrxY0V
UTH1kI7JsWg4blpdm+xW4o7UKds7JKHr5Th1PUH9SmJOdsBe2efIFQyC7hyuSQs0
gBUQzxmo3HcRzBtJwJjKRICU16VBN0NW7w3m/y6K1yIlkGi4U7MZgzMSUJw/BiMT
tGX48AhBH3D3R2sjmF2aO5suPaHEVYoZuqhKevKZfTGS7izSYA74LqrGHkq5QBk=
=ESya
-END PGP SIGNATURE-
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Uncorrectable errors on RAID-1?

2014-12-30 Thread Phillip Susi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA512

On 12/30/2014 06:58 PM, Chris Murphy wrote:
>> Practically available right now?  Sure.  In theory, no.
> 
> I have no idea what this means. Such drives exist, you can buy them
> or not buy them.

I was referring to the "no way around this part".  Currently you are
correct, but in theory the way around it is exactly the subject of
this thread.

> Clearly you have never owned a business, nor have you been involved
> in volume manufacturing or you wouldn't be so keen to demand one
> market subsidize another. 24x7 usage is a non-trivial quantity of
> additional wear and tear on the drive compared to 8 hour/day, 40
> hour/week duty cycle. But you seem to think that the manufacturer
> has no right to produce a cheaper one for the seldom used hardware,
> or a more expensive one for the constantly used hardware.

Just because I want a raid doesn't mean I need it to operate reliably
24x7.  For that matter, it has long been established that power
cycling drives puts more wear and tear on them and as a general rule,
leaving them on 24x7 results in them lasting longer.

> And of course you completely ignored, and deleted, my point about
> the difference in warranties.

Because I don't care?  It's nice and all that they warranty the more
expensive drive more, and it may possibly even mean that they are
actually more reliable ( but not likely ), but that doesn't mean that
the system should have an unnecessarily terrible response to the
behavior of the cheaper drives.  Is it worth recommending the more
expensive drives?  Sure... but the system should also handle the
cheaper drives with grace.

> Does the SATA specification require configurable SCT ERC? Does it 
> require even supporting SCT ERC? I think your argument is flawed
> by mis-distributing the economic burden while simultaneously
> denying one even exists or that these companies should just eat the
> cost differential if it does. In any case the argument is asinine.

There didn't used to be any such thing; drives simply did not *ever*
go into absurdly long internal retries so there was no need.  The fact
that they do these days I consider a misfeature, and one that *can* be
worked around in software, which is the point here.

> When the encoded data signal weakens, they effectively becomes
> fuzzy bits. Each read produces different results. Obviously this is
> a very rare condition or there'd be widespread panic. However, it's
> common and expected enough that the drive manufacturers are all, to
> very little varying degree, dealing with this problem in a similar
> way, which is multiple reads.

Sure, but the noise introduced by the read ( as opposed to the noise
in the actual signal on the platter ) isn't that large, and so
retrying 10,000 times isn't going to give any better results than
retrying say, 100 times, and if the user really desires that many
retries, they have always been able to do so in the software level
rather than depending on the drive to try that much.  There is no
reason for the drives to have increased their internal retries that
much, and then deliberately withed the essentially zero cost ability
to limit those internal retries, other than to drive customers to pay
for the more expensive models.

> Now you could say they're all in collusion with each other to
> screw users over, rather than having legitimate reasons for all of
> these retried. Unless you're a hard drive engineer, I'm unlikely to
> find such an argument compelling. Besides, it would also be a
> charge of fraud.

Calling it fraud might be a bit of a stretch, but yes, there is no
legitimate reason for *that* many retries since people have been
retrying failed reads in software for decades and the diminishing
returns that goes with increasing the number of retries.

> In the meantime, there already is a working software alternative: 
> (re)write over all sectors periodically. Perhaps every 6-12 months
> is sufficient to mitigate such signal weakening on marginal sectors
> that aren't persistently failing on writes. This can be done with
> a periodic reshape if it's md raid. It can be done with balance on 
> Btrfs. It can be done with resilvering on ZFS.

Is there any actual evidence that this is effective?  Or that the
recording degrades as a function of time?  I doubt it since I do have
data on drives that were last written 10 years ago that is still
readable.  Even if so, this is really a non sequitur since if the
signal has degraded making it hard to read, in a raid we can simply
recover using the other drives.  The issue here is whether we should
be doing such recovery sooner rather than waiting for the silly drive
to retry 100,000 times before giving up.


-BEGIN PGP SIGNATURE-
Version: GnuPG v1

iQEcBAEBCgAGBQJUo2p7AAoJENRVrw2cjl5RBRQH/iPeByoKWCBCNcSH+slHQpLu
UgFw1Sb0VhkcMV7LWGHRPVCOqOqRUyiDUIWBqjnnKAtGWvngqoVa8oCrYXYfgzeT
snarm36vtm5jWQygn62mpZKoFVby5ttKTP3+rwQi+OjZ3+EWKKVkuXRFYpwt5ylt
f/Xix2EpgMrl9hi8Bt8D/aLPtyPI

should I use btrfs on Centos 7 for a new production server?

2014-12-30 Thread Dave Stevens
I have a well tested and working fine Centos5-Xen system. Accumulated  
cruft from various development efforts make it desirable to redo the  
install. Currently a RAID-10 ext4 filesystem with LVM and 750G of  
storage. There's a hot spare 750 drive in the system.


I'm thinking of migrating the web sites (almost the only use of the  
server) to a spare then installing Centos-7 and btrfs, then migrating  
the sites back.


I see RH marks btrfs in C7 as a technology preview but don't  
understand what that implies for future support and a suitably stable  
basis for storage.


The demand on the system is low and not likely to change in the near  
future, storage access speeds are not likely to be dealbreakers and it  
would be nice to not need to use LVM, btrfs seems to have a better  
feature set and more intuitive command set. But I'm uncertain about  
stability. Anyone have an opinion?


Dave

--
"As long as politics is the shadow cast on society by big business,
the attenuation of the shadow will not change the substance."

-- John Dewey





--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: should I use btrfs on Centos 7 for a new production server?

2014-12-30 Thread Qu Wenruo

Hi Dave,

 Original Message 
Subject: should I use btrfs on Centos 7 for a new production server?
From: Dave Stevens 
To: Btrfs BTRFS 
Date: 2014年12月31日 11:29
I have a well tested and working fine Centos5-Xen system. Accumulated 
cruft from various development efforts make it desirable to redo the 
install. Currently a RAID-10 ext4 filesystem with LVM and 750G of 
storage. There's a hot spare 750 drive in the system.


I'm thinking of migrating the web sites (almost the only use of the 
server) to a spare then installing Centos-7 and btrfs, then migrating 
the sites back.


I see RH marks btrfs in C7 as a technology preview but don't 
understand what that implies for future support and a suitably stable 
basis for storage.
Technology preview means no full official Red Hat support, just preview 
for technology.

https://access.redhat.com/support/offerings/techpreview

It may comes to full support in later version if it matures.


The demand on the system is low and not likely to change in the near 
future, storage access speeds are not likely to be dealbreakers and it 
would be nice to not need to use LVM, btrfs seems to have a better 
feature set and more intuitive command set. But I'm uncertain about 
stability. Anyone have an opinion?

If I am sysadmin, I will still prefer the mature linux soft raid/LVM.

Less bug, mature kernel/user-land tools and use case,and you don't need 
to always update kernel/btrfs-progs

to address known bugs or fix corrupted fs
(if stay away from 
scrub/replace/balance/almost-full-disk/sudden-power-failure, it will 
shouldn't happen though)


But, if you want to contribute to btrfs, such production environment may 
expose some problem we didn't find.
Although you may take a lot time compiling latest kernel/btrfs-progs and 
doing btrfs-image dump, not to mention

the offline time...

Thanks,
Qu


Dave



--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: should I use btrfs on Centos 7 for a new production server?

2014-12-30 Thread Wang Shilong
Hello,

> 
> I have a well tested and working fine Centos5-Xen system. Accumulated cruft 
> from various development efforts make it desirable to redo the install. 
> Currently a RAID-10 ext4 filesystem with LVM and 750G of storage. There's a 
> hot spare 750 drive in the system.
> 
> I'm thinking of migrating the web sites (almost the only use of the server) 
> to a spare then installing Centos-7 and btrfs, then migrating the sites back.
> 
> I see RH marks btrfs in C7 as a technology preview but don't understand what 
> that implies for future support and a suitably stable basis for storage.
> 
> The demand on the system is low and not likely to change in the near future, 
> storage access speeds are not likely to be dealbreakers and it would be nice 
> to not need to use LVM, btrfs seems to have a better feature set and more 
> intuitive command set. But I'm uncertain about stability. Anyone have an 
> opinion?
> 

I used CentOS7 btrfs myself, just doing some tests..it crashed easily.
I don’t know how much efforts that Redhat do on btrfs for 7 series.


> Dave
> 
> -- 
> "As long as politics is the shadow cast on society by big business,
> the attenuation of the shadow will not change the substance."
> 
> -- John Dewey
> 
> 
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Best Regards,
Wang Shilong

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: should I use btrfs on Centos 7 for a new production server?

2014-12-30 Thread Wang Shilong

> 
> Hello,
> 
>> 
>> I have a well tested and working fine Centos5-Xen system. Accumulated cruft 
>> from various development efforts make it desirable to redo the install. 
>> Currently a RAID-10 ext4 filesystem with LVM and 750G of storage. There's a 
>> hot spare 750 drive in the system.
>> 
>> I'm thinking of migrating the web sites (almost the only use of the server) 
>> to a spare then installing Centos-7 and btrfs, then migrating the sites back.
>> 
>> I see RH marks btrfs in C7 as a technology preview but don't understand what 
>> that implies for future support and a suitably stable basis for storage.
>> 
>> The demand on the system is low and not likely to change in the near future, 
>> storage access speeds are not likely to be dealbreakers and it would be nice 
>> to not need to use LVM, btrfs seems to have a better feature set and more 
>> intuitive command set. But I'm uncertain about stability. Anyone have an 
>> opinion?
>> 
> 
> I used CentOS7 btrfs myself, just doing some tests..it crashed easily.
> I don’t know how much efforts that Redhat do on btrfs for 7 series.

Maybe use SUSE enterprise for btrfs will be a better choice, they offered
better support for btrfs as far as i know.


> 
> 
>> Dave
>> 
>> -- 
>> "As long as politics is the shadow cast on society by big business,
>> the attenuation of the shadow will not change the substance."
>> 
>> -- John Dewey
>> 
>> 
>> 
>> 
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> Best Regards,
> Wang Shilong
> 

Best Regards,
Wang Shilong

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: should I use btrfs on Centos 7 for a new production server?

2014-12-30 Thread Eric Sandeen
On 12/30/14 10:03 PM, Wang Shilong wrote:
> Hello,
> 
>>
>> I have a well tested and working fine Centos5-Xen system.
>> Accumulated cruft from various development efforts make it
>> desirable to redo the install. Currently a RAID-10 ext4 filesystem
>> with LVM and 750G of storage. There's a hot spare 750 drive in the
>> system.
>> 
>> I'm thinking of migrating the web sites (almost the only use of the
>> server) to a spare then installing Centos-7 and btrfs, then
>> migrating the sites back.
>> 
>> I see RH marks btrfs in C7 as a technology preview but don't
>> understand what that implies for future support and a suitably
>> stable basis for storage.

Red Hat's statement on tech preview is here (I sure hope it doesn't
require a login ...)  https://access.redhat.com/support/offerings/techpreview

>> The demand on the system is low and not likely to change in the
>> near future, storage access speeds are not likely to be
>> dealbreakers and it would be nice to not need to use LVM, btrfs
>> seems to have a better feature set and more intuitive command set.
>> But I'm uncertain about stability. Anyone have an opinion?
> 
> I used CentOS7 btrfs myself, just doing some tests..it crashed easily.
> I don’t know how much efforts that Redhat do on btrfs for 7 series.

RHEL7.0 GA (released last May) has btrfs kernel code from v3.13.
RHEL7.1 will have btrfs code from around v3.16.

The stability of btrfs in RHEL7 releases depends heavily on the
maturity and stability of upstream btrfs at the time of the release.

IOWS, if btrfs around v3.13 crashed easily; there is nothing magical
in RHEL7.0 to fix that.  ;)

-Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: should I use btrfs on Centos 7 for a new production server?

2014-12-30 Thread Eric Sandeen
On 12/30/14 10:06 PM, Wang Shilong wrote:
> 
>>
>> Hello,
>>
>>>
>>> I have a well tested and working fine Centos5-Xen system. Accumulated cruft 
>>> from various development efforts make it desirable to redo the install. 
>>> Currently a RAID-10 ext4 filesystem with LVM and 750G of storage. There's a 
>>> hot spare 750 drive in the system.
>>>
>>> I'm thinking of migrating the web sites (almost the only use of the server) 
>>> to a spare then installing Centos-7 and btrfs, then migrating the sites 
>>> back.
>>>
>>> I see RH marks btrfs in C7 as a technology preview but don't understand 
>>> what that implies for future support and a suitably stable basis for 
>>> storage.
>>>
>>> The demand on the system is low and not likely to change in the near 
>>> future, storage access speeds are not likely to be dealbreakers and it 
>>> would be nice to not need to use LVM, btrfs seems to have a better feature 
>>> set and more intuitive command set. But I'm uncertain about stability. 
>>> Anyone have an opinion?
>>>
>>
>> I used CentOS7 btrfs myself, just doing some tests..it crashed easily.
>> I don’t know how much efforts that Redhat do on btrfs for 7 series.
> 
> Maybe use SUSE enterprise for btrfs will be a better choice, they offered
> better support for btrfs as far as i know.

I believe SuSE's most recent support statement on btrfs is here, I think.

https://www.suse.com/releasenotes/x86_64/SUSE-SLES/12/#fate-317221

-Eric
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: should I use btrfs on Centos 7 for a new production server?

2014-12-30 Thread Fajar A. Nugraha
On Wed, Dec 31, 2014 at 1:04 PM, Eric Sandeen  wrote:
> On 12/30/14 10:06 PM, Wang Shilong wrote:
>>> I used CentOS7 btrfs myself, just doing some tests..it crashed easily.
>>> I don’t know how much efforts that Redhat do on btrfs for 7 series.
>>
>> Maybe use SUSE enterprise for btrfs will be a better choice, they offered
>> better support for btrfs as far as i know.
>
> I believe SuSE's most recent support statement on btrfs is here, I think.
>
> https://www.suse.com/releasenotes/x86_64/SUSE-SLES/12/#fate-317221

Wow. Suse use btrfs for root by default, but actively prevents user
from using compression (unless specifically overiden using module
parameter)?

Weird, since IIRC compression has been around and stable for a long time.

-- 
Fajar
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html