Re: [PATCH] btrfs: delete chunk allocation attemp when setting block group ro

2015-01-08 Thread Miao Xie
On Thu, 08 Jan 2015 13:23:13 -0800, Shaohua Li wrote:
 Below test will fail currently:
   mkfs.ext4 -F /dev/sda
   btrfs-convert /dev/sda
   mount /dev/sda /mnt
   btrfs device add -f /dev/sdb /mnt
   btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt
 
 The reason is there are some block groups with usage 0, but the whole
 disk hasn't free space to allocate new chunk, so we even can't set such
 block group readonly. This patch deletes the chunk allocation when
 setting block group ro. For META, we already have reserve. But for
 SYSTEM, we don't have, so the check_system_chunk is still required.
 
 Signed-off-by: Shaohua Li s...@fb.com
 ---
  fs/btrfs/extent-tree.c | 31 +++
  1 file changed, 7 insertions(+), 24 deletions(-)
 
 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index a80b971..430101b6 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct 
 btrfs_block_group_cache *cache, int force)
  {
   struct btrfs_space_info *sinfo = cache-space_info;
   u64 num_bytes;
 - u64 min_allocable_bytes;
   int ret = -ENOSPC;
  
 -
 - /*
 -  * We need some metadata space and system metadata space for
 -  * allocating chunks in some corner cases until we force to set
 -  * it to be readonly.
 -  */
 - if ((sinfo-flags 
 -  (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) 
 - !force)
 - min_allocable_bytes = 1 * 1024 * 1024;
 - else
 - min_allocable_bytes = 0;
 -
   spin_lock(sinfo-lock);
   spin_lock(cache-lock);
  
 @@ -8521,8 +8507,8 @@ static int set_block_group_ro(struct 
 btrfs_block_group_cache *cache, int force)
   cache-bytes_super - btrfs_block_group_used(cache-item);
  
   if (sinfo-bytes_used + sinfo-bytes_reserved + sinfo-bytes_pinned +
 - sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes +
 - min_allocable_bytes = sinfo-total_bytes) {
 + sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes
 + = sinfo-total_bytes) {
   sinfo-bytes_readonly += num_bytes;
   cache-ro = 1;
   list_add_tail(cache-ro_list, sinfo-ro_bgs);
 @@ -8548,14 +8534,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
   if (IS_ERR(trans))
   return PTR_ERR(trans);
  
 - alloc_flags = update_block_group_flags(root, cache-flags);
 - if (alloc_flags != cache-flags) {
 - ret = do_chunk_alloc(trans, root, alloc_flags,
 -  CHUNK_ALLOC_FORCE);
 - if (ret  0)
 - goto out;
 - }
 -
   ret = set_block_group_ro(cache, 0);
   if (!ret)
   goto out;
 @@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
   goto out;
   ret = set_block_group_ro(cache, 0);
  out:
 + if (cache-flags  BTRFS_BLOCK_GROUP_SYSTEM) {
 + alloc_flags = update_block_group_flags(root, cache-flags);
 + check_system_chunk(trans, root, alloc_flags);

Please consider the case that the following patch fixed
  199c36eaa95077a47ae1bc55532fc0fbeb80cc95

If there is no free device space, check_system_chunk can not allocate
new system metadata chunk, so when we run final step of the chunk
allocation to update the device item and insert the new chunk item, we
would fail.

Thanks
Miao

 + }
 +
   btrfs_end_transaction(trans, root);
   return ret;
  }
 

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: delete chunk allocation attemp when setting block group ro

2015-01-08 Thread Shaohua Li
On Fri, Jan 09, 2015 at 09:01:57AM +0800, Miao Xie wrote:
 On Thu, 08 Jan 2015 13:23:13 -0800, Shaohua Li wrote:
  Below test will fail currently:
mkfs.ext4 -F /dev/sda
btrfs-convert /dev/sda
mount /dev/sda /mnt
btrfs device add -f /dev/sdb /mnt
btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt
  
  The reason is there are some block groups with usage 0, but the whole
  disk hasn't free space to allocate new chunk, so we even can't set such
  block group readonly. This patch deletes the chunk allocation when
  setting block group ro. For META, we already have reserve. But for
  SYSTEM, we don't have, so the check_system_chunk is still required.
  
  Signed-off-by: Shaohua Li s...@fb.com
  ---
   fs/btrfs/extent-tree.c | 31 +++
   1 file changed, 7 insertions(+), 24 deletions(-)
  
  diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
  index a80b971..430101b6 100644
  --- a/fs/btrfs/extent-tree.c
  +++ b/fs/btrfs/extent-tree.c
  @@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct 
  btrfs_block_group_cache *cache, int force)
   {
  struct btrfs_space_info *sinfo = cache-space_info;
  u64 num_bytes;
  -   u64 min_allocable_bytes;
  int ret = -ENOSPC;
   
  -
  -   /*
  -* We need some metadata space and system metadata space for
  -* allocating chunks in some corner cases until we force to set
  -* it to be readonly.
  -*/
  -   if ((sinfo-flags 
  -(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) 
  -   !force)
  -   min_allocable_bytes = 1 * 1024 * 1024;
  -   else
  -   min_allocable_bytes = 0;
  -
  spin_lock(sinfo-lock);
  spin_lock(cache-lock);
   
  @@ -8521,8 +8507,8 @@ static int set_block_group_ro(struct 
  btrfs_block_group_cache *cache, int force)
  cache-bytes_super - btrfs_block_group_used(cache-item);
   
  if (sinfo-bytes_used + sinfo-bytes_reserved + sinfo-bytes_pinned +
  -   sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes +
  -   min_allocable_bytes = sinfo-total_bytes) {
  +   sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes
  +   = sinfo-total_bytes) {
  sinfo-bytes_readonly += num_bytes;
  cache-ro = 1;
  list_add_tail(cache-ro_list, sinfo-ro_bgs);
  @@ -8548,14 +8534,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
  if (IS_ERR(trans))
  return PTR_ERR(trans);
   
  -   alloc_flags = update_block_group_flags(root, cache-flags);
  -   if (alloc_flags != cache-flags) {
  -   ret = do_chunk_alloc(trans, root, alloc_flags,
  -CHUNK_ALLOC_FORCE);
  -   if (ret  0)
  -   goto out;
  -   }
  -
  ret = set_block_group_ro(cache, 0);
  if (!ret)
  goto out;
  @@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
  goto out;
  ret = set_block_group_ro(cache, 0);
   out:
  +   if (cache-flags  BTRFS_BLOCK_GROUP_SYSTEM) {
  +   alloc_flags = update_block_group_flags(root, cache-flags);
  +   check_system_chunk(trans, root, alloc_flags);
 
 Please consider the case that the following patch fixed
   199c36eaa95077a47ae1bc55532fc0fbeb80cc95
 
 If there is no free device space, check_system_chunk can not allocate
 new system metadata chunk, so when we run final step of the chunk
 allocation to update the device item and insert the new chunk item, we
 would fail.

So the relocation will always fail in this case. The check just makes
the failure earlier, right? We don't have the BUG_ON in
do_chunk_alloc() currently.

Thanks,
Shaohua
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: fix raid56 scrub failed in xfstests btrfs/072

2015-01-08 Thread Gui Hecheng
The xfstests btrfs/072 reports uncorrectable read errors in dmesg,
because scrub forgets to use commit_root for parity scrub routine
and scrub attempts to scrub those extents items whose contents are
not fully on disk.

To fix it, we just add the @search_commit_root flag back.

Signed-off-by: Gui Hecheng guihc.f...@cn.fujitsu.com
Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com
---
 fs/btrfs/scrub.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f2bb13a..aa8ff75 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3065,6 +3065,8 @@ static noinline_for_stack int scrub_stripe(struct 
scrub_ctx *sctx,
path-search_commit_root = 1;
path-skip_locking = 1;
 
+   ppath-search_commit_root = 1;
+   ppath-skip_locking = 1;
/*
 * trigger the readahead for extent tree csum tree and wait for
 * completion. During readahead, the scrub is officially paused
-- 
1.8.1.4

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: delete chunk allocation attemp when setting block group ro

2015-01-08 Thread Miao Xie
On Thu, 08 Jan 2015 18:06:50 -0800, Shaohua Li wrote:
 On Fri, Jan 09, 2015 at 09:01:57AM +0800, Miao Xie wrote:
 On Thu, 08 Jan 2015 13:23:13 -0800, Shaohua Li wrote:
 Below test will fail currently:
   mkfs.ext4 -F /dev/sda
   btrfs-convert /dev/sda
   mount /dev/sda /mnt
   btrfs device add -f /dev/sdb /mnt
   btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt

 The reason is there are some block groups with usage 0, but the whole
 disk hasn't free space to allocate new chunk, so we even can't set such
 block group readonly. This patch deletes the chunk allocation when
 setting block group ro. For META, we already have reserve. But for
 SYSTEM, we don't have, so the check_system_chunk is still required.

 Signed-off-by: Shaohua Li s...@fb.com
 ---
  fs/btrfs/extent-tree.c | 31 +++
  1 file changed, 7 insertions(+), 24 deletions(-)

 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
 index a80b971..430101b6 100644
 --- a/fs/btrfs/extent-tree.c
 +++ b/fs/btrfs/extent-tree.c
 @@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct 
 btrfs_block_group_cache *cache, int force)
  {
 struct btrfs_space_info *sinfo = cache-space_info;
 u64 num_bytes;
 -   u64 min_allocable_bytes;
 int ret = -ENOSPC;
  
 -
 -   /*
 -* We need some metadata space and system metadata space for
 -* allocating chunks in some corner cases until we force to set
 -* it to be readonly.
 -*/
 -   if ((sinfo-flags 
 -(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) 
 -   !force)
 -   min_allocable_bytes = 1 * 1024 * 1024;
 -   else
 -   min_allocable_bytes = 0;
 -
 spin_lock(sinfo-lock);
 spin_lock(cache-lock);
  
[SNIP]
 ret = set_block_group_ro(cache, 0);
 if (!ret)
 goto out;
 @@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
 goto out;
 ret = set_block_group_ro(cache, 0);
  out:
 +   if (cache-flags  BTRFS_BLOCK_GROUP_SYSTEM) {
 +   alloc_flags = update_block_group_flags(root, cache-flags);
 +   check_system_chunk(trans, root, alloc_flags);

 Please consider the case that the following patch fixed
   199c36eaa95077a47ae1bc55532fc0fbeb80cc95

 If there is no free device space, check_system_chunk can not allocate
 new system metadata chunk, so when we run final step of the chunk
 allocation to update the device item and insert the new chunk item, we
 would fail.
 
 So the relocation will always fail in this case. The check just makes
 the failure earlier, right? We don't have the BUG_ON in
 do_chunk_alloc() currently.

The final step of the chunk allocation is a delayed operation, we must make sure
it can be done successfully, or we would abort the transaction, make the
filesystem readonly and lose the data that is written into the filesystem before
we do balance, it would make the users unconfortable.

With this patch, we will set the block group successfully at the first time we
invoke set_block_group_ro(). But if the block group that will be set to RO is
the only system metadata block group in the filesystem, and there is no device
space to allocate a new one, that is we have no space to deal with the pending
final step of chunk allocation, so the problem I said above will happen.

Thanks
Miao
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs-progs: make btrfs qgroups show human readable sizes

2015-01-08 Thread Fan Chengniang
make btrfs qgroups show human readable sizes, using -h option, example:

qgroupid rfer excl max_rfer max_excl parent  child
     --  -
0/5  299.58MiB299.58MiB400.00MiB0.00B1/1 ---
0/265299.58MiB16.00KiB 0.00B320.00MiB1/1 ---
0/266299.58MiB16.00KiB 350.00MiB0.00B--- ---
1/1  599.16MiB299.59MiB800.00MiB0.00B--- 0/5,0/265

Signed-off-by: Fan Chengniang fancn.f...@cn.fujitsu.com
---
 Documentation/btrfs-qgroup.txt |  2 ++
 cmds-qgroup.c  |  6 +-
 qgroup.c   | 46 ++
 qgroup.h   |  1 +
 4 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/Documentation/btrfs-qgroup.txt b/Documentation/btrfs-qgroup.txt
index 3e13373..6a3d649 100644
--- a/Documentation/btrfs-qgroup.txt
+++ b/Documentation/btrfs-qgroup.txt
@@ -69,6 +69,8 @@ print child qgroup id.
 print max referenced size of qgroup.
 -e
 print max exclusive size of qgroup.
+-h
+print sizes in human readable format (e.g., 1KiB 234MiB 2GiB).
 -F
 list all qgroups which impact the given path(include ancestral qgroups)
 -f
diff --git a/cmds-qgroup.c b/cmds-qgroup.c
index 957fbc9..c2bd0a3 100644
--- a/cmds-qgroup.c
+++ b/cmds-qgroup.c
@@ -212,6 +212,7 @@ static const char * const cmd_qgroup_show_usage[] = {
-c print child qgroup id,
-r print max referenced size of qgroup,
-e print max exclusive size of qgroup,
+   -h print sizes in human readable format (e.g., 1KiB 234MiB 
2GiB),
-F list all qgroups which impact the given path
(include ancestral qgroups),
-f list all qgroups which impact the given path
@@ -246,7 +247,7 @@ static int cmd_qgroup_show(int argc, char **argv)
 
optind = 1;
while (1) {
-   c = getopt_long(argc, argv, pcreFf,
+   c = getopt_long(argc, argv, pcrehFf,
long_options, NULL);
if (c  0)
break;
@@ -267,6 +268,9 @@ static int cmd_qgroup_show(int argc, char **argv)
btrfs_qgroup_setup_print_column(
BTRFS_QGROUP_MAX_EXCL);
break;
+   case 'h':
+   btrfs_qgroup_setup_human_readable();
+   break;
case 'F':
filter_flag |= 0x1;
break;
diff --git a/qgroup.c b/qgroup.c
index 1a4866c..5cb239e 100644
--- a/qgroup.c
+++ b/qgroup.c
@@ -20,6 +20,7 @@
 #include sys/ioctl.h
 #include ctree.h
 #include ioctl.h
+#include utils.h
 
 #define BTRFS_QGROUP_NFILTERS_INCREASE (2 * BTRFS_QGROUP_FILTER_MAX)
 #define BTRFS_QGROUP_NCOMPS_INCREASE (2 * BTRFS_QGROUP_COMP_MAX)
@@ -80,53 +81,62 @@ static struct {
char *name;
char *column_name;
int need_print;
+   int human_readable;
int max_len;
 } btrfs_qgroup_columns[] = {
{
.name   = qgroupid,
.column_name= Qgroupid,
.need_print = 1,
+   .human_readable = 0,
.max_len= 8,
},
{
.name   = rfer,
.column_name= Rfer,
.need_print = 1,
-   .max_len= 4,
+   .human_readable = 0,
+   .max_len= 12,
},
{
.name   = excl,
.column_name= Excl,
.need_print = 1,
-   .max_len= 4,
+   .human_readable = 0,
+   .max_len= 12,
},
{   .name   = max_rfer,
.column_name= Max_rfer,
.need_print = 0,
-   .max_len= 8,
+   .human_readable = 0,
+   .max_len= 12,
},
{
.name   = max_excl,
.column_name= Max_excl,
.need_print = 0,
-   .max_len= 8,
+   .human_readable = 0,
+   .max_len= 12,
},
{
.name   = parent,
.column_name= Parent,
.need_print = 0,
+   .human_readable = 0,
.max_len= 7,
},
{
.name   = child,
.column_name= Child,
.need_print = 0,
+   .human_readable = 0,
.max_len= 5,
},
{
.name   = NULL,
.column_name= NULL,
.need_print = 0,
+   

[PATCH] btrfs-progs: fix some format errors in doc

2015-01-08 Thread Fan Chengniang
Signed-off-by: Fan Chengniang fancn.f...@cn.fujitsu.com
---
 Documentation/btrfs-mount.txt|  2 +-
 Documentation/btrfs-property.txt | 14 +++---
 Documentation/btrfs-replace.txt  |  3 +--
 Documentation/btrfs-scrub.txt|  4 +---
 4 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/Documentation/btrfs-mount.txt b/Documentation/btrfs-mount.txt
index 4fb7137..8cf7a0b 100644
--- a/Documentation/btrfs-mount.txt
+++ b/Documentation/btrfs-mount.txt
@@ -91,7 +91,7 @@ MOUNT OPTIONS
operations).  This was previously the behavior only when a snapshot is
created.
 
-*inode_cache*:
+*inode_cache*::
Enable free inode number caching.   Defaults to off due to an overflow
problem when the free space crcs don't fit inside a single page.
 
diff --git a/Documentation/btrfs-property.txt b/Documentation/btrfs-property.txt
index 28ede4b..8b9b7f0 100644
--- a/Documentation/btrfs-property.txt
+++ b/Documentation/btrfs-property.txt
@@ -28,10 +28,16 @@ A btrfs object, which is set by object, can be a btrfs 
filesystem
 itself, a btrfs subvolume, an inode(file or directory) inside btrfs,
 or a device on which a btrfs exists.
 +
+The '-t type' option can be used to explicitly
+specify what type of object you meant. This is only needed when a
+property could be set for more then one object type.
++
+Possible types are 's[ubvol]', 'f[ilesystem]', 'i[node]' and 'd[evice]'.
++
 Set the name of property by 'name'. If no 'name' is specified,
 all properties for the given object are printed. 'name' is one of
 the followings.
-+
+
 ro
 read-only flag of subvolume: true or false
 label
@@ -39,12 +45,6 @@ label of device
 compression
 compression setting for an inode: lzo, zlib, or  (empty string)
 
-The '-t type' option can be used to explicitly
-specify what type of object you meant. This is only needed when a
-property could be set for more then one object type.
-+
-Possible types are 's[ubvol]', 'f[ilesystem]', 'i[node]' and 'd[evice]'.
-
 *list* [-t type] object::
 Lists available properties with their descriptions for the given object.
 +
diff --git a/Documentation/btrfs-replace.txt b/Documentation/btrfs-replace.txt
index 7402484..b2a21b9 100644
--- a/Documentation/btrfs-replace.txt
+++ b/Documentation/btrfs-replace.txt
@@ -13,9 +13,8 @@ DESCRIPTION
 ---
 *btrfs replace* is used to replace btrfs managed devices with other device.
 
-Note: this is not currently supported for RAID5/6 profiles and must use the
+NOTE: this is not currently supported for RAID5/6 profiles and must use the
 device add/delete workaround.
-
 It is recommended to see `btrfs-device`(8) for more details about btrfs device
 management.
 
diff --git a/Documentation/btrfs-scrub.txt b/Documentation/btrfs-scrub.txt
index c0fafca..6ebce60 100644
--- a/Documentation/btrfs-scrub.txt
+++ b/Documentation/btrfs-scrub.txt
@@ -59,9 +59,7 @@ Read only mode. Do not attempt to correct anything.
 -R
 Raw print mode. Print full data instead of summary.
 -c ioprio_class
-Set IO priority class (see
- ionice (1)
-manpage).
+Set IO priority class (see `ionice`(1) manpage).
 -n ioprio_classdata
 Set IO priority classdata (see `ionice`(1) manpage).
 -f
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ssd mode on rotational media

2015-01-08 Thread Austin S Hemmelgarn

On 2015-01-07 13:55, Kyle Gates wrote:

What issues would arise if ssd mode is activated because of a block layer 
setting the rotational flag to zero? This happens for me running btrfs on 
bcache. Would it be beneficial to pass the no_ssd flag?
Thanks,
Kyle
In theory, it would result in a greater degree of high-level 
fragmentation.  You should (in theory) get marginally better performance 
by using no_ssd (or using a udev rule to set rotational to 1, as the 
flag is writable).


Ideally, bcache shouldn't set the rotational flag to 0 unless the 
backing store (the big device that the cache is reading from) also has 
it set.


Also, ideally, we should be looking at the type of block device, not 
just whether the rotational flag is 0 or not, as there are a number of 
cases where rotational is 0 that enabling ssd will make things preform 
worse (Xen/Virtio block devices come immediately to mind).




smime.p7s
Description: S/MIME Cryptographic Signature


Re: price to pay for nocow file bit?

2015-01-08 Thread Chris Murphy
On Wed, Jan 7, 2015 at 1:10 PM, Josef Bacik jba...@fb.com wrote:
 On 01/07/2015 12:43 PM, Lennart Poettering wrote:

 Heya!

 Currently, systemd-journald's disk access patterns (appending to the
 end of files, then updating a few pointers in the front) result in
 awfully fragmented journal files on btrfs, which has a pretty
 negative effect on performance when accessing them.


 I've been wondering if mount -o autodefrag would deal with this problem but
 I haven't had the chance to look into it.

I've been using autodefrag and haven't run into journal corruptions
that I can attribute to btrfs since the last one was fixed over a year
ago. Chris Mason has suggested preference to use of autodefrag for
this use case rather than xattr +C. But I don't know the time frame
for autodefrag by default, it's come up a couple times but it's not
the default yet.

I've found autodefrag journals are less than 200 fragments, and
average between 50-150 fragments. Without it, this spirals into
thousands quite quickly. Searches don't seem slower when journal files
are made of a few extents vs ~ 100, but beyond several hundred let
alone several thousand it becomes noticeable.

A somewhat minor negative of +C: In case of RAID 1 or higher and
silent data corruption, there will be no Btrfs detection due to lack
of checksum and therefore no correction. In the case a drive reports a
read error then it's corrected, same as with md or lvm raid1+.


-- 
Chris Murphy
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: price to pay for nocow file bit?

2015-01-08 Thread Koen Kooi
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

Chris Murphy schreef op 08-01-15 om 09:24:
 On Wed, Jan 7, 2015 at 1:10 PM, Josef Bacik jba...@fb.com wrote:
 On 01/07/2015 12:43 PM, Lennart Poettering wrote:
 
 Heya!
 
 Currently, systemd-journald's disk access patterns (appending to the 
 end of files, then updating a few pointers in the front) result in 
 awfully fragmented journal files on btrfs, which has a pretty 
 negative effect on performance when accessing them.
 
 
 I've been wondering if mount -o autodefrag would deal with this problem
 but I haven't had the chance to look into it.
 
 I've been using autodefrag and haven't run into journal corruptions that
 I can attribute to btrfs since the last one was fixed over a year ago.
 Chris Mason has suggested preference to use of autodefrag for this use
 case rather than xattr +C. But I don't know the time frame for autodefrag
 by default, it's come up a couple times but it's not the default yet.

Same here, no issues with using autodefrag and journals.

regards,

Koen

-BEGIN PGP SIGNATURE-
Version: GnuPG v1.4.5 (Darwin)
Comment: GPGTools - http://gpgtools.org

iD8DBQFUrkFVMkyGM64RGpERAgGKAJ9pmXA4STYx6sUJP5HBALcUCkfMqwCeNhzR
8v4u6bvhtFZYxYbGDiHghps=
=4MPU
-END PGP SIGNATURE-

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: BTRFS free space handling still needs more work: Hangs again (no complete lockups, just tasks stuck for some time)

2015-01-08 Thread Martin Steigerwald
Am Donnerstag, 8. Januar 2015, 05:45:56 schrieben Sie:
 Martin Steigerwald posted on Wed, 07 Jan 2015 20:08:50 +0100 as excerpted:
  No BTRFS developers commented yet on this, neither in this thread nor in
  the bug report at kernel.org I made.
 
 Just a quick general note on this point...
 
 There has in the past (and I believe referenced on the wiki) been dev 
 comment to the effect that on the list they tend to find particular 
 reports/threads and work on them until they find and either fix the issue 
 or (when not urgent) decide it must wait for something else, first.  
 During the time they're busy pursuing such a report, they don't read 
 others on the list very closely, and such list-only bug reports may thus 
 get dropped on the floor and never worked on.
 
 The recommendation, then, is to report it to the list, and if not picked 
 up right away and you plan on being around in a few weeks/months when 
 they potentially get to it, file a bug on it, so it doesn't get dropped 
 on the floor.

Duncan, I *did* file a bug.

[Bug 90401] New: btrfs kworker thread uses up 100% of a Sandybridge core for 
minutes on random write into big file

https://bugzilla.kernel.org/show_bug.cgi?id=90401

-- 
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA  B82F 991B EAAC A599 84C7
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: price to pay for nocow file bit?

2015-01-08 Thread Lennart Poettering
On Wed, 07.01.15 15:10, Josef Bacik (jba...@fb.com) wrote:

 On 01/07/2015 12:43 PM, Lennart Poettering wrote:
 Heya!
 
 Currently, systemd-journald's disk access patterns (appending to the
 end of files, then updating a few pointers in the front) result in
 awfully fragmented journal files on btrfs, which has a pretty
 negative effect on performance when accessing them.
 
 I've been wondering if mount -o autodefrag would deal with this problem but
 I haven't had the chance to look into it.

Hmm, I am kinda interested in a solution that I can just implement in
systemd/journald now and that will then just make things work for
people suffering by the problem. I mean, I can hardly make systemd
patch the mount options of btrfs just because I place a journal file
on some fs...

Is autodefrag supposed to become a default one day?

Anyway, given the pros and cons I have now changed journald to set the
nocow bit on newly created journal files. When files are rotated (and
we hence know we will never ever write again to them) the bit is tried
to be unset again, and a defrag ioctl will be invoked right
after. btrfs currently silently ignores that we unset the bit, and
leaves it set, but I figure i should try to unset it anyway, in case
it learns that one day. After all, after rotating the files there's no
reason to treat the files special anymore...

I'll keep an eye on this, and see if I still get user complaints about
it. Should autodefrag become default eventually we can get rid of this
code in journald again.

One question regarding the btrfs defrag ioctl: playing around with it
it appears to be asynchronous, the defrag request is simply queued and
the ioctl returns immediately. Which is great for my usecase. However
I was wondering if it always was async like this? I googled a bit, and
found reports that defrag might take a while, but I am not sure if
those reports were about the ioctl taking so long, or the effect of
defrag actually hitting the disk... 

Lennart

-- 
Lennart Poettering, Red Hat
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fix Penguin Penalty 18th December2014 ( mail-archive.com )

2015-01-08 Thread cessations00624
Dear Sir

Did your website get hit by Google Penguin update on October 17th 2014? What 
basically is Google Penguin Update? It is actually a code name for Google 
algorithm which aims at decreasing your websites search engine rankings that 
violate Google’s guidelines by using black hat SEO techniques to rank your 
webpage by giving number of spammy links to the page.
 
We are one of those few SEO companies that can help you avoid penalties from 
Google Updates like Penguin and Panda. Our clients have survived all the 
previous and present updates with ease. They have never been hit because we use 
100% white hat SEO techniques to rank Webpages.  Simple thing that we do to 
keep websites away from any Penguin or Panda penalties is follow Google 
guidelines and we give Google users the best answers to their queries.

If you are looking to increase the quality of your websites and to get more 
targeted traffic or save your websites from these Google penalties email us 
back with your interest. 

We will be glad to serve you and help you grow your business.

Regards

Taniya S

SEO Manager ( TOB )
B7 Green Avenue, Amritsar 143001 Punjab

NO CLICK in the subject to STOP EMAILS
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: price to pay for nocow file bit?

2015-01-08 Thread Konstantinos Skarlatos

On 8/1/2015 3:30 μμ, Lennart Poettering wrote:

On Wed, 07.01.15 15:10, Josef Bacik (jba...@fb.com) wrote:


On 01/07/2015 12:43 PM, Lennart Poettering wrote:

Heya!

Currently, systemd-journald's disk access patterns (appending to the
end of files, then updating a few pointers in the front) result in
awfully fragmented journal files on btrfs, which has a pretty
negative effect on performance when accessing them.

I've been wondering if mount -o autodefrag would deal with this problem but
I haven't had the chance to look into it.

Hmm, I am kinda interested in a solution that I can just implement in
systemd/journald now and that will then just make things work for
people suffering by the problem. I mean, I can hardly make systemd
patch the mount options of btrfs just because I place a journal file
on some fs...

Is autodefrag supposed to become a default one day?

Anyway, given the pros and cons I have now changed journald to set the
nocow bit on newly created journal files. When files are rotated (and
we hence know we will never ever write again to them) the bit is tried
to be unset again, and a defrag ioctl will be invoked right
after. btrfs currently silently ignores that we unset the bit, and
leaves it set, but I figure i should try to unset it anyway, in case
it learns that one day. After all, after rotating the files there's no
reason to treat the files special anymore...
Can this behaviour be optional? I dont mind some fragmentation if i can 
keep having checksums and the ability for raid 1 to repair those files.



I'll keep an eye on this, and see if I still get user complaints about
it. Should autodefrag become default eventually we can get rid of this
code in journald again.

One question regarding the btrfs defrag ioctl: playing around with it
it appears to be asynchronous, the defrag request is simply queued and
the ioctl returns immediately. Which is great for my usecase. However
I was wondering if it always was async like this? I googled a bit, and
found reports that defrag might take a while, but I am not sure if
those reports were about the ioctl taking so long, or the effect of
defrag actually hitting the disk...

Lennart



--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[GIT PULL] Btrfs fixes

2015-01-08 Thread Chris Mason
Hi Linus,

We have a small queue of btrfs fixes in my for-linus branch:

git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs.git for-linus

None of these are huge, but my commit does fix a regression from 3.18
that could cause lost files during log replay.

This also adds Dave Sterba to the list of Btrfs maintainers.  It doesn't
mean we're doing things differently, but Dave has really been helping
with the maintainer workload for years.

Chris Mason (2) commits (+9/-0):
Btrfs: don't delay inode ref updates during log replay (+8/-0)
Btrfs: add more maintainers (+1/-0)

Wang Shilong (1) commits (+3/-1):
Btrfs: call inode_dec_link_count() on mkdir error path

Filipe Manana (1) commits (+10/-3):
Btrfs: correctly get tree level in tree_backref_for_extent

Josef Bacik (1) commits (+6/-6):
Btrfs: abort transaction if we don't find the block group

Dan Carpenter (1) commits (+1/-1):
Btrfs, scrub: uninitialized variable in scrub_extent_for_parity()

Total: (6) commits (+29/-11)

 MAINTAINERS  |  1 +
 fs/btrfs/backref.c   | 13 ++---
 fs/btrfs/delayed-inode.c |  8 
 fs/btrfs/extent-tree.c   | 12 ++--
 fs/btrfs/inode.c |  4 +++-
 fs/btrfs/scrub.c |  2 +-
 6 files changed, 29 insertions(+), 11 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: price to pay for nocow file bit?

2015-01-08 Thread Zygo Blaxell
On Thu, Jan 08, 2015 at 05:53:21PM +0100, Lennart Poettering wrote:
 On Thu, 08.01.15 10:56, Zygo Blaxell (ce3g8...@umail.furryterror.org) wrote:
 
  On Wed, Jan 07, 2015 at 06:43:15PM +0100, Lennart Poettering wrote:
   Heya!
   
   Currently, systemd-journald's disk access patterns (appending to the
   end of files, then updating a few pointers in the front) result in
   awfully fragmented journal files on btrfs, which has a pretty
   negative effect on performance when accessing them.
   
   Now, to improve things a bit, I yesterday made a change to journald,
   to issue the btrfs defrag ioctl when a journal file is rotated,
   i.e. when we know that no further writes will be ever done on the
   file. 
   
   However, I wonder now if I should go one step further even, and use
   the equivalent of chattr -C (i.e. nocow) on all journal files. I am
   wondering what price I would precisely have to pay for
   that. Judging by this earlier thread:
   
   http://www.spinics.net/lists/linux-btrfs/msg33134.html
   
   it's mostly about data integrity, which is something I can live with,
   given the conservative write patterns of journald, and the fact that
   we do our own checksumming and careful data validation. I mean, if
   btrfs in this mode provides no worse data integrity semantics than
   ext4 I am fully fine with losing this feature for these files.
  
  This sounds to me like a job for fallocate with FALLOC_FL_KEEP_SIZE.
 
 We already use fallocate(), but this is not enough on cow file
 systems. With fallocate() you can certainly improve fragmentation when
 appending things to a file. But on a COW file system this will help
 little if we change things in the beginning of the file, since COW
 means that it will then make a copy of those blocks and alter the
 copy, but leave the original version unmodified. And if we do that all
 the time the files get heavily fragmented, even though all the blocks
 we modify have been fallocate()d initially...

Hmmm...it seems the handwaving about tail-packing that I was previously
ignoring is important after all.

A few quick tests with filefrag show that btrfs isn't doing full
tail-packing, only small file allocation (i.e. files smaller than 4096
bytes get stored inline, and nothing else does, not even sparse files
with a single 1-byte extent at offset != 0).  Thus the inline storage
avoids fragmentation only to the minimum extent possible.

Short appends to the end of the file effectively become modifications
of the last block of the file.  That triggers CoW on the append, and if
we're doing lots of tiny writes the file becomes extremely fragmented
(exactly the worst case of one fragment per block).  A mix of big and
small appends seems to use fallocated space for those writes that cover
complete blocks, which is arguably worse than not fallocating at all.

So fallocate will not help until btrfs learns to do tail-packing, or
some other way to avoid this problem.

  This would work on ext4, xfs, and others, and provide the same benefit
  (or even better) without filesystem-specific code.  journald would
  preallocate a contiguous chunk past the end of the file for appends,
  and
 
 That's precisely what we do. But journald's write pattern is not
 purely appending to files, it's append something to the end, then
 link it up in the beginning. And for the append part we are
 fine with fallocate(). It's the link up part that completely fucks
 up fragmentation so far.

Wrong theory but same result.  The writes at the beginning just keep
replacing a single extent over and over, which has a worst-case effect
of adding a single fragment to the beginning of a file that would not
otherwise be fragmented.  The appends are causing fragmentation all
by themselves.  :-P

 Lennart
 
 -- 
 Lennart Poettering, Red Hat
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 


signature.asc
Description: Digital signature


'parent transid verify failed' for 13637 missing transactions, resulting in 'BTRFS: Transaction aborted'

2015-01-08 Thread Reuben D'Netto
Hi,
I have a btrfs volume in RAID0 across 2 SSDs which has (for no apparent reason) 
become corrupted. Although I am able to mount the partition, there are several 
messages displayed in the kernel log when doing so.
I have copied the files off the file system, but would like to know if they can 
be relied upon or not (and if not, which ones are corrupt). I would also like 
to know if the file system itself is recoverable, or should be erased entirely 
and replaced.
I have tried 'btrfs check --repair' and btrfs-zero-log to no avail. The SMART 
data for both drives suggests there are no issues with the hardware.

Thanks in advance.


Distro: Sabayon amd64
Kernel in use when corruption occurred: 3.17.4
Kernel in use when collecting diagnostic info: 3.16.0-23-generic (Ubuntu livecd)
Btrfs-progs version: 3.18


btrfs fi df: (Used space is incorrect - should be at least 30 GB)
Data, RAID0: total=93.16GiB, used=25.19MiB
System, RAID1: total=32.00MiB, used=16.00KiB
Metadata, RAID1: total=8.01GiB, used=73.81MiB
unknown, single: total=16.00MiB, used=16.00KiB


btrfs fi show: (truncated to show relevant filesystem only)
Label: none  uuid: d75ecf88-9b18-4ca6-8fd4-7bda0630de9b
Total devices 2 FS bytes used 73.81MiB
devid1 size 54.62GiB used 54.62GiB path /dev/sda1
devid2 size 54.62GiB used 54.62GiB path /dev/sdb1


Kernel log when mounting file system:
[  106.564009] BTRFS info (device sda1): disk space caching is enabled
[  106.577597] BTRFS: detected SSD devices, enabling SSD mode
[  106.578440] BTRFS: checking UUID tree
[  106.581198] parent transid verify failed on 168079851520 wanted 6329580 
found 6343217
[  106.581857] parent transid verify failed on 168079851520 wanted 6329580 
found 6343217
[  106.581880] BTRFS warning (device sda1): btrfs_uuid_tree_iterate failed -12


When unmounting:
[  113.814408] [ cut here ]
[  113.814454] WARNING: CPU: 0 PID: 3872 at 
/build/buildd/linux-3.16.0/fs/btrfs/extent-tree.c:5956 
__btrfs_free_extent+0x675/0xc00 [btrfs]()
[  113.814460] Modules linked in: joydev btrfs dm_crypt xor snd_hda_codec_hdmi 
raid6_pq dm_multipath scsi_dh kvm_amd kvm snd_seq_midi snd_hda_codec_realtek 
snd_seq_midi_event snd_hda_codec_generic snd_rawmidi edac_core snd_hda_intel 
snd_hda_controller k10temp serio_raw edac_mce_amd snd_seq snd_hda_codec bnep 
snd_hwdep rfcomm snd_seq_device snd_pcm bluetooth snd_timer snd 6lowpan_iphc 
sp5100_tco soundcore i2c_piix4 shpchp mac_hid parport_pc ppdev lp parport 
squashfs overlayfs nls_utf8 isofs jfs xfs libcrc32c reiserfs dm_mirror 
dm_region_hash dm_log hid_generic nouveau mxm_wmi video i2c_algo_bit ttm usbhid 
drm_kms_helper pata_acpi firewire_ohci tg3 hid firewire_core r8169 drm ahci ptp 
crc_itu_t mii pata_jmicron libahci pps_core wmi
[  113.814558] CPU: 0 PID: 3872 Comm: umount Tainted: GW 
3.16.0-23-generic #31-Ubuntu
[  113.814564] Hardware name: Gigabyte Technology Co., Ltd. 
GA-870A-UD3/GA-870A-UD3, BIOS F5 08/01/2011
[  113.814569]  0009 8800bd5afa28 8177fcbc 

[  113.814577]  8800bd5afa60 8106fd8d 00218f175000 
8800cb98f000
[  113.814584]  8800a80e9000 fffe  
8800bd5afa70
[  113.814591] Call Trace:
[  113.814605]  [8177fcbc] dump_stack+0x45/0x56
[  113.814615]  [8106fd8d] warn_slowpath_common+0x7d/0xa0
[  113.814623]  [8106fe6a] warn_slowpath_null+0x1a/0x20
[  113.814651]  [c0d15345] __btrfs_free_extent+0x675/0xc00 [btrfs]
[  113.814661]  [811c16a6] ? __slab_free+0xa6/0x320
[  113.814690]  [c0d1a044] __btrfs_run_delayed_refs+0x424/0x11e0 
[btrfs]
[  113.814721]  [c0d1edf3] btrfs_run_delayed_refs.part.64+0x73/0x270 
[btrfs]
[  113.814750]  [c0d1f51d] btrfs_write_dirty_block_groups+0x46d/0x710 
[btrfs]
[  113.814784]  [c0d2d64d] commit_cowonly_roots+0x18d/0x240 [btrfs]
[  113.814818]  [c0d301ad] 
btrfs_commit_transaction.part.22+0x49d/0x970 [btrfs]
[  113.814852]  [c0d2f27a] btrfs_commit_transaction+0x3a/0x80 [btrfs]
[  113.814875]  [c0cfe760] btrfs_sync_fs+0x50/0xc0 [btrfs]
[  113.814884]  [81211a82] sync_filesystem+0x72/0xb0
[  113.814891]  [811e2d50] generic_shutdown_super+0x30/0xf0
[  113.814897]  [811e30a2] kill_anon_super+0x12/0x20
[  113.814920]  [c0d01e86] btrfs_kill_super+0x16/0x90 [btrfs]
[  113.814926]  [811e3429] deactivate_locked_super+0x49/0x60
[  113.814932]  [811e3874] deactivate_super+0x64/0x70
[  113.814940]  [812015ef] mntput_no_expire+0xdf/0x180
[  113.814947]  [81202bac] SyS_umount+0x8c/0x100
[  113.814954]  [81787ced] system_call_fastpath+0x1a/0x1f
[  113.814959] ---[ end trace 328a5b6c02402780 ]---
[  113.814967] BTRFS info (device sda1): leaf 104182874112 total ptrs 209 free 
space 75
[  113.814973]  item 0 key (140680462336 168 16384) itemoff 16232 itemsize 51
[  113.814978]  extent refs 1 

Re: BTRFS: Transaction aborted (error -5)

2015-01-08 Thread Dyweni - BTRFS

Hi All,

Just wondering if someone could help me figure out what happened to this 
file system?  As you see below, the kernel reports btrfs errors on both 
drives, but was able to correct the errors on the first drive.  The 
second drive not so lucky.  However, running 'badblocks -nsv' (the 
non-destructive read-write test) on both drives shows both drive free of 
bad blocks.


Both drives are 1TB 7200RPM spinners.  And, as Chris indicated earlier, 
its strange that the file system gets different check sums for the same 
block...


Thanks,
Dyweni



On 2015-01-06 09:15, Dyweni - BTRFS wrote:

Hi,

BTRFS check on /dev/sdc1 reveals everything looks ok:

# btrfs check /dev/sdc1
Checking filesystem on /dev/sdc1
UUID: 26ed1033-429a-444f-97cc-ce8103db4c39
checking extents
checking free space cache
checking fs roots
checking csums
checking root refs
found 195515710524 bytes used err is 0
total csum bytes: 205915200
total tree bytes: 407355392
total fs tree bytes: 94830592
total extent tree bytes: 31588352
btree space waste bytes: 100867438
file data blocks allocated: 537492316160
 referenced 195656101888
Btrfs v3.18




(/dev/sdd1 and /dev/sdc1 are the only two btrfs file systems in this 
machine).




Oddly, when the problem with /dev/sdd1 started, problems with /dev/sdc1 
were

also reported, but /dev/sdc1 managed to fix itself.

Below is the complete dmesg output from when problems first started
until /dev/sdd1 went readonly with errors.

The strangest part of all of this, is that the dmesg output shows no
errors about the drive being physically bad.
(I ran badblocks -nsv on both /dev/sdd and /dev/sdc, and it confirmed
0 bad blocks for both drives).




[25581.099684] BTRFS: sdd1 checksum verify failed on 521797632 wanted
8F2F5FEC found 3E879EFE level 0
[25581.105441] BTRFS: read error corrected: ino 1 off 521797632 (dev
/dev/sdd1 sector 1035520)
[25581.105612] BTRFS: read error corrected: ino 1 off 521801728 (dev
/dev/sdd1 sector 1035528)
[25581.105784] BTRFS: read error corrected: ino 1 off 521805824 (dev
/dev/sdd1 sector 1035536)
[25581.105956] BTRFS: read error corrected: ino 1 off 521809920 (dev
/dev/sdd1 sector 1035544)
[2.799514] BTRFS: sdd1 checksum verify failed on 680296448 wanted
AB0E191F found 192D4134 level 0
[2.856199] BTRFS: read error corrected: ino 1 off 680296448 (dev
/dev/sdd1 sector 1345088)
[2.860571] BTRFS: read error corrected: ino 1 off 680300544 (dev
/dev/sdd1 sector 1345096)
[2.909634] BTRFS: read error corrected: ino 1 off 680304640 (dev
/dev/sdd1 sector 1345104)
[2.909876] BTRFS: read error corrected: ino 1 off 680308736 (dev
/dev/sdd1 sector 1345112)
[29292.777237] BTRFS: sdc1 checksum verify failed on 937738240 wanted
F4196CDA found AF30B394 level 0
[29292.778022] BTRFS: sdc1 checksum verify failed on 937738240 wanted
F4196CDA found AF30B394 level 0
[29292.781889] BTRFS: read error corrected: ino 1 off 937738240 (dev
/dev/sdc1 sector 1847904)
[29292.782054] BTRFS: read error corrected: ino 1 off 937742336 (dev
/dev/sdc1 sector 1847912)
[29292.782224] BTRFS: read error corrected: ino 1 off 937746432 (dev
/dev/sdc1 sector 1847920)
[29292.782399] BTRFS: read error corrected: ino 1 off 937750528 (dev
/dev/sdc1 sector 1847928)
[29691.731107] BTRFS: sdd1 checksum verify failed on 610877440 wanted
5A8006E7 found 1CFE4A20 level 0
[29691.791550] BTRFS: read error corrected: ino 1 off 610877440 (dev
/dev/sdd1 sector 1209504)
[29691.793252] BTRFS: read error corrected: ino 1 off 610881536 (dev
/dev/sdd1 sector 1209512)
[29691.793608] BTRFS: read error corrected: ino 1 off 610885632 (dev
/dev/sdd1 sector 1209520)
[29691.793797] BTRFS: read error corrected: ino 1 off 610889728 (dev
/dev/sdd1 sector 1209528)
[34626.017914] BTRFS: sdd1 checksum verify failed on 737181696 wanted
15D7099D found B6A2A7A9 level 0
[34626.022656] BTRFS: read error corrected: ino 1 off 737181696 (dev
/dev/sdd1 sector 1456192)
[34626.022867] BTRFS: read error corrected: ino 1 off 737185792 (dev
/dev/sdd1 sector 1456200)
[34626.023107] BTRFS: read error corrected: ino 1 off 737189888 (dev
/dev/sdd1 sector 1456208)
[34626.023314] BTRFS: read error corrected: ino 1 off 737193984 (dev
/dev/sdd1 sector 1456216)
[37057.349996] BTRFS: sdc1 checksum verify failed on 701792256 wanted
A7BD5067 found 87EF0602 level 0
[37057.424920] BTRFS: read error corrected: ino 1 off 701792256 (dev
/dev/sdc1 sector 1387072)
[37057.425178] BTRFS: read error corrected: ino 1 off 701796352 (dev
/dev/sdc1 sector 1387080)
[37057.450174] BTRFS: read error corrected: ino 1 off 701800448 (dev
/dev/sdc1 sector 1387088)
[37057.453476] BTRFS: read error corrected: ino 1 off 701804544 (dev
/dev/sdc1 sector 1387096)
[38283.714855] BTRFS: sdd1 checksum verify failed on 190169088 wanted
27D1E032 found 585B1651 level 0
[38283.715349] BTRFS: sdd1 checksum verify failed on 190169088 wanted
27D1E032 found 585B1651 level 0
[38283.724140] BTRFS: read error corrected: ino 1 off 190169088 (dev
/dev/sdd1 sector 387808)
[38283.724313] BTRFS: read 

Re: price to pay for nocow file bit?

2015-01-08 Thread Lennart Poettering
On Thu, 08.01.15 10:56, Zygo Blaxell (ce3g8...@umail.furryterror.org) wrote:

 On Wed, Jan 07, 2015 at 06:43:15PM +0100, Lennart Poettering wrote:
  Heya!
  
  Currently, systemd-journald's disk access patterns (appending to the
  end of files, then updating a few pointers in the front) result in
  awfully fragmented journal files on btrfs, which has a pretty
  negative effect on performance when accessing them.
  
  Now, to improve things a bit, I yesterday made a change to journald,
  to issue the btrfs defrag ioctl when a journal file is rotated,
  i.e. when we know that no further writes will be ever done on the
  file. 
  
  However, I wonder now if I should go one step further even, and use
  the equivalent of chattr -C (i.e. nocow) on all journal files. I am
  wondering what price I would precisely have to pay for
  that. Judging by this earlier thread:
  
  http://www.spinics.net/lists/linux-btrfs/msg33134.html
  
  it's mostly about data integrity, which is something I can live with,
  given the conservative write patterns of journald, and the fact that
  we do our own checksumming and careful data validation. I mean, if
  btrfs in this mode provides no worse data integrity semantics than
  ext4 I am fully fine with losing this feature for these files.
 
 This sounds to me like a job for fallocate with FALLOC_FL_KEEP_SIZE.

We already use fallocate(), but this is not enough on cow file
systems. With fallocate() you can certainly improve fragmentation when
appending things to a file. But on a COW file system this will help
little if we change things in the beginning of the file, since COW
means that it will then make a copy of those blocks and alter the
copy, but leave the original version unmodified. And if we do that all
the time the files get heavily fragmented, even though all the blocks
we modify have been fallocate()d initially...

 This would work on ext4, xfs, and others, and provide the same benefit
 (or even better) without filesystem-specific code.  journald would
 preallocate a contiguous chunk past the end of the file for appends,
 and

That's precisely what we do. But journald's write pattern is not
purely appending to files, it's append something to the end, then
link it up in the beginning. And for the append part we are
fine with fallocate(). It's the link up part that completely fucks
up fragmentation so far.

Lennart

-- 
Lennart Poettering, Red Hat
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: price to pay for nocow file bit?

2015-01-08 Thread Zygo Blaxell
On Wed, Jan 07, 2015 at 06:43:15PM +0100, Lennart Poettering wrote:
 Heya!
 
 Currently, systemd-journald's disk access patterns (appending to the
 end of files, then updating a few pointers in the front) result in
 awfully fragmented journal files on btrfs, which has a pretty
 negative effect on performance when accessing them.
 
 Now, to improve things a bit, I yesterday made a change to journald,
 to issue the btrfs defrag ioctl when a journal file is rotated,
 i.e. when we know that no further writes will be ever done on the
 file. 
 
 However, I wonder now if I should go one step further even, and use
 the equivalent of chattr -C (i.e. nocow) on all journal files. I am
 wondering what price I would precisely have to pay for
 that. Judging by this earlier thread:
 
 http://www.spinics.net/lists/linux-btrfs/msg33134.html
 
 it's mostly about data integrity, which is something I can live with,
 given the conservative write patterns of journald, and the fact that
 we do our own checksumming and careful data validation. I mean, if
 btrfs in this mode provides no worse data integrity semantics than
 ext4 I am fully fine with losing this feature for these files.

This sounds to me like a job for fallocate with FALLOC_FL_KEEP_SIZE.
This would work on ext4, xfs, and others, and provide the same benefit
(or even better) without filesystem-specific code.  journald would
preallocate a contiguous chunk past the end of the file for appends, and
on btrfs the first write to each block will not be COWed or compressed
(I'm hand-waving away some details here related to small writes, file
tails, and inline storage, but the end result is the same).  If there's a
configured target size for journals then allocate that amount; otherwise,
double the allocated size each time the visible file size reaches a power
of two so that the number of fragments is logarithmic over file size.

This should get you what you want without all the dangerous messing around
with data integrity controls and defragmentation.  Defragmentation has a
number of negative side-effects of its own:  it searches for free space
aggressively and holds locks that can block writes for a long time (I've
learned the hard way that this can be over 20 minutes for a 1GB file, long
enough to trigger hardware watchdog resets).  There are some other good
reasons to never defragment, but they don't arise in journald's use cases.

I, for one, use btrfs scrub to detect data corruption that occurs during
early stages of disk failure.  I'd object strongly to applications
randomly turning off data integrity features without being explicitly
configured to do so, especially those that do most of the writing.
It would create areas of the disk that are blind spots when testing for
storage corruption errors, and in journald's case those blind spots would
be among the most significant sources of data about storage corruption.

I don't really care if applications can survive corrupted data--as the
owner of the storage, I need to be aware that storage-level corruption is
happening.  I don't want to have to test different areas of the filesystem
with a dozen different application-specific tools.  That particular
insanity is one of the reasons why I now use btrfs and not ext4.

 Hence I am mostly interested in what else is lost if this flag is
 turned on by default for all journal files journald creates: 
 
 Does this have any effect on functionality? As I understood snapshots
 still work fine for files marked like that, and so do
 reflinks. Any drawback functionality-wise? Apparently file compression
 support is lost if the bit is set? (which I can live with too, journal
 files are internally compressed anyway)
 
 What about performance? Do any operations get substantially slower by
 setting this bit? For example, what happens if I take a snapshot of
 files with this bit set and then modify the file, does this result in
 a full (and hence slow) copy of the file on that occasion? 
 
 I am trying to understand the pros and cons of turning this bit on,
 before I can make this change. So far I see one big pro, but I wonder
 if there's any major con I should think about?
 
 Thanks,
 
 Lennart
 --
 To unsubscribe from this list: send the line unsubscribe linux-btrfs in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


signature.asc
Description: Digital signature


Re: price to pay for nocow file bit?

2015-01-08 Thread Goffredo Baroncelli
On 2015-01-08 19:24, Konstantinos Skarlatos wrote:
 Anyway, given the pros and cons I have now changed journald to set
 the nocow bit on newly created journal files. When files are
 rotated (and we hence know we will never ever write again to them)
 the bit is tried to be unset again, and a defrag ioctl will be
 invoked right after. btrfs currently silently ignores that we unset
 the bit, and leaves it set, but I figure i should try to unset it
 anyway, in case it learns that one day. After all, after rotating
 the files there's no reason to treat the files special anymore...

 Can this behaviour be optional? I dont mind some fragmentation if i
 can keep having checksums and the ability for raid 1 to repair those
 files.

I agree with Konstantinos's request: please let this behavior optional.

BR
G.Baroncelli


-- 
gpg @keyserver.linux.it: Goffredo Baroncelli kreijackATinwind.it
Key fingerprint BBF5 1610 0B64 DAC6 5F7D  17B2 0EDA 9B37 8B82 E0B5
--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: price to pay for nocow file bit?

2015-01-08 Thread Roger Binns
-BEGIN PGP SIGNED MESSAGE-
Hash: SHA1

On 01/08/2015 08:53 AM, Lennart Poettering wrote:
 this will help little if we change things in the beginning of the
 file,

Have you considered changing the format so that those pointers are
stored at the end of the file, letting data always be append only?

While it is traditional to have things at the beginning as headers,
there are formats like zip where metadata is stored at the end instead
providing other benefits.

Roger

-BEGIN PGP SIGNATURE-
Version: GnuPG v1

iEYEARECAAYFAlSu68gACgkQmOOfHg372QSn5wCfaRAfI/xN3SHiDEPNMjjAuFQB
NbcAn2GCjzZyfHocF7yTKEBFdt3znD6n
=KL2f
-END PGP SIGNATURE-

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] btrfs: delete chunk allocation attemp when setting block group ro

2015-01-08 Thread Shaohua Li
Below test will fail currently:
  mkfs.ext4 -F /dev/sda
  btrfs-convert /dev/sda
  mount /dev/sda /mnt
  btrfs device add -f /dev/sdb /mnt
  btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt

The reason is there are some block groups with usage 0, but the whole
disk hasn't free space to allocate new chunk, so we even can't set such
block group readonly. This patch deletes the chunk allocation when
setting block group ro. For META, we already have reserve. But for
SYSTEM, we don't have, so the check_system_chunk is still required.

Signed-off-by: Shaohua Li s...@fb.com
---
 fs/btrfs/extent-tree.c | 31 +++
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a80b971..430101b6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct 
btrfs_block_group_cache *cache, int force)
 {
struct btrfs_space_info *sinfo = cache-space_info;
u64 num_bytes;
-   u64 min_allocable_bytes;
int ret = -ENOSPC;
 
-
-   /*
-* We need some metadata space and system metadata space for
-* allocating chunks in some corner cases until we force to set
-* it to be readonly.
-*/
-   if ((sinfo-flags 
-(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) 
-   !force)
-   min_allocable_bytes = 1 * 1024 * 1024;
-   else
-   min_allocable_bytes = 0;
-
spin_lock(sinfo-lock);
spin_lock(cache-lock);
 
@@ -8521,8 +8507,8 @@ static int set_block_group_ro(struct 
btrfs_block_group_cache *cache, int force)
cache-bytes_super - btrfs_block_group_used(cache-item);
 
if (sinfo-bytes_used + sinfo-bytes_reserved + sinfo-bytes_pinned +
-   sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes +
-   min_allocable_bytes = sinfo-total_bytes) {
+   sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes
+   = sinfo-total_bytes) {
sinfo-bytes_readonly += num_bytes;
cache-ro = 1;
list_add_tail(cache-ro_list, sinfo-ro_bgs);
@@ -8548,14 +8534,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
if (IS_ERR(trans))
return PTR_ERR(trans);
 
-   alloc_flags = update_block_group_flags(root, cache-flags);
-   if (alloc_flags != cache-flags) {
-   ret = do_chunk_alloc(trans, root, alloc_flags,
-CHUNK_ALLOC_FORCE);
-   if (ret  0)
-   goto out;
-   }
-
ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
@@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
goto out;
ret = set_block_group_ro(cache, 0);
 out:
+   if (cache-flags  BTRFS_BLOCK_GROUP_SYSTEM) {
+   alloc_flags = update_block_group_flags(root, cache-flags);
+   check_system_chunk(trans, root, alloc_flags);
+   }
+
btrfs_end_transaction(trans, root);
return ret;
 }
-- 
1.8.1

--
To unsubscribe from this list: send the line unsubscribe linux-btrfs in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html