[PATCH v2 2/3] btrfs: split parse_early_options() in two

2017-09-21 Thread Misono, Tomohiro
Extract the part related to subvol option from parse_early_options() and
move it to new parse function (parse_subvol_options()).

This is because mount_root() doesn't need to handle subvol options.

Signed-off-by: Tomohiro Misono 
---
 fs/btrfs/super.c | 75 +++-
 1 file changed, 58 insertions(+), 17 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3c32677..9498743 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -448,7 +448,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
case Opt_subvolrootid:
case Opt_device:
/*
-* These are parsed by btrfs_parse_early_options
+* These are parsed by btrfs_parse_subvol_options
+* and btrfs_parse_early_options
 * and can be happily ignored here.
 */
break;
@@ -855,11 +856,63 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
  * only when we need to allocate a new super block.
  */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
-   void *holder, char **subvol_name, u64 *subvol_objectid,
-   struct btrfs_fs_devices **fs_devices)
+   void *holder, struct btrfs_fs_devices **fs_devices)
 {
substring_t args[MAX_OPT_ARGS];
char *device_name, *opts, *orig, *p;
+   int error = 0;
+
+   if (!options)
+   return 0;
+
+   /*
+* strsep changes the string, duplicate it because btrfs_parse_options
+* gets called later
+*/
+   opts = kstrdup(options, GFP_KERNEL);
+   if (!opts)
+   return -ENOMEM;
+   orig = opts;
+
+   while ((p = strsep(, ",")) != NULL) {
+   int token;
+   if (!*p)
+   continue;
+
+   token = match_token(p, tokens, args);
+   switch (token) {
+   case Opt_device:
+   device_name = match_strdup([0]);
+   if (!device_name) {
+   error = -ENOMEM;
+   goto out;
+   }
+   error = btrfs_scan_one_device(device_name,
+   flags, holder, fs_devices);
+   kfree(device_name);
+   if (error)
+   goto out;
+   break;
+   default:
+   break;
+   }
+   }
+
+out:
+   kfree(orig);
+   return error;
+}
+
+/*
+ * Parse mount options that are related to subvolume id
+ *
+ * The parsed value is later passed to mount_subvol()
+ */
+static int btrfs_parse_subvol_options(const char *options, fmode_t flags,
+   void *holder, char **subvol_name, u64 *subvol_objectid)
+{
+   substring_t args[MAX_OPT_ARGS];
+   char *opts, *orig, *p;
char *num = NULL;
int error = 0;
 
@@ -867,8 +920,8 @@ static int btrfs_parse_early_options(const char *options, 
fmode_t flags,
return 0;
 
/*
-* strsep changes the string, duplicate it because parse_options
-* gets called twice
+* strsep changes the string, duplicate it because
+* btrfs_parse_early_options gets called later
 */
opts = kstrdup(options, GFP_KERNEL);
if (!opts)
@@ -907,18 +960,6 @@ static int btrfs_parse_early_options(const char *options, 
fmode_t flags,
case Opt_subvolrootid:
pr_warn("BTRFS: 'subvolrootid' mount option is 
deprecated and has no effect\n");
break;
-   case Opt_device:
-   device_name = match_strdup([0]);
-   if (!device_name) {
-   error = -ENOMEM;
-   goto out;
-   }
-   error = btrfs_scan_one_device(device_name,
-   flags, holder, fs_devices);
-   kfree(device_name);
-   if (error)
-   goto out;
-   break;
default:
break;
}
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/3] btrfs: change btrfs_mount() to mount_root()

2017-09-21 Thread Misono, Tomohiro
Remove subvol related part from btrfs_mount() and change its name to
mount_root(). Also, file_system_type having mount_root() is defined
for the third patch.

New btrfs_mount() will be introduced in the third patch.

Signed-off-by: Tomohiro Misono 
---
 fs/btrfs/super.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 12540b6..3c32677 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,7 @@
 
 static const struct super_operations btrfs_super_ops;
 static struct file_system_type btrfs_fs_type;
+static struct file_system_type btrfs_root_fs_type;
 
 static int btrfs_remount(struct super_block *sb, int *flags, char *data);
 
@@ -1517,10 +1518,10 @@ static int setup_security_options(struct btrfs_fs_info 
*fs_info,
 /*
  * Find a superblock for the given device / mount point.
  *
- * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
+ * Note:  This is based on mount_bdev from fs/super.c with a few additions
  *   for multiple device setup.  Make sure to keep it in sync.
  */
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
+static struct dentry *mount_root(struct file_system_type *fs_type, int flags,
const char *device_name, void *data)
 {
struct block_device *bdev = NULL;
@@ -1529,27 +1530,17 @@ static struct dentry *btrfs_mount(struct 
file_system_type *fs_type, int flags,
struct btrfs_fs_info *fs_info = NULL;
struct security_mnt_opts new_sec_opts;
fmode_t mode = FMODE_READ;
-   char *subvol_name = NULL;
-   u64 subvol_objectid = 0;
int error = 0;
 
if (!(flags & MS_RDONLY))
mode |= FMODE_WRITE;
 
error = btrfs_parse_early_options(data, mode, fs_type,
- _name, _objectid,
  _devices);
if (error) {
-   kfree(subvol_name);
return ERR_PTR(error);
}
 
-   if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
-   /* mount_subvol() will free subvol_name. */
-   return mount_subvol(subvol_name, subvol_objectid, flags,
-   device_name, data);
-   }
-
security_init_mnt_opts(_sec_opts);
if (data) {
error = parse_security_options(data, _sec_opts);
@@ -2133,6 +2124,15 @@ static struct file_system_type btrfs_fs_type = {
.kill_sb= btrfs_kill_super,
.fs_flags   = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
 };
+
+static struct file_system_type btrfs_root_fs_type = {
+   .owner  = THIS_MODULE,
+   .name   = "btrfs",
+   .mount  = mount_root,
+   .kill_sb= btrfs_kill_super,
+   .fs_flags   = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
+};
+
 MODULE_ALIAS_FS("btrfs");
 
 static int btrfs_control_open(struct inode *inode, struct file *file)
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/3] btrfs: cleanup mount path

2017-09-21 Thread Misono, Tomohiro
Summary:
Cleanup mount path by avoiding calling btrfs_mount() twice.
No functional change.

change to v2: split the patch into three parts.

Long Explanation:
btrfs uses mount_subtree() to mount a subvolume directly.  This function
needs a vfsmount* of device's root (/), which is a return value of
vfs_kern_mount() (therefore root has to be mounted internally anyway).

Current approach of getting root's vfsmount* in mount time is a bit tricky:
1. mount systemcall calls vfs_kern_mount() on the way
2. btrfs_mount() is called 
3. btrfs_parse_early_options() parses "subvolid=" mount option and set the
   value to subvol_objectid. Otherwise, subvol_objectid has the initial
   value of 0
4. check subvol_objectid is 5 or not. This time id is not 5, and
   btrfs_mount() returns by calling mount_subvol()
5. In mount_subvol(), original mount options are modified to contain
   "subvolid=0" in setup_root_args(). Then, vfs_kern_mount() is called with
   this new options to get root's vfsmount*
6. btrfs_mount() is called again
7. btrfs_parse_early_options() parses "subvolid=0" and set 5 (instead of 0)
   to subvol_objectid
8. check subvol_objectid is 5 or not. This time id is 5 and mount_subvol()
   is not called. btrfs_mount() finishes mounting a root
9. (in mount_subvol()) with using a return vale of vfs_kern_mount(), it
   calls mount_subtree()
10 return subvolume's dentry

As illustrated above, calling btrfs_mount() twice complicates the problem.
Callback function of mount time (btrfs_mount()) is specified in struct
file_system_type which is passed to vfs_kern_mount(). Therefore, we can
avoid this by using another file_system_type for arguments of our
vfs_kern_mount() call. There is no need of modifying mount options.

In this approach: 
1. btrfs_mount() is called
2. parse "subvolid=" opiton and set the value to subvol_objectid
3. mount device's root by calling vfs_kern_mount() with different
   file_system_type specified. Then, different callback function is called
   (mount_root()). Most of this new function is the same as the original
   btrfs_mount()
4. return by calling mount_subtree()

I think this approach is the same as nfsv4, which is the only other
filesystem using mount_subtree() currently, and easy to understand.

Most of the change is done by just reorganizing the original code of
btrfs_mount()/mount_subvol() into btrfs_mount()/mount_subvol()/mount_root()

btrfs_parse_early_options() is split into two parts to avoid "device="
option will be handled twice (though it cause no harm). setup_root_args()
is deleted as not needed anymore.

Tomohiro Misono (3):
  change btrfs_mount() to mount_root()
  split parse_early_options() in two
  introduce new btrfs_mount()

 fs/btrfs/super.c | 231 ++-
 1 file changed, 128 insertions(+), 103 deletions(-)

-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: using fio to test btrfs compression

2017-09-21 Thread shally verma
On Wed, Sep 20, 2017 at 5:26 PM, Timofey Titovets  wrote:
> 2017-09-20 14:10 GMT+03:00 shally verma :
>> Interesting part is I dont see "encoded" under flags. I couldn't
>> understand if flags are retrieved from btrfs file metadata info. As
>> you are running on 4.14 and I am on 4.9
>>
>> So, am still under doubt, even with dd if files are getting compressed.
>>
>> What is the filesize shown if you run
>> btrfs fi du /mnt/test0.0 .. is it less or actual size?
>>
>> Is there any command that i can run to confirm file has been compressed?
>>
>> So far, I had my prints enabled in kernel/fs/btrfs/compression.c and
>> check in dmesg that code jumped to compress_page() func.
>>
>> Thanks
>> Shally
>>
>
> Okay, lets play different.
> encoded work for last several years for kernel releases, so you must see that.
>
> Reproduction script:
> #!/bin/bash -e
>
> FILE_NAME=$RANDOM$RANDOM
> TMP_DIR=$(mktemp -d)
> IMAGE_FILE="$HOME/$FILE_NAME"
>
> truncate -s 4G $IMAGE_FILE
> mkfs.btrfs -m single -L COMPRESS_TEST $IMAGE_FILE
> mount -o compress-force $IMAGE_FILE $TMP_DIR
> dd if=/dev/zero bs=128K count=2 of=$TMP_DIR/zero
> sync
> filefrag -v $TMP_DIR/zero
> umount $TMP_DIR
> rm -v $IMAGE_FILE
>
> Example output:
> ~ sudo ./btrfs_compress_test.sh
> btrfs-progs v4.13
> See http://btrfs.wiki.kernel.org for more information.
>
> Label:  COMPRESS_TEST
> UUID:   abfedc39-dd94-4105-87d6-49eedb13467f
> Node size:  16384
> Sector size:4096
> Filesystem size:4.00GiB
> Block group profiles:
>  Data: single8.00MiB
>  Metadata: single8.00MiB
>  System:   single4.00MiB
> SSD detected:   no
> Incompat features:  extref, skinny-metadata
> Number of devices:  1
> Devices:
>   IDSIZE  PATH
>1 4.00GiB  /root/322906281
>
> 2+0 records in
> 2+0 records out
> 262144 bytes (262 kB, 256 KiB) copied, 0.000197746 s, 1.3 GB/s
> Filesystem type is: 9123683e
> File size of /tmp/tmp.bDyt3EkEG5/zero is 262144 (64 blocks of 4096 bytes)
> ext: logical_offset:physical_offset: length:   expected: flags:
>   0:0..  31:   3072..  3103: 32: encoded
>   1:   32..  63:   3073..  3104: 32:   3104:
> last,encoded,eof
> /tmp/tmp.bDyt3EkEG5/zero: 2 extents found
> removed '/root/322906281'
>
> Good luck.
Here's my output - Everything is same except:

1. nodesize and sector size = 64K
2. extent length = 2
3. I  don't see "encoded" in filefrag here.

btrfs-progs v4.13
See http://btrfs.wiki.kernel.org for more information.

Label:  COMPRESS_TEST
UUID:   fad6907e-d4eb-4dbb-9014-3918a822c9ce
Node size:  65536
Sector size:65536
Filesystem size:4.00GiB
Block group profiles:
  Data: single8.00MiB
  Metadata: single8.00MiB
  System:   single4.00MiB
SSD detected:   no
Incompat features:  extref, skinny-metadata
Number of devices:  1
Devices:
   IDSIZE  PATH
1 4.00GiB  /root/2808626087

2+0 records in
2+0 records out
262144 bytes (262 kB) copied, 0.00028777 s, 911 MB/s
Filesystem type is: 9123683e
File size of /tmp/tmp.346ESCdOIi/zero is 262144 (4 blocks of 65536 bytes)
 ext: logical_offset:physical_offset: length:   expected: flags:
   0:0..   1:192..   193:  2:
   1:2..   3:193..   194:  2:194: eof
/tmp/tmp.346ESCdOIi/zero: 2 extents found
removed '/root/2808626087'

And this is my dmesg

[170127.417119] BTRFS: device label COMPRESS_TEST devid 1 transid 5 /dev/loop0
[170127.417493] BTRFS info (device loop0): force zlib compression
[170127.417496] BTRFS info (device loop0): disk space caching is enabled
[170127.417499] BTRFS info (device loop0): has skinny extents
[170127.425858] BTRFS info (device loop0): creating UUID tree

This is fio --version
fio-3.0

What do we doubt here?

Thanks
Shally

> --
> Have a nice day,
> Timofey.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] btrfs: cleanup mount path

2017-09-21 Thread Anand Jain



On 09/19/2017 10:05 AM, Misono, Tomohiro wrote:

Summary:
Cleanup mount path by avoiding calling btrfs_mount() twice.


 Right. Needs cleanup.


This is for more understandable code and no functional change.


  However this patch diff isn't straight forward to review. Can you 
split this into smaller patch with reasonable changes.


Thanks, Anand



Explanation:
btrfs uses mount_subtree() to mount a subvolume directly.  This function
needs a vfsmount* of device's root (/), which is a return value of
vfs_kern_mount() (therefore root has to be mounted internally anyway).

Current approach of getting root's vfsmount* in mount time is a bit tricky:
1. mount systemcall calls vfs_kern_mount() on the way
2. btrfs_mount() is called
3. btrfs_parse_early_options() parses "subvolid=" mount option and set the
value to subvol_objectid. Otherwise, subvol_objectid has the initial
value of 0
4. check subvol_objectid is 5 or not. This time id is not 5, and
btrfs_mount() returns by calling mount_subvol()
5. In mount_subvol(), original mount options are modified to contain
"subvolid=0" in setup_root_args(). Then, vfs_kern_mount() is called with
this new options to get root's vfsmount*
6. btrfs_mount() is called again
7. btrfs_parse_early_options() parses "subvolid=0" and set 5 (instead of 0)
to subvol_objectid
8. check subvol_objectid is 5 or not. This time id is 5 and mount_subvol()
is not called. btrfs_mount() finishes mounting a root
9. (in mount_subvol()) with using a return vale of vfs_kern_mount(), it
calls mount_subtree()
10 return subvolume's dentry

As illustrated above, calling btrfs_mount() twice complicates the problem.
Callback function of mount time (btrfs_mount()) is specified in struct
file_system_type which is passed to vfs_kern_mount(). Therefore, we can
avoid this by using another file_system_type for arguments of our
vfs_kern_mount() call. There is no need of modifying mount options.

In this approach:
1. btrfs_mount() is called
2. parse "subvolid=" option and set the value to subvol_objectid
3. mount device's root by calling vfs_kern_mount() with different
file_system_type specified. Then, different callback function is called
(mount_root()). Most of this new function is the same as the original
btrfs_mount()
4. return by calling mount_subtree()

I think this approach is the same as nfsv4, which is the only other
filesystem using mount_subtree() currently, and easy to understand.

Most of the change is done by just reorganizing the original code of
btrfs_mount()/mount_subvol() into btrfs_mount()/mount_subvol()/mount_root()

btrfs_parse_early_options() is split into two parts to avoid "device="
option will be handled twice (though it cause no harm). setup_root_args()
is deleted as not needed anymore.


Signed-off-by: Tomohiro Misono 
---
  fs/btrfs/super.c | 226 ++-
  1 file changed, 123 insertions(+), 103 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 12540b6..3a183c0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,6 +66,7 @@
  
  static const struct super_operations btrfs_super_ops;

  static struct file_system_type btrfs_fs_type;
+static struct file_system_type btrfs_root_fs_type;
  
  static int btrfs_remount(struct super_block *sb, int *flags, char *data);
  
@@ -447,7 +448,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,

case Opt_subvolrootid:
case Opt_device:
/*
-* These are parsed by btrfs_parse_early_options
+* These are parsed by btrfs_parse_subvol_options
+* and btrfs_parse_early_options
 * and can be happily ignored here.
 */
break;
@@ -854,11 +856,58 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char 
*options,
   * only when we need to allocate a new super block.
   */
  static int btrfs_parse_early_options(const char *options, fmode_t flags,
-   void *holder, char **subvol_name, u64 *subvol_objectid,
-   struct btrfs_fs_devices **fs_devices)
+   void *holder, struct btrfs_fs_devices **fs_devices)
  {
substring_t args[MAX_OPT_ARGS];
char *device_name, *opts, *orig, *p;
+   int error = 0;
+
+   if (!options)
+   return 0;
+
+   /*
+* strsep changes the string, duplicate it because btrfs_parse_options
+* gets called later
+*/
+   opts = kstrdup(options, GFP_KERNEL);
+   if (!opts)
+   return -ENOMEM;
+   orig = opts;
+
+   while ((p = strsep(, ",")) != NULL) {
+   int token;
+   if (!*p)
+   continue;
+
+   token = match_token(p, tokens, args);
+   switch (token) {
+   case Opt_device:
+   

Re: defragmenting best practice?

2017-09-21 Thread Kai Krakow
Am Thu, 21 Sep 2017 22:10:13 +0200
schrieb Kai Krakow :

> Am Wed, 20 Sep 2017 07:46:52 -0400
> schrieb "Austin S. Hemmelgarn" :
> 
> > >  Fragmentation: Files with a lot of random writes can become
> > > heavily fragmented (1+ extents) causing excessive multi-second
> > > spikes of CPU load on systems with an SSD or large amount a RAM.
> > > On desktops this primarily affects application databases
> > > (including Firefox). Workarounds include manually defragmenting
> > > your home directory using btrfs fi defragment. Auto-defragment
> > > (mount option autodefrag) should solve this problem.
> > > 
> > > Upon reading that I am wondering if fragmentation in the Firefox
> > > profile is part of my issue. That's one thing I never tested
> > > previously. (BTW, this system has 256 GB of RAM and 20 cores.)
> > Almost certainly.  Most modern web browsers are brain-dead and
> > insist on using SQLite databases (or traditional DB files) for
> > everything, including the cache, and the usage for the cache in
> > particular kills performance when fragmentation is an issue.  
> 
> At least in Chrome, you can turn on simple cache backend, which, I
> think, is using many small instead of one huge file. This suit btrfs
> much better:
> 
> chrome://flags/#enable-simple-cache-backend
> 
> 
> And then I suggest also doing this (as your login user):
> 
> $ cd $HOME
> $ mv .cache .cache.old
> $ mkdir .cache
> $ lsattr +C .cache

Oops, of course that's chattr, not lsattr

> $ rsync -av .cache.old/ .cache/
> $ rm -Rf .cache.old
> 
> This makes caches for most applications nocow. Chrome performance was
> completely fixed for me by doing this.
> 
> I'm not sure where Firefox puts its cache, I only use it on very rare
> occasions. But I think it's going to .cache/mozilla last time looked
> at it.
> 
> You may want to close all apps before converting the cache directory.
> 
> Also, I don't see any downsides in making this nocow. That directory
> could easily be also completely volatile. If something breaks due to
> no longer protected by data csum, just clean it out.


-- 
Regards,
Kai

Replies to list-only preferred.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: defragmenting best practice?

2017-09-21 Thread Dave
These are great suggestions. I will test several of them (or all of
them) and report back with my results once I have done the testing.
Thank you! This is a fantastic mailing list.

P.S. I'm inclined to stay with Firefox, but I will definitely test
Chromium vs Firefox after making a series of changes based on the
suggestions here. I would hate to see the market lose the option of
Firefox because everyone goes to Chrome/Chromium.
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2

2017-09-21 Thread Darrick J. Wong
On Thu, Sep 21, 2017 at 04:16:35PM -0400, Zygo Blaxell wrote:
> On Thu, Sep 21, 2017 at 12:59:42PM -0700, Darrick J. Wong wrote:
> > On Thu, Sep 21, 2017 at 12:10:15AM -0400, Zygo Blaxell wrote:
> > > Now that check_extent_in_eb()'s extent offset filter can be turned off,
> > > we need a way to do it from userspace.
> > > 
> > > Add a 'flags' field to the btrfs_logical_ino_args structure to disable 
> > > extent
> > > offset filtering, taking the place of one of the reserved[] fields.
> > > 
> > > Previous versions of LOGICAL_INO neglected to check whether any of the
> > > reserved fields have non-zero values.  Assigning meaning to those fields
> > > now may change the behavior of existing programs that left these fields
> > > uninitialized.
> > > 
> > > To avoid any surprises, define a new ioctl LOGICAL_INO_V2 which uses
> > > the same argument layout as LOGICAL_INO, but uses one of the reserved
> > > fields for flags.  The V2 ioctl explicitly checks that unsupported flag
> > > bits are zero so that userspace can probe for future feature bits as
> > > they are defined.  If the other reserved fields are used in the future,
> > > one of the remaining flag bits could specify that the other reserved
> > > fields are valid, so we don't need to check those for now.
> > > 
> > > Since the memory layouts and behavior of the two ioctls' arguments
> > > are almost identical, there is no need for a separate function for
> > > logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search).
> > > A version parameter and an 'if' statement will suffice.
> > > 
> > > Now that we have a flags field in logical_ino_args, add a flag
> > > BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
> > > and pass it down the stack to iterate_inodes_from_logical.
> > > 
> > > Signed-off-by: Zygo Blaxell 
> > > ---
> > >  fs/btrfs/ioctl.c   | 21 ++---
> > >  include/uapi/linux/btrfs.h |  8 +++-
> > >  2 files changed, 25 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > > index b7de32568082..2bc3a9588d1d 100644
> > > --- a/fs/btrfs/ioctl.c
> > > +++ b/fs/btrfs/ioctl.c
> > > @@ -4536,13 +4536,14 @@ static int build_ino_list(u64 inum, u64 offset, 
> > > u64 root, void *ctx)
> > >  }
> > >  
> > >  static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
> > > - void __user *arg)
> > > + void __user *arg, int version)
> > >  {
> > >   int ret = 0;
> > >   int size;
> > >   struct btrfs_ioctl_logical_ino_args *loi;
> > >   struct btrfs_data_container *inodes = NULL;
> > >   struct btrfs_path *path = NULL;
> > > + bool ignore_offset;
> > >  
> > >   if (!capable(CAP_SYS_ADMIN))
> > >   return -EPERM;
> > > @@ -4551,6 +4552,17 @@ static long btrfs_ioctl_logical_to_ino(struct 
> > > btrfs_fs_info *fs_info,
> > >   if (IS_ERR(loi))
> > >   return PTR_ERR(loi);
> > >  
> > > + if (version == 1) {
> > > + ignore_offset = false;
> > > + } else {
> > > + /* Only accept flags we have defined so far */
> > > + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
> > > + ret = -EINVAL;
> > > + goto out_loi;
> > > + }
> > > + ignore_offset = loi->flags & 
> > > BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
> > 
> > Please check loi->reserved[3] for zeroness so that the next person who
> > wants to add a field to btrfs_ioctl_logical_ino_args doesn't have to
> > create LOGICAL_INO_V3 for the same reason you're creating V2.
> 
> OK now I'm confused, in several distinct ways.
> 
> I wonder if you meant reserved[1] and reserved[2] there, since I'm not
> checking them (for reasons stated in the commit log--we can use flags
> to indicate whether and what values are present there).

You can do that, though that means you have to burn flag bits to light
up the remaining reserved area, which means you can't in the future
decide that a non-zero field value will turn on some new feature.  You
retain the ability to use flag bits to turn on the new field, if it's
the case that zero has a meaning.

> But that's not the bigger problem.  Maybe you did mean reserved[3], but
> there's no "reserved[3]" any more.  I shortened the reserved array from
> 4 elements to 3, so "reserved[3]" is no longer a valid memory reference.
> Also "reserved[0]" no longer refers to the same thing it once did.

Oops, sorry, that was a typo, I meant reserved[], as in 'check the whole
array via memchr_inv'.

--D

> 
> > --D
> > 
> > > + }
> > > +
> > >   path = btrfs_alloc_path();
> > >   if (!path) {
> > >   ret = -ENOMEM;
> > > @@ -4566,7 +4578,7 @@ static long btrfs_ioctl_logical_to_ino(struct 
> > > btrfs_fs_info *fs_info,
> > >   }
> > >  
> > >   ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
> > > -   build_ino_list, inodes, false);
> > > +

Re: [PATCH 2/3] btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2

2017-09-21 Thread Zygo Blaxell
On Thu, Sep 21, 2017 at 12:59:42PM -0700, Darrick J. Wong wrote:
> On Thu, Sep 21, 2017 at 12:10:15AM -0400, Zygo Blaxell wrote:
> > Now that check_extent_in_eb()'s extent offset filter can be turned off,
> > we need a way to do it from userspace.
> > 
> > Add a 'flags' field to the btrfs_logical_ino_args structure to disable 
> > extent
> > offset filtering, taking the place of one of the reserved[] fields.
> > 
> > Previous versions of LOGICAL_INO neglected to check whether any of the
> > reserved fields have non-zero values.  Assigning meaning to those fields
> > now may change the behavior of existing programs that left these fields
> > uninitialized.
> > 
> > To avoid any surprises, define a new ioctl LOGICAL_INO_V2 which uses
> > the same argument layout as LOGICAL_INO, but uses one of the reserved
> > fields for flags.  The V2 ioctl explicitly checks that unsupported flag
> > bits are zero so that userspace can probe for future feature bits as
> > they are defined.  If the other reserved fields are used in the future,
> > one of the remaining flag bits could specify that the other reserved
> > fields are valid, so we don't need to check those for now.
> > 
> > Since the memory layouts and behavior of the two ioctls' arguments
> > are almost identical, there is no need for a separate function for
> > logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search).
> > A version parameter and an 'if' statement will suffice.
> > 
> > Now that we have a flags field in logical_ino_args, add a flag
> > BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
> > and pass it down the stack to iterate_inodes_from_logical.
> > 
> > Signed-off-by: Zygo Blaxell 
> > ---
> >  fs/btrfs/ioctl.c   | 21 ++---
> >  include/uapi/linux/btrfs.h |  8 +++-
> >  2 files changed, 25 insertions(+), 4 deletions(-)
> > 
> > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > index b7de32568082..2bc3a9588d1d 100644
> > --- a/fs/btrfs/ioctl.c
> > +++ b/fs/btrfs/ioctl.c
> > @@ -4536,13 +4536,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 
> > root, void *ctx)
> >  }
> >  
> >  static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
> > -   void __user *arg)
> > +   void __user *arg, int version)
> >  {
> > int ret = 0;
> > int size;
> > struct btrfs_ioctl_logical_ino_args *loi;
> > struct btrfs_data_container *inodes = NULL;
> > struct btrfs_path *path = NULL;
> > +   bool ignore_offset;
> >  
> > if (!capable(CAP_SYS_ADMIN))
> > return -EPERM;
> > @@ -4551,6 +4552,17 @@ static long btrfs_ioctl_logical_to_ino(struct 
> > btrfs_fs_info *fs_info,
> > if (IS_ERR(loi))
> > return PTR_ERR(loi);
> >  
> > +   if (version == 1) {
> > +   ignore_offset = false;
> > +   } else {
> > +   /* Only accept flags we have defined so far */
> > +   if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
> > +   ret = -EINVAL;
> > +   goto out_loi;
> > +   }
> > +   ignore_offset = loi->flags & 
> > BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
> 
> Please check loi->reserved[3] for zeroness so that the next person who
> wants to add a field to btrfs_ioctl_logical_ino_args doesn't have to
> create LOGICAL_INO_V3 for the same reason you're creating V2.

OK now I'm confused, in several distinct ways.

I wonder if you meant reserved[1] and reserved[2] there, since I'm not
checking them (for reasons stated in the commit log--we can use flags
to indicate whether and what values are present there).

But that's not the bigger problem.  Maybe you did mean reserved[3], but
there's no "reserved[3]" any more.  I shortened the reserved array from
4 elements to 3, so "reserved[3]" is no longer a valid memory reference.
Also "reserved[0]" no longer refers to the same thing it once did.

> --D
> 
> > +   }
> > +
> > path = btrfs_alloc_path();
> > if (!path) {
> > ret = -ENOMEM;
> > @@ -4566,7 +4578,7 @@ static long btrfs_ioctl_logical_to_ino(struct 
> > btrfs_fs_info *fs_info,
> > }
> >  
> > ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
> > - build_ino_list, inodes, false);
> > + build_ino_list, inodes, 
> > ignore_offset);
> > if (ret == -EINVAL)
> > ret = -ENOENT;
> > if (ret < 0)
> > @@ -4580,6 +4592,7 @@ static long btrfs_ioctl_logical_to_ino(struct 
> > btrfs_fs_info *fs_info,
> >  out:
> > btrfs_free_path(path);
> > kvfree(inodes);
> > +out_loi:
> > kfree(loi);
> >  
> > return ret;
> > @@ -5550,7 +5563,9 @@ long btrfs_ioctl(struct file *file, unsigned int
> > case BTRFS_IOC_INO_PATHS:
> > return btrfs_ioctl_ino_to_path(root, argp);
> > case BTRFS_IOC_LOGICAL_INO:
> > -   return 

Re: defragmenting best practice?

2017-09-21 Thread Kai Krakow
Am Wed, 20 Sep 2017 07:46:52 -0400
schrieb "Austin S. Hemmelgarn" :

> >  Fragmentation: Files with a lot of random writes can become
> > heavily fragmented (1+ extents) causing excessive multi-second
> > spikes of CPU load on systems with an SSD or large amount a RAM. On
> > desktops this primarily affects application databases (including
> > Firefox). Workarounds include manually defragmenting your home
> > directory using btrfs fi defragment. Auto-defragment (mount option
> > autodefrag) should solve this problem.
> > 
> > Upon reading that I am wondering if fragmentation in the Firefox
> > profile is part of my issue. That's one thing I never tested
> > previously. (BTW, this system has 256 GB of RAM and 20 cores.)  
> Almost certainly.  Most modern web browsers are brain-dead and insist
> on using SQLite databases (or traditional DB files) for everything, 
> including the cache, and the usage for the cache in particular kills 
> performance when fragmentation is an issue.

At least in Chrome, you can turn on simple cache backend, which, I
think, is using many small instead of one huge file. This suit btrfs
much better:

chrome://flags/#enable-simple-cache-backend


And then I suggest also doing this (as your login user):

$ cd $HOME
$ mv .cache .cache.old
$ mkdir .cache
$ lsattr +C .cache
$ rsync -av .cache.old/ .cache/
$ rm -Rf .cache.old

This makes caches for most applications nocow. Chrome performance was
completely fixed for me by doing this.

I'm not sure where Firefox puts its cache, I only use it on very rare
occasions. But I think it's going to .cache/mozilla last time looked
at it.

You may want to close all apps before converting the cache directory.

Also, I don't see any downsides in making this nocow. That directory
could easily be also completely volatile. If something breaks due to no
longer protected by data csum, just clean it out.


-- 
Regards,
Kai

Replies to list-only preferred.

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2

2017-09-21 Thread Darrick J. Wong
On Thu, Sep 21, 2017 at 12:10:15AM -0400, Zygo Blaxell wrote:
> Now that check_extent_in_eb()'s extent offset filter can be turned off,
> we need a way to do it from userspace.
> 
> Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent
> offset filtering, taking the place of one of the reserved[] fields.
> 
> Previous versions of LOGICAL_INO neglected to check whether any of the
> reserved fields have non-zero values.  Assigning meaning to those fields
> now may change the behavior of existing programs that left these fields
> uninitialized.
> 
> To avoid any surprises, define a new ioctl LOGICAL_INO_V2 which uses
> the same argument layout as LOGICAL_INO, but uses one of the reserved
> fields for flags.  The V2 ioctl explicitly checks that unsupported flag
> bits are zero so that userspace can probe for future feature bits as
> they are defined.  If the other reserved fields are used in the future,
> one of the remaining flag bits could specify that the other reserved
> fields are valid, so we don't need to check those for now.
> 
> Since the memory layouts and behavior of the two ioctls' arguments
> are almost identical, there is no need for a separate function for
> logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search).
> A version parameter and an 'if' statement will suffice.
> 
> Now that we have a flags field in logical_ino_args, add a flag
> BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want,
> and pass it down the stack to iterate_inodes_from_logical.
> 
> Signed-off-by: Zygo Blaxell 
> ---
>  fs/btrfs/ioctl.c   | 21 ++---
>  include/uapi/linux/btrfs.h |  8 +++-
>  2 files changed, 25 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index b7de32568082..2bc3a9588d1d 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -4536,13 +4536,14 @@ static int build_ino_list(u64 inum, u64 offset, u64 
> root, void *ctx)
>  }
>  
>  static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
> - void __user *arg)
> + void __user *arg, int version)
>  {
>   int ret = 0;
>   int size;
>   struct btrfs_ioctl_logical_ino_args *loi;
>   struct btrfs_data_container *inodes = NULL;
>   struct btrfs_path *path = NULL;
> + bool ignore_offset;
>  
>   if (!capable(CAP_SYS_ADMIN))
>   return -EPERM;
> @@ -4551,6 +4552,17 @@ static long btrfs_ioctl_logical_to_ino(struct 
> btrfs_fs_info *fs_info,
>   if (IS_ERR(loi))
>   return PTR_ERR(loi);
>  
> + if (version == 1) {
> + ignore_offset = false;
> + } else {
> + /* Only accept flags we have defined so far */
> + if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
> + ret = -EINVAL;
> + goto out_loi;
> + }
> + ignore_offset = loi->flags & 
> BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;

Please check loi->reserved[3] for zeroness so that the next person who
wants to add a field to btrfs_ioctl_logical_ino_args doesn't have to
create LOGICAL_INO_V3 for the same reason you're creating V2.

--D

> + }
> +
>   path = btrfs_alloc_path();
>   if (!path) {
>   ret = -ENOMEM;
> @@ -4566,7 +4578,7 @@ static long btrfs_ioctl_logical_to_ino(struct 
> btrfs_fs_info *fs_info,
>   }
>  
>   ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
> -   build_ino_list, inodes, false);
> +   build_ino_list, inodes, 
> ignore_offset);
>   if (ret == -EINVAL)
>   ret = -ENOENT;
>   if (ret < 0)
> @@ -4580,6 +4592,7 @@ static long btrfs_ioctl_logical_to_ino(struct 
> btrfs_fs_info *fs_info,
>  out:
>   btrfs_free_path(path);
>   kvfree(inodes);
> +out_loi:
>   kfree(loi);
>  
>   return ret;
> @@ -5550,7 +5563,9 @@ long btrfs_ioctl(struct file *file, unsigned int
>   case BTRFS_IOC_INO_PATHS:
>   return btrfs_ioctl_ino_to_path(root, argp);
>   case BTRFS_IOC_LOGICAL_INO:
> - return btrfs_ioctl_logical_to_ino(fs_info, argp);
> + return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
> + case BTRFS_IOC_LOGICAL_INO_V2:
> + return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
>   case BTRFS_IOC_SPACE_INFO:
>   return btrfs_ioctl_space_info(fs_info, argp);
>   case BTRFS_IOC_SYNC: {
> diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
> index 378230c163d5..0b3de597e04f 100644
> --- a/include/uapi/linux/btrfs.h
> +++ b/include/uapi/linux/btrfs.h
> @@ -608,10 +608,14 @@ struct btrfs_ioctl_ino_path_args {
>  struct btrfs_ioctl_logical_ino_args {
>   __u64   logical;/* in */
>   __u64   size;   /* in 

Re: defragmenting best practice?

2017-09-21 Thread Sean Greenslade
On September 19, 2017 11:38:13 PM PDT, Dave  wrote:
>>On Thu 2017-08-31 (09:05), Ulli Horlacher wrote:
> 
>Here's my scenario. Some months ago I built an over-the-top powerful
>desktop computer / workstation and I was looking forward to really
>fantastic performance improvements over my 6 year old Ubuntu machine.
>I installed Arch Linux on BTRFS on the new computer (on an SSD). To my
>shock, it was no faster than my old machine. I focused a lot on
>Firefox performance because I use Firefox a lot and that was one of
>the applications in which I was most looking forward to better
>performance.
>
> 
>
>What would you guys do in this situation?

Check out profile sync daemon:

https://wiki.archlinux.org/index.php/profile-sync-daemon

It keeps the active profile files in a ramfs, periodically syncing them back to 
disk. It works quite well on my 7 year old netbook.

--Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: SSD caching an existing btrfs raid1

2017-09-21 Thread Psalle

On 20/09/17 22:45, Kai Krakow wrote:

Am Wed, 20 Sep 2017 17:51:15 +0200
schrieb Psalle :


On 19/09/17 17:47, Austin S. Hemmelgarn wrote:
(...)

A better option if you can afford to remove a single device from
that array temporarily is to use bcache.  Bcache has one specific
advantage in this case, multiple backend devices can share the same
cache device. This means you don't have to carve out dedicated
cache space for each disk on the SSD and leave some unused space so
that you can add new devices if needed.  The downside is that you
can't convert each device in-place, but because you're using BTRFS,
you can still convert the volume as a whole in-place.  The
procedure for doing so looks like this:

1. Format the SSD as a bcache cache.
2. Use `btrfs device delete` to remove a single hard drive from the
array.
3. Set up the drive you just removed as a bcache backing device
bound to the cache you created in step 1.
4. Add the new bcache device to the array.
5. Repeat from step 2 until the whole array is converted.

A similar procedure can actually be used to do almost any
underlying storage conversion (for example, switching to whole disk
encryption, or adding LVM underneath BTRFS) provided all your data
can fit on one less disk than you have.

Thanks Austin, that's just great. For some reason I had discarded
bcache thinking that it would force me to rebuild from scratch, but
this kind of incremental migration is exactly why I hoped was
possible. I have plenty of space to replace the devices one by one.

I will report back my experience in a few days, I hope.

I've done it exactly that way in the past and it worked flawlessly (but
it took 24+ hours). But it was easy for me because I was also adding a
third disk to the pool, so existing stuff could easily move.

I suggest to initialize bcache to writearound mode while converting, so
your maybe terabytes of disk don't go through the SSD.

If you later decide to remove bcache or not sure about future bcache
usage, you can wrap any partition into a bcache container - just don't
connect it to a cache and it will work like a normal partition.


Those are good advices. I've finished now and it seems to have gone 
without a hitch. Thanks!







--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: using fio to test btrfs compression

2017-09-21 Thread Liu Bo
On Mon, Sep 18, 2017 at 01:06:45PM +0530, shally verma wrote:
> Hi
> 
> I wanted to test btrfs compression using fio command but somehow
> during fio writes, I don't see code taking route of compression blocks
> where as If I do a copy to btrfs compression enabled mount point then
> I can easily see code falling through compression.c.
> 
> Here's how I do my setup
> 
> 1. mkfs.btrfs /dev/sdb1
> 2. mount -t btrfs -o compress=zlib,compress-force /dev/sdb1 /mnt
> 3. cp  /mnt
> 4. dmesg shows print staments from compression.c and zlib.c confirming
> compression routine was invoked during write
> 5. now, copy back from btrfs mount point to home directory also shows
> decompress call invokation
> 
> Now, try same with fio commands:
> 
> fio command
> 
> fio --directory=/mnt/ --numjobs=1 --direct=0 --buffered=1
> --ioengine=libaio --group_reporting --bs=64k --rw=write --iodepth=128
> --name=test --size=10G --runtime=180 --time_based

fio by default uses fallocate (posix_falloc) to pre-allocate space for
the later writes, and PREALLOC path overrides compression path.

Like others mentioned, after fio and sync, you'll see 'encoded' in
filefrag -v your_file.

thanks,
-liubo

> 
> But it seems to write uncompressed data.
> 
> Any help here? what's missing?
> 
> Thanks
> Shally
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Help Recovering BTRFS array

2017-09-21 Thread grondinm

Hi Duncan,

I'm not sure if this will attache to my original message...

Thank you for your reply. For some reason i'm not getting list messages even 
tho i know i am subscribed.

I know all to well about the golden rule of data. It has bitten me  a few 
times. The data on this array is mostly data that i don't really care about. I 
was able to copy off what i wanted. The main reason i sent it to the list was 
just to see if i could somehow return the FS to a working state without having 
to recreate. I'm just surprised that all 3 copies of the super block got 
corrupted. Probably my lack of understanding but i always assumed that if one 
copy got corrupted it would be replaced by a good copy therefore leaving all 
copies in a good state. Is that not the case. If it is then what back luck that 
all 3 got messed up at same time. 

Some information i forgot to include in my original message

uname -a
Linux thebeach 4.12.13-gentoo-GMAN #1 SMP Sat Sep 16 15:28:26 ADT 2017 x86_64 
Intel(R) Core(TM) i5-2320 CPU @ 3.00GHz GenuineIntel GNU/Linux

btrfs --version
btrfs-progs v4.10.2

Anyways thank you again for your reply. I will leave the FS intact for a few 
days in case anymore details could help the development of BTRFS and maybe 
avoid this happening or having a recovery option.

Marc


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: using fio to test btrfs compression

2017-09-21 Thread Duncan
shally verma posted on Wed, 20 Sep 2017 16:40:15 +0530 as excerpted:

> Is there any command that i can run to confirm file has been compressed?

There is the quite recently posted (and actively updated since then) 
compsize command.

https://github.com/kilobyte/compsize

-- 
Duncan - List replies preferred.   No HTML msgs.
"Every nonfree program has a lord, a master --
and if you use the program, he is your master."  Richard Stallman

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: defragmenting best practice?

2017-09-21 Thread Duncan
Dave posted on Wed, 20 Sep 2017 02:38:13 -0400 as excerpted:

> Here's my scenario. Some months ago I built an over-the-top powerful
> desktop computer / workstation and I was looking forward to really
> fantastic performance improvements over my 6 year old Ubuntu machine. I
> installed Arch Linux on BTRFS on the new computer (on an SSD). To my
> shock, it was no faster than my old machine. I focused a lot on Firefox
> performance because I use Firefox a lot and that was one of the
> applications in which I was most looking forward to better performance.
> 
> I tried everything I could think of and everything recommended to me in
> various forums (except switching to Windows) and the performance
> remained very disappointing.
> 
> Then today I read the following:
> 
> Gotchas - btrfs Wiki https://btrfs.wiki.kernel.org/index.php/Gotchas
> 
> Fragmentation: Files with a lot of random writes can become
> heavily fragmented (1+ extents) causing excessive multi-second
> spikes of CPU load on systems with an SSD or large amount a RAM. On
> desktops this primarily affects application databases (including
> Firefox). Workarounds include manually defragmenting your home directory
> using btrfs fi defragment. Auto-defragment (mount option autodefrag)
> should solve this problem.
> 
> Upon reading that I am wondering if fragmentation in the Firefox profile
> is part of my issue. That's one thing I never tested previously. (BTW,
> this system has 256 GB of RAM and 20 cores.)
> 
> Furthermore, on the same BTRFS Wiki page, it mentions the performance
> penalties of many snapshots. I am keeping 30 to 50 snapshots of the
> volume that contains the Firefox profile.
> 
> Would these two things be enough to turn top-of-the-line hardware into a
> mediocre-preforming desktop system? (The system performs fine on
> benchmarks -- it's real life usage, particularly with Firefox where it
> is disappointing.)
> 
> After reading the info here, I am wondering if I should make a new
> subvolume just for my Firefox profile(s) and not use COW and/or not keep
> snapshots on it and mount it with the autodefrag option.
> 
> As part of this strategy, I could send snapshots to another disk using
> btrfs send-receive. That way I would have the benefits of snapshots
> (which are important to me), but by not keeping any snapshots on the
> live subvolume I could avoid the performance problems.
> 
> What would you guys do in this situation?

[FWIW this is my second try at a reply, my first being way too detailed 
and going off into the weeds somewhere, so I killed it.]

That's an interesting scenario indeed, and perhaps I can help, since my 
config isn't near as high end as yours, but I run firefox on btrfs on 
ssds, and have no performance complaints.  The difference is very likely 
due to one or more of the following (FWIW I'd suggest a 4-3-1-2 order, 
tho only 1 and 2 are really btrfs related):

1) I make sure I consistently mount with autodefrag, from the first mount 
after the filesystem is created in ordered to first populate it, on.  The 
filesystem never gets fragmented, forcing writes to highly fragmented 
free space, in the first place.  (With the past and current effect of the 
ssd mount option under discussion to change, it's possible I'll get more 
fragmentation in the future after ssd doesn't try so hard to find 
reasonably large free-space chunks to write into, but it has been fine so 
far.)

2) Subvolumes and snapshots seemed to me more trouble than they were 
worth, particularly since it's the same filesystem anyway, and if it's 
damaged, it'll take all the subvolumes and snapshots with it.  So I don't 
use them, preferring instead to use real partitioning and more smaller 
fully separate filesystems, some of which aren't mounted by default (and 
root mounted read-only by default), so there's little chance they'll be 
damaged in a crash or filesystem bug damage scenario.  And if there /is/ 
any damage, it's much more limited in scope since all my data eggs aren't 
in the same basket, so maintenance such as btrfs check and scrub take far 
less time (and check far less memory) than they would were it one big 
pool with snapshots.  And if recovery fails too, the backups are likewise 
small filesystems the same size as the working copies, so copying the 
data back over takes far less time as well (not to mention making the 
backups takes less time in the first place, so it's easier to regularly 
update them).

3) Austin mentioned the firefox cache.  I honestly wouldn't know on it, 
since I have firefox configured to use a tmpfs for its cache, so it 
operates at memory speed and gets cleared along with its memory at every 
reboot or tmpfs umount.  My inet speed is fast enough I don't really need 
cache anyway, but it's nice to have it, operating at memory speed, within 
a single boot session... and to have it cleared on reboot.


4) This one was the biggest one for me for awhile.

Is firefox running in multi-process mode?  

Re: Storage and snapshots as historical yearly

2017-09-21 Thread Senén Vidal Blanco
El martes, 19 de septiembre de 2017 21:33:31 (CEST) Andrei Borzenkov escribió:
> 19.09.2017 14:49, Senén Vidal Blanco пишет:
> > Perfect!! Just what I was looking for.
> > Sorry for the delay, because before doing so, I preferred to test to see
> > if it actually worked.
> > 
> > I have a doubt. The system works perfectly, but at the time of deleting
> > the
> > writing disk and merging the data on the read-only disk I fail to
> > understand the process.
> > 
> > I have tried to remove the seed bit on disk A and delete the write B as
> > you
> > mention, and so move the data to A, but tells me that disk B does not
> > exist. These are the orders I have made:
> > 
> > md127-> A
> > md126-> B
> > 
> > btrfstune -S 0 /dev /md127
> > mount /dev/md127 /mnt (I mount this disk since the md126 gives error)
> > btrfs device delete /dev/md126 /mnt
> > ERROR: error removing device '/dev/md126': No such file or directory
> > 
> > Another thing I've tried is to remove disk B without removing the seed
> > bit,
> > but it gives me the error:
> > 
> > ERROR: error removing device '/dev/md126': unable to remove the only
> > writeable device.
> > 
> > Any ideas about it?
> 
> Yes, sorry about it. Clearing seed flag on device invalidates
> filesystem. What you can do, is to rotate devices. I.e. remove
> /dev/md126, set seed flag on md127 and add md126 back.
> 
> I actually tested it and it works for me.
> 

OK thanks

Now I see how it works :))

With the commands:
mount /dev/md126 /mnt
btrfs device remove /dev/md127 /mnt
We remove the read-only array (A) from the BTRFS system and in doing so pass 
all the information from (A) to (B) read-write to mix them.

From what I see is not bad since both (A) and (B) are still operational. (A) 
with last year and (B) with everything current.

Finally with this other commands:
btrfstune -S 1 /dev/md126
mount /dev/md126 /mnt
btrfs device add -f /dev/md127 /mnt
we activate the seed bit in md126 (B) and add the (A) in read-write mode, 
where the new files will be archived and (B) as store until the following year 
and (A) do clean to fill in it new data.

I have tried to rotate twice to see if it goes well and smoothly.

Just comment that I see two small problems to this:

1. The transfer of data from (A) to (B) when removing the read-only disk takes 
quite a while and more the more it has stored in the history. It would be nice 
if the process were reversed, since in (B) there are fewer "data" stored. 
Also, I could not use it monthly or daily for this reason.

2. My idea was to have a larger A-disk than B where I would save the 
historical ones, because so in B I could put a smaller disk and something 
faster. If the decoupling process outside read-write rather than read-only and 
passed the data to A would be ideal for this case.

On the other hand, as an anecdote only, and perhaps for lack of experience or 
knowledge, I have used the entire linux system in BTRFS (@ and @home) format 
and a single partition md126 to have the system bootable and running simply by 
attaching the disk to the computer in degraded mode (swap outside the raid , 
which I'm not so bad: P). This has made that by rotating disks A and B I have 
had some problems with grub and fstab at boot, which I had to overcome by 
making changes to the boot configurations and some more botches.

I'm going to see a couple more things and if there's any way I can combine 
this with snapshots and see if the bulb will light up. If I do not get it I 
will try with the other filesystems that you have suggested to me. Although 
honestly, I like BTRFS more than the other alternatives, I already use BTRFS 
on 5 computers and it goes very well.

Greetings.

> > Thank you very much for the reply.
> > Greetings.
> > 
> > El martes, 12 de septiembre de 2017 6:34:15 (CEST) Andrei Borzenkov 
escribió:
> >> 11.09.2017 21:17, Senén Vidal Blanco пишет:
> >>> I am trying to implement a system that stores the data in a unit (A)
> >>> with
> >>> BTRFS format that is untouchable and that future files and folders
> >>> created
> >>> or modified are stored in another physical unit (B) with BTRFS format.
> >>> Each year the new files will be moved to store A and start over.
> >>> 
> >>> The idea is that a duplicate of disk A can be made to keep it in a safe
> >>> place and that the files stored there can not be modified until the
> >>> mixture of (A) and (B) is made.
> >> 
> >> This can probably be achieved using seed device. Mark original device as
> >> seed and all changes will go to another writable device, similar to
> >> overlay; then remove seed bit from original device, "btrfs device remove
> >> writable" device and it should relocate its content back. Rinse and
> >> repeat.

-- 
Senén Vidal Blanco - SGISoft S.L.
 
Tlf.: 986413322 - 660923711
GPG ID 466431A8AF01F99A
http://www.sgisoft.com/
--
 


signature.asc
Description: This is a digitally signed message part.


Re: [PATCH] fstests: btrfs/150 regression test for reading compressed data

2017-09-21 Thread Lu Fengqi
On Wed, Sep 20, 2017 at 05:52:43PM -0600, Liu Bo wrote:
>We had a bug in btrfs compression code which could end up with a
>kernel panic.
>
>This is adding a regression test for the bug and I've also sent a
>kernel patch to fix the bug.
>
>The patch is "Btrfs: fix kernel oops while reading compressed data".
>
>Signed-off-by: Liu Bo 
>---
> tests/btrfs/150 | 102 
> tests/btrfs/150.out |   3 ++
> tests/btrfs/group   |   1 +
> 3 files changed, 106 insertions(+)
> create mode 100755 tests/btrfs/150
> create mode 100644 tests/btrfs/150.out
>
>diff --git a/tests/btrfs/150 b/tests/btrfs/150
>new file mode 100755
>index 000..834be51
>--- /dev/null
>+++ b/tests/btrfs/150
>@@ -0,0 +1,102 @@
>+#! /bin/bash
>+# FS QA Test btrfs/150
>+#
>+# This is a regression test which ends up with a kernel oops in btrfs.
>+# It occurs when btrfs's read repair happens while reading a compressed
>+# extent.
>+# The patch for this is 
>+# x
>+#
>+#---
>+# Copyright (c) 2017 Liu Bo.  All Rights Reserved.
>+#
>+# This program is free software; you can redistribute it and/or
>+# modify it under the terms of the GNU General Public License as
>+# published by the Free Software Foundation.
>+#
>+# This program is distributed in the hope that it would be useful,
>+# but WITHOUT ANY WARRANTY; without even the implied warranty of
>+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>+# GNU General Public License for more details.
>+#
>+# You should have received a copy of the GNU General Public License
>+# along with this program; if not, write the Free Software Foundation,
>+# Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
>+#---
>+#
>+
>+seq=`basename $0`
>+seqres=$RESULT_DIR/$seq
>+echo "QA output created by $seq"
>+
>+here=`pwd`
>+tmp=/tmp/$$
>+status=1  # failure is the default!
>+trap "_cleanup; exit \$status" 0 1 2 3 15
>+
>+_cleanup()
>+{
>+  cd /
>+  rm -f $tmp.*
>+}
>+
>+# get standard environment, filters and checks
>+. ./common/rc
>+. ./common/filter
>+
>+# remove previous $seqres.full before test
>+rm -f $seqres.full
>+
>+# real QA test starts here
>+
>+# Modify as appropriate.
>+_supported_fs btrfs
>+_supported_os Linux
>+_require_scratch
>+_require_fail_make_request
>+_require_scratch_dev_pool 2 
>+
>+SYSFS_BDEV=`_sysfs_dev $SCRATCH_DEV`
>+enable_io_failure()
>+{
>+echo 100 > $DEBUGFS_MNT/fail_make_request/probability
>+echo 1000 > $DEBUGFS_MNT/fail_make_request/times

What does 1000 mean? Enough failures?
Why not set times to -1?

>+echo 0 > $DEBUGFS_MNT/fail_make_request/verbose
>+echo 1 > $SYSFS_BDEV/make-it-fail
>+}
>+
>+disable_io_failure()
>+{
>+echo 0 > $SYSFS_BDEV/make-it-fail
>+echo 0 > $DEBUGFS_MNT/fail_make_request/probability
>+echo 0 > $DEBUGFS_MNT/fail_make_request/times
>+}
>+
>+_scratch_pool_mkfs "-d raid1 -b 1G" >> $seqres.full 2>&1
>+
>+# It doesn't matter which compression algorithm we use.
>+_scratch_mount -ocompress
>+
>+# Create a file with all data being compressed
>+$XFS_IO_PROG -f -c "pwrite -W 0 8K" $SCRATCH_MNT/foobar | _filter_xfs_io
>+
>+# Raid1 consists of two copies and btrfs decides which copy to read by 
>reader's
>+# %pid.  Now we inject errors to copy #1 and copy #0 is good.  We want to read
>+# the bad copy to trigger read-repair.
>+while true; do
>+  disable_io_failure
>+  # invalidate the page cache
>+  $XFS_IO_PROG -f -c "fadvise -d 0 128K" $SCRATCH_MNT/foobar | 
>_filter_xfs_io
>+
>+  enable_io_failure
>+  od -x $SCRATCH_MNT/foobar > /dev/null &
>+  pid=$!
>+  wait
>+  [ $((pid % 2)) == 1 ] && break
>+done
>+
>+disable_io_failure
>+
>+# success, all done
>+status=0
>+exit
>diff --git a/tests/btrfs/150.out b/tests/btrfs/150.out
>new file mode 100644
>index 000..c492c24
>--- /dev/null
>+++ b/tests/btrfs/150.out
>@@ -0,0 +1,3 @@
>+QA output created by 150
>+wrote 8192/8192 bytes at offset 0
>+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
>diff --git a/tests/btrfs/group b/tests/btrfs/group
>index 70c3f05..b70a122 100644
>--- a/tests/btrfs/group
>+++ b/tests/btrfs/group
>@@ -152,3 +152,4 @@
> 147 auto quick send
> 148 auto quick rw
> 149 auto quick send compress
>+150 auto quick
>-- 
>2.5.0
>
>--
>To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>the body of a message to majord...@vger.kernel.org
>More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>

-- 
Thanks,
Lu


--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: SSD caching an existing btrfs raid1

2017-09-21 Thread Paul Jones
> -Original Message-
> From: linux-btrfs-ow...@vger.kernel.org [mailto:linux-btrfs-
> ow...@vger.kernel.org] On Behalf Of Kai Krakow
> Sent: Thursday, 21 September 2017 6:45 AM
> To: linux-btrfs@vger.kernel.org
> Subject: Re: SSD caching an existing btrfs raid1
> 
> Am Wed, 20 Sep 2017 17:51:15 +0200
> schrieb Psalle :
> 
> > On 19/09/17 17:47, Austin S. Hemmelgarn wrote:
> > (...)
> > >
> > > A better option if you can afford to remove a single device from
> > > that array temporarily is to use bcache.  Bcache has one specific
> > > advantage in this case, multiple backend devices can share the same
> > > cache device. This means you don't have to carve out dedicated cache
> > > space for each disk on the SSD and leave some unused space so that
> > > you can add new devices if needed.  The downside is that you can't
> > > convert each device in-place, but because you're using BTRFS, you
> > > can still convert the volume as a whole in-place.  The procedure for
> > > doing so looks like this:
> > >
> > > 1. Format the SSD as a bcache cache.
> > > 2. Use `btrfs device delete` to remove a single hard drive from the
> > > array.
> > > 3. Set up the drive you just removed as a bcache backing device
> > > bound to the cache you created in step 1.
> > > 4. Add the new bcache device to the array.
> > > 5. Repeat from step 2 until the whole array is converted.
> > >
> > > A similar procedure can actually be used to do almost any underlying
> > > storage conversion (for example, switching to whole disk encryption,
> > > or adding LVM underneath BTRFS) provided all your data can fit on
> > > one less disk than you have.
> >
> > Thanks Austin, that's just great. For some reason I had discarded
> > bcache thinking that it would force me to rebuild from scratch, but
> > this kind of incremental migration is exactly why I hoped was
> > possible. I have plenty of space to replace the devices one by one.
> >
> > I will report back my experience in a few days, I hope.
> 
> I've done it exactly that way in the past and it worked flawlessly (but it 
> took
> 24+ hours). But it was easy for me because I was also adding a third disk to
> the pool, so existing stuff could easily move.

Device delete takes freaking ages! I would avoid using it if you can. Device 
replace is much faster.


Paul.




N�r��yb�X��ǧv�^�)޺{.n�+{�n�߲)w*jg����ݢj/���z�ޖ��2�ޙ&�)ߡ�a�����G���h��j:+v���w��٥