Re: [PATCH] btrfs: delete chunk allocation attemp when setting block group ro
On Thu, 08 Jan 2015 13:23:13 -0800, Shaohua Li wrote: Below test will fail currently: mkfs.ext4 -F /dev/sda btrfs-convert /dev/sda mount /dev/sda /mnt btrfs device add -f /dev/sdb /mnt btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt The reason is there are some block groups with usage 0, but the whole disk hasn't free space to allocate new chunk, so we even can't set such block group readonly. This patch deletes the chunk allocation when setting block group ro. For META, we already have reserve. But for SYSTEM, we don't have, so the check_system_chunk is still required. Signed-off-by: Shaohua Li s...@fb.com --- fs/btrfs/extent-tree.c | 31 +++ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a80b971..430101b6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache-space_info; u64 num_bytes; - u64 min_allocable_bytes; int ret = -ENOSPC; - - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo-flags - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) - !force) - min_allocable_bytes = 1 * 1024 * 1024; - else - min_allocable_bytes = 0; - spin_lock(sinfo-lock); spin_lock(cache-lock); @@ -8521,8 +8507,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache-bytes_super - btrfs_block_group_used(cache-item); if (sinfo-bytes_used + sinfo-bytes_reserved + sinfo-bytes_pinned + - sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes + - min_allocable_bytes = sinfo-total_bytes) { + sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes + = sinfo-total_bytes) { sinfo-bytes_readonly += num_bytes; cache-ro = 1; list_add_tail(cache-ro_list, sinfo-ro_bgs); @@ -8548,14 +8534,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache-flags); - if (alloc_flags != cache-flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, - CHUNK_ALLOC_FORCE); - if (ret 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache-flags BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache-flags); + check_system_chunk(trans, root, alloc_flags); Please consider the case that the following patch fixed 199c36eaa95077a47ae1bc55532fc0fbeb80cc95 If there is no free device space, check_system_chunk can not allocate new system metadata chunk, so when we run final step of the chunk allocation to update the device item and insert the new chunk item, we would fail. Thanks Miao + } + btrfs_end_transaction(trans, root); return ret; } -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs: delete chunk allocation attemp when setting block group ro
On Fri, Jan 09, 2015 at 09:01:57AM +0800, Miao Xie wrote: On Thu, 08 Jan 2015 13:23:13 -0800, Shaohua Li wrote: Below test will fail currently: mkfs.ext4 -F /dev/sda btrfs-convert /dev/sda mount /dev/sda /mnt btrfs device add -f /dev/sdb /mnt btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt The reason is there are some block groups with usage 0, but the whole disk hasn't free space to allocate new chunk, so we even can't set such block group readonly. This patch deletes the chunk allocation when setting block group ro. For META, we already have reserve. But for SYSTEM, we don't have, so the check_system_chunk is still required. Signed-off-by: Shaohua Li s...@fb.com --- fs/btrfs/extent-tree.c | 31 +++ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a80b971..430101b6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache-space_info; u64 num_bytes; - u64 min_allocable_bytes; int ret = -ENOSPC; - - /* -* We need some metadata space and system metadata space for -* allocating chunks in some corner cases until we force to set -* it to be readonly. -*/ - if ((sinfo-flags -(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) - !force) - min_allocable_bytes = 1 * 1024 * 1024; - else - min_allocable_bytes = 0; - spin_lock(sinfo-lock); spin_lock(cache-lock); @@ -8521,8 +8507,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache-bytes_super - btrfs_block_group_used(cache-item); if (sinfo-bytes_used + sinfo-bytes_reserved + sinfo-bytes_pinned + - sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes + - min_allocable_bytes = sinfo-total_bytes) { + sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes + = sinfo-total_bytes) { sinfo-bytes_readonly += num_bytes; cache-ro = 1; list_add_tail(cache-ro_list, sinfo-ro_bgs); @@ -8548,14 +8534,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache-flags); - if (alloc_flags != cache-flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, -CHUNK_ALLOC_FORCE); - if (ret 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache-flags BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache-flags); + check_system_chunk(trans, root, alloc_flags); Please consider the case that the following patch fixed 199c36eaa95077a47ae1bc55532fc0fbeb80cc95 If there is no free device space, check_system_chunk can not allocate new system metadata chunk, so when we run final step of the chunk allocation to update the device item and insert the new chunk item, we would fail. So the relocation will always fail in this case. The check just makes the failure earlier, right? We don't have the BUG_ON in do_chunk_alloc() currently. Thanks, Shaohua -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs: fix raid56 scrub failed in xfstests btrfs/072
The xfstests btrfs/072 reports uncorrectable read errors in dmesg, because scrub forgets to use commit_root for parity scrub routine and scrub attempts to scrub those extents items whose contents are not fully on disk. To fix it, we just add the @search_commit_root flag back. Signed-off-by: Gui Hecheng guihc.f...@cn.fujitsu.com Signed-off-by: Qu Wenruo quwen...@cn.fujitsu.com --- fs/btrfs/scrub.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f2bb13a..aa8ff75 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3065,6 +3065,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, path-search_commit_root = 1; path-skip_locking = 1; + ppath-search_commit_root = 1; + ppath-skip_locking = 1; /* * trigger the readahead for extent tree csum tree and wait for * completion. During readahead, the scrub is officially paused -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs: delete chunk allocation attemp when setting block group ro
On Thu, 08 Jan 2015 18:06:50 -0800, Shaohua Li wrote: On Fri, Jan 09, 2015 at 09:01:57AM +0800, Miao Xie wrote: On Thu, 08 Jan 2015 13:23:13 -0800, Shaohua Li wrote: Below test will fail currently: mkfs.ext4 -F /dev/sda btrfs-convert /dev/sda mount /dev/sda /mnt btrfs device add -f /dev/sdb /mnt btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt The reason is there are some block groups with usage 0, but the whole disk hasn't free space to allocate new chunk, so we even can't set such block group readonly. This patch deletes the chunk allocation when setting block group ro. For META, we already have reserve. But for SYSTEM, we don't have, so the check_system_chunk is still required. Signed-off-by: Shaohua Li s...@fb.com --- fs/btrfs/extent-tree.c | 31 +++ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a80b971..430101b6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache-space_info; u64 num_bytes; - u64 min_allocable_bytes; int ret = -ENOSPC; - - /* -* We need some metadata space and system metadata space for -* allocating chunks in some corner cases until we force to set -* it to be readonly. -*/ - if ((sinfo-flags -(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) - !force) - min_allocable_bytes = 1 * 1024 * 1024; - else - min_allocable_bytes = 0; - spin_lock(sinfo-lock); spin_lock(cache-lock); [SNIP] ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache-flags BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache-flags); + check_system_chunk(trans, root, alloc_flags); Please consider the case that the following patch fixed 199c36eaa95077a47ae1bc55532fc0fbeb80cc95 If there is no free device space, check_system_chunk can not allocate new system metadata chunk, so when we run final step of the chunk allocation to update the device item and insert the new chunk item, we would fail. So the relocation will always fail in this case. The check just makes the failure earlier, right? We don't have the BUG_ON in do_chunk_alloc() currently. The final step of the chunk allocation is a delayed operation, we must make sure it can be done successfully, or we would abort the transaction, make the filesystem readonly and lose the data that is written into the filesystem before we do balance, it would make the users unconfortable. With this patch, we will set the block group successfully at the first time we invoke set_block_group_ro(). But if the block group that will be set to RO is the only system metadata block group in the filesystem, and there is no device space to allocate a new one, that is we have no space to deal with the pending final step of chunk allocation, so the problem I said above will happen. Thanks Miao -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs-progs: make btrfs qgroups show human readable sizes
make btrfs qgroups show human readable sizes, using -h option, example: qgroupid rfer excl max_rfer max_excl parent child -- - 0/5 299.58MiB299.58MiB400.00MiB0.00B1/1 --- 0/265299.58MiB16.00KiB 0.00B320.00MiB1/1 --- 0/266299.58MiB16.00KiB 350.00MiB0.00B--- --- 1/1 599.16MiB299.59MiB800.00MiB0.00B--- 0/5,0/265 Signed-off-by: Fan Chengniang fancn.f...@cn.fujitsu.com --- Documentation/btrfs-qgroup.txt | 2 ++ cmds-qgroup.c | 6 +- qgroup.c | 46 ++ qgroup.h | 1 + 4 files changed, 46 insertions(+), 9 deletions(-) diff --git a/Documentation/btrfs-qgroup.txt b/Documentation/btrfs-qgroup.txt index 3e13373..6a3d649 100644 --- a/Documentation/btrfs-qgroup.txt +++ b/Documentation/btrfs-qgroup.txt @@ -69,6 +69,8 @@ print child qgroup id. print max referenced size of qgroup. -e print max exclusive size of qgroup. +-h +print sizes in human readable format (e.g., 1KiB 234MiB 2GiB). -F list all qgroups which impact the given path(include ancestral qgroups) -f diff --git a/cmds-qgroup.c b/cmds-qgroup.c index 957fbc9..c2bd0a3 100644 --- a/cmds-qgroup.c +++ b/cmds-qgroup.c @@ -212,6 +212,7 @@ static const char * const cmd_qgroup_show_usage[] = { -c print child qgroup id, -r print max referenced size of qgroup, -e print max exclusive size of qgroup, + -h print sizes in human readable format (e.g., 1KiB 234MiB 2GiB), -F list all qgroups which impact the given path (include ancestral qgroups), -f list all qgroups which impact the given path @@ -246,7 +247,7 @@ static int cmd_qgroup_show(int argc, char **argv) optind = 1; while (1) { - c = getopt_long(argc, argv, pcreFf, + c = getopt_long(argc, argv, pcrehFf, long_options, NULL); if (c 0) break; @@ -267,6 +268,9 @@ static int cmd_qgroup_show(int argc, char **argv) btrfs_qgroup_setup_print_column( BTRFS_QGROUP_MAX_EXCL); break; + case 'h': + btrfs_qgroup_setup_human_readable(); + break; case 'F': filter_flag |= 0x1; break; diff --git a/qgroup.c b/qgroup.c index 1a4866c..5cb239e 100644 --- a/qgroup.c +++ b/qgroup.c @@ -20,6 +20,7 @@ #include sys/ioctl.h #include ctree.h #include ioctl.h +#include utils.h #define BTRFS_QGROUP_NFILTERS_INCREASE (2 * BTRFS_QGROUP_FILTER_MAX) #define BTRFS_QGROUP_NCOMPS_INCREASE (2 * BTRFS_QGROUP_COMP_MAX) @@ -80,53 +81,62 @@ static struct { char *name; char *column_name; int need_print; + int human_readable; int max_len; } btrfs_qgroup_columns[] = { { .name = qgroupid, .column_name= Qgroupid, .need_print = 1, + .human_readable = 0, .max_len= 8, }, { .name = rfer, .column_name= Rfer, .need_print = 1, - .max_len= 4, + .human_readable = 0, + .max_len= 12, }, { .name = excl, .column_name= Excl, .need_print = 1, - .max_len= 4, + .human_readable = 0, + .max_len= 12, }, { .name = max_rfer, .column_name= Max_rfer, .need_print = 0, - .max_len= 8, + .human_readable = 0, + .max_len= 12, }, { .name = max_excl, .column_name= Max_excl, .need_print = 0, - .max_len= 8, + .human_readable = 0, + .max_len= 12, }, { .name = parent, .column_name= Parent, .need_print = 0, + .human_readable = 0, .max_len= 7, }, { .name = child, .column_name= Child, .need_print = 0, + .human_readable = 0, .max_len= 5, }, { .name = NULL, .column_name= NULL, .need_print = 0, +
[PATCH] btrfs-progs: fix some format errors in doc
Signed-off-by: Fan Chengniang fancn.f...@cn.fujitsu.com --- Documentation/btrfs-mount.txt| 2 +- Documentation/btrfs-property.txt | 14 +++--- Documentation/btrfs-replace.txt | 3 +-- Documentation/btrfs-scrub.txt| 4 +--- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/Documentation/btrfs-mount.txt b/Documentation/btrfs-mount.txt index 4fb7137..8cf7a0b 100644 --- a/Documentation/btrfs-mount.txt +++ b/Documentation/btrfs-mount.txt @@ -91,7 +91,7 @@ MOUNT OPTIONS operations). This was previously the behavior only when a snapshot is created. -*inode_cache*: +*inode_cache*:: Enable free inode number caching. Defaults to off due to an overflow problem when the free space crcs don't fit inside a single page. diff --git a/Documentation/btrfs-property.txt b/Documentation/btrfs-property.txt index 28ede4b..8b9b7f0 100644 --- a/Documentation/btrfs-property.txt +++ b/Documentation/btrfs-property.txt @@ -28,10 +28,16 @@ A btrfs object, which is set by object, can be a btrfs filesystem itself, a btrfs subvolume, an inode(file or directory) inside btrfs, or a device on which a btrfs exists. + +The '-t type' option can be used to explicitly +specify what type of object you meant. This is only needed when a +property could be set for more then one object type. ++ +Possible types are 's[ubvol]', 'f[ilesystem]', 'i[node]' and 'd[evice]'. ++ Set the name of property by 'name'. If no 'name' is specified, all properties for the given object are printed. 'name' is one of the followings. -+ + ro read-only flag of subvolume: true or false label @@ -39,12 +45,6 @@ label of device compression compression setting for an inode: lzo, zlib, or (empty string) -The '-t type' option can be used to explicitly -specify what type of object you meant. This is only needed when a -property could be set for more then one object type. -+ -Possible types are 's[ubvol]', 'f[ilesystem]', 'i[node]' and 'd[evice]'. - *list* [-t type] object:: Lists available properties with their descriptions for the given object. + diff --git a/Documentation/btrfs-replace.txt b/Documentation/btrfs-replace.txt index 7402484..b2a21b9 100644 --- a/Documentation/btrfs-replace.txt +++ b/Documentation/btrfs-replace.txt @@ -13,9 +13,8 @@ DESCRIPTION --- *btrfs replace* is used to replace btrfs managed devices with other device. -Note: this is not currently supported for RAID5/6 profiles and must use the +NOTE: this is not currently supported for RAID5/6 profiles and must use the device add/delete workaround. - It is recommended to see `btrfs-device`(8) for more details about btrfs device management. diff --git a/Documentation/btrfs-scrub.txt b/Documentation/btrfs-scrub.txt index c0fafca..6ebce60 100644 --- a/Documentation/btrfs-scrub.txt +++ b/Documentation/btrfs-scrub.txt @@ -59,9 +59,7 @@ Read only mode. Do not attempt to correct anything. -R Raw print mode. Print full data instead of summary. -c ioprio_class -Set IO priority class (see - ionice (1) -manpage). +Set IO priority class (see `ionice`(1) manpage). -n ioprio_classdata Set IO priority classdata (see `ionice`(1) manpage). -f -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ssd mode on rotational media
On 2015-01-07 13:55, Kyle Gates wrote: What issues would arise if ssd mode is activated because of a block layer setting the rotational flag to zero? This happens for me running btrfs on bcache. Would it be beneficial to pass the no_ssd flag? Thanks, Kyle In theory, it would result in a greater degree of high-level fragmentation. You should (in theory) get marginally better performance by using no_ssd (or using a udev rule to set rotational to 1, as the flag is writable). Ideally, bcache shouldn't set the rotational flag to 0 unless the backing store (the big device that the cache is reading from) also has it set. Also, ideally, we should be looking at the type of block device, not just whether the rotational flag is 0 or not, as there are a number of cases where rotational is 0 that enabling ssd will make things preform worse (Xen/Virtio block devices come immediately to mind). smime.p7s Description: S/MIME Cryptographic Signature
Re: price to pay for nocow file bit?
On Wed, Jan 7, 2015 at 1:10 PM, Josef Bacik jba...@fb.com wrote: On 01/07/2015 12:43 PM, Lennart Poettering wrote: Heya! Currently, systemd-journald's disk access patterns (appending to the end of files, then updating a few pointers in the front) result in awfully fragmented journal files on btrfs, which has a pretty negative effect on performance when accessing them. I've been wondering if mount -o autodefrag would deal with this problem but I haven't had the chance to look into it. I've been using autodefrag and haven't run into journal corruptions that I can attribute to btrfs since the last one was fixed over a year ago. Chris Mason has suggested preference to use of autodefrag for this use case rather than xattr +C. But I don't know the time frame for autodefrag by default, it's come up a couple times but it's not the default yet. I've found autodefrag journals are less than 200 fragments, and average between 50-150 fragments. Without it, this spirals into thousands quite quickly. Searches don't seem slower when journal files are made of a few extents vs ~ 100, but beyond several hundred let alone several thousand it becomes noticeable. A somewhat minor negative of +C: In case of RAID 1 or higher and silent data corruption, there will be no Btrfs detection due to lack of checksum and therefore no correction. In the case a drive reports a read error then it's corrected, same as with md or lvm raid1+. -- Chris Murphy -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: price to pay for nocow file bit?
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 Chris Murphy schreef op 08-01-15 om 09:24: On Wed, Jan 7, 2015 at 1:10 PM, Josef Bacik jba...@fb.com wrote: On 01/07/2015 12:43 PM, Lennart Poettering wrote: Heya! Currently, systemd-journald's disk access patterns (appending to the end of files, then updating a few pointers in the front) result in awfully fragmented journal files on btrfs, which has a pretty negative effect on performance when accessing them. I've been wondering if mount -o autodefrag would deal with this problem but I haven't had the chance to look into it. I've been using autodefrag and haven't run into journal corruptions that I can attribute to btrfs since the last one was fixed over a year ago. Chris Mason has suggested preference to use of autodefrag for this use case rather than xattr +C. But I don't know the time frame for autodefrag by default, it's come up a couple times but it's not the default yet. Same here, no issues with using autodefrag and journals. regards, Koen -BEGIN PGP SIGNATURE- Version: GnuPG v1.4.5 (Darwin) Comment: GPGTools - http://gpgtools.org iD8DBQFUrkFVMkyGM64RGpERAgGKAJ9pmXA4STYx6sUJP5HBALcUCkfMqwCeNhzR 8v4u6bvhtFZYxYbGDiHghps= =4MPU -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: BTRFS free space handling still needs more work: Hangs again (no complete lockups, just tasks stuck for some time)
Am Donnerstag, 8. Januar 2015, 05:45:56 schrieben Sie: Martin Steigerwald posted on Wed, 07 Jan 2015 20:08:50 +0100 as excerpted: No BTRFS developers commented yet on this, neither in this thread nor in the bug report at kernel.org I made. Just a quick general note on this point... There has in the past (and I believe referenced on the wiki) been dev comment to the effect that on the list they tend to find particular reports/threads and work on them until they find and either fix the issue or (when not urgent) decide it must wait for something else, first. During the time they're busy pursuing such a report, they don't read others on the list very closely, and such list-only bug reports may thus get dropped on the floor and never worked on. The recommendation, then, is to report it to the list, and if not picked up right away and you plan on being around in a few weeks/months when they potentially get to it, file a bug on it, so it doesn't get dropped on the floor. Duncan, I *did* file a bug. [Bug 90401] New: btrfs kworker thread uses up 100% of a Sandybridge core for minutes on random write into big file https://bugzilla.kernel.org/show_bug.cgi?id=90401 -- Martin 'Helios' Steigerwald - http://www.Lichtvoll.de GPG: 03B0 0D6C 0040 0710 4AFA B82F 991B EAAC A599 84C7 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: price to pay for nocow file bit?
On Wed, 07.01.15 15:10, Josef Bacik (jba...@fb.com) wrote: On 01/07/2015 12:43 PM, Lennart Poettering wrote: Heya! Currently, systemd-journald's disk access patterns (appending to the end of files, then updating a few pointers in the front) result in awfully fragmented journal files on btrfs, which has a pretty negative effect on performance when accessing them. I've been wondering if mount -o autodefrag would deal with this problem but I haven't had the chance to look into it. Hmm, I am kinda interested in a solution that I can just implement in systemd/journald now and that will then just make things work for people suffering by the problem. I mean, I can hardly make systemd patch the mount options of btrfs just because I place a journal file on some fs... Is autodefrag supposed to become a default one day? Anyway, given the pros and cons I have now changed journald to set the nocow bit on newly created journal files. When files are rotated (and we hence know we will never ever write again to them) the bit is tried to be unset again, and a defrag ioctl will be invoked right after. btrfs currently silently ignores that we unset the bit, and leaves it set, but I figure i should try to unset it anyway, in case it learns that one day. After all, after rotating the files there's no reason to treat the files special anymore... I'll keep an eye on this, and see if I still get user complaints about it. Should autodefrag become default eventually we can get rid of this code in journald again. One question regarding the btrfs defrag ioctl: playing around with it it appears to be asynchronous, the defrag request is simply queued and the ioctl returns immediately. Which is great for my usecase. However I was wondering if it always was async like this? I googled a bit, and found reports that defrag might take a while, but I am not sure if those reports were about the ioctl taking so long, or the effect of defrag actually hitting the disk... Lennart -- Lennart Poettering, Red Hat -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Fix Penguin Penalty 18th December2014 ( mail-archive.com )
Dear Sir Did your website get hit by Google Penguin update on October 17th 2014? What basically is Google Penguin Update? It is actually a code name for Google algorithm which aims at decreasing your websites search engine rankings that violate Googles guidelines by using black hat SEO techniques to rank your webpage by giving number of spammy links to the page. We are one of those few SEO companies that can help you avoid penalties from Google Updates like Penguin and Panda. Our clients have survived all the previous and present updates with ease. They have never been hit because we use 100% white hat SEO techniques to rank Webpages. Simple thing that we do to keep websites away from any Penguin or Panda penalties is follow Google guidelines and we give Google users the best answers to their queries. If you are looking to increase the quality of your websites and to get more targeted traffic or save your websites from these Google penalties email us back with your interest. We will be glad to serve you and help you grow your business. Regards Taniya S SEO Manager ( TOB ) B7 Green Avenue, Amritsar 143001 Punjab NO CLICK in the subject to STOP EMAILS -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: price to pay for nocow file bit?
On 8/1/2015 3:30 μμ, Lennart Poettering wrote: On Wed, 07.01.15 15:10, Josef Bacik (jba...@fb.com) wrote: On 01/07/2015 12:43 PM, Lennart Poettering wrote: Heya! Currently, systemd-journald's disk access patterns (appending to the end of files, then updating a few pointers in the front) result in awfully fragmented journal files on btrfs, which has a pretty negative effect on performance when accessing them. I've been wondering if mount -o autodefrag would deal with this problem but I haven't had the chance to look into it. Hmm, I am kinda interested in a solution that I can just implement in systemd/journald now and that will then just make things work for people suffering by the problem. I mean, I can hardly make systemd patch the mount options of btrfs just because I place a journal file on some fs... Is autodefrag supposed to become a default one day? Anyway, given the pros and cons I have now changed journald to set the nocow bit on newly created journal files. When files are rotated (and we hence know we will never ever write again to them) the bit is tried to be unset again, and a defrag ioctl will be invoked right after. btrfs currently silently ignores that we unset the bit, and leaves it set, but I figure i should try to unset it anyway, in case it learns that one day. After all, after rotating the files there's no reason to treat the files special anymore... Can this behaviour be optional? I dont mind some fragmentation if i can keep having checksums and the ability for raid 1 to repair those files. I'll keep an eye on this, and see if I still get user complaints about it. Should autodefrag become default eventually we can get rid of this code in journald again. One question regarding the btrfs defrag ioctl: playing around with it it appears to be asynchronous, the defrag request is simply queued and the ioctl returns immediately. Which is great for my usecase. However I was wondering if it always was async like this? I googled a bit, and found reports that defrag might take a while, but I am not sure if those reports were about the ioctl taking so long, or the effect of defrag actually hitting the disk... Lennart -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[GIT PULL] Btrfs fixes
Hi Linus, We have a small queue of btrfs fixes in my for-linus branch: git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs.git for-linus None of these are huge, but my commit does fix a regression from 3.18 that could cause lost files during log replay. This also adds Dave Sterba to the list of Btrfs maintainers. It doesn't mean we're doing things differently, but Dave has really been helping with the maintainer workload for years. Chris Mason (2) commits (+9/-0): Btrfs: don't delay inode ref updates during log replay (+8/-0) Btrfs: add more maintainers (+1/-0) Wang Shilong (1) commits (+3/-1): Btrfs: call inode_dec_link_count() on mkdir error path Filipe Manana (1) commits (+10/-3): Btrfs: correctly get tree level in tree_backref_for_extent Josef Bacik (1) commits (+6/-6): Btrfs: abort transaction if we don't find the block group Dan Carpenter (1) commits (+1/-1): Btrfs, scrub: uninitialized variable in scrub_extent_for_parity() Total: (6) commits (+29/-11) MAINTAINERS | 1 + fs/btrfs/backref.c | 13 ++--- fs/btrfs/delayed-inode.c | 8 fs/btrfs/extent-tree.c | 12 ++-- fs/btrfs/inode.c | 4 +++- fs/btrfs/scrub.c | 2 +- 6 files changed, 29 insertions(+), 11 deletions(-) -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: price to pay for nocow file bit?
On Thu, Jan 08, 2015 at 05:53:21PM +0100, Lennart Poettering wrote: On Thu, 08.01.15 10:56, Zygo Blaxell (ce3g8...@umail.furryterror.org) wrote: On Wed, Jan 07, 2015 at 06:43:15PM +0100, Lennart Poettering wrote: Heya! Currently, systemd-journald's disk access patterns (appending to the end of files, then updating a few pointers in the front) result in awfully fragmented journal files on btrfs, which has a pretty negative effect on performance when accessing them. Now, to improve things a bit, I yesterday made a change to journald, to issue the btrfs defrag ioctl when a journal file is rotated, i.e. when we know that no further writes will be ever done on the file. However, I wonder now if I should go one step further even, and use the equivalent of chattr -C (i.e. nocow) on all journal files. I am wondering what price I would precisely have to pay for that. Judging by this earlier thread: http://www.spinics.net/lists/linux-btrfs/msg33134.html it's mostly about data integrity, which is something I can live with, given the conservative write patterns of journald, and the fact that we do our own checksumming and careful data validation. I mean, if btrfs in this mode provides no worse data integrity semantics than ext4 I am fully fine with losing this feature for these files. This sounds to me like a job for fallocate with FALLOC_FL_KEEP_SIZE. We already use fallocate(), but this is not enough on cow file systems. With fallocate() you can certainly improve fragmentation when appending things to a file. But on a COW file system this will help little if we change things in the beginning of the file, since COW means that it will then make a copy of those blocks and alter the copy, but leave the original version unmodified. And if we do that all the time the files get heavily fragmented, even though all the blocks we modify have been fallocate()d initially... Hmmm...it seems the handwaving about tail-packing that I was previously ignoring is important after all. A few quick tests with filefrag show that btrfs isn't doing full tail-packing, only small file allocation (i.e. files smaller than 4096 bytes get stored inline, and nothing else does, not even sparse files with a single 1-byte extent at offset != 0). Thus the inline storage avoids fragmentation only to the minimum extent possible. Short appends to the end of the file effectively become modifications of the last block of the file. That triggers CoW on the append, and if we're doing lots of tiny writes the file becomes extremely fragmented (exactly the worst case of one fragment per block). A mix of big and small appends seems to use fallocated space for those writes that cover complete blocks, which is arguably worse than not fallocating at all. So fallocate will not help until btrfs learns to do tail-packing, or some other way to avoid this problem. This would work on ext4, xfs, and others, and provide the same benefit (or even better) without filesystem-specific code. journald would preallocate a contiguous chunk past the end of the file for appends, and That's precisely what we do. But journald's write pattern is not purely appending to files, it's append something to the end, then link it up in the beginning. And for the append part we are fine with fallocate(). It's the link up part that completely fucks up fragmentation so far. Wrong theory but same result. The writes at the beginning just keep replacing a single extent over and over, which has a worst-case effect of adding a single fragment to the beginning of a file that would not otherwise be fragmented. The appends are causing fragmentation all by themselves. :-P Lennart -- Lennart Poettering, Red Hat -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html signature.asc Description: Digital signature
'parent transid verify failed' for 13637 missing transactions, resulting in 'BTRFS: Transaction aborted'
Hi, I have a btrfs volume in RAID0 across 2 SSDs which has (for no apparent reason) become corrupted. Although I am able to mount the partition, there are several messages displayed in the kernel log when doing so. I have copied the files off the file system, but would like to know if they can be relied upon or not (and if not, which ones are corrupt). I would also like to know if the file system itself is recoverable, or should be erased entirely and replaced. I have tried 'btrfs check --repair' and btrfs-zero-log to no avail. The SMART data for both drives suggests there are no issues with the hardware. Thanks in advance. Distro: Sabayon amd64 Kernel in use when corruption occurred: 3.17.4 Kernel in use when collecting diagnostic info: 3.16.0-23-generic (Ubuntu livecd) Btrfs-progs version: 3.18 btrfs fi df: (Used space is incorrect - should be at least 30 GB) Data, RAID0: total=93.16GiB, used=25.19MiB System, RAID1: total=32.00MiB, used=16.00KiB Metadata, RAID1: total=8.01GiB, used=73.81MiB unknown, single: total=16.00MiB, used=16.00KiB btrfs fi show: (truncated to show relevant filesystem only) Label: none uuid: d75ecf88-9b18-4ca6-8fd4-7bda0630de9b Total devices 2 FS bytes used 73.81MiB devid1 size 54.62GiB used 54.62GiB path /dev/sda1 devid2 size 54.62GiB used 54.62GiB path /dev/sdb1 Kernel log when mounting file system: [ 106.564009] BTRFS info (device sda1): disk space caching is enabled [ 106.577597] BTRFS: detected SSD devices, enabling SSD mode [ 106.578440] BTRFS: checking UUID tree [ 106.581198] parent transid verify failed on 168079851520 wanted 6329580 found 6343217 [ 106.581857] parent transid verify failed on 168079851520 wanted 6329580 found 6343217 [ 106.581880] BTRFS warning (device sda1): btrfs_uuid_tree_iterate failed -12 When unmounting: [ 113.814408] [ cut here ] [ 113.814454] WARNING: CPU: 0 PID: 3872 at /build/buildd/linux-3.16.0/fs/btrfs/extent-tree.c:5956 __btrfs_free_extent+0x675/0xc00 [btrfs]() [ 113.814460] Modules linked in: joydev btrfs dm_crypt xor snd_hda_codec_hdmi raid6_pq dm_multipath scsi_dh kvm_amd kvm snd_seq_midi snd_hda_codec_realtek snd_seq_midi_event snd_hda_codec_generic snd_rawmidi edac_core snd_hda_intel snd_hda_controller k10temp serio_raw edac_mce_amd snd_seq snd_hda_codec bnep snd_hwdep rfcomm snd_seq_device snd_pcm bluetooth snd_timer snd 6lowpan_iphc sp5100_tco soundcore i2c_piix4 shpchp mac_hid parport_pc ppdev lp parport squashfs overlayfs nls_utf8 isofs jfs xfs libcrc32c reiserfs dm_mirror dm_region_hash dm_log hid_generic nouveau mxm_wmi video i2c_algo_bit ttm usbhid drm_kms_helper pata_acpi firewire_ohci tg3 hid firewire_core r8169 drm ahci ptp crc_itu_t mii pata_jmicron libahci pps_core wmi [ 113.814558] CPU: 0 PID: 3872 Comm: umount Tainted: GW 3.16.0-23-generic #31-Ubuntu [ 113.814564] Hardware name: Gigabyte Technology Co., Ltd. GA-870A-UD3/GA-870A-UD3, BIOS F5 08/01/2011 [ 113.814569] 0009 8800bd5afa28 8177fcbc [ 113.814577] 8800bd5afa60 8106fd8d 00218f175000 8800cb98f000 [ 113.814584] 8800a80e9000 fffe 8800bd5afa70 [ 113.814591] Call Trace: [ 113.814605] [8177fcbc] dump_stack+0x45/0x56 [ 113.814615] [8106fd8d] warn_slowpath_common+0x7d/0xa0 [ 113.814623] [8106fe6a] warn_slowpath_null+0x1a/0x20 [ 113.814651] [c0d15345] __btrfs_free_extent+0x675/0xc00 [btrfs] [ 113.814661] [811c16a6] ? __slab_free+0xa6/0x320 [ 113.814690] [c0d1a044] __btrfs_run_delayed_refs+0x424/0x11e0 [btrfs] [ 113.814721] [c0d1edf3] btrfs_run_delayed_refs.part.64+0x73/0x270 [btrfs] [ 113.814750] [c0d1f51d] btrfs_write_dirty_block_groups+0x46d/0x710 [btrfs] [ 113.814784] [c0d2d64d] commit_cowonly_roots+0x18d/0x240 [btrfs] [ 113.814818] [c0d301ad] btrfs_commit_transaction.part.22+0x49d/0x970 [btrfs] [ 113.814852] [c0d2f27a] btrfs_commit_transaction+0x3a/0x80 [btrfs] [ 113.814875] [c0cfe760] btrfs_sync_fs+0x50/0xc0 [btrfs] [ 113.814884] [81211a82] sync_filesystem+0x72/0xb0 [ 113.814891] [811e2d50] generic_shutdown_super+0x30/0xf0 [ 113.814897] [811e30a2] kill_anon_super+0x12/0x20 [ 113.814920] [c0d01e86] btrfs_kill_super+0x16/0x90 [btrfs] [ 113.814926] [811e3429] deactivate_locked_super+0x49/0x60 [ 113.814932] [811e3874] deactivate_super+0x64/0x70 [ 113.814940] [812015ef] mntput_no_expire+0xdf/0x180 [ 113.814947] [81202bac] SyS_umount+0x8c/0x100 [ 113.814954] [81787ced] system_call_fastpath+0x1a/0x1f [ 113.814959] ---[ end trace 328a5b6c02402780 ]--- [ 113.814967] BTRFS info (device sda1): leaf 104182874112 total ptrs 209 free space 75 [ 113.814973] item 0 key (140680462336 168 16384) itemoff 16232 itemsize 51 [ 113.814978] extent refs 1
Re: BTRFS: Transaction aborted (error -5)
Hi All, Just wondering if someone could help me figure out what happened to this file system? As you see below, the kernel reports btrfs errors on both drives, but was able to correct the errors on the first drive. The second drive not so lucky. However, running 'badblocks -nsv' (the non-destructive read-write test) on both drives shows both drive free of bad blocks. Both drives are 1TB 7200RPM spinners. And, as Chris indicated earlier, its strange that the file system gets different check sums for the same block... Thanks, Dyweni On 2015-01-06 09:15, Dyweni - BTRFS wrote: Hi, BTRFS check on /dev/sdc1 reveals everything looks ok: # btrfs check /dev/sdc1 Checking filesystem on /dev/sdc1 UUID: 26ed1033-429a-444f-97cc-ce8103db4c39 checking extents checking free space cache checking fs roots checking csums checking root refs found 195515710524 bytes used err is 0 total csum bytes: 205915200 total tree bytes: 407355392 total fs tree bytes: 94830592 total extent tree bytes: 31588352 btree space waste bytes: 100867438 file data blocks allocated: 537492316160 referenced 195656101888 Btrfs v3.18 (/dev/sdd1 and /dev/sdc1 are the only two btrfs file systems in this machine). Oddly, when the problem with /dev/sdd1 started, problems with /dev/sdc1 were also reported, but /dev/sdc1 managed to fix itself. Below is the complete dmesg output from when problems first started until /dev/sdd1 went readonly with errors. The strangest part of all of this, is that the dmesg output shows no errors about the drive being physically bad. (I ran badblocks -nsv on both /dev/sdd and /dev/sdc, and it confirmed 0 bad blocks for both drives). [25581.099684] BTRFS: sdd1 checksum verify failed on 521797632 wanted 8F2F5FEC found 3E879EFE level 0 [25581.105441] BTRFS: read error corrected: ino 1 off 521797632 (dev /dev/sdd1 sector 1035520) [25581.105612] BTRFS: read error corrected: ino 1 off 521801728 (dev /dev/sdd1 sector 1035528) [25581.105784] BTRFS: read error corrected: ino 1 off 521805824 (dev /dev/sdd1 sector 1035536) [25581.105956] BTRFS: read error corrected: ino 1 off 521809920 (dev /dev/sdd1 sector 1035544) [2.799514] BTRFS: sdd1 checksum verify failed on 680296448 wanted AB0E191F found 192D4134 level 0 [2.856199] BTRFS: read error corrected: ino 1 off 680296448 (dev /dev/sdd1 sector 1345088) [2.860571] BTRFS: read error corrected: ino 1 off 680300544 (dev /dev/sdd1 sector 1345096) [2.909634] BTRFS: read error corrected: ino 1 off 680304640 (dev /dev/sdd1 sector 1345104) [2.909876] BTRFS: read error corrected: ino 1 off 680308736 (dev /dev/sdd1 sector 1345112) [29292.777237] BTRFS: sdc1 checksum verify failed on 937738240 wanted F4196CDA found AF30B394 level 0 [29292.778022] BTRFS: sdc1 checksum verify failed on 937738240 wanted F4196CDA found AF30B394 level 0 [29292.781889] BTRFS: read error corrected: ino 1 off 937738240 (dev /dev/sdc1 sector 1847904) [29292.782054] BTRFS: read error corrected: ino 1 off 937742336 (dev /dev/sdc1 sector 1847912) [29292.782224] BTRFS: read error corrected: ino 1 off 937746432 (dev /dev/sdc1 sector 1847920) [29292.782399] BTRFS: read error corrected: ino 1 off 937750528 (dev /dev/sdc1 sector 1847928) [29691.731107] BTRFS: sdd1 checksum verify failed on 610877440 wanted 5A8006E7 found 1CFE4A20 level 0 [29691.791550] BTRFS: read error corrected: ino 1 off 610877440 (dev /dev/sdd1 sector 1209504) [29691.793252] BTRFS: read error corrected: ino 1 off 610881536 (dev /dev/sdd1 sector 1209512) [29691.793608] BTRFS: read error corrected: ino 1 off 610885632 (dev /dev/sdd1 sector 1209520) [29691.793797] BTRFS: read error corrected: ino 1 off 610889728 (dev /dev/sdd1 sector 1209528) [34626.017914] BTRFS: sdd1 checksum verify failed on 737181696 wanted 15D7099D found B6A2A7A9 level 0 [34626.022656] BTRFS: read error corrected: ino 1 off 737181696 (dev /dev/sdd1 sector 1456192) [34626.022867] BTRFS: read error corrected: ino 1 off 737185792 (dev /dev/sdd1 sector 1456200) [34626.023107] BTRFS: read error corrected: ino 1 off 737189888 (dev /dev/sdd1 sector 1456208) [34626.023314] BTRFS: read error corrected: ino 1 off 737193984 (dev /dev/sdd1 sector 1456216) [37057.349996] BTRFS: sdc1 checksum verify failed on 701792256 wanted A7BD5067 found 87EF0602 level 0 [37057.424920] BTRFS: read error corrected: ino 1 off 701792256 (dev /dev/sdc1 sector 1387072) [37057.425178] BTRFS: read error corrected: ino 1 off 701796352 (dev /dev/sdc1 sector 1387080) [37057.450174] BTRFS: read error corrected: ino 1 off 701800448 (dev /dev/sdc1 sector 1387088) [37057.453476] BTRFS: read error corrected: ino 1 off 701804544 (dev /dev/sdc1 sector 1387096) [38283.714855] BTRFS: sdd1 checksum verify failed on 190169088 wanted 27D1E032 found 585B1651 level 0 [38283.715349] BTRFS: sdd1 checksum verify failed on 190169088 wanted 27D1E032 found 585B1651 level 0 [38283.724140] BTRFS: read error corrected: ino 1 off 190169088 (dev /dev/sdd1 sector 387808) [38283.724313] BTRFS: read
Re: price to pay for nocow file bit?
On Thu, 08.01.15 10:56, Zygo Blaxell (ce3g8...@umail.furryterror.org) wrote: On Wed, Jan 07, 2015 at 06:43:15PM +0100, Lennart Poettering wrote: Heya! Currently, systemd-journald's disk access patterns (appending to the end of files, then updating a few pointers in the front) result in awfully fragmented journal files on btrfs, which has a pretty negative effect on performance when accessing them. Now, to improve things a bit, I yesterday made a change to journald, to issue the btrfs defrag ioctl when a journal file is rotated, i.e. when we know that no further writes will be ever done on the file. However, I wonder now if I should go one step further even, and use the equivalent of chattr -C (i.e. nocow) on all journal files. I am wondering what price I would precisely have to pay for that. Judging by this earlier thread: http://www.spinics.net/lists/linux-btrfs/msg33134.html it's mostly about data integrity, which is something I can live with, given the conservative write patterns of journald, and the fact that we do our own checksumming and careful data validation. I mean, if btrfs in this mode provides no worse data integrity semantics than ext4 I am fully fine with losing this feature for these files. This sounds to me like a job for fallocate with FALLOC_FL_KEEP_SIZE. We already use fallocate(), but this is not enough on cow file systems. With fallocate() you can certainly improve fragmentation when appending things to a file. But on a COW file system this will help little if we change things in the beginning of the file, since COW means that it will then make a copy of those blocks and alter the copy, but leave the original version unmodified. And if we do that all the time the files get heavily fragmented, even though all the blocks we modify have been fallocate()d initially... This would work on ext4, xfs, and others, and provide the same benefit (or even better) without filesystem-specific code. journald would preallocate a contiguous chunk past the end of the file for appends, and That's precisely what we do. But journald's write pattern is not purely appending to files, it's append something to the end, then link it up in the beginning. And for the append part we are fine with fallocate(). It's the link up part that completely fucks up fragmentation so far. Lennart -- Lennart Poettering, Red Hat -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: price to pay for nocow file bit?
On Wed, Jan 07, 2015 at 06:43:15PM +0100, Lennart Poettering wrote: Heya! Currently, systemd-journald's disk access patterns (appending to the end of files, then updating a few pointers in the front) result in awfully fragmented journal files on btrfs, which has a pretty negative effect on performance when accessing them. Now, to improve things a bit, I yesterday made a change to journald, to issue the btrfs defrag ioctl when a journal file is rotated, i.e. when we know that no further writes will be ever done on the file. However, I wonder now if I should go one step further even, and use the equivalent of chattr -C (i.e. nocow) on all journal files. I am wondering what price I would precisely have to pay for that. Judging by this earlier thread: http://www.spinics.net/lists/linux-btrfs/msg33134.html it's mostly about data integrity, which is something I can live with, given the conservative write patterns of journald, and the fact that we do our own checksumming and careful data validation. I mean, if btrfs in this mode provides no worse data integrity semantics than ext4 I am fully fine with losing this feature for these files. This sounds to me like a job for fallocate with FALLOC_FL_KEEP_SIZE. This would work on ext4, xfs, and others, and provide the same benefit (or even better) without filesystem-specific code. journald would preallocate a contiguous chunk past the end of the file for appends, and on btrfs the first write to each block will not be COWed or compressed (I'm hand-waving away some details here related to small writes, file tails, and inline storage, but the end result is the same). If there's a configured target size for journals then allocate that amount; otherwise, double the allocated size each time the visible file size reaches a power of two so that the number of fragments is logarithmic over file size. This should get you what you want without all the dangerous messing around with data integrity controls and defragmentation. Defragmentation has a number of negative side-effects of its own: it searches for free space aggressively and holds locks that can block writes for a long time (I've learned the hard way that this can be over 20 minutes for a 1GB file, long enough to trigger hardware watchdog resets). There are some other good reasons to never defragment, but they don't arise in journald's use cases. I, for one, use btrfs scrub to detect data corruption that occurs during early stages of disk failure. I'd object strongly to applications randomly turning off data integrity features without being explicitly configured to do so, especially those that do most of the writing. It would create areas of the disk that are blind spots when testing for storage corruption errors, and in journald's case those blind spots would be among the most significant sources of data about storage corruption. I don't really care if applications can survive corrupted data--as the owner of the storage, I need to be aware that storage-level corruption is happening. I don't want to have to test different areas of the filesystem with a dozen different application-specific tools. That particular insanity is one of the reasons why I now use btrfs and not ext4. Hence I am mostly interested in what else is lost if this flag is turned on by default for all journal files journald creates: Does this have any effect on functionality? As I understood snapshots still work fine for files marked like that, and so do reflinks. Any drawback functionality-wise? Apparently file compression support is lost if the bit is set? (which I can live with too, journal files are internally compressed anyway) What about performance? Do any operations get substantially slower by setting this bit? For example, what happens if I take a snapshot of files with this bit set and then modify the file, does this result in a full (and hence slow) copy of the file on that occasion? I am trying to understand the pros and cons of turning this bit on, before I can make this change. So far I see one big pro, but I wonder if there's any major con I should think about? Thanks, Lennart -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html signature.asc Description: Digital signature
Re: price to pay for nocow file bit?
On 2015-01-08 19:24, Konstantinos Skarlatos wrote: Anyway, given the pros and cons I have now changed journald to set the nocow bit on newly created journal files. When files are rotated (and we hence know we will never ever write again to them) the bit is tried to be unset again, and a defrag ioctl will be invoked right after. btrfs currently silently ignores that we unset the bit, and leaves it set, but I figure i should try to unset it anyway, in case it learns that one day. After all, after rotating the files there's no reason to treat the files special anymore... Can this behaviour be optional? I dont mind some fragmentation if i can keep having checksums and the ability for raid 1 to repair those files. I agree with Konstantinos's request: please let this behavior optional. BR G.Baroncelli -- gpg @keyserver.linux.it: Goffredo Baroncelli kreijackATinwind.it Key fingerprint BBF5 1610 0B64 DAC6 5F7D 17B2 0EDA 9B37 8B82 E0B5 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: price to pay for nocow file bit?
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 01/08/2015 08:53 AM, Lennart Poettering wrote: this will help little if we change things in the beginning of the file, Have you considered changing the format so that those pointers are stored at the end of the file, letting data always be append only? While it is traditional to have things at the beginning as headers, there are formats like zip where metadata is stored at the end instead providing other benefits. Roger -BEGIN PGP SIGNATURE- Version: GnuPG v1 iEYEARECAAYFAlSu68gACgkQmOOfHg372QSn5wCfaRAfI/xN3SHiDEPNMjjAuFQB NbcAn2GCjzZyfHocF7yTKEBFdt3znD6n =KL2f -END PGP SIGNATURE- -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs: delete chunk allocation attemp when setting block group ro
Below test will fail currently: mkfs.ext4 -F /dev/sda btrfs-convert /dev/sda mount /dev/sda /mnt btrfs device add -f /dev/sdb /mnt btrfs balance start -v -dconvert=raid1 -mconvert=raid1 /mnt The reason is there are some block groups with usage 0, but the whole disk hasn't free space to allocate new chunk, so we even can't set such block group readonly. This patch deletes the chunk allocation when setting block group ro. For META, we already have reserve. But for SYSTEM, we don't have, so the check_system_chunk is still required. Signed-off-by: Shaohua Li s...@fb.com --- fs/btrfs/extent-tree.c | 31 +++ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a80b971..430101b6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8493,22 +8493,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) { struct btrfs_space_info *sinfo = cache-space_info; u64 num_bytes; - u64 min_allocable_bytes; int ret = -ENOSPC; - - /* -* We need some metadata space and system metadata space for -* allocating chunks in some corner cases until we force to set -* it to be readonly. -*/ - if ((sinfo-flags -(BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) - !force) - min_allocable_bytes = 1 * 1024 * 1024; - else - min_allocable_bytes = 0; - spin_lock(sinfo-lock); spin_lock(cache-lock); @@ -8521,8 +8507,8 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force) cache-bytes_super - btrfs_block_group_used(cache-item); if (sinfo-bytes_used + sinfo-bytes_reserved + sinfo-bytes_pinned + - sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes + - min_allocable_bytes = sinfo-total_bytes) { + sinfo-bytes_may_use + sinfo-bytes_readonly + num_bytes + = sinfo-total_bytes) { sinfo-bytes_readonly += num_bytes; cache-ro = 1; list_add_tail(cache-ro_list, sinfo-ro_bgs); @@ -8548,14 +8534,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, if (IS_ERR(trans)) return PTR_ERR(trans); - alloc_flags = update_block_group_flags(root, cache-flags); - if (alloc_flags != cache-flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, -CHUNK_ALLOC_FORCE); - if (ret 0) - goto out; - } - ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8566,6 +8544,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, goto out; ret = set_block_group_ro(cache, 0); out: + if (cache-flags BTRFS_BLOCK_GROUP_SYSTEM) { + alloc_flags = update_block_group_flags(root, cache-flags); + check_system_chunk(trans, root, alloc_flags); + } + btrfs_end_transaction(trans, root); return ret; } -- 1.8.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html