[PATCH v2 1/4] Btrfs: fix wrong send_in_progress accounting
Steps to reproduce: # mkfs.btrfs -f /dev/sda8 # mount /dev/sda8 /mnt # btrfs sub snapshot -r /mnt /mnt/snap1 # btrfs sub snapshot -r /mnt /mnt/snap2 # btrfs send /mnt/snap1 -p /mnt/snap2 -f /mnt/1 # dmesg The problem is that we will sort clone roots(include @send_root), it might push @send_root before thus @send_root's @send_in_progress will be decreased twice. Cc: David Sterba dste...@suse.cz Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com --- Changelog v1-v2: on the right way to fix the problem.(thanks to David) --- fs/btrfs/send.c | 16 +--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index bff0b1a..5b69785 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4752,6 +4752,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; + int sort_clone_roots = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4942,6 +4943,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sort(sctx-clone_roots, sctx-clone_roots_cnt, sizeof(*sctx-clone_roots), __clone_root_cmp_sort, NULL); + sort_clone_roots = 1; ret = send_subvol(sctx); if (ret 0) @@ -4957,11 +4959,19 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) } out: - for (i = 0; sctx i clone_sources_to_rollback; i++) - btrfs_root_dec_send_in_progress(sctx-clone_roots[i].root); + if (sort_clone_roots) { + for (i = 0; i sctx-clone_roots_cnt; i++) + btrfs_root_dec_send_in_progress( + sctx-clone_roots[i].root); + } else { + for (i = 0; sctx i clone_sources_to_rollback; i++) + btrfs_root_dec_send_in_progress( + sctx-clone_roots[i].root); + + btrfs_root_dec_send_in_progress(send_root); + } if (sctx !IS_ERR_OR_NULL(sctx-parent_root)) btrfs_root_dec_send_in_progress(sctx-parent_root); - btrfs_root_dec_send_in_progress(send_root); kfree(arg); vfree(clone_sources_tmp); -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 2/4] Btrfs: fix protection between send and root deletion
We should gurantee that parent and clone roots can not be destroyed during send, for this we have two ideas. 1.by holding @subvol_sem, this might be a nightmare, because it will block all subvolumes deletion for a long time. 2.Miao pointed out we can reuse @send_in_progress, that mean we will skip snapshot deletion if root sending is in progress. Here we adopt the second approach since it won't block other subvolumes deletion for a long time. Besides in btrfs_clean_one_deleted_snapshot(), we only check first root , if this root is involved in send, we return directly rather than continue to check.There are several reasons about it: 1.this case happen seldomly. 2.after sending,cleaner thread can continue to drop that root. 3.make code simple Cc: David Sterba dste...@suse.cz Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com Reviewed-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1-v2: use right root to check(pointed by David) --- fs/btrfs/send.c| 16 fs/btrfs/transaction.c | 13 + 2 files changed, 29 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5b69785..4e2461b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4753,6 +4753,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; int sort_clone_roots = 0; + int index; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4893,8 +4894,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = clone_sources_tmp[i]; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(fs_info-subvol_srcu); + clone_root = btrfs_read_fs_root_no_name(fs_info, key); if (IS_ERR(clone_root)) { + srcu_read_unlock(fs_info-subvol_srcu, index); ret = PTR_ERR(clone_root); goto out; } @@ -4903,10 +4908,13 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) clone_root-send_in_progress++; if (!btrfs_root_readonly(clone_root)) { spin_unlock(clone_root-root_item_lock); + srcu_read_unlock(fs_info-subvol_srcu, index); ret = -EPERM; goto out; } spin_unlock(clone_root-root_item_lock); + srcu_read_unlock(fs_info-subvol_srcu, index); + sctx-clone_roots[i].root = clone_root; } vfree(clone_sources_tmp); @@ -4917,19 +4925,27 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) key.objectid = arg-parent_root; key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; + + index = srcu_read_lock(fs_info-subvol_srcu); + sctx-parent_root = btrfs_read_fs_root_no_name(fs_info, key); if (IS_ERR(sctx-parent_root)) { + srcu_read_unlock(fs_info-subvol_srcu, index); ret = PTR_ERR(sctx-parent_root); goto out; } + spin_lock(sctx-parent_root-root_item_lock); sctx-parent_root-send_in_progress++; if (!btrfs_root_readonly(sctx-parent_root)) { spin_unlock(sctx-parent_root-root_item_lock); + srcu_read_unlock(fs_info-subvol_srcu, index); ret = -EPERM; goto out; } spin_unlock(sctx-parent_root-root_item_lock); + + srcu_read_unlock(fs_info-subvol_srcu, index); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 30d11bb..1ef8e28 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1971,6 +1971,19 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) } root = list_first_entry(fs_info-dead_roots, struct btrfs_root, root_list); + /* +* Make sure root is not involved in send, +* if we fail with first root, we return +* directly rather than continue. +*/ + spin_lock(root-root_item_lock); + if (root-send_in_progress) { + spin_unlock(fs_info-trans_lock); + spin_unlock(root-root_item_lock); + return 0; + } + spin_unlock(root-root_item_lock); + list_del_init(root-root_list); spin_unlock(fs_info-trans_lock); -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a
[PATCH v2 4/4] Btrfs: handle EAGAIN case properly in btrfs_drop_snapshot()
We may return early in btrfs_drop_snapshot(), we shouldn't call btrfs_std_err() for this case, fix it. Cc: sta...@vger.kernel.org Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com --- Changelog v1-v2: cc stable --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4fa1437..81650cf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8272,7 +8272,7 @@ out: */ if (!for_reloc root_dropped == false) btrfs_add_dead_root(root); - if (err) + if (err err != -EAGAIN) btrfs_std_error(root-fs_info, err); return err; } -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v2 3/4] Btrfs: remove unnecessary transaction commit before send
We will finish orphan cleanups during snapshot, so we don't have to commit transaction here. Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com Reviewed-by: Miao Xie mi...@cn.fujitsu.com --- Changelog v1-v2: none --- fs/btrfs/send.c | 29 - 1 file changed, 29 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4e2461b..591063d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4776,35 +4776,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) WARN_ON(send_root-orphan_cleanup_state != ORPHAN_CLEANUP_DONE); /* -* If we just created this root we need to make sure that the orphan -* cleanup has been done and committed since we search the commit root, -* so check its commit root transid with our otransid and if they match -* commit the transaction to make sure everything is updated. -*/ - down_read(send_root-fs_info-extent_commit_sem); - if (btrfs_header_generation(send_root-commit_root) == - btrfs_root_otransid(send_root-root_item)) { - struct btrfs_trans_handle *trans; - - up_read(send_root-fs_info-extent_commit_sem); - - trans = btrfs_attach_transaction_barrier(send_root); - if (IS_ERR(trans)) { - if (PTR_ERR(trans) != -ENOENT) { - ret = PTR_ERR(trans); - goto out; - } - /* ENOENT means theres no transaction */ - } else { - ret = btrfs_commit_transaction(trans, send_root); - if (ret) - goto out; - } - } else { - up_read(send_root-fs_info-extent_commit_sem); - } - - /* * Userspace tools do the checks and warn the user if it's * not RO. */ -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC] lib: raid: New RAID library supporting up to six parities
Hi Neil, On 01/07, NeilBrown wrote: To do the same with up to six failures, it's now required some kind of sort function. So I would probably just make sure we always process the block is the right order. Then sorting would be irrelevant. But as I say, I haven't fiddled with the code, so maybe that would end up being more complex. If the the async_tx and btrfs layers can always provide indexes in order, for sure this sort function can be removed. I agree with you that it would be a better design. In fact, the raid library never uses it directly. It's just provided to help the callers that for whatever reason cannot provide such indexes in order. And seeing these swap operations between faila/failb in both btrfs and async_tx, made me to assume that the order is not always correct. I don't see the above as sorting faila and failb, but rather determining which one is first. Once you know which one is first, the remainder follow in order. The new raid library, like the existing raid6, requires to have the indexes of the failed blocks in order. With only two indexes faila/failb this means faila failb and sorting is equivalent to find the first one. But with up to six failures, is like to have six fail variables: faila/failb/failc/faild/faile/failf and the raid library requires them to be in order as: faila failb failc faild faile failf. In this general case a sort function is the only one that gives the guarantee to fulfill this requirement whatever is the original order. Ciao, Andrea -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: btrfs-transaction blocked for more than 120 seconds
On 2014/01/06 12:57 AM, Roman Mamedov wrote: Did you align your partitions to accommodate for the 4K sector of the EARS? I had, yes. I had to do a lot of research to get the array working optimally. I didn't need to repartition the spare so this carried over to its being used as an OS disk. I actually lost the Green array twice - and learned some valuable lessons: 1. I had an 8-port SCSI card which was dropping the disks due to the timeout issue mentioned by Chris. That caused the first array failure. Technically all the data was on the disks - but temporarily irrecoverable as disks were constantly being dropped. I made a mistake during ddrescue which simultaneously destroyed two disks' data, meaning that the recovery operation was finally for nought. The only consolation was that I had very little data at the time and none of it was irreplaceable. 2. After replacing the SCSI card with two 4-port SATA cards, a few months later I still had a double-failure (the second failure being during the RAID5 rebuild). This time it was only due to bad disks and a lack of scrubbing/early warning - clearly my own fault. Having learnt these lessons, I'm now a big fan of scrubbing and backups. ;) I'm also pushing for RAID15 wherever data is mission-critical. I simply don't trust the reliability of disks any more and I also better understand how, by having more and/or larger disks in a RAID5/6 array, the overall reliability of that array array plummets. -- __ Brendan Hide http://swiftspirit.co.za/ http://www.webafrica.co.za/?AFF1E97 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC v2 2/2] fs: btrfs: Extends btrfs/raid56 to support up to six parities
Hi Chris, On 01/06, Chris Mason wrote: Neat. The faila/failb were always my least favorite part of the btrfs code ;) Did you test just raid5/6 or also the higher parity counts? At this stage no real testing was made with btrfs. The intention of this btrfs patch is mainly to get feedback on the new raid inteface, and see if it matches the needs of btrfs, and if it results in cleaner code than before. And besides the removal of the faila/failb variables, something other that likely you can appreciate is the removal of all the P/Q logic from btrfs, that it's now replaced with a single raid_rec() call. After the raid interface stabilizes, this patch can be used as starting point for a real btrfs patch. But at now it's just to show some example code of what kind of modification btrfs will need. Ciao, Andrea -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive howto?
I read different howtos on the wiki and oracle docs, but I can't get it to work: legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new Create a readonly snapshot of 'tmp' in './tmp_read_only_new' legolas:/mnt/btrfs_pool1# sync legolas:/mnt/btrfs_pool1# btrfs send tmp_read_only_new | btrfs receive /mnt/btrfs_pool2/ At subvol tmp_read_only_new At subvol tmp_read_only_new # Make a new snapshot later and try to sync it: legolas:/mnt/btrfs_pool1# mv tmp_read_only_new tmp_read_only legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new Create a readonly snapshot of 'tmp' in './tmp_read_only_new' legolas:/mnt/btrfs_pool1# btrfs send -p tmp_read_only tmp_read_only_new | btrfs receive /mnt/btrfs_pool2/ At subvol tmp_read_only_new At snapshot tmp_read_only_new ERROR: creating snapshot tmp_read_only_new - tmp_read_only_new failed. File exists This is where I get stuck: Obviously /mnt/btrfs_pool2/tmp_read_only_new already exists since it's the reference backup volume. What am I doing wrong? Thanks, Marc -- A mouse is a device used to point at the xterm you want to type in - A.S.R. Microsoft is to operating systems what McDonalds is to gourmet cooking Home page: http://marc.merlins.org/ | PGP 1024R/763BE901 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Is anyone using btrfs send/receive howto?
On Tue, Jan 07, 2014 at 02:49:51AM -0800, Marc MERLIN wrote: I read different howtos on the wiki and oracle docs, but I can't get it to work: legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new Create a readonly snapshot of 'tmp' in './tmp_read_only_new' legolas:/mnt/btrfs_pool1# sync legolas:/mnt/btrfs_pool1# btrfs send tmp_read_only_new | btrfs receive /mnt/btrfs_pool2/ At subvol tmp_read_only_new At subvol tmp_read_only_new # Make a new snapshot later and try to sync it: legolas:/mnt/btrfs_pool1# mv tmp_read_only_new tmp_read_only legolas:/mnt/btrfs_pool1# btrfs subvolume snapshot -r tmp tmp_read_only_new Create a readonly snapshot of 'tmp' in './tmp_read_only_new' legolas:/mnt/btrfs_pool1# btrfs send -p tmp_read_only tmp_read_only_new | btrfs receive /mnt/btrfs_pool2/ At subvol tmp_read_only_new At snapshot tmp_read_only_new ERROR: creating snapshot tmp_read_only_new - tmp_read_only_new failed. File exists This is where I get stuck: Obviously /mnt/btrfs_pool2/tmp_read_only_new already exists since it's the reference backup volume. What am I doing wrong? You need to move /mnt/btrfs_pool2/tmp_read_only_new to a different name as well. The send stream contains the name of the subvolume it wants to create, so it's trying to create a subvolume called tmp_read_only_new in /mnt/btrfs_pool2, and there's one there already. Hugo. -- === Hugo Mills: hugo@... carfax.org.uk | darksatanic.net | lug.org.uk === PGP key: 65E74AC0 from wwwkeys.eu.pgp.net or http://www.carfax.org.uk --- It's against my programming to impersonate a deity! --- signature.asc Description: Digital signature
Re: [RFC v2 0/2] New RAID library supporting up to six parities
On 01/06, joystick wrote: Just by looking at the Subjects, it seems patch number 0/1 is missing. It might have not gotten through to the lists, or be a numbering mistake. The patch files can be also downloaded from: http://snapraid.sourceforge.net/linux/v2/ Sorry about that, Does your code also support (shortcut) RMW as opposed to RCW, for all parities? At now no. But it can be easily extended to add the parity computation of a single disk in a set of already computed parities and then provide the required support for RMW. With SSSE3 this can be implemented in a very efficient way. Ciao, Andrea -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC v2 0/2] New RAID library supporting up to six parities
Hi, It seems that the patch was to big for some linux lists. If you miss some patch files, you can download them also at: http://snapraid.sourceforge.net/linux/v2/ Sorry about that. Ciao, Andrea -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH v4] Btrfs: add support for inode properties
This change adds infrastructure to allow for generic properties for inodes. Properties are name/value pairs that can be associated with inodes for different purposes. They are stored as xattrs with the prefix btrfs. Properties can be inherited - this means when a directory inode has inheritable properties set, these are added to new inodes created under that directory. Further, subvolumes can also have properties associated with them, and they can be inherited from their parent subvolume. Naturally, directory properties have priority over subvolume properties (in practice a subvolume property is just a regular property associated with the root inode, objectid 256, of the subvolume's fs tree). This change also adds one specific property implementation, named compression, whose values can be lzo or zlib and it's an inheritable property. The corresponding changes to btrfs-progs were also implemented. A patch with xfstests for this feature will follow once there's agreement on this change/feature. Further, the script at the bottom of this commit message was used to do some benchmarks to measure any performance penalties of this feature. Basically the tests correspond to: Test 1 - create a filesystem and mount it with compress-force=lzo, then sequentially create N files of 64Kb each, measure how long it took to create the files, unmount the filesystem, mount the filesystem and perform an 'ls -lha' against the test directory holding the N files, and report the time the command took. Test 2 - create a filesystem and don't use any compression option when mounting it - instead set the compression property of the subvolume's root to 'lzo'. Then create N files of 64Kb, and report the time it took. The unmount the filesystem, mount it again and perform an 'ls -lha' like in the former test. This means every single file ends up with a property (xattr) associated to it. Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the compression property, have no real effect other than adding more work when inheriting properties and taking more btree leaf space. Test 4 - same as test 3 but with 10 properties per file. Results (in seconds, and averages of 5 runs each), for different N numbers of files follow. * Without properties (test 1) file creation timels -lha time 10 000 files 3.49 0.76 100 000 files47.19 8.37 1 000 000 files 518.51 107.06 * With 1 property (compression property set to lzo - test 2) file creation timels -lha time 10 000 files 3.630.93 100 000 files48.569.74 1 000 000 files 537.72 125.11 * With 4 properties (test 3) file creation timels -lha time 10 000 files 3.941.20 100 000 files52.14 11.48 1 000 000 files 572.70 142.13 * With 10 properties (test 4) file creation timels -lha time 10 000 files 4.611.35 100 000 files58.86 13.83 1 000 000 files 656.01 177.61 The increased latencies with properties are essencialy because of: *) When creating an inode, we now synchronously write 1 more item (an xattr item) for each property inherited from the parent dir (or subvolume). This could be done in an asynchronous way such as we do for dir intex items (delayed-inode.c), which could help reduce the file creation latency; *) With properties, we now have larger fs trees. For this particular test each xattr item uses 75 bytes of leaf space in the fs tree. This could be less by using a new item for xattr items, instead of the current btrfs_dir_item, since we could cut the 'location' and 'type' fields (saving 18 bytes) and maybe 'transid' too (saving a total of 26 bytes per xattr item) from the btrfs_dir_item type. Also tried batching the xattr insertions (ignoring proper hash collision handling, since it didn't exist) when creating files that inherit properties from their parent inode/subvolume, but the end results were (surprisingly) essentially the same. Test script: $ cat test.pl #!/usr/bin/perl -w use strict; use Time::HiRes qw(time); use constant NUM_FILES = 10_000; use constant FILE_SIZES = (64 * 1024); use constant DEV = '/dev/sdb4'; use constant MNT_POINT = '/home/fdmanana/btrfs-tests/dev'; use constant TEST_DIR = (MNT_POINT . '/testdir'); system(mkfs.btrfs, -l, 16384, -f, DEV) == 0 or die mkfs.btrfs failed!; # following line for testing without properties #system(mount, -o, compress-force=lzo, DEV, MNT_POINT) == 0 or die mount failed!; # following 2 lines for testing with properties system(mount, DEV, MNT_POINT) == 0 or die mount failed!; system(btrfs, prop,
Re: [PATCH 0/5] Add support for object properties
On Sat, Nov 23, 2013 at 12:52 AM, David Sterba dste...@suse.cz wrote: On Tue, Nov 12, 2013 at 01:41:41PM +, Filipe David Borba Manana wrote: This is a revised version of the original proposal/work from Alexander Block to introduce a generic framework to set properties on btrfs filesystem objects (inodes, subvolumes, filesystems, devices). Currently the command group looks like this: btrfs prop set [-t type] /path/to/object name value btrfs prop get [-t type] /path/to/object [name] (omitting name dumps all) btrfs prop list [-t type] /path/to/object (lists properties with description) The type is used to explicitly specify what type of object you mean. This is necessary in case the object+property combination is ambiguous. For example '/path/to/fs/root' could mean the root subvolume, the directory inode or the filesystem itself. Normally, btrfs-progs will try to detect the type automatically. The generic commandline UI still looks ok to me, storing properties as xattr’s is also ok (provided that we can capture the “btrfs.” xattr namespace). I’ll dump my thoughts and questions about the rest. 1) Where are stored properties that are not directly attached to an inode? Ie. whole-filesystem and device. How can I access props for a subvolume that is not currently reachable in the directory tree? A fs or device props must be accessible any time, eg. no matter which subvolume is currently mounted. This should be probably a special case where the property can be queried from any inode but will be internally routed to eg. toplevel subvolume that will store the respective xattrs. 2) if a property’s object can be ambiguous, how is that distinguished in the xattrs? We don’t have a list of props yet, so I’m trying to use one that hopefully makes some sense. The example here can be persistent-mount-options that are attached to fs and a subvolume. The fs-wide props will apply when a different subvolume is explicitly mounted. Assuming that the xattrs are stored with the toplevel subvolume, the fs-wide and per-subvolume property must have a differnt xattr name (another option is to store fs-wide elsewhere). So the question is, if we should encode the property object into the xattr name directly. Eg.: btrfs.fs.persistent_mount btrfs.subvol.persistent_mount or if the fs-wide uses a reserved naming scheme that would appear as xattr named btrfs.persistent_mount but the value would differ if queried with ‘-t fs or ‘-t subvolume’. 3) property precedence, interaction with mount options The precedence should follow the rule of the least surprise, ie. if I set eg. a compression of a file to zlib, set the subvolume compression type to ‘none’ and have fs-wide mount the filesystem with compress=lzo, I’m expecting that the file will use ‘zlib’. The generic rule says that mount options (that have a corresponding property) take precedence. There may be exceptions. What if there are no specific mount options, but the subvolume has eg. compression set? Should new files inherit that automatically or only if explicitly marked as such? The background concern is how far do I need to look and whether this could be a potential performance problem. The lookup chain can be long: inode - dir - subvol - fs, plus mount options along the path where applicable. If the xattr values are cached in structs that are accessed anyway (containing subvol, fs_info) and updated when property xattr changes, this could work. 4) behaviour of tools that see the btrfs-specific props What would cp or rsync do when copying the xattrs to a different filesystem? They may simply ignore it and just try to copy the user. namespace, I haven’t done any research here. 5) properties to consider, naming, xattr name namespaces, inheritable props Here’s a list of properties that I’ve collected so far: filesystem-wide - persistent mount options - default raid allocation profile - default compression parameters - label - scrub ioprio subvolume - persistent mount options - default raid allocation profile - default compression parameters - writable device - writable/ro/dead - hot-spare - preferred for metadata - preferred data reads - preferred data writes - no new allocations if possible - speed profile, io priorities - allocation group - from project idea Chunk allocation groups - ssd cache, L2ARC - scrub ioprio file, directory - compression parameters - raid allocation profile details on compression parameters: - compression algo - compression level - desired compression ratio target - file compressibility hint - from project idea “Compression updates” - compression container type - dtto The props should be basically self explanatory. This is not a complete or finalized list, feel free to suggest what you think should/not be here. The number of
Re: correct way to rollback a root filesystem?
Jim Salter wrote (ao): I tried a kernel upgrade with moderately disastrous (non-btrfs-related) results this morning; after the kernel upgrade Xorg was completely borked beyond my ability to get it working properly again through any normal means. I do have hourly snapshots being taken by cron, though, so I'm successfully X'ing again on the machine in question right now. It was quite a fight getting back to where I started even so, though - I'm embarassed to admit I finally ended up just doing a cp --reflink=all /mnt/@/.snapshots/snapshotname /mnt/@/ from the initramfs BusyBox prompt. Which WORKED well enough, but obviously isn't ideal. I tried the btrfs sub set-default command - again from BusyBox - and it didn't seem to want to work for me; I got an inappropriate ioctl error (which may be because I tried to use / instead of /mnt, where the root volume was CURRENTLY mounted, as an argument?). Before that, I'd tried setting subvol=@root (which is the writeable snapshot I created from the original read-only hourly snapshot I had) in GRUB and in fstab... but that's what landed me in BusyBox to begin with. When I DID mount the filesystem in BusyBox on /mnt, I saw that @ and @home were listed under /mnt, but no other directories were - which explains why mounting -o subvol=@root didn't work. I guess the question is, WHY couldn't I see @root in there, since I had a working, readable, writeable snapshot which showed its own name as root when doing a btrfs sub show /.snapshots/root ? I don't quite get how your setup is. In my setup, all subvolumes and snapshots are under /.root/ # cat /etc/fstab LABEL=panda / btrfs subvol=rootvolume,space_cache,inode_cache,compress=lzo,ssd 0 0 LABEL=panda /home btrfs subvol=home 0 0 LABEL=panda /root btrfs subvol=root 0 0 LABEL=panda /varbtrfs subvol=var 0 0 LABEL=panda /holdingbtrfs subvol=.holding 0 0 LABEL=panda /.root btrfs subvolid=0 0 0 /Varlib /var/libnonebind 0 0 In case of an OS upgrade gone wrong, I would mount subvolid=0, move subvolume 'rootvolume' out of the way, and move (rename) the last known good snapshot to 'rootvolume'. Not sure if that works though. Never tried. Sander -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 00/21] Consolidate Posix ACL implementation V3
On Fri, Dec 20, 2013 at 05:16:35AM -0800, Christoph Hellwig wrote: This series consolidates the various cut'n'pasted Posix ACL implementations into a single common one based on the -get_acl method Linus added a while ago and a new -set_acl counterpart. This remove ~1800 lines of code and provides a single place to implement various nasty little gems of the semantics. Any more comments? Al, what's the best way to make sure this doesn't miss 3.14? -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/3] Btrfs: rework qgroup accounting
On 12/21/2013 03:56 AM, Wang Shilong wrote: Hello Josef, Though i know there are still problems related to qgroup(for example removing snapshot will beak qgroup accounting).I did a simple test about your patch.. # btrfs quota enable /mnt # dd if=/dev/zero of=/mnt/data bs=4k count=102400 oflag=direct # btrfs sub snapshot /mnt/ /mnt/snap1 # btrfs sub snapshot /mnt /mnt/snap2 # btrfs sub delete /mnt/snap1 /mnt/snap2 # sync # rm -rf /mnt/data # sync # dmesg # btrfs qgroup show /mnt Firstly, qgroup accounting is wrong, this is maybe expected because efficient fs tree removal. However, from dmesg, i get the WARNING: WARNING: CPU: 1 PID: 2650 at fs/btrfs/qgroup.c:1486 btrfs_delayed_qgroup_accounting I did not take a deep look at codes, but i think you will be interested in taking a look at this. ^_^ Ok I finally sat down to look at this and it is because subvol deletion screws up quota accounting no matter what. I just added this WARN_ON() to catch actual mistakes, it just so happens it gets tripped when the snapshot deletion stuff happens too. I'm going to put this on the I'll deal with it later list since it is an existing issue with or without my patch. Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests: kill lib/random.c
Hey Gents, On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote: On 1/6/14, 3:42 PM, Josef Bacik wrote: On 01/06/2014 04:32 PM, Eric Sandeen wrote: On 1/6/14, 1:58 PM, Josef Bacik wrote: I was trying to reproduce something with fsx and I noticed that no matter what seed I set I was getting the same file. Come to find out we are overloading random() with our own custom horribleness for some unknown reason. So nuke the damn thing from orbit and rely on glibc's random(). With this fix the -S option actually does something with fsx. Thanks, Hm, old comments seem to indicate that this was done handwave to make random behave the same on different architectures (i.e. same result from same seed, I guess?) I . . . don't know if that is true of glibc's random(), is it? I'd like to dig into the history just a bit before we yank this, just to be sure. I think that if we need the output to match based on a predictable random() output then we've lost already. We shouldn't be checking for specific output (like inode numbers or sizes etc) that are dependant on random()'s behaviour, and if we are we need to fix those tests. So even if that is why it was put in place originally I'd say it is high time we ripped it out and fixed up any tests that rely on this behaviour. Thanks, Yeah, you're probably right. And the ancient xfstests history seems to be lost in the mists of time, at least as far as I can see. So I'm ok with this but let's let Dave SGI chime in too just to be certain. I did not have success locating the history prior to what we have posted on oss. I agree that it was likely added so that tests that expose output from random into golden output files will have the same results across arches. Maybe this is still of concern for folks who use a different c library with the kernel. Looks there are quite a few callers. IMO if we're going to remove this we should fix the tests first. -Ben -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests: kill lib/random.c
On 1/7/14, 2:01 PM, Ben Myers wrote: Hey Gents, On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote: On 1/6/14, 3:42 PM, Josef Bacik wrote: On 01/06/2014 04:32 PM, Eric Sandeen wrote: On 1/6/14, 1:58 PM, Josef Bacik wrote: I was trying to reproduce something with fsx and I noticed that no matter what seed I set I was getting the same file. Come to find out we are overloading random() with our own custom horribleness for some unknown reason. So nuke the damn thing from orbit and rely on glibc's random(). With this fix the -S option actually does something with fsx. Thanks, Hm, old comments seem to indicate that this was done handwave to make random behave the same on different architectures (i.e. same result from same seed, I guess?) I . . . don't know if that is true of glibc's random(), is it? I'd like to dig into the history just a bit before we yank this, just to be sure. I think that if we need the output to match based on a predictable random() output then we've lost already. We shouldn't be checking for specific output (like inode numbers or sizes etc) that are dependant on random()'s behaviour, and if we are we need to fix those tests. So even if that is why it was put in place originally I'd say it is high time we ripped it out and fixed up any tests that rely on this behaviour. Thanks, Yeah, you're probably right. And the ancient xfstests history seems to be lost in the mists of time, at least as far as I can see. So I'm ok with this but let's let Dave SGI chime in too just to be certain. I did not have success locating the history prior to what we have posted on oss. I agree that it was likely added so that tests that expose output from random into golden output files will have the same results across arches. Maybe this is still of concern for folks who use a different c library with the kernel. Looks there are quite a few callers. IMO if we're going to remove this we should fix the tests first. Or first, determine if they really need fixing. Did you find tests which actually contain the random results in the golden output? -Eric -Ben ___ xfs mailing list x...@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests: kill lib/random.c
On 1/7/14, 2:10 PM, Eric Sandeen wrote: On 1/7/14, 2:01 PM, Ben Myers wrote: Hey Gents, On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote: On 1/6/14, 3:42 PM, Josef Bacik wrote: On 01/06/2014 04:32 PM, Eric Sandeen wrote: On 1/6/14, 1:58 PM, Josef Bacik wrote: I was trying to reproduce something with fsx and I noticed that no matter what seed I set I was getting the same file. Come to find out we are overloading random() with our own custom horribleness for some unknown reason. So nuke the damn thing from orbit and rely on glibc's random(). With this fix the -S option actually does something with fsx. Thanks, Hm, old comments seem to indicate that this was done handwave to make random behave the same on different architectures (i.e. same result from same seed, I guess?) I . . . don't know if that is true of glibc's random(), is it? I'd like to dig into the history just a bit before we yank this, just to be sure. I think that if we need the output to match based on a predictable random() output then we've lost already. We shouldn't be checking for specific output (like inode numbers or sizes etc) that are dependant on random()'s behaviour, and if we are we need to fix those tests. So even if that is why it was put in place originally I'd say it is high time we ripped it out and fixed up any tests that rely on this behaviour. Thanks, Yeah, you're probably right. And the ancient xfstests history seems to be lost in the mists of time, at least as far as I can see. So I'm ok with this but let's let Dave SGI chime in too just to be certain. I did not have success locating the history prior to what we have posted on oss. I agree that it was likely added so that tests that expose output from random into golden output files will have the same results across arches. Maybe this is still of concern for folks who use a different c library with the kernel. Looks there are quite a few callers. IMO if we're going to remove this we should fix the tests first. Or first, determine if they really need fixing. Did you find tests which actually contain the random results in the golden output? This should be easy enough to test by just hacking the lib/random.c with a new starting seed, right? -Eric -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [block:for-3.14/core] kernel BUG at fs/bio.c:1748
Thanks Fengguang. Final patch with added comment. BTW, fengguang mentioned that git-am has trouble with the inline patch and quilt import worked fine for him... In btrfs_end_bio(), we increment bi_remaining if is_orig_bio. If not, we restore the orig_bio but failed to increment bi_remaining for orig_bio, which triggers a BUG_ON later when bio_endio is called. Fix is to increment bi_remaining when we restore the orig bio as well. Reported-and-Tested-by: Fengguang wu fengguang...@intel.com CC: Kent Overstreet k...@daterainc.com CC: Jens Axboe ax...@kernel.dk CC: Chris Mason c...@fb.com Signed-off-by: Muthukumar Ratty mut...@gmail.com --- fs/btrfs/volumes.c |8 ++-- 1 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 37972d5..34aba2b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5297,9 +5297,13 @@ static void btrfs_end_bio(struct bio *bio, int err) if (!is_orig_bio) { bio_put(bio); bio = bbio-orig_bio; - } else { - atomic_inc(bio-bi_remaining); } +/* + * We have original bio now. So increment bi_remaining to + * account for it in endio + */ + atomic_inc(bio-bi_remaining); + bio-bi_private = bbio-private; bio-bi_end_io = bbio-end_io; btrfs_io_bio(bio)-mirror_num = bbio-mirror_num; - -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests: kill lib/random.c
On 01/07/2014 03:10 PM, Eric Sandeen wrote: On 1/7/14, 2:01 PM, Ben Myers wrote: Hey Gents, On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote: On 1/6/14, 3:42 PM, Josef Bacik wrote: On 01/06/2014 04:32 PM, Eric Sandeen wrote: On 1/6/14, 1:58 PM, Josef Bacik wrote: I was trying to reproduce something with fsx and I noticed that no matter what seed I set I was getting the same file. Come to find out we are overloading random() with our own custom horribleness for some unknown reason. So nuke the damn thing from orbit and rely on glibc's random(). With this fix the -S option actually does something with fsx. Thanks, Hm, old comments seem to indicate that this was done handwave to make random behave the same on different architectures (i.e. same result from same seed, I guess?) I . . . don't know if that is true of glibc's random(), is it? I'd like to dig into the history just a bit before we yank this, just to be sure. I think that if we need the output to match based on a predictable random() output then we've lost already. We shouldn't be checking for specific output (like inode numbers or sizes etc) that are dependant on random()'s behaviour, and if we are we need to fix those tests. So even if that is why it was put in place originally I'd say it is high time we ripped it out and fixed up any tests that rely on this behaviour. Thanks, Yeah, you're probably right. And the ancient xfstests history seems to be lost in the mists of time, at least as far as I can see. So I'm ok with this but let's let Dave SGI chime in too just to be certain. I did not have success locating the history prior to what we have posted on oss. I agree that it was likely added so that tests that expose output from random into golden output files will have the same results across arches. Maybe this is still of concern for folks who use a different c library with the kernel. Looks there are quite a few callers. IMO if we're going to remove this we should fix the tests first. Or first, determine if they really need fixing. Did you find tests which actually contain the random results in the golden output? I looked through stuff when I ripped it out and I couldn't find anything that relied on consistent numbers, we tend to filter all that stuff out. I think ripping it out and waiting to see if somebody complains is a good way to figure out if things need fixing, but that may be less than friendly ;). Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs-progs: deal with invalid key orderings and bad orphan items V2
A user had a fs where the objectid of an orphan item was not the actual orphan item objectid. This screwed up fsck because the block has keys in the wrong order, also the fs scanning stuff will freak out because we have an inode with nlink 0 and no orphan item. So this patch is pretty big but is all related. 1) Deal with bad key ordering. We can easily fix this up, so fix the checking stuff to tell us exactly what it found when it said there was a problem. Then if it's bad key ordering we can reorder the keys and restart the scan. 2) Deal with bad keys. If we find an orphan item with the wrong objectid it's likely to screw with stuff, so keep track of these sort of things with a bad_item list and just run through and delete any objects that don't make sense. So far we just do this for orphan items but we could extend this as new stuff pops up. 3) Deal with missing orphan items. This is easy, if we have a file with i_nlink set to 0 and no orphan item we can just add an orphan item. 4) Add the infrastructure to corrupt actual key values. Needed this to create a test image to verify I was fixing things properly. This patch fixes the corrupt image I'm adding and passes the other make test tests. Thanks, Signed-off-by: Josef Bacik jba...@fb.com --- V1-V2: Fix btrfs-corrupt-block so it will actually compile. btrfs-corrupt-block.c | 109 ++- cmds-check.c| 331 ctree.c | 135 +++-- ctree.h | 32 ++- file-item.c | 2 +- tests/fsck-tests/003-bad-orphan-key.img | Bin 0 - 4096 bytes 6 files changed, 509 insertions(+), 100 deletions(-) create mode 100644 tests/fsck-tests/003-bad-orphan-key.img diff --git a/btrfs-corrupt-block.c b/btrfs-corrupt-block.c index f0c14a9..10cae00 100644 --- a/btrfs-corrupt-block.c +++ b/btrfs-corrupt-block.c @@ -105,6 +105,8 @@ static void print_usage(void) specify -i for the inode and -f for the field to corrupt)\n); fprintf(stderr, \t-m The metadata block to corrupt (must also specify -f for the field to corrupt)\n); + fprintf(stderr, \t-K The key to corrupt in the format + num,num,num (must also specify -f for the field)\n); fprintf(stderr, \t-f The field in the item to corrupt\n); exit(1); } @@ -306,6 +308,13 @@ enum btrfs_metadata_block_field { BTRFS_METADATA_BLOCK_BAD, }; +enum btrfs_key_field { + BTRFS_KEY_OBJECTID, + BTRFS_KEY_TYPE, + BTRFS_KEY_OFFSET, + BTRFS_KEY_BAD, +}; + static enum btrfs_inode_field convert_inode_field(char *field) { if (!strncmp(field, isize, FIELD_BUF_LEN)) @@ -328,6 +337,17 @@ convert_metadata_block_field(char *field) return BTRFS_METADATA_BLOCK_BAD; } +static enum btrfs_key_field convert_key_field(char *field) +{ + if (!strncmp(field, objectid, FIELD_BUF_LEN)) + return BTRFS_KEY_OBJECTID; + if (!strncmp(field, type, FIELD_BUF_LEN)) + return BTRFS_KEY_TYPE; + if (!strncmp(field, offset, FIELD_BUF_LEN)) + return BTRFS_KEY_OFFSET; + return BTRFS_KEY_BAD; +} + static u64 generate_u64(u64 orig) { u64 ret; @@ -337,6 +357,73 @@ static u64 generate_u64(u64 orig) return ret; } +static u8 generate_u8(u8 orig) +{ + u8 ret; + do { + ret = rand(); + } while (ret == orig); + return ret; +} + +static int corrupt_key(struct btrfs_root *root, struct btrfs_key *key, + char *field) +{ + enum btrfs_key_field corrupt_field = convert_key_field(field); + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + int ret; + + root = root-fs_info-fs_root; + if (corrupt_field == BTRFS_KEY_BAD) { + fprintf(stderr, Invalid field %s\n, field); + return -EINVAL; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + btrfs_free_path(path); + return PTR_ERR(trans); + } + + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret 0) + goto out; + if (ret 0) { + fprintf(stderr, Couldn't find the key to corrupt\n); + ret = -ENOENT; + goto out; + } + + switch (corrupt_field) { + case BTRFS_KEY_OBJECTID: + key-objectid = generate_u64(key-objectid); + break; + case BTRFS_KEY_TYPE: + key-type = generate_u8(key-type); + break; + case BTRFS_KEY_OFFSET: + key-offset = generate_u64(key-objectid); + break; + default: + fprintf(stderr, Invalid field %s, %d\n, field, +
Re: [block:for-3.14/core] kernel BUG at fs/bio.c:1748
On Tue, 2014-01-07 at 12:15 -0800, Muthu Kumar wrote: Thanks Fengguang. Final patch with added comment. BTW, fengguang mentioned that git-am has trouble with the inline patch and quilt import worked fine for him... In btrfs_end_bio(), we increment bi_remaining if is_orig_bio. If not, we restore the orig_bio but failed to increment bi_remaining for orig_bio, which triggers a BUG_ON later when bio_endio is called. Fix is to increment bi_remaining when we restore the orig bio as well. Hi everyone, Which git tree is this against? Just Jens or some extra code too? I'll run some tests here. My original patch is below (it's slightly different from Muthu's). Btrfs is sometimes calling bio_endio twice on the same bio while we chain things. This makes sure we don't trip over new assertions in fs/bio.c Signed-off-by: Chris Mason c...@fb.com diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7fcac70..5b30360 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2289,6 +2289,10 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status) block = next_block; } while (NULL != block); + /* +* since we're not using bio_endio here, we don't need to worry about +* the remaining count +*/ bp-bi_end_io(bp, bio_error_status); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 62176ad..786ddac 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1684,7 +1684,7 @@ static void end_workqueue_fn(struct btrfs_work *work) bio-bi_private = end_io_wq-private; bio-bi_end_io = end_io_wq-end_io; kfree(end_io_wq); - bio_endio(bio, error); + bio_endio_nodec(bio, error); } static int cleaner_kthread(void *arg) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ef48947..a31448f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5284,9 +5284,17 @@ static void btrfs_end_bio(struct bio *bio, int err) } } - if (bio == bbio-orig_bio) + if (bio == bbio-orig_bio) { is_orig_bio = 1; + /* +* eventually we will call the bi_endio for the original bio, +* make sure that we've properly bumped bi_remaining to reflect +* our chain of endios here +*/ + atomic_inc(bio-bi_remaining); + } + if (atomic_dec_and_test(bbio-stripes_pending)) { if (!is_orig_bio) { bio_put(bio); -- To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs-progs: bail if we find errors in the extent tree
For some reason we weren't exiting if there were errors in the extent tree, which means we could have errors in just the extent tree and things like xfstests would keep going because we completely throw away the return value. Thanks, Signed-off-by: Josef Bacik jba...@fb.com --- cmds-check.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmds-check.c b/cmds-check.c index 689fe6c..cbabfdc 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -6409,8 +6409,10 @@ int cmd_check(int argc, char **argv) goto out; } ret = check_chunks_and_extents(root); - if (ret) + if (ret) { fprintf(stderr, Errors found in extent allocation tree or chunk allocation\n); + goto out; + } fprintf(stderr, checking free space cache\n); ret = check_space_cache(root); -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: btrfsck does not fix
Hello, I ran btrfsck on my volume with the repair option. When I re-run it, I get the same errors as before. It mounts without errors? So why then btrfsck/btrfs repair? What precipitated the repair? I don't know what caused the damage, but a check revealed this: Checking filesystem on /dev/sdb1 UUID: 989306aa-d291-4752-8477-0baf94f8c42f checking extents Extent back ref already exists for 2994950590464 parent 863072366592 root 0 Extent back ref already exists for 2994950836224 parent 863072366592 root 0 Extent back ref already exists for 862762737664 parent 863072366592 root 0 Extent back ref already exists for 2994950877184 parent 863072366592 [...] Incorrect global backref count on 2995767250944 found 1 wanted 2 backpointer mismatch on [2995767250944 4096] ref mismatch on [2995767304192 4096] extent item 1, found 2 Incorrect global backref count on 2995767304192 found 1 wanted 2 backpointer mismatch on [2995767304192 4096] ref mismatch on [2995768258560 4096] extent item 1, found 2 Incorrect global backref count on 2995768258560 found 1 wanted 2 backpointer mismatch on [2995768258560 4096] ref mismatch on [2995768459264 4096] extent item 1, found 2 Incorrect global backref count on 2995768459264 found 1 wanted 2 backpointer mismatch on [2995768459264 4096] Errors found in extent allocation tree or chunk allocation ref mismatch on [2995768459264 4096] extent item 1, found 2 Incorrect global backref count on 2995768459264 found 1 wanted 2 backpointer mismatch on [2995768459264 4096] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots root 256 inode 9579 errors 100, file extent discount root 256 inode 9580 errors 100, file extent discount root 256 inode 14258 errors 100, file extent discount root 256 inode 14259 errors 100, file extent discount root inode 9579 errors 100, file extent discount root inode 9580 errors 100, file extent discount root inode 14258 errors 100, file extent discount root inode 14259 errors 100, file extent discount found 1993711951581 bytes used err is 1 total csum bytes: 4560615360 total tree bytes: 5643403264 total fs tree bytes: 139776000 total extent tree bytes: 263602176 btree space waste bytes: 504484726 file data blocks allocated: 6557032402944 referenced 6540949323776 Btrfs v3.12 This made me run btrfsck with the repair option: Extent back ref already exists for 2994950590464 parent 863072366592 root 0 ref mismatch on [32935936 4096] extent item 1, found 2 repair deleting extent record: key 32935936 168 4096 adding new tree backref on start 32935936 len 4096 parent 2994784206848 root 2994784206848 Incorrect global backref count on 32935936 found 1 wanted 2 backpointer mismatch on [32935936 4096] ref mismatch on [32997376 4096] extent item 1, found 2 repair deleting extent record: key 32997376 168 4096 adding new tree backref on start 32997376 len 4096 parent 2994824708096 root 2994824708096 Incorrect global backref count on 32997376 found 1 wanted 2 backpointer mismatch on [32997376 4096] Incorrect global backref count on 8988365651968 found 1 wanted 0 backpointer mismatch on [8988365651968 4096] repaired damaged extent references checking free space cache checking fs roots root 256 inode 9579 errors 100, file extent discount root 256 inode 9580 errors 100, file extent discount root 256 inode 14258 errors 100, file extent discount root 256 inode 14259 errors 100, file extent discount root inode 9579 errors 100, file extent discount root inode 9580 errors 100, file extent discount root inode 14258 errors 100, file extent discount root inode 14259 errors 100, file extent discount enabling repair mode Checking filesystem on /dev/sdc1 UUID: 989306aa-d291-4752-8477-0baf94f8c42f cache and super generation don't match, space cache will be invalidated found 827360733827 bytes used err is 1 total csum bytes: 4446455380 total tree bytes: 5506977792 total fs tree bytes: 137293824 total extent tree bytes: 258691072 btree space waste bytes: 496921489 file data blocks allocated: 6440132583424 referenced 6424163344384 Btrfs v3.12 After this, I ran a check without the repair option again and the same errors persist. Greetings, Hendrik -- Hendrik Friedel Auf dem Brink 12 28844 Weyhe Mobil 0178 1874363 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests: kill lib/random.c
On Tue, Jan 07, 2014 at 02:10:15PM -0600, Eric Sandeen wrote: On 1/7/14, 2:01 PM, Ben Myers wrote: Hey Gents, On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote: On 1/6/14, 3:42 PM, Josef Bacik wrote: On 01/06/2014 04:32 PM, Eric Sandeen wrote: On 1/6/14, 1:58 PM, Josef Bacik wrote: I was trying to reproduce something with fsx and I noticed that no matter what seed I set I was getting the same file. Come to find out we are overloading random() with our own custom horribleness for some unknown reason. So nuke the damn thing from orbit and rely on glibc's random(). With this fix the -S option actually does something with fsx. Thanks, Hm, old comments seem to indicate that this was done handwave to make random behave the same on different architectures (i.e. same result from same seed, I guess?) I . . . don't know if that is true of glibc's random(), is it? I'd like to dig into the history just a bit before we yank this, just to be sure. I think that if we need the output to match based on a predictable random() output then we've lost already. We shouldn't be checking for specific output (like inode numbers or sizes etc) that are dependant on random()'s behaviour, and if we are we need to fix those tests. So even if that is why it was put in place originally I'd say it is high time we ripped it out and fixed up any tests that rely on this behaviour. Thanks, Yeah, you're probably right. And the ancient xfstests history seems to be lost in the mists of time, at least as far as I can see. So I'm ok with this but let's let Dave SGI chime in too just to be certain. I did not have success locating the history prior to what we have posted on oss. I agree that it was likely added so that tests that expose output from random into golden output files will have the same results across arches. Maybe this is still of concern for folks who use a different c library with the kernel. Looks there are quite a few callers. IMO if we're going to remove this we should fix the tests first. Or first, determine if they really need fixing. Did you find tests which actually contain the random results in the golden output? At one point random.c was modified because it was returning different test results on i386 and ia64 with test 007. Looks like nametest.c is a good candidate. -Ben -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Btrfs-progs: bail if we find errors in the extent tree
On 01/07/2014 03:34 PM, Josef Bacik wrote: For some reason we weren't exiting if there were errors in the extent tree, which means we could have errors in just the extent tree and things like xfstests would keep going because we completely throw away the return value. Thanks, Signed-off-by: Josef Bacik jba...@fb.com --- cmds-check.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cmds-check.c b/cmds-check.c index 689fe6c..cbabfdc 100644 --- a/cmds-check.c +++ b/cmds-check.c @@ -6409,8 +6409,10 @@ int cmd_check(int argc, char **argv) goto out; } ret = check_chunks_and_extents(root); - if (ret) + if (ret) { fprintf(stderr, Errors found in extent allocation tree or chunk allocation\n); + goto out; + } fprintf(stderr, checking free space cache\n); ret = check_space_cache(root); Sigh ignore this, it is causing other problems, I'm going to have to figure out a better solution to this. Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 5/6] Btrfs: use flags instead of the bool variants in delayed node
On Tue, Jan 07, 2014 at 11:59:18AM +0800, Miao Xie wrote: But I read a discuss about the use of boolean type, some developers suggested us to use bitfields instead of bool, because the bitfields can work better, and they are more flexible, less misuse than bool. https://lkml.org/lkml/2013/9/1/154 I was searching for pros/cons of the bools but haven't found this one nor anything useful, though not all of the advantages of bitfields apply in our case. I've compared the generated assembly with just this patch, and the difference is effectively only the lock prefix for set/clear, the read side has no significant penalty: old: mov0x128(%rbx),%esi new: mov0x120(%rbx),%rax test $0x1,%al old set: movb $0x1,0x120(%rbx) new: lock orb $0x1,0x120(%rbx) The delayed_node structure is relatively short lived, is reused frequently and I haven't seen any contention points where the lock prefix would hurt. So, ok to merge it. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests: kill lib/random.c
On 01/07/2014 03:40 PM, Ben Myers wrote: On Tue, Jan 07, 2014 at 02:10:15PM -0600, Eric Sandeen wrote: On 1/7/14, 2:01 PM, Ben Myers wrote: Hey Gents, On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote: On 1/6/14, 3:42 PM, Josef Bacik wrote: On 01/06/2014 04:32 PM, Eric Sandeen wrote: On 1/6/14, 1:58 PM, Josef Bacik wrote: I was trying to reproduce something with fsx and I noticed that no matter what seed I set I was getting the same file. Come to find out we are overloading random() with our own custom horribleness for some unknown reason. So nuke the damn thing from orbit and rely on glibc's random(). With this fix the -S option actually does something with fsx. Thanks, Hm, old comments seem to indicate that this was done handwave to make random behave the same on different architectures (i.e. same result from same seed, I guess?) I . . . don't know if that is true of glibc's random(), is it? I'd like to dig into the history just a bit before we yank this, just to be sure. I think that if we need the output to match based on a predictable random() output then we've lost already. We shouldn't be checking for specific output (like inode numbers or sizes etc) that are dependant on random()'s behaviour, and if we are we need to fix those tests. So even if that is why it was put in place originally I'd say it is high time we ripped it out and fixed up any tests that rely on this behaviour. Thanks, Yeah, you're probably right. And the ancient xfstests history seems to be lost in the mists of time, at least as far as I can see. So I'm ok with this but let's let Dave SGI chime in too just to be certain. I did not have success locating the history prior to what we have posted on oss. I agree that it was likely added so that tests that expose output from random into golden output files will have the same results across arches. Maybe this is still of concern for folks who use a different c library with the kernel. Looks there are quite a few callers. IMO if we're going to remove this we should fix the tests first. Or first, determine if they really need fixing. Did you find tests which actually contain the random results in the golden output? At one point random.c was modified because it was returning different test results on i386 and ia64 with test 007. Looks like nametest.c is a good candidate. Ugh you're right. Just ignore this patch for now, I'll be in the corner banging my head against the wall. Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] xfstests: kill lib/random.c
On Tue, 2014-01-07 at 16:17 -0500, Josef Bacik wrote: On 01/07/2014 03:40 PM, Ben Myers wrote: On Tue, Jan 07, 2014 at 02:10:15PM -0600, Eric Sandeen wrote: On 1/7/14, 2:01 PM, Ben Myers wrote: Hey Gents, On Mon, Jan 06, 2014 at 03:46:58PM -0600, Eric Sandeen wrote: On 1/6/14, 3:42 PM, Josef Bacik wrote: On 01/06/2014 04:32 PM, Eric Sandeen wrote: On 1/6/14, 1:58 PM, Josef Bacik wrote: I was trying to reproduce something with fsx and I noticed that no matter what seed I set I was getting the same file. Come to find out we are overloading random() with our own custom horribleness for some unknown reason. So nuke the damn thing from orbit and rely on glibc's random(). With this fix the -S option actually does something with fsx. Thanks, Hm, old comments seem to indicate that this was done handwave to make random behave the same on different architectures (i.e. same result from same seed, I guess?) I . . . don't know if that is true of glibc's random(), is it? I'd like to dig into the history just a bit before we yank this, just to be sure. I think that if we need the output to match based on a predictable random() output then we've lost already. We shouldn't be checking for specific output (like inode numbers or sizes etc) that are dependant on random()'s behaviour, and if we are we need to fix those tests. So even if that is why it was put in place originally I'd say it is high time we ripped it out and fixed up any tests that rely on this behaviour. Thanks, Yeah, you're probably right. And the ancient xfstests history seems to be lost in the mists of time, at least as far as I can see. So I'm ok with this but let's let Dave SGI chime in too just to be certain. I did not have success locating the history prior to what we have posted on oss. I agree that it was likely added so that tests that expose output from random into golden output files will have the same results across arches. Maybe this is still of concern for folks who use a different c library with the kernel. Looks there are quite a few callers. IMO if we're going to remove this we should fix the tests first. Or first, determine if they really need fixing. Did you find tests which actually contain the random results in the golden output? At one point random.c was modified because it was returning different test results on i386 and ia64 with test 007. Looks like nametest.c is a good candidate. Ugh you're right. Just ignore this patch for now, I'll be in the corner banging my head against the wall. Thanks, For now we can just use srandom? -chris -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] Skip non-regular files in recursive defrag
On Mon, Jan 6, 2014 at 4:36 PM, David Sterba dste...@suse.cz wrote: On Wed, Jan 01, 2014 at 03:10:25PM +0100, Pascal VITOUX wrote: --- cmds-filesystem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmds-filesystem.c b/cmds-filesystem.c index 1c1926b..979dbd9 100644 --- a/cmds-filesystem.c +++ b/cmds-filesystem.c @@ -646,7 +646,7 @@ static int defrag_callback(const char *fpath, const struct stat *sb, int e = 0; int fd = 0; - if (typeflag == FTW_F) { + if ((typeflag == FTW_F) S_ISREG(sb-st_mode)) { The ftw(3) documentation states that FTW_F gets the regular files, do you have a reproducer whre this does not work as expected? Yes, this can be tested with the example code provided in ftw(3) and compiled with glibc 2.18. In fact, the ftw(3) page also states in the NOTES part that, under Linux, FTW_F is used for all objects that are not a directory. Some libc versions are specified ( libc4, libc5 and glic 2.0.6 ) but it's still true in glibc 2.18 The ftw's man page is less precise than the libc manual, as you can see here : http://www.gnu.org/software/libc/manual/html_node/Working-with-Directory-Trees.html I think the check could be placed into cmd_defrag, there's a st_mode check for directories already. Defragmenting makes most sense for regular files (though there's the option to defrag portions of the metadata b-trees), so it's fine to handle just directories and regular files in userspace. I think so, too. And for a single file defrag the check should return to the user an explicit error message instead of the 'defrag range ioctl not supported' error. If you're going to resend the patch, please add 'btrfs-progs: ' string to the subject, write a short changlog and add your Signed-off-by line. thanks, david Ok, thanks for these instructions. I will resend it soon. Pascal. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
YOU HAVE RECEIVED A DONATION
Hello, This message is to inform you that you have just received a cash donation of $460,000 from us. You are advised to write us an email once you get this message through the following Email: dawesda...@hotmail.com for more information on how you can redeem your donated sum. Best of luck. Dave Angela Dawes. Email: dawesda...@gmail.com Correo Electrónico - Universidad de Nariño Pasto - Nariño - Colombia www.udenar.edu.co N�r��yb�X��ǧv�^�){.n�+{�n�߲)w*jg����ݢj/���z�ޖ��2�ޙ�)ߡ�a�����G���h��j:+v���w��٥
Re: btrfs-transaction blocked for more than 120 seconds
On Fri, Jan 03, 2014 at 09:34:10PM +, Duncan wrote: IIRC someone also mentioned problems with autodefrag and an about 3/4 gig systemd journal. My gut feeling (IOW, *NOT* benchmarked!) is that double- digit MiB files should /normally/ be fine, but somewhere in the lower triple digits, write-magnification could well become an issue, depending of course on exactly how much active writing the app is doing into the file. When I defrag'ed my 83GB vm file with 156222 extents, it was not in use or being written to. As I said there's more work going into tuning autodefrag ATM, but as it is, I couldn't really recommend making it a global default... tho maybe a distro could enable it by default on a no-VM desktop system (as opposed to a server). Certainly I'd recommend most desktop types enable it. I use VMs on my desktop :) but point taken. On Sun, Jan 05, 2014 at 10:09:38AM -0700, Chris Murphy wrote: gandalfthegreat:/var/local/nobck/VirtualBox VMs/Win7# filefrag Win7.vdi Win7.vdi: 156222 extents found Considering how virtualbox works, that's hardly surprising. I haven't read anything so far indicating defrag applies to the VM container use case, rather nodatacow via xattr +C is the way to go. At least for now. Yep, I'll convert the file, but since I found a pretty severe performance problem, does anyone care to get details off my system before I make the problem go away for me? It's better than a panic or corrupt data. So far the best combination To be honest, I'd have taken a panic, it would have saved me 2H of waiting for a laptop to recover when it was never going to recover :( Data corruption, sure, obviously :) I've found, open to other suggestions though, is +C xattr on So you're saying that defragmentation has known performance problems that can't get fixed for now, and that the solution is not to get fragmented or recreate the relevant files. If so, I'll go ahead, I just wanted to make sure I didn't have useful debug state before clearing my problem. This may already be a known problem but it's worth sysrq+w, and then dmesg and posting those results if you haven't already. No, I had not yet, but I'll do this. On Sun, Jan 05, 2014 at 01:44:25PM -0700, Duncan wrote: [I normally try to reply directly to list but don't believe I've seen this there yet, but got it direct-mailed so will reply-all in response.] I like direct Cc on replies, makes my filter and mutt coloring happier :) Dupes with the same message-id are what procmail and others were written for :) I now believe the lockup must be due to processing the hundreds of thousands of extents on all those snapshots, too, in addition to doing That's a good call. I do have this: gandalfthegreat:/mnt/btrfs_pool1# ls var var/ var_hourly_20140105_16:00:01/ var_daily_20140102_00:01:01/ var_hourly_20140105_17:00:26/ var_daily_20140103_00:59:28/ var_weekly_20131208_00:02:02/ var_daily_20140104_00:01:01/ var_weekly_20131215_00:02:01/ var_daily_20140105_00:33:14/ var_weekly_20131229_00:02:02/ var_hourly_20140105_05:00:01/ var_weekly_20140105_00:33:14/ it on the main volume. I don't actually make very extensive use of snapshots here anyway, so I didn't think about that aspect originally, but that's gotta be what's throwing the real spanner in the works, turning a possibly long but workable normal defrag (O(1)) into a lockup scenario (O(n)) where virtually no progress is made as currently coded. That is indeed what I'm seeing, so it's very possible you're right. Marc -- A mouse is a device used to point at the xterm you want to type in - A.S.R. Microsoft is to operating systems what McDonalds is to gourmet cooking Home page: http://marc.merlins.org/ | PGP 1024R/763BE901 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v4 00/18] Replace btrfs_workers with kernel workqueue based btrfs_workqueue
On Mon, 23 Dec 2013 10:35:04 +0800 , Qu Wenruo wrote: On fri, 20 Dec 2013 05:30:48 -0800, Josef Bacik wrote: On 12/19/2013 07:08 PM, Qu Wenruo wrote: I'm sorry but I failed to reproduce the problem. Btrfs/012 in xfstests has been run for serveral hours but nothing happened. Would you please give me some more details about the environment or the panic backtrace? Ok so it wasn't that test, it was just ./check -g auto. It would sometimes die at btrfs/012, other times it would make it as far as generic/083 before it keeled over. I bisected it down to btrfs: Replace fs_info-workers with btrfs_workqueue which of course is just the first patch that you start using the new code which isn't helpful. My dmesg from one of my runs is here http://ur1.ca/g867b All of the initial panics looked exactly the same. I'm just going to kick the series out for now while you track down the problem. Thanks, Josef -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Thanks for your dmesg a lot. It's quite sad that I still failed to reproduce the same bug on all my test environment during the weekend. But according to the dmesg, which is expired now though, I made the following assumption: There is a possibility that out of order execution made the work-wq = wq sentence executed behind the queue_work() call, and the normal_work_helper is executed in another CPU instantly, which caused the problem. -- static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; work-wq = wq; /* There is no memory barrier ensure the work-wq = wq is executed before queue_work */ thresh_queue_hook(wq); if (work-ordered_func) { spin_lock_irqsave(wq-list_lock, flags); list_add_tail(work-ordered_list, wq-ordered_list); spin_unlock_irqrestore(wq-list_lock, flags); } queue_work(wq-normal_wq, work-normal_work); } -- The memory order problem involves compiler or even CPU hardware (AMD CPUS seems to have a weaker memory order than Intel), so to simulate the bug, I used the following codes tried to reproduce the bug, and the result is pretty much the same as your dmesg. -- static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; int in_order = get_random_int() % 1000 if (in_order) work-wq = wq; thresh_queue_hook(wq); if (work-ordered_func) { spin_lock_irqsave(wq-list_lock, flags); list_add_tail(work-ordered_list, wq-ordered_list); spin_unlock_irqrestore(wq-list_lock, flags); } queue_work(wq-normal_wq, work-normal_work); /* Try make the codes out of order since the Intel CPU seems obey the memory order quite well */ if (!in_order) work-wq = wq; } -- The fix is as easy as just adding a smp_wmb() behind the work-wq = wq sentence, but since I failed to reproduce the bug, I can't confirm whether the fix is good or not. I know it's impolite but would you please first try the smp_wmb() first to confirm the fix? If the fix is good I'll send the formal V5 patchset. Thanks Qu Happy new year! Very sorry for disturbing but ithas been some time before my last replay and no feedback since then (mainly because of the new year holiday). If itis convenient for you,would you please give some feedback about the smp_wmb() fix ? Also incase that smp_wmb() doesn't work, it would be very nice of you to also try smp_mb() fix. Thanks Qu -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html