Re: [PATCH 2/4] btrfs-progs: fix segment fault when exec btrfs-debug-tree as non-root
On 20 February 2014 02:49, Gui Hecheng guihc.f...@cn.fujitsu.com wrote: When exec btrfs-debug-tree as non-root user, we get a segment fault. Because the btrfs_scan_block_devices return a success 0 when we fail to open a device. Now we just return the errno if this case happens. Signed-off-by: Gui Hecheng guihc.f...@cn.fujitsu.com --- utils.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils.c b/utils.c index 97e23d5..1878abc 100644 --- a/utils.c +++ b/utils.c @@ -1517,7 +1517,8 @@ scan_again: scans++; goto scan_again; } - return 0; + + return errno ? -errno : 0; } u64 parse_size(char *s) -- 1.8.1.4 Hi Gui, This strikes me as not not right because errno is only documented as being set when open() returns -1 on failure. In the success case errno is not set so you can't assume it will be 0. I think the following might work: 1) Initilase ret = 0 at the start of btrfs_scan_block_devices() 2) In the open(fullpath, O_RDONLY) failure case set ret = fd 3) return ret Thanks, Mike -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/9] Btrfs: use ACCESS_ONCE to prevent the optimize accesses to -last_trans_log_full_commit
-last_trans_log_full_commit may be changed by the other tasks without lock, so we need prevent the compiler from the optimize access just like tmp = fs_info-last_trans_log_full_commit if (tmp == ...) ... do something if (tmp == ...) ... In fact, we need get the new value of -last_trans_log_full_commit during the second access. Fix it by ACCESS_ONCE(). Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/tree-log.c | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7c449c6..5a4e10b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2375,14 +2375,14 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, wait, TASK_UNINTERRUPTIBLE); mutex_unlock(root-log_mutex); - if (root-fs_info-last_trans_log_full_commit != + if (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) != trans-transid root-log_transid transid + 2 atomic_read(root-log_commit[index])) schedule(); finish_wait(root-log_commit_wait[index], wait); mutex_lock(root-log_mutex); - } while (root-fs_info-last_trans_log_full_commit != + } while (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) != trans-transid root-log_transid transid + 2 atomic_read(root-log_commit[index])); return 0; @@ -2392,12 +2392,12 @@ static void wait_for_writer(struct btrfs_trans_handle *trans, struct btrfs_root *root) { DEFINE_WAIT(wait); - while (root-fs_info-last_trans_log_full_commit != + while (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) != trans-transid atomic_read(root-log_writers)) { prepare_to_wait(root-log_writer_wait, wait, TASK_UNINTERRUPTIBLE); mutex_unlock(root-log_mutex); - if (root-fs_info-last_trans_log_full_commit != + if (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) != trans-transid atomic_read(root-log_writers)) schedule(); mutex_lock(root-log_mutex); @@ -2456,7 +2456,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (root-fs_info-last_trans_log_full_commit == trans-transid) { + if (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) == + trans-transid) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(root-log_mutex); @@ -2515,7 +2516,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(log_root_tree-log_mutex); goto out; } - root-fs_info-last_trans_log_full_commit = trans-transid; + ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) = + trans-transid; btrfs_wait_marked_extents(log, log-dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(log_root_tree-log_mutex); @@ -2547,7 +2549,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (root-fs_info-last_trans_log_full_commit == trans-transid) { + if (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) == + trans-transid) { blk_finish_plug(plug); btrfs_wait_marked_extents(log, log-dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/9] Btrfs: use signed integer instead of unsigned long integer for log transid
The log trans id is initialized to be 0 every time we create a log tree, and the log tree need be re-created after a new transaction is started, it means the log trans id is unlikely to be a huge number, so we can use signed integer instead of unsigned long integer to save a bit space. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/btrfs_inode.h | 14 +++--- fs/btrfs/ctree.h | 4 ++-- fs/btrfs/tree-log.c| 4 ++-- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8fed212..c9a2444 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -109,14 +109,17 @@ struct btrfs_inode { u64 last_trans; /* -* log transid when this inode was last modified +* transid that last logged this inode */ - u64 last_sub_trans; + u64 logged_trans; /* -* transid that last logged this inode +* log transid when this inode was last modified */ - u64 logged_trans; + int last_sub_trans; + + /* a local copy of root's last_log_commit */ + int last_log_commit; /* total number of bytes pending delalloc, used by stat to calc the * real block usage of the file @@ -155,9 +158,6 @@ struct btrfs_inode { /* flags field from the on disk inode */ u32 flags; - /* a local copy of root's last_log_commit */ - unsigned long last_log_commit; - /* * Counters to keep track of the number of extent item's we may use due * to delalloc and such. outstanding_extents is the number of extent diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c45a10a..76b6bff 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1742,8 +1742,8 @@ struct btrfs_root { atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; - unsigned long log_transid; - unsigned long last_log_commit; + int log_transid; + int last_log_commit; pid_t log_start_pid; u64 objectid; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c9f2479..c27e2c9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2362,7 +2362,7 @@ static int update_log_root(struct btrfs_trans_handle *trans, } static int wait_log_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long transid) + struct btrfs_root *root, int transid) { DEFINE_WAIT(wait); int index = transid % 2; @@ -2434,7 +2434,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int ret; struct btrfs_root *log = root-log_root; struct btrfs_root *log_root_tree = root-fs_info-log_root_tree; - unsigned long log_transid = 0; + int log_transid = 0; struct blk_plug plug; mutex_lock(root-log_mutex); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/9] Btrfs: use bitfield instead of integer data type for the some variants in btrfs_root
Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.c | 25 ++--- fs/btrfs/ctree.h | 39 +-- fs/btrfs/disk-io.c | 33 +++-- fs/btrfs/extent-tree.c | 6 +++--- fs/btrfs/file.c| 4 +++- fs/btrfs/inode.c | 29 ++--- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/relocation.c | 17 + fs/btrfs/root-tree.c | 2 +- fs/btrfs/transaction.c | 33 + fs/btrfs/tree-defrag.c | 2 +- fs/btrfs/tree-log.c| 9 + 12 files changed, 109 insertions(+), 94 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cbd3a7d..9067d79 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -224,7 +224,8 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) static void add_root_to_dirty_list(struct btrfs_root *root) { spin_lock(root-fs_info-trans_lock); - if (root-track_dirty list_empty(root-dirty_list)) { + if (test_bit(BTRFS_ROOT_TRACK_DIRTY, root-state) + list_empty(root-dirty_list)) { list_add(root-dirty_list, root-fs_info-dirty_cowonly_roots); } @@ -246,9 +247,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int level; struct btrfs_disk_key disk_key; - WARN_ON(root-ref_cows trans-transid != - root-fs_info-running_transaction-transid); - WARN_ON(root-ref_cows trans-transid != root-last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, root-state) + trans-transid != root-fs_info-running_transaction-transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, root-state) + trans-transid != root-last_trans); level = btrfs_header_level(buf); if (level == 0) @@ -997,14 +999,14 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * snapshot and the block was not allocated by tree relocation, * we know the block is not shared. */ - if (root-ref_cows + if (test_bit(BTRFS_ROOT_REF_COWS, root-state) buf != root-node buf != root-commit_root (btrfs_header_generation(buf) = btrfs_root_last_snapshot(root-root_item) || btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) return 1; #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 - if (root-ref_cows + if (test_bit(BTRFS_ROOT_REF_COWS, root-state) btrfs_header_backref_rev(buf) BTRFS_MIXED_BACKREF_REV) return 1; #endif @@ -1146,9 +1148,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); - WARN_ON(root-ref_cows trans-transid != - root-fs_info-running_transaction-transid); - WARN_ON(root-ref_cows trans-transid != root-last_trans); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, root-state) + trans-transid != root-fs_info-running_transaction-transid); + WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, root-state) + trans-transid != root-last_trans); level = btrfs_header_level(buf); @@ -1193,7 +1196,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } - if (root-ref_cows) { + if (test_bit(BTRFS_ROOT_REF_COWS, root-state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) return ret; @@ -1556,7 +1559,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) !(root-root_key.objectid != BTRFS_TREE_RELOC_OBJECTID btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) - !root-force_cow) + !test_bit(BTRFS_ROOT_REF_COWS, root-state)) return 0; return 1; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 277f627..c45a10a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1683,6 +1683,26 @@ struct btrfs_fs_info { }; /* + * The state of btrfs root + */ +/* + * btrfs_record_root_in_trans is a multi-step process, + * and it can race with the balancing code. But the + * race is very small, and only the first time the root + * is added to each transaction. So IN_TRANS_SETUP + * is used to tell us when more checks are required + */ +#define BTRFS_ROOT_IN_TRANS_SETUP 0 +#define BTRFS_ROOT_REF_COWS1 +#define BTRFS_ROOT_TRACK_DIRTY 2 +#define BTRFS_ROOT_IN_RADIX3 +#define BTRFS_ROOT_DUMMY_ROOT 4 +#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED5 +#define BTRFS_ROOT_DEFRAG_RUNNING 6 +#define BTRFS_ROOT_FORCE_COW 7 +#define BTRFS_ROOT_MULTI_LOG_TASKS 8 + +/* * in ram representation of the tree. extent_root is used for all allocations * and
[PATCH 3/9] Btrfs: don't start the log transaction if the log tree init fails
The old code would start the log transaction even the log tree init failed, it was unnecessary. Fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/tree-log.c | 26 ++ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8a03b39..ca960ad 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -139,7 +139,6 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; - int err = 0; mutex_lock(root-log_mutex); if (root-log_root) { @@ -155,24 +154,27 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_unlock(root-log_mutex); return 0; } - root-log_multiple_pids = false; - root-log_start_pid = current-pid; + + ret = 0; mutex_lock(root-fs_info-tree_log_mutex); - if (!root-fs_info-log_root_tree) { + if (!root-fs_info-log_root_tree) ret = btrfs_init_log_root_tree(trans, root-fs_info); - if (ret) - err = ret; - } - if (err == 0 !root-log_root) { + mutex_unlock(root-fs_info-tree_log_mutex); + if (ret) + goto out; + + if (!root-log_root) { ret = btrfs_add_log_tree(trans, root); if (ret) - err = ret; + goto out; } - mutex_unlock(root-fs_info-tree_log_mutex); + root-log_multiple_pids = false; + root-log_start_pid = current-pid; atomic_inc(root-log_batch); atomic_inc(root-log_writers); +out: mutex_unlock(root-log_mutex); - return err; + return ret; } /* @@ -4116,7 +4118,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, ret = start_log_trans(trans, root); if (ret) - goto end_trans; + goto end_no_trans; ret = btrfs_log_inode(trans, root, inode, inode_only); if (ret) -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 8/9] Btrfs: fix skipped error handle when log sync failed
It is possible that many tasks sync the log tree at the same time, but only one task can do the sync work, the others will wait for it. But those wait tasks didn't get the result of the log sync, and returned 0 when they ended the wait. It caused those tasks skipped the error handle, and the serious problem was they told the users the file sync succeeded but in fact they failed. This patch fixes this problem by introducing a log context structure, we insert it into the a global list. When the sync fails, we will set the error number of every log context in the list, then the waiting tasks get the error number of the log context and handle the error if need. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h| 1 + fs/btrfs/disk-io.c | 2 + fs/btrfs/file.c | 9 +++-- fs/btrfs/tree-log.c | 114 fs/btrfs/tree-log.h | 16 +++- 5 files changed, 111 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 76b6bff..1bfce32 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1739,6 +1739,7 @@ struct btrfs_root { struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; + struct list_head log_ctxs[2]; atomic_t log_writers; atomic_t log_commit[2]; atomic_t log_batch; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cc0af87..f85f83b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1197,6 +1197,8 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, init_waitqueue_head(root-log_writer_wait); init_waitqueue_head(root-log_commit_wait[0]); init_waitqueue_head(root-log_commit_wait[1]); + INIT_LIST_HEAD(root-log_ctxs[0]); + INIT_LIST_HEAD(root-log_ctxs[1]); atomic_set(root-log_commit[0], 0); atomic_set(root-log_commit[1], 0); atomic_set(root-log_writers, 0); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 76ff550..0660e16 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1858,8 +1858,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct dentry *dentry = file-f_path.dentry; struct inode *inode = dentry-d_inode; struct btrfs_root *root = BTRFS_I(inode)-root; - int ret = 0; struct btrfs_trans_handle *trans; + struct btrfs_log_ctx ctx; + int ret = 0; bool full_sync = 0; trace_btrfs_sync_file(file, datasync); @@ -1953,7 +1954,9 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans-sync = true; - ret = btrfs_log_dentry_safe(trans, root, dentry); + btrfs_init_log_ctx(ctx); + + ret = btrfs_log_dentry_safe(trans, root, dentry, ctx); if (ret 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -1973,7 +1976,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { - ret = btrfs_sync_log(trans, root); + ret = btrfs_sync_log(trans, root, ctx); if (!ret) { ret = btrfs_end_transaction(trans, root); goto out; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4326da9..3289eb5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -136,8 +136,10 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, * syncing the tree wait for us to finish */ static int start_log_trans(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_root *root, + struct btrfs_log_ctx *ctx) { + int index; int ret; mutex_lock(root-log_mutex); @@ -157,6 +159,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(root-log_batch); atomic_inc(root-log_writers); + if (ctx) { + index = root-log_transid % 2; + list_add_tail(ctx-list, root-log_ctxs[index]); + } mutex_unlock(root-log_mutex); return 0; } @@ -178,6 +184,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, root-log_start_pid = current-pid; atomic_inc(root-log_batch); atomic_inc(root-log_writers); + if (ctx) { + index = root-log_transid % 2; + list_add_tail(ctx-list, root-log_ctxs[index]); + } out: mutex_unlock(root-log_mutex); return ret; @@ -2367,12 +2377,11 @@ static int update_log_root(struct btrfs_trans_handle *trans, return ret; } -static int wait_log_commit(struct btrfs_trans_handle *trans, -
[PATCH 7/9] Btrfs: stop joining the log transaction if sync log fails
If the log sync fails, there is something wrong in the log tree, we should not continue to join the log transaction and log the metadata. What we should do is to do a full commit. This patch fixes this problem by setting -last_trans_log_full_commit to the current transaction id, it will tell the tasks not to join the log transaction, and do a full commit. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/tree-log.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c27e2c9..4326da9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -142,6 +142,12 @@ static int start_log_trans(struct btrfs_trans_handle *trans, mutex_lock(root-log_mutex); if (root-log_root) { + if (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) == + trans-transid) { + ret = -EAGAIN; + goto out; + } + if (!root-log_start_pid) { root-log_start_pid = current-pid; clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, root-state); @@ -2488,6 +2494,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(plug); btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); + ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) = + trans-transid; mutex_unlock(root-log_mutex); goto out; } @@ -2520,13 +2528,13 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (ret) { blk_finish_plug(plug); + ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) = + trans-transid; if (ret != -ENOSPC) { btrfs_abort_transaction(trans, root, ret); mutex_unlock(log_root_tree-log_mutex); goto out; } - ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) = - trans-transid; btrfs_wait_marked_extents(log, log-dirty_log_pages, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(log_root_tree-log_mutex); @@ -2572,6 +2580,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(plug); if (ret) { + ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) = + trans-transid; btrfs_abort_transaction(trans, root, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(log_root_tree-log_mutex); @@ -2600,6 +2610,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = write_ctree_super(trans, root-fs_info-tree_root, 1); if (ret) { + ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) = + trans-transid; btrfs_abort_transaction(trans, root, ret); goto out_wake_log_root; } -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 9/9] Btrfs: just wait or commit our own log sub-transaction
We might commit the log sub-transaction which didn't contain the metadata we logged. It was because we didn't record the log transid and just select the current log sub-transaction to commit, but the right one might be committed by the other task already. Actually, we needn't do anything and it is safe that we go back directly in this case. This patch improves the log sync by the above idea. We record the transid of the log sub-transaction in which we log the metadata, and the transid of the log sub-transaction we have committed. If the committed transid is = the transid we record when logging the metadata, we just go back. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/ctree.h| 3 +++ fs/btrfs/disk-io.c | 2 ++ fs/btrfs/tree-log.c | 63 ++--- fs/btrfs/tree-log.h | 2 ++ 4 files changed, 47 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1bfce32..dc72c1f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1744,6 +1744,9 @@ struct btrfs_root { atomic_t log_commit[2]; atomic_t log_batch; int log_transid; + /* No matter the commit succeeds or not*/ + int log_transid_committed; + /* Just be updated when the commit succeeds. */ int last_log_commit; pid_t log_start_pid; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f85f83b..6699483 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1206,6 +1206,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, atomic_set(root-orphan_inodes, 0); atomic_set(root-refs, 1); root-log_transid = 0; + root-log_transid_committed = -1; root-last_log_commit = 0; if (fs_info) extent_io_tree_init(root-dirty_log_pages, @@ -1419,6 +1420,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root-log_root); root-log_root = log_root; root-log_transid = 0; + root-log_transid_committed = -1; root-last_log_commit = 0; return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3289eb5..ffee158 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -162,6 +162,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ctx) { index = root-log_transid % 2; list_add_tail(ctx-list, root-log_ctxs[index]); + ctx-log_transid = root-log_transid; } mutex_unlock(root-log_mutex); return 0; @@ -187,6 +188,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, if (ctx) { index = root-log_transid % 2; list_add_tail(ctx-list, root-log_ctxs[index]); + ctx-log_transid = root-log_transid; } out: mutex_unlock(root-log_mutex); @@ -2393,13 +2395,13 @@ static void wait_log_commit(struct btrfs_trans_handle *trans, wait, TASK_UNINTERRUPTIBLE); mutex_unlock(root-log_mutex); - if (root-log_transid transid + 2 + if (root-log_transid_committed transid atomic_read(root-log_commit[index])) schedule(); finish_wait(root-log_commit_wait[index], wait); mutex_lock(root-log_mutex); - } while (root-log_transid transid + 2 + } while (root-log_transid_committed transid atomic_read(root-log_commit[index])); } @@ -2476,18 +2478,24 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct blk_plug plug; mutex_lock(root-log_mutex); - log_transid = root-log_transid; - index1 = root-log_transid % 2; + log_transid = ctx-log_transid; + if (root-log_transid_committed = log_transid) { + mutex_unlock(root-log_mutex); + return ctx-log_ret; + } + + index1 = log_transid % 2; if (atomic_read(root-log_commit[index1])) { - wait_log_commit(trans, root, root-log_transid); + wait_log_commit(trans, root, log_transid); mutex_unlock(root-log_mutex); return ctx-log_ret; } + ASSERT(log_transid == root-log_transid); atomic_set(root-log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(root-log_commit[(index1 + 1) % 2])) - wait_log_commit(trans, root, root-log_transid - 1); + wait_log_commit(trans, root, log_transid - 1); while (1) { int batch = atomic_read(root-log_batch); @@ -2544,9 +2552,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(root-log_mutex); + btrfs_init_log_ctx(root_log_ctx); + mutex_lock(log_root_tree-log_mutex);
[PATCH 2/9] Btrfs: fix the skipped transaction commit during the file sync
We may abort the wait earlier if -last_trans_log_full_commit was set to the current transaction id, at this case, we need commit the current transaction instead of the log sub-transaction. But the current code didn't tell the caller to do it (return 0, not -EAGAIN). Fix it. Signed-off-by: Miao Xie mi...@cn.fujitsu.com --- fs/btrfs/tree-log.c | 26 -- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 5a4e10b..8a03b39 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2364,6 +2364,7 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, { DEFINE_WAIT(wait); int index = transid % 2; + int ret = 0; /* * we only allow two pending log transactions at a time, @@ -2371,21 +2372,26 @@ static int wait_log_commit(struct btrfs_trans_handle *trans, * current transaction, we're done */ do { + if (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) == + trans-transid) { + ret = -EAGAIN; + break; + } + prepare_to_wait(root-log_commit_wait[index], wait, TASK_UNINTERRUPTIBLE); mutex_unlock(root-log_mutex); - if (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) != - trans-transid root-log_transid transid + 2 + if (root-log_transid transid + 2 atomic_read(root-log_commit[index])) schedule(); finish_wait(root-log_commit_wait[index], wait); mutex_lock(root-log_mutex); - } while (ACCESS_ONCE(root-fs_info-last_trans_log_full_commit) != -trans-transid root-log_transid transid + 2 + } while (root-log_transid transid + 2 atomic_read(root-log_commit[index])); - return 0; + + return ret; } static void wait_for_writer(struct btrfs_trans_handle *trans, @@ -2433,15 +2439,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, log_transid = root-log_transid; index1 = root-log_transid % 2; if (atomic_read(root-log_commit[index1])) { - wait_log_commit(trans, root, root-log_transid); + ret = wait_log_commit(trans, root, root-log_transid); mutex_unlock(root-log_mutex); - return 0; + return ret; } atomic_set(root-log_commit[index1], 1); /* wait for previous tree log sync to complete */ if (atomic_read(root-log_commit[(index1 + 1) % 2])) wait_log_commit(trans, root, root-log_transid - 1); + while (1) { int batch = atomic_read(root-log_batch); /* when we're on an ssd, just kick the log commit out */ @@ -2529,11 +2536,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, if (atomic_read(log_root_tree-log_commit[index2])) { blk_finish_plug(plug); btrfs_wait_marked_extents(log, log-dirty_log_pages, mark); - wait_log_commit(trans, log_root_tree, - log_root_tree-log_transid); + ret = wait_log_commit(trans, log_root_tree, + log_root_tree-log_transid); btrfs_free_logged_extents(log, log_transid); mutex_unlock(log_root_tree-log_mutex); - ret = 0; goto out; } atomic_set(log_root_tree-log_commit[index2], 1); -- 1.8.1.4 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Meaning of no_csum field when scrubbing with -R option
Sebastian Ochmann posted on Wed, 19 Feb 2014 13:58:17 +0100 as excerpted: So my question is, why does scrub show a high (i.e. non-zero) value for no_csum? I never enabled nodatasum or a similar option. I've no answer but have wondered that myself. So hopefully you get an answer I can read too. =:^) -- Duncan - List replies preferred. No HTML msgs. Every nonfree program has a lord, a master -- and if you use the program, he is your master. Richard Stallman -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Meaning of no_csum field when scrubbing with -R option
On 02/20/2014 06:31 PM, Duncan wrote: Sebastian Ochmann posted on Wed, 19 Feb 2014 13:58:17 +0100 as excerpted: So my question is, why does scrub show a high (i.e. non-zero) value for no_csum? I never enabled nodatasum or a similar option. Did you enable nodatacow option? if nodatacow option is enabled, data checksums will be also disabled at the same time. Thanks, Wang I've no answer but have wondered that myself. So hopefully you get an answer I can read too. =:^) -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Incremental backup over writable snapshot
Kai Krakow posted on Wed, 19 Feb 2014 21:20:23 +0100 as excerpted: Duncan had a nice example in this list how to migrate directories to subvolumes by using shallow copies: mv dir dir.old btrfs sub create dir cp -a -- reflink=always dir.old/. dir/. rm -Rf dir.old. FWIW, that was someone else. I remember seeing it and I may well have been involved in some aspect of the discussion and thus might have quoted it, but my particular use-case doesn't involve a lot of subvolumes or snapshots, so I don't typically get quite that deep into the command- detail in subvolume discussions as I've simply not had the necessary personal experience in that area to properly discuss at that level. (Tho it's certainly typical of what I might post in other areas, just not that one.) -- Duncan - List replies preferred. No HTML msgs. Every nonfree program has a lord, a master -- and if you use the program, he is your master. Richard Stallman -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Meaning of no_csum field when scrubbing with -R option
Wang Shilong posted on Thu, 20 Feb 2014 18:51:10 +0800 as excerpted: On 02/20/2014 06:31 PM, Duncan wrote: Sebastian Ochmann posted on Wed, 19 Feb 2014 13:58:17 +0100 as excerpted: So my question is, why does scrub show a high (i.e. non-zero) value for no_csum? I never enabled nodatasum or a similar option. Did you enable nodatacow option? if nodatacow option is enabled, data checksums will be also disabled at the same time. I have not. Everything here is standard checksummed COW, which is why I wondered why I was seeing the no_csum results. Tho in my case there's few enough I thought it might have been triggered by some other abnormality. (In particular, I was suspending-to-ram for awhile, and if I left the system suspended too long, the disks wouldn't wake up fast enough on resume and one of the raid1 pair would often drop out, forcing the filesystem to read-only and generally a pretty quick reboot, after which I'd have to do a scrub to sync the raid1 back up. I had guessed the no- csum instances might have been half-written at the suspend, but it bothered me. That was a couple kernels ago, tho. I decided to quit risking the suspend-to-ram since I was sometimes having to reboot anyway and I did end up with a file or two corrupted (even after scrub) as a result, so I've not had that issue or had any other scrub errors in some months now. I've thought I should try it again and see if it works better now, but I haven't done so yet.) -- Duncan - List replies preferred. No HTML msgs. Every nonfree program has a lord, a master -- and if you use the program, he is your master. Richard Stallman -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Meaning of \no_csum\ field when scrubbing with -R option
Hello, Sebastian Ochmann posted on Wed, 19 Feb 2014 13:58:17 +0100 as excerpted: So my question is, why does scrub show a high (i.e. non-zero) value for no_csum? I never enabled nodatasum or a similar option. Did you enable nodatacow option? if nodatacow option is enabled, data checksums will be also disabled at the same time. No, never, not even on single files. Some additional info: The filesystem is only a few weeks old (even though I see similar results on an older filesystem as well), it's my root filesystem, and as mount options I use rw,noatime,ssd,discard,space_cache (it's on a SSD). Kernel version is 3.12.9. Best regards, Sebastian -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] btrfs: send: lower memory requirements in common case
On Wed, Feb 5, 2014 at 3:17 PM, David Sterba dste...@suse.cz wrote: The fs_path structure uses an inline buffer and falls back to a chain of allocations, but vmalloc is not necessary because PATH_MAX fits into PAGE_SIZE. The size of fs_path has been reduced to 256 bytes from PAGE_SIZE, usually 4k. Experimental measurements show that most paths on a single filesystem do not exceed 200 bytes, and these get stored into the inline buffer directly, which is now 230 bytes. Longer paths are kmalloced when needed. Signed-off-by: David Sterba dste...@suse.cz --- v2: - intel build test reports that krealloc should not reuse the buffer for return value, though it's not a problem in our case, a failed allocation leads to immediate return, let's use a temporary variable to keep the check happy fs/btrfs/send.c | 106 +++--- 1 files changed, 37 insertions(+), 69 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1f09c74e1c1f..dd0b02adb1e6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -57,7 +57,12 @@ struct fs_path { unsigned short reversed:1; char inline_buf[]; }; - char pad[PAGE_SIZE]; + /* +* Average path length does not exceed 200 bytes, we'll have +* better packing in the slab and higher chance to satisfy +* a allocation later during send. +*/ + char pad[256]; }; }; #define FS_PATH_INLINE_SIZE \ @@ -262,12 +267,8 @@ static void fs_path_free(struct fs_path *p) { if (!p) return; - if (p-buf != p-inline_buf) { - if (is_vmalloc_addr(p-buf)) - vfree(p-buf); - else - kfree(p-buf); - } + if (p-buf != p-inline_buf) + kfree(p-buf); kfree(p); } @@ -287,40 +288,31 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p-buf_len = len) return 0; - path_len = p-end - p-start; - old_buf_len = p-buf_len; - len = PAGE_ALIGN(len); - + /* +* First time the inline_buf does not suffice +*/ if (p-buf == p-inline_buf) { - tmp_buf = kmalloc(len, GFP_NOFS | __GFP_NOWARN); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - } - memcpy(tmp_buf, p-buf, p-buf_len); - p-buf = tmp_buf; - p-buf_len = len; + p-buf = kmalloc(len, GFP_NOFS); + if (!p-buf) + return -ENOMEM; + /* +* The real size of the buffer is bigger, this will let the +* fast path happen most of the time +*/ + p-buf_len = ksize(p-buf); } else { - if (is_vmalloc_addr(p-buf)) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p-buf, p-buf_len); - vfree(p-buf); - } else { - tmp_buf = krealloc(p-buf, len, GFP_NOFS); - if (!tmp_buf) { - tmp_buf = vmalloc(len); - if (!tmp_buf) - return -ENOMEM; - memcpy(tmp_buf, p-buf, p-buf_len); - kfree(p-buf); - } - } - p-buf = tmp_buf; - p-buf_len = len; + char *tmp; + + tmp = krealloc(p-buf, len, GFP_NOFS); + if (!tmp) + return -ENOMEM; + p-buf = tmp; + p-buf_len = ksize(p-buf); } + + path_len = p-end - p-start; + old_buf_len = p-buf_len; Hi Dave, I think this is not correct here. old_buf_len doesn't get assigned the old buffer's length but instead the new length. So this assignment should be before the if-then-else that allocates/reallocates the path buffer, just like it was done previously before this change. Or is there anything I missed here? thanks + if (p-reversed) { tmp_buf = p-buf + old_buf_len - path_len - 1; p-end = p-buf + p-buf_len - 1; @@ -911,9 +903,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_dir_item *di; struct btrfs_key di_key; char *buf = NULL; - char *buf2 = NULL; - int buf_len; - int buf_virtual = 0; + const int buf_len = PATH_MAX;
btrfs device add hangs
Hello, I am noticing some weird behavior with btrfs device add. The command does not return/finish although there is no disk activity in iotop. I have compiled a gist [1] with the details, which I would gladly update, if anyone knows what additional information I can provide. kind regards, Niklas Fischer [1] https://gist.github.com/niklasfi/9111967 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Meaning of \no_csum\ field when scrubbing with -R option
On 02/20/2014 07:25 PM, Sebastian Ochmann wrote: Hello, Sebastian Ochmann posted on Wed, 19 Feb 2014 13:58:17 +0100 as excerpted: So my question is, why does scrub show a high (i.e. non-zero) value for no_csum? I never enabled nodatasum or a similar option. Did you enable nodatacow option? if nodatacow option is enabled, data checksums will be also disabled at the same time. [SNIP] So i have found why we have such strange things from debugging, you can have a try as the following steps: # mkfs.btrfs -f /dev/sda9 # mount /dev/sda9 /mnt # btrfs scrub start -BR /mnt scrub done for 02dd3326-959f-4602-9baa-aa9ed99ac2e5 scrub started at Thu Feb 20 20:31:46 2014 and finished after 0 seconds data_extents_scrubbed: 1 tree_extents_scrubbed: 16 data_bytes_scrubbed: 65536 tree_bytes_scrubbed: 262144 read_errors: 0 csum_errors: 0 verify_errors: 0 no_csum: 16 --not equal 0 for a newly mkfs. csum_discards: 0 super_errors: 0 malloc_errors: 0 uncorrectable_errors: 0 unverified_errors: 0 corrected_errors: 0 last_physical: 467140608 # btrfs-debug-tree /dev/sda9 By debuging tree, we found there is a EXTENT_ITEM in extent tree for newly mkfs filesystem which we have written anything yet. item 2 key (12582912 EXTENT_ITEM 65536) itemoff 16182 itemsize 53 extent refs 1 gen 5 flags 1 extent data backref root 1 objectid 256 offset 0 count 1 item 3 key (12582912 BLOCK_GROUP_ITEM 8388608) itemoff 16158 itemsize 24 At the same time, Let's see Csum tree debug output: checksum tree key (CSUM_TREE ROOT_ITEM 0) leaf 29425664 items 0 free space 16283 generation 4 owner 7 fs uuid 02dd3326-959f-4602-9baa-aa9ed99ac2e5 So there is not corresponding checksum item for that DATA extent item.. This can explain why scrub output no_sum count! For reasons, It may be reserved data space without checksum for other purpose! So if this is true, i don't think this is harm for common users unless it is designed unexpectedly! Thanks, Wang -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs-progs: don't remove BTRFS_BLOCK_GROUP_DUP in chunk type
During restoring of image (-r using btrfs-image) we zero out RAID profile in chunk type but forget to save BTRFS_BLOCK_GROUP_DUP if present. This results in some false messages being printed by btrfsck. $ ./mkfs.btrfs /dev/sdb2 -f $ ./btrfs-image /dev/sdb2 btrfs_image_output $ ./btrfs-image -r btrfs_image_output disk-image $ ./btrfsck disk-image Checking filesystem on disk-image UUID: e644be2d-7701-4bd4-8804-7487f560d2a7 checking extents Chunk[256, 228, 20971520]: length(8388608), offset(20971520), type(2) mismatch with block group[20971520, 192, 8388608]: offset(8388608), objectid(20971520), flags(34) Chunk[256, 228, 29360128]: length(1073741824), offset(29360128), type(4) mismatch with block group[29360128, 192, 1073741824]: offset(1073741824), objectid(29360128), flags(36) Block group[20971520, 8388608] (flags = 34) didn't find the relative chunk. Block group[29360128, 1073741824] (flags = 36) didn't find the relative chunk. Even though ./btrfsck on /dev/sdb2 seemed fine. This is due to type mismatch above and type mismatch occured because we zero'ed out BTRFS_BLOCK_GROUP_DUP while handling chunk trees. Signed-off-by: Rakesh Pandit rak...@tuxera.com --- btrfs-image.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/btrfs-image.c b/btrfs-image.c index 7bcfc06..2be54a7 100644 --- a/btrfs-image.c +++ b/btrfs-image.c @@ -1521,7 +1521,8 @@ static int fixup_chunk_tree_block(struct mdrestore_struct *mdres, type = btrfs_stack_chunk_type(chunk); type = (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM | -BTRFS_BLOCK_GROUP_METADATA); +BTRFS_BLOCK_GROUP_METADATA | +BTRFS_BLOCK_GROUP_DUP); btrfs_set_stack_chunk_type(chunk, type); btrfs_set_stack_chunk_num_stripes(chunk, 1); -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
btrfs-raid10 - stripes or blocks?
Hi list - Can anybody tell me whether btrfs-raid10 reads and writes a stripe at a time, like traditional raid10, or whether it reads and writes individual redundant blocks like btrfs-raid1, but just locks particular disks as mirror pairs, and/or locks the order in which mirror pairs should be written to in sequence? It occurs to me that this might be an important distinction for workloads with very high small random I/O - stripe reads and writes being at a disadvantage to individual blocks being read and written, when the requests are smaller than the stripe. Apologies again if this is a question that should have an obvious answer, but as long as it took me to figure out that btrfs-raid1 is NOT much like traditional raid1, it seemed worth asking. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/4] Btrfs-progs: new helper to parse string to u64 for btrfs
On Wed, Feb 19, 2014 at 07:17:51PM +0800, Wang Shilong wrote: +u64 btrfs_strtoull(char *str, int base) +{ + u64 value; + char *ptr_parse_end = NULL; + char *ptr_str_end = str + strlen(str); + + value = strtoull(str, ptr_parse_end, base); + if (ptr_parse_end != ptr_str_end) { + fprintf(stderr, ERROR: %s is not an invalid unsigned long long integer.\n, + str); + exit(1); Calling exit() in a helper function makes is unsuitable as a generic helper or for use from any library function, though all the places where it's used are in main() so it's similar to usage(), and makes handling errors in command line arguments easier. I'm still concerned about the generic function name, but don't have an idea how to fix it atm. + } + if (value == ULONG_MAX) { + fprintf(stderr, ERROR: %s is out of range.\n, str); + exit(1); + } + return value; +} -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2 1/4] Btrfs-progs: new helper to parse string to u64 for btrfs
On 2/19/14, 7:43 PM, Wang Shilong wrote: On 02/20/2014 09:39 AM, Eric Sandeen wrote: On 2/19/14, 7:30 PM, Wang Shilong wrote: ... +/* + * if we pass a negative number to strtoull, + * it will return an unexpected number to us, + * so let's do the check ourselves firstly. + */ +if (str[0] == '-') { +fprintf(stderr, ERROR: %s may be negative value.\n, str); well, it _is_ a negative value right? (vs. may be) So perhaps: fprintf(stderr, ERROR: %s: negative value is invalid.\n, str); I use may be because the following case: -123, -abcd. something like these, these string are invalid, but they are not negative number...So i have not thought a better idea to tell user what is wrong with input.:-) Ok; well, sorry for being nitpicky. :) But user error messages probably should be very clear and unambiguous; we may as well do this right. So what about this: Do strtoull first, and *if* it passes numeric parsing, but str[0] == '-', *then* say ERROR: %s: negative value is invalid. -Eric -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] btrfs: wake up transaction thread upon remount
Now that we can adjust the commit interval with a remount, we need to wake up the transaction thread or else he will continue to sleep until the previous transaction interval has elapsed before waking up. So, if we go from a large commit interval to something smaller, the transaction thread will not wake up until the large interval has expired. This also causes the cleaner thread to stay sleeping, since it gets woken up by the transaction thread. Fix it by simply waking up the transaction thread during a remount. Signed-off-by: Justin Maggard jmaggar...@gmail.com --- fs/btrfs/super.c |1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d04db81..426b7c6 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1479,6 +1479,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sb-s_flags = ~MS_RDONLY; } out: + wake_up_process(fs_info-transaction_kthread); btrfs_remount_cleanup(fs_info, old_opts); return 0; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] btrfs: Allow forced conversion of metadata to dup profile on multiple devices
On Wed, Feb 19, 2014 at 11:10:41AM -0500, Austin S Hemmelgarn wrote: Currently, btrfs balance start fails when trying to convert metadata or system chunks to dup profile on filesystems with multiple devices. This requires that a conversion from a multi-device filesystem to a single device filesystem use the following methodology: 1. btrfs balance start -dconvert=single -mconvert=single \ -sconvert=single -f / 2. btrfs device delete / /dev/sdx 3. btrfs balance start -mconvert=dup -sconvert=dup / This results in a period of time (possibly very long if the devices are big) where you don't have the protection guarantees of multiple copies of metadata chunks. After applying this patch, one can instead use the following methodology for conversion from a multi-device filesystem to a single device filesystem: 1. btrfs balance start -dconvert=single -mconvert=dup \ -sconvert=dup -f / 2. btrfs device delete / /dev/sdx This greatly reduces the chances of the operation causing data loss due to a read error during the device delete. Signed-off-by: Austin S. Hemmelgarn ahferro...@gmail.com Reviewed-by: David Sterba dste...@suse.cz Sounds useful. The muliple devices + DUP is allowed setup when the device is added, this patch only adds the 'delete' counterpart. The imroved data loss protection during the process is a good thing. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v3 1/4] Btrfs-progs: new helper to parse string to u64 for btrfs
On Thu, Feb 20, 2014 at 10:29:51AM +0800, Wang Shilong wrote: There are many places that need parse string to u64 for btrfs commands, in fact, we do such things *too casually*, using atoi/atol/atoll..is not right at all, and even we don't check whether it is a valid string. Let's do everything more gracefully, we introduce a new helper arg_strtou64() which will do all the necessary checks.If we fail to parse string to u64, we will output message and exit directly, this is something like what usage() is doing. It is ok to not return erro to it's caller, because this function should be called when parsing arg (just like usage!) Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com This version looks good to me, thanks. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/8] Add command btrfs filesystem disk-usage
On Mon, Feb 17, 2014 at 07:09:55PM +0100, Goffredo Baroncelli wrote: I have to agree with Chris: looking at the output of btrfs fi disk-usage $ sudo ./btrfs filesystem disk-usage -t /mnt/btrfs1/ Data DataMetadata Metadata System System Single RAID6 Single RAID5Single RAID5 Unallocated /dev/vdb 8.00MB 1.00GB 8.00MB 1.00GB 4.00MB 4.00MB 97.98GB /dev/vdc - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vdd - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vde - 1.00GB- 1.00GB - 4.00MB 98.00GB == === == === === Total8.00MB 2.00GB 8.00MB 3.00GB 4.00MB 12.00MB391.97GB Used 0.00 11.25MB 0.00 36.00KB 0.00 4.00KB it is hard to tell that this can be named filesystem usage. I think that details or examine is a better name. Regarding btrfs device usage, it seems to me more coherent. But as reported before consistency also matters, so now I am inclined to use detail (or examine) also for btrfs device I'm for 'btrfs filesystem usage' and 'btrfs device usage', IMHO intuitive, easy to type and remember. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][BTRFS-PROGS][v4] Enhance btrfs fi df
On Thu, Feb 13, 2014 at 08:18:10PM +0100, Goffredo Baroncelli wrote: space (if the next chunk are allocated as SINGLE) or the minimum one ( if the next chunks are allocated as DUP/RAID1/RAID10). The other two commands show the chunks in the disks. $ sudo btrfs filesystem disk-usage /mnt/btrfs1/ Data,Single: Size:8.00MB, Used:0.00 /dev/vdb 8.00MB The information about per-device usage can be enhanced and there's enough space to print that: * allocated in chunks (the number above) * actually used (simiar to what 'btrfs fi show' prints as 'used') I don't see a reason why it would not fit here, nor any other place where this can be obtained. There is the cumulative number of 'Used' for all devices, but I'd like to see it per-device as well. or in tabular format $ sudo ./btrfs filesystem disk-usage -t /mnt/btrfs1/ Data DataMetadata Metadata System System Single RAID6 Single RAID5Single RAID5 Unallocated /dev/vdb 8.00MB 1.00GB 8.00MB 1.00GB 4.00MB 4.00MB 97.98GB /dev/vdc - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vdd - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vde - 1.00GB- 1.00GB - 4.00MB 98.00GB == === == === === Total8.00MB 2.00GB 8.00MB 3.00GB 4.00MB 12.00MB391.97GB Used 0.00 11.25MB 0.00 36.00KB 0.00 4.00KB These are the most complete output, where it is possible to know which disk a chunk uses and the usage of every chunk. Though not per-device, similar to the above, but the tabular output is limited compared to the sequential output. Not sure what to do here. Finally the last command shows which chunks a disk hosts: $ sudo ./btrfs device disk-usage /mnt/btrfs1/ /dev/vdb100.00GB Data,Single: 8.00MB Data,RAID6: 1.00GB Metadata,Single: 8.00MB Metadata,RAID5: 1.00GB System,Single:4.00MB System,RAID5: 4.00MB Unallocated: 97.98GB /dev/vdc100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB /dev/vdd100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB /dev/vde100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB More or less are the same information above, only grouped by disk. Ie. it's only a variant of the 'filesystem usage' where it is grouped by blockgroup type. Why doesn't 'btrfs device usage' take a device instead of the whole filesystem? This seems counterintuitive. It should be possible to ask for a device by it or path. Also, I'd like to see all useful information about the device: * id, path, uuid, ... whatever * physical device size * size visible by the filesystem * space allocated in chunks * space actually used Unfortunately I don't have any information about the chunk usage per disk basis. And I'm missing it. Is it a fundamental problem or just not addressed in this patchset? Finally I have to point out that the command btrfs fi df previous didn't require the root capability, now with my patches it is required, because I need to know some info about the chunks so I need to use the BTRFS_IOC_TREE_SEARCH. I think that there are the following possibilities: 1) accept this regresssion 2) remove the command btrfs fi df and leave only btrfs fi disk-usage and btrfs dev disk-usage 3) adding a new ioctl which could be used without root capability. Of course this ioctl would return only a subset of the BTRFS_IOC_TREE_SEARCH info I think that the 3) would be the long term solution. I am not happy about the 1), so as short term solution I think that we should go with the 2). What do you think ? No sorry, 1) is not acceptable. We can live with this limitation only during development so we're not blocked by some new ioctl development. No for 2), 'fi df' is useful as and widely used in existing scripts. Yes for 3), we may also export the information through the existing ioctls if possible (eg. IOC_FS_INFO). -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/8] Allow use of get_device_info()
On Thu, Feb 13, 2014 at 08:19:44PM +0100, Goffredo Baroncelli wrote: Allow the use of get_device_info() for different units. FYI, I'll move this patch to integration base, it's a simple and isolated change. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH][BTRFS-PROGS][v4] Enhance btrfs fi df
-BEGIN PGP SIGNED MESSAGE- Hash: SHA1 On 02/20/2014 01:08 PM, David Sterba wrote: On Thu, Feb 13, 2014 at 08:18:10PM +0100, Goffredo Baroncelli wrote: space (if the next chunk are allocated as SINGLE) or the minimum one ( if the next chunks are allocated as DUP/RAID1/RAID10). The other two commands show the chunks in the disks. $ sudo btrfs filesystem disk-usage /mnt/btrfs1/ Data,Single: Size:8.00MB, Used:0.00 /dev/vdb 8.00MB The information about per-device usage can be enhanced and there's enough space to print that: * allocated in chunks (the number above) * actually used (simiar to what 'btrfs fi show' prints as 'used') I don't see a reason why it would not fit here, nor any other place where this can be obtained. There is the cumulative number of 'Used' for all devices, but I'd like to see it per-device as well. or in tabular format $ sudo ./btrfs filesystem disk-usage -t /mnt/btrfs1/ Data Data Metadata Metadata System System Single RAID6 Single RAID5 Single RAID5 Unallocated /dev/vdb 8.00MB 1.00GB 8.00MB 1.00GB 4.00MB 4.00MB 97.98GB /dev/vdc - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vdd - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vde - 1.00GB- 1.00GB - 4.00MB 98.00GB == === == === === Total8.00MB 2.00GB 8.00MB 3.00GB 4.00MB 12.00MB391.97GB Used 0.00 11.25MB 0.00 36.00KB 0.00 4.00KB These are the most complete output, where it is possible to know which disk a chunk uses and the usage of every chunk. Though not per-device, similar to the above, but the tabular output is limited compared to the sequential output. Not sure what to do here. Finally the last command shows which chunks a disk hosts: $ sudo ./btrfs device disk-usage /mnt/btrfs1/ /dev/vdb 100.00GB Data,Single: 8.00MB Data,RAID6: 1.00GB Metadata,Single: 8.00MB Metadata,RAID5: 1.00GB System,Single:4.00MB System,RAID5: 4.00MB Unallocated: 97.98GB /dev/vdc 100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB /dev/vdd 100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB /dev/vde 100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB More or less are the same information above, only grouped by disk. Ie. it's only a variant of the 'filesystem usage' where it is grouped by blockgroup type. Why doesn't 'btrfs device usage' take a device instead of the whole filesystem? This seems counterintuitive. It should be possible to ask for a device by it or path. Also, I'd like to see all useful information about the device: * id, path, uuid, ... whatever * physical device size * size visible by the filesystem * space allocated in chunks * space actually used Unfortunately I don't have any information about the chunk usage per disk basis. And I'm missing it. Is it a fundamental problem or just not addressed in this patchset? Finally I have to point out that the command btrfs fi df previous didn't require the root capability, now with my patches it is required, because I need to know some info about the chunks so I need to use the BTRFS_IOC_TREE_SEARCH. I think that there are the following possibilities: 1) accept this regresssion 2) remove the command btrfs fi df and leave only btrfs fi disk-usage and btrfs dev disk-usage 3) adding a new ioctl which could be used without root capability. Of course this ioctl would return only a subset of the BTRFS_IOC_TREE_SEARCH info I think that the 3) would be the long term solution. I am not happy about the 1), so as short term solution I think that we should go with the 2). What do you think ? No sorry, 1) is not acceptable. We can live with this limitation only during development so we're not blocked by some new ioctl development. No for 2), 'fi df' is useful as and widely used in existing scripts. Yes for 3), we may also export the information through the existing ioctls if possible (eg. IOC_FS_INFO). For _right now_ I'd say just not do the raid56 stuff if we don't notice any raid56 chunks from the normal load_space_info, and then if there are raid56 we try and run the tree search ioctl and notice if we get back EPERM or whatever you get when you don't have permissions. Then just spit out as much information that you can about the fs with a little note at the bottom that available calculation isn't 100% and you need to run as root if you want that info. Then what we could do is add another flag type for the existing SPACE_INFO ioctl to spit out the information you need about the raid5/6 chunks and then just test for those flags and
Re: [PATCH][BTRFS-PROGS][v4] Enhance btrfs fi df
Hi David, below my comments On 02/20/2014 07:08 PM, David Sterba wrote: On Thu, Feb 13, 2014 at 08:18:10PM +0100, Goffredo Baroncelli wrote: space (if the next chunk are allocated as SINGLE) or the minimum one ( if the next chunks are allocated as DUP/RAID1/RAID10). The other two commands show the chunks in the disks. $ sudo btrfs filesystem disk-usage /mnt/btrfs1/ Data,Single: Size:8.00MB, Used:0.00 /dev/vdb 8.00MB The information about per-device usage can be enhanced and there's enough space to print that: * allocated in chunks (the number above) This is the value already written. * actually used (simiar to what 'btrfs fi show' prints as 'used') The used is a value returned by the ioctl BTRFS_IOC_SPACE_INFO, which returns this value per group-block basis. See below. I don't see a reason why it would not fit here, nor any other place where this can be obtained. There is the cumulative number of 'Used' for all devices, but I'd like to see it per-device as well. or in tabular format $ sudo ./btrfs filesystem disk-usage -t /mnt/btrfs1/ Data DataMetadata Metadata System System Single RAID6 Single RAID5Single RAID5 Unallocated /dev/vdb 8.00MB 1.00GB 8.00MB 1.00GB 4.00MB 4.00MB 97.98GB /dev/vdc - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vdd - 1.00GB- 1.00GB - 4.00MB 98.00GB /dev/vde - 1.00GB- 1.00GB - 4.00MB 98.00GB == === == === === Total8.00MB 2.00GB 8.00MB 3.00GB 4.00MB 12.00MB391.97GB Used 0.00 11.25MB 0.00 36.00KB 0.00 4.00KB These are the most complete output, where it is possible to know which disk a chunk uses and the usage of every chunk. Though not per-device, similar to the above, but the tabular output is limited compared to the sequential output. Not sure what to do here. The tabular is to have a friendly summary of how the filesystem is span on the different disks. I suggest to not add more info in the tabular Finally the last command shows which chunks a disk hosts: $ sudo ./btrfs device disk-usage /mnt/btrfs1/ /dev/vdb 100.00GB Data,Single: 8.00MB Data,RAID6: 1.00GB Metadata,Single: 8.00MB Metadata,RAID5: 1.00GB System,Single:4.00MB System,RAID5: 4.00MB Unallocated: 97.98GB /dev/vdc 100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB /dev/vdd 100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB /dev/vde 100.00GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB More or less are the same information above, only grouped by disk. Ie. it's only a variant of the 'filesystem usage' where it is grouped by blockgroup type. Why doesn't 'btrfs device usage' take a device instead of the whole filesystem? This seems counterintuitive. It should be possible to ask for a device by it or path. If a device is passed, what you would expect as output: the list of all the devices involved,or only the one(s) passed ? In other terms, the device passed has to act also as filter, or it is only a different way to indicate the path ? Also, I'd like to see all useful information about the device: * id, path, uuid, ... whatever ok, we can add a -v switch to add these further information * physical device size ok, it is already written * size visible by the filesystem Could you be more explicit ? a) For each chunk (in each disk) or b) for each disk ? And how this information per disk basis would be useful ? Example (supposing 4 disks) a) /dev/vde 100.00GB Data,RAID6: Disk size: 1.00GB, FS size: 512.00MB Metadata,RAID5: Disk size: 1.00GB, FS size: 750.00MB System,RAID5: Disk size: 4.00MB, FS size; 3.00MB Unallocated: Disk size: 98.00GB b) /dev/vde Disk size: 100.00GB, FS size: 1.28GB Data,RAID6: 1.00GB Metadata,RAID5: 1.00GB System,RAID5: 4.00MB Unallocated: 98.00GB * space allocated in chunks ok, it is already written * space actually used See my other comments I am not against to add further information to btrfs dev [disk-]usage, but I liked the idea to add the minimum information to avoid ambiguities: - the number on the left of the device is the device size - the number on the left of the block group is the space allocated on the disk
Re: [PATCH][BTRFS-PROGS][v4] Enhance btrfs fi df
Hi Josef, On 02/20/2014 07:32 PM, Josef Bacik wrote: Yes for 3), we may also export the information through the existing ioctls if possible (eg. IOC_FS_INFO). For _right now_ I'd say just not do the raid56 stuff if we don't notice any raid56 chunks from the normal load_space_info, and then if there are raid56 we try and run the tree search ioctl and notice if we get back EPERM or whatever you get when you don't have permissions. Then just spit out as much information that you can about the fs with a little note at the bottom that available calculation isn't 100% and you need to run as root if you want that info. Then what we could do is add another flag type for the existing SPACE_INFO ioctl to spit out the information you need about the raid5/6 chunks and then just test for those flags and make the adjustment necessary. This way we avoid adding yet another ioctl and stuff will still work nicely for old kernels that don't have the updated ioctl. Thanks, Josef Good suggestion ! I will take it -- gpg @keyserver.linux.it: Goffredo Baroncelli (kreijackATinwind.it Key fingerprint BBF5 1610 0B64 DAC6 5F7D 17B2 0EDA 9B37 8B82 E0B5 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: correctly determine if blocks are shared in btrfs_compare_trees
Just comparing the pointers (logical disk addresses) of the btree nodes is not completely bullet proof, we have to check if their generation numbers match too. It is guaranteed that a COW operation will result in a block with a different logical disk address than the original block's address, but over time we can reuse that former logical disk address. For example, creating a 2Gb filesystem on a loop device, and having a script running in a loop always updating the access timestamp of a file, resulted in the same logical disk address being reused for the same fs btree block in about only 4 minutes. This could make us skip entire subtrees when doing an incremental send (which is currently the only user of btrfs_compare_trees). However the odds of getting 2 blocks at the same tree level, with the same logical disk address, equal first slot keys and different generations, should hopefully be very low. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/ctree.c | 11 ++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cbd3a7d..88d1b1e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5376,6 +5376,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root, int advance_right; u64 left_blockptr; u64 right_blockptr; + u64 left_gen; + u64 right_gen; u64 left_start_ctransid; u64 right_start_ctransid; u64 ctransid; @@ -5640,7 +5642,14 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_blockptr = btrfs_node_blockptr( right_path-nodes[right_level], right_path-slots[right_level]); - if (left_blockptr == right_blockptr) { + left_gen = btrfs_node_ptr_generation( + left_path-nodes[left_level], + left_path-slots[left_level]); + right_gen = btrfs_node_ptr_generation( + right_path-nodes[right_level], + right_path-slots[right_level]); + if (left_blockptr == right_blockptr + left_gen == right_gen) { /* * As we're on a shared block, don't * allow to go deeper. -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Incremental backup over writable snapshot
Duncan 1i5t5.dun...@cox.net schrieb: Duncan had a nice example in this list how to migrate directories to subvolumes by using shallow copies: mv dir dir.old btrfs sub create dir cp -a -- reflink=always dir.old/. dir/. rm -Rf dir.old. FWIW, that was someone else. I remember seeing it and I may well have been involved in some aspect of the discussion and thus might have quoted it, but my particular use-case doesn't involve a lot of subvolumes or snapshots, so I don't typically get quite that deep into the command- detail in subvolume discussions as I've simply not had the necessary personal experience in that area to properly discuss at that level. (Tho it's certainly typical of what I might post in other areas, just not that one.) Oh sorry... I just was testing if you are following the list closely... ;-) I didn't as you can see. :-) -- Replies to list only preferred. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Incremental backup over writable snapshot
GEO 1g2e...@gmail.com schrieb: @Kai Krakow: I accept your opinion and thank you for your answer. However I have special reasons doing so. I could name you a few use cases. For example I do not need to backup search indexes as they mess up over time, so I simple recreate the cache in case of a new install. I know most of the settings I set and I know exactly what missing directories break what in case of deletion, because I have tried so various times. I tried to keep it neutral to help people not to try applying your special case as an idea for their own backup which may have a total different requirement. This is not supposed to be a system backup, or a home backup, but a backup of my data (documents, videos etc.). I know hidden directories contain mails etc. but I know exactly where my mails are (most of them imap anyway) and I would include them in the backup. So I am looking for a different use case. I may be wrong but it sounds like you approach the problem from the wrong direction. If only selective data is important to you for backup: Why not put your important data in a single subvolume and backup that? You could install some compatibility symlinks to keep consistency with the well known directory structure... This is how I usually handle such corner cases. A small script like recreate_symlinks.sh which you also put into the backup will help you in case of restoring. -- Replies to list only preferred. -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] Btrfs: fix send issuing outdated paths for utimes, chown and chmod
When doing an incremental send, if we had a directory pending a move/rename operation and none of its parents, except for the immediate parent, were pending a move/rename, after processing the directory's references, we would be issuing utimes, chown and chmod intructions against am outdated path - a path which matched the one in the parent root. This change also simplifies a bit the code that deals with building a path for a directory which has a move/rename operation delayed. Steps to reproduce: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ mkdir -p /mnt/btrfs/a/b/c/d/e $ mkdir /mnt/btrfs/a/b/c/f $ chmod 0777 /mnt/btrfs/a/b/c/d/e $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap1 $ btrfs send /mnt/btrfs/snap1 -f /tmp/base.send $ mv /mnt/btrfs/a/b/c/f /mnt/btrfs/a/b/f2 $ mv /mnt/btrfs/a/b/c/d/e /mnt/btrfs/a/b/f2/e2 $ mv /mnt/btrfs/a/b/c /mnt/btrfs/a/b/c2 $ mv /mnt/btrfs/a/b/c2/d /mnt/btrfs/a/b/c2/d2 $ chmod 0700 /mnt/btrfs/a/b/f2/e2 $ btrfs subvolume snapshot -r /mnt/btrfs /mnt/btrfs/snap2 $ btrfs send -p /mnt/btrfs/snap1 /mnt/btrfs/snap2 -f /tmp/incremental.send $ umount /mnt/btrfs $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ btrfs receive /mnt/btrfs -f /tmp/base.send $ btrfs receive /mnt/btrfs -f /tmp/incremental.send The second btrfs receive command failed with: ERROR: chmod a/b/c/d/e failed. No such file or directory A test case for xfstests follows. Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- fs/btrfs/send.c | 31 --- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 46c6b54..298e25d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -2000,7 +2000,6 @@ static void name_cache_free(struct send_ctx *sctx) */ static int __get_cur_name_and_parent(struct send_ctx *sctx, u64 ino, u64 gen, -int skip_name_cache, u64 *parent_ino, u64 *parent_gen, struct fs_path *dest) @@ -2010,8 +2009,6 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, struct btrfs_path *path = NULL; struct name_cache_entry *nce = NULL; - if (skip_name_cache) - goto get_ref; /* * First check if we already did a call to this function with the same * ino/gen. If yes, check if the cache entry is still up-to-date. If yes @@ -2056,12 +2053,11 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, goto out_cache; } -get_ref: /* * Depending on whether the inode was already processed or not, use * send_root or parent_root for ref lookup. */ - if (ino sctx-send_progress !skip_name_cache) + if (ino sctx-send_progress) ret = get_first_ref(sctx-send_root, ino, parent_ino, parent_gen, dest); else @@ -2085,8 +2081,6 @@ get_ref: goto out; ret = 1; } - if (skip_name_cache) - goto out; out_cache: /* @@ -2154,7 +2148,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; - int skip_name_cache = 0; name = fs_path_alloc(); if (!name) { @@ -2162,9 +2155,6 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, goto out; } - if (is_waiting_for_move(sctx, ino)) - skip_name_cache = 1; - dest-reversed = 1; fs_path_reset(dest); @@ -2179,16 +2169,19 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, break; } - ret = __get_cur_name_and_parent(sctx, ino, gen, skip_name_cache, - parent_inode, parent_gen, name); + if (is_waiting_for_move(sctx, ino)) { + ret = get_first_ref(sctx-parent_root, ino, + parent_inode, parent_gen, name); + } else { + ret = __get_cur_name_and_parent(sctx, ino, gen, + parent_inode, + parent_gen, name); + if (ret) + stop = 1; + } + if (ret 0) goto out; - if (ret) - stop = 1; - - if (!skip_name_cache - is_waiting_for_move(sctx, parent_inode)) - skip_name_cache = 1; ret = fs_path_add_path(dest, name); if (ret
[PATCH] xfstests: add test btrfs/042 for btrfs incremental send
Regression test for a btrfs incremental send issue where invalid paths for utimes, chown and chmod operations were sent to the send stream, causing btrfs receive to fail. If a directory had a move/rename operation delayed, and none of its parent directories, except for the immediate one, had delayed move/rename operations, after processing the directory's references, the incremental send code would issue invalid paths for utimes, chown and chmod operations. This issue is fixed by the following linux kernel btrfs patch: Btrfs: fix send issuing outdated paths for utimes, chown and chmod Signed-off-by: Filipe David Borba Manana fdman...@gmail.com --- tests/btrfs/042 | 133 +++ tests/btrfs/042.out |1 + tests/btrfs/group |1 + 3 files changed, 135 insertions(+) create mode 100755 tests/btrfs/042 create mode 100644 tests/btrfs/042.out diff --git a/tests/btrfs/042 b/tests/btrfs/042 new file mode 100755 index 000..b38db31 --- /dev/null +++ b/tests/btrfs/042 @@ -0,0 +1,133 @@ +#! /bin/bash +# FS QA Test No. btrfs/042 +# +# Regression test for a btrfs incremental send issue where under certain +# scenarios invalid paths for utimes, chown and chmod operations were sent +# to the send stream, causing btrfs receive to fail. +# +# If a directory had a move/rename operation delayed, and none of its parent +# directories, except for the immediate one, had delayed move/rename operations, +# after processing the directory's references, the incremental send code would +# issue invalid paths for utimes, chown and chmod operations. +# +# This issue is fixed by the following linux kernel btrfs patch: +# +# Btrfs: fix send issuing outdated paths for utimes, chown and chmod +# +#--- +# Copyright (c) 2014 Filipe Manana. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +#--- +# + +seq=`basename $0` +seqres=$RESULT_DIR/$seq +echo QA output created by $seq + +here=`pwd` +tmp=`mktemp -d` +status=1 # failure is the default! +trap _cleanup; exit \$status 0 1 2 3 15 + +_cleanup() +{ +rm -fr $tmp +} + +# get standard environment, filters and checks +. ./common/rc +. ./common/filter + +# real QA test starts here +_supported_fs btrfs +_supported_os Linux +_require_scratch +_need_to_be_root + +FSSUM_PROG=$here/src/fssum +[ -x $FSSUM_PROG ] || _notrun fssum not built + +rm -f $seqres.full + +_scratch_mkfs /dev/null 21 +_scratch_mount + +umask 0 +mkdir -p $SCRATCH_MNT/a/b/c/d/e +mkdir $SCRATCH_MNT/a/b/c/f +echo 'ola ' $SCRATCH_MNT/a/b/c/d/e/file.txt +chmod 0777 $SCRATCH_MNT/a/b/c/d/e + +# Filesystem looks like: +# +# . (ino 256) +# |-- a/ (ino 257) +# |-- b/ (ino 258) +# |-- c/ (ino 259) +# |-- d/ (ino 260) +# | |-- e/ (ino 261) +# | |-- file.txt(ino 262) +# | +# |-- f/ (ino 263) + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap1 + +echo 'mundo' $SCRATCH_MNT/a/b/c/d/e/file.txt +mv $SCRATCH_MNT/a/b/c/d/e/file.txt $SCRATCH_MNT/a/b/c/d/e/file2.txt +mv $SCRATCH_MNT/a/b/c/f $SCRATCH_MNT/a/b/f2 +mv $SCRATCH_MNT/a/b/c/d/e $SCRATCH_MNT/a/b/f2/e2 +mv $SCRATCH_MNT/a/b/c $SCRATCH_MNT/a/b/c2 +mv $SCRATCH_MNT/a/b/c2/d $SCRATCH_MNT/a/b/c2/d2 +chmod 0700 $SCRATCH_MNT/a/b/f2/e2 + +# Filesystem now looks like: +# +# . (ino 256) +# |-- a/ (ino 257) +# |-- b/ (ino 258) +# |-- c2/(ino 259) +# | |-- d2/(ino 260) +# | +# |-- f2/(ino 263) +# |-- e2 (ino 261) +# |-- file2.txt (ino 263) + +_run_btrfs_util_prog subvolume snapshot -r $SCRATCH_MNT $SCRATCH_MNT/mysnap2 + +run_check $FSSUM_PROG -A -f -w $tmp/1.fssum $SCRATCH_MNT/mysnap1 +run_check $FSSUM_PROG -A -f -w $tmp/2.fssum -x $SCRATCH_MNT/mysnap2/mysnap1 \ + $SCRATCH_MNT/mysnap2 + +_run_btrfs_util_prog send $SCRATCH_MNT/mysnap1 -f $tmp/1.snap +_run_btrfs_util_prog send -p $SCRATCH_MNT/mysnap1 $SCRATCH_MNT/mysnap2 \ +
Re: [PATCH 1/9] dcache: move d_splice_alias
On Tue, Feb 18, 2014 at 03:28:57PM -0500, J. Bruce Fields wrote: From: J. Bruce Fields bfie...@redhat.com Just a trivial move to locate it near (similar) d_materialise_unique code and save some forward references in a following patch. Signed-off-by: J. Bruce Fields bfie...@redhat.com Looks good, Reviewed-by: Christoph Hellwig h...@lst.de -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 2/9] dcache: close d_move race in d_splice_alias
On Tue, Feb 18, 2014 at 03:28:58PM -0500, J. Bruce Fields wrote: From: J. Bruce Fields bfie...@redhat.com d_splice_alias will d_move an IS_ROOT() directory dentry into place if one exists. This should be safe as long as the dentry remains IS_ROOT, but I can't see what guarantees that: once we drop the i_lock all we hold here is the i_mutex on an unrelated parent directory. Instead copy the logic of d_materialise_unique. Looks good, Reviewed-by: Christoph Hellwig h...@lst.de -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 5/9] dcache: d_obtain_alias callers don't all want DISCONNECTED
- return d_obtain_alias(inode); + return d_obtain_alias_root(inode); Can we call this d_obtain_root or similar, please? -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v4 1/4] Btrfs-progs: new helper to parse string to u64 for btrfs
There are many places that need parse string to u64 for btrfs commands, in fact, we do such things *too casually*, using atoi/atol/atoll..is not right at all, and even we don't check whether it is a valid string. Let's do everything more gracefully, we introduce a new helper arg_strtou64() which will do all the necessary checks.If we fail to parse string to u64, we will output message and exit directly, this is something like what usage() is doing. It is ok to not return erro to it's caller, because this function should be called when parsing arg (just like usage!) Signed-off-by: Wang Shilong wangsl.f...@cn.fujitsu.com --- v3-v4: move prefix '-' check after strtoull call, which will make negative output more precise to common users!(Thanks to Eric Sandeen) --- utils.c | 33 + utils.h | 1 + 2 files changed, 34 insertions(+) diff --git a/utils.c b/utils.c index 97e23d5..9120877 100644 --- a/utils.c +++ b/utils.c @@ -1520,6 +1520,39 @@ scan_again: return 0; } +/* + * This function should be only used when parsing + * command arg, it won't return error to it's + * caller and rather exit directly just like usage(). + */ +u64 arg_strtou64(const char *str) +{ + u64 value; + char *ptr_parse_end = NULL; + + value = strtoull(str, ptr_parse_end, 0); + if (ptr_parse_end *ptr_parse_end != '\0') { + fprintf(stderr, ERROR: %s is not a valid numeric value.\n, + str); + exit(1); + } + /* +* if we pass a negative number to strtoull, +* it will return an unexpected number to us, +* so let's do the check ourselves. +*/ + if (str[0] == '-') { + fprintf(stderr, ERROR: %s: negative value is invalid.\n, + str); + exit(1); + } + if (value == ULLONG_MAX) { + fprintf(stderr, ERROR: %s is too large.\n, str); + exit(1); + } + return value; +} + u64 parse_size(char *s) { int i; diff --git a/utils.h b/utils.h index 04b8c45..a201085 100644 --- a/utils.h +++ b/utils.h @@ -71,6 +71,7 @@ int pretty_size_snprintf(u64 size, char *str, size_t str_bytes); int get_mountpt(char *dev, char *mntpt, size_t size); int btrfs_scan_block_devices(int run_ioctl); u64 parse_size(char *s); +u64 arg_strtou64(const char *str); int open_file_or_dir(const char *fname, DIR **dirstream); void close_file_or_dir(int fd, DIR *dirstream); int get_fs_info(char *path, struct btrfs_ioctl_fs_info_args *fi_args, -- 1.9.0 -- To unsubscribe from this list: send the line unsubscribe linux-btrfs in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html