Hi Wang, On 01.05.2013 09:29, Wang Shilong wrote: > Hello Jan, > >> If qgroup tracking is out of sync, a rescan operation can be started. It >> iterates the complete extent tree and recalculates all qgroup tracking data. >> This is an expensive operation and should not be used unless required. >> >> A filesystem under rescan can still be umounted. The rescan continues on the >> next mount. Status information is provided with a separate ioctl while a >> rescan operation is in progress. >> >> Signed-off-by: Jan Schmidt <list.bt...@jan-o-sch.net> >> --- >> fs/btrfs/ctree.h | 17 ++- >> fs/btrfs/disk-io.c | 5 + >> fs/btrfs/ioctl.c | 83 ++++++++++-- >> fs/btrfs/qgroup.c | 318 >> ++++++++++++++++++++++++++++++++++++++++++-- >> include/uapi/linux/btrfs.h | 12 ++- >> 5 files changed, 400 insertions(+), 35 deletions(-) >> >> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h >> index 412c306..e4f28a6 100644 >> --- a/fs/btrfs/ctree.h >> +++ b/fs/btrfs/ctree.h >> @@ -1021,9 +1021,9 @@ struct btrfs_block_group_item { >> */ >> #define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0) >> /* >> - * SCANNING is set during the initialization phase >> + * RESCAN is set during the initialization phase >> */ >> -#define BTRFS_QGROUP_STATUS_FLAG_SCANNING (1ULL << 1) >> +#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1) >> /* >> * Some qgroup entries are known to be out of date, >> * either because the configuration has changed in a way that >> @@ -1052,7 +1052,7 @@ struct btrfs_qgroup_status_item { >> * only used during scanning to record the progress >> * of the scan. It contains a logical address >> */ >> - __le64 scan; >> + __le64 rescan; >> } __attribute__ ((__packed__)); >> >> struct btrfs_qgroup_info_item { >> @@ -1603,6 +1603,11 @@ struct btrfs_fs_info { >> /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ >> u64 qgroup_seq; >> >> + /* qgroup rescan items */ >> + struct mutex qgroup_rescan_lock; /* protects the progress item */ >> + struct btrfs_key qgroup_rescan_progress; >> + struct btrfs_workers qgroup_rescan_workers; >> + >> /* filesystem state */ >> unsigned long fs_state; >> >> @@ -2888,8 +2893,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_version, struct >> btrfs_qgroup_status_item, >> version, 64); >> BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, >> flags, 64); >> -BTRFS_SETGET_FUNCS(qgroup_status_scan, struct btrfs_qgroup_status_item, >> - scan, 64); >> +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, >> + rescan, 64); >> >> /* btrfs_qgroup_info_item */ >> BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, >> @@ -3834,7 +3839,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle >> *trans, >> struct btrfs_fs_info *fs_info); >> int btrfs_quota_disable(struct btrfs_trans_handle *trans, >> struct btrfs_fs_info *fs_info); >> -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info); >> +int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); >> int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, >> struct btrfs_fs_info *fs_info, u64 src, u64 dst); >> int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, >> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c >> index 7717363..63e9348 100644 >> --- a/fs/btrfs/disk-io.c >> +++ b/fs/btrfs/disk-io.c >> @@ -2010,6 +2010,7 @@ static void btrfs_stop_all_workers(struct >> btrfs_fs_info *fs_info) >> btrfs_stop_workers(&fs_info->caching_workers); >> btrfs_stop_workers(&fs_info->readahead_workers); >> btrfs_stop_workers(&fs_info->flush_workers); >> + btrfs_stop_workers(&fs_info->qgroup_rescan_workers); >> } >> >> /* helper to cleanup tree roots */ >> @@ -2301,6 +2302,7 @@ int open_ctree(struct super_block *sb, >> fs_info->qgroup_seq = 1; >> fs_info->quota_enabled = 0; >> fs_info->pending_quota_state = 0; >> + mutex_init(&fs_info->qgroup_rescan_lock); >> >> btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); >> btrfs_init_free_cluster(&fs_info->data_alloc_cluster); >> @@ -2529,6 +2531,8 @@ int open_ctree(struct super_block *sb, >> btrfs_init_workers(&fs_info->readahead_workers, "readahead", >> fs_info->thread_pool_size, >> &fs_info->generic_worker); >> + btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1, >> + &fs_info->generic_worker); >> >> /* >> * endios are largely parallel and should have a very >> @@ -2563,6 +2567,7 @@ int open_ctree(struct super_block *sb, >> ret |= btrfs_start_workers(&fs_info->caching_workers); >> ret |= btrfs_start_workers(&fs_info->readahead_workers); >> ret |= btrfs_start_workers(&fs_info->flush_workers); >> + ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers); >> if (ret) { >> err = -ENOMEM; >> goto fail_sb_buffer; >> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c >> index d0af96a..5e93bb8 100644 >> --- a/fs/btrfs/ioctl.c >> +++ b/fs/btrfs/ioctl.c >> @@ -3701,12 +3701,10 @@ static long btrfs_ioctl_quota_ctl(struct file *file, >> void __user *arg) >> } >> >> down_write(&root->fs_info->subvol_sem); >> - if (sa->cmd != BTRFS_QUOTA_CTL_RESCAN) { >> - trans = btrfs_start_transaction(root->fs_info->tree_root, 2); >> - if (IS_ERR(trans)) { >> - ret = PTR_ERR(trans); >> - goto out; >> - } >> + trans = btrfs_start_transaction(root->fs_info->tree_root, 2); >> + if (IS_ERR(trans)) { >> + ret = PTR_ERR(trans); >> + goto out; >> } >> >> switch (sa->cmd) { >> @@ -3716,9 +3714,6 @@ static long btrfs_ioctl_quota_ctl(struct file *file, >> void __user *arg) >> case BTRFS_QUOTA_CTL_DISABLE: >> ret = btrfs_quota_disable(trans, root->fs_info); >> break; >> - case BTRFS_QUOTA_CTL_RESCAN: >> - ret = btrfs_quota_rescan(root->fs_info); >> - break; >> default: >> ret = -EINVAL; >> break; >> @@ -3727,11 +3722,9 @@ static long btrfs_ioctl_quota_ctl(struct file *file, >> void __user *arg) >> if (copy_to_user(arg, sa, sizeof(*sa))) >> ret = -EFAULT; >> >> - if (trans) { >> - err = btrfs_commit_transaction(trans, root->fs_info->tree_root); >> - if (err && !ret) >> - ret = err; >> - } >> + err = btrfs_commit_transaction(trans, root->fs_info->tree_root); >> + if (err && !ret) >> + ret = err; >> out: >> kfree(sa); >> up_write(&root->fs_info->subvol_sem); >> @@ -3886,6 +3879,64 @@ drop_write: >> return ret; >> } >> >> +static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) >> +{ >> + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; >> + struct btrfs_ioctl_quota_rescan_args *qsa; >> + int ret; >> + >> + if (!capable(CAP_SYS_ADMIN)) >> + return -EPERM; >> + >> + ret = mnt_want_write_file(file); >> + if (ret) >> + return ret; >> + >> + qsa = memdup_user(arg, sizeof(*qsa)); >> + if (IS_ERR(qsa)) { >> + ret = PTR_ERR(qsa); >> + goto drop_write; >> + } >> + >> + if (qsa->flags) { >> + ret = -EINVAL; >> + goto out; >> + } >> + >> + ret = btrfs_qgroup_rescan(root->fs_info); >> + >> +out: >> + kfree(qsa); >> +drop_write: >> + mnt_drop_write_file(file); >> + return ret; >> +} >> + >> +static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user >> *arg) >> +{ >> + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; >> + struct btrfs_ioctl_quota_rescan_args *qsa; >> + int ret = 0; >> + >> + if (!capable(CAP_SYS_ADMIN)) >> + return -EPERM; >> + >> + qsa = kzalloc(sizeof(*qsa), GFP_NOFS); >> + if (!qsa) >> + return -ENOMEM; >> + >> + if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >> + qsa->flags = 1; >> + qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; >> + } >> + >> + if (copy_to_user(arg, qsa, sizeof(*qsa))) >> + ret = -EFAULT; >> + >> + kfree(qsa); >> + return ret; >> +} >> + >> static long btrfs_ioctl_set_received_subvol(struct file *file, >> void __user *arg) >> { >> @@ -4124,6 +4175,10 @@ long btrfs_ioctl(struct file *file, unsigned int >> return btrfs_ioctl_qgroup_create(file, argp); >> case BTRFS_IOC_QGROUP_LIMIT: >> return btrfs_ioctl_qgroup_limit(file, argp); >> + case BTRFS_IOC_QUOTA_RESCAN: >> + return btrfs_ioctl_quota_rescan(file, argp); >> + case BTRFS_IOC_QUOTA_RESCAN_STATUS: >> + return btrfs_ioctl_quota_rescan_status(file, argp); >> case BTRFS_IOC_DEV_REPLACE: >> return btrfs_ioctl_dev_replace(root, argp); >> case BTRFS_IOC_GET_FSLABEL: >> diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c >> index c50e5a5..664d457 100644 >> --- a/fs/btrfs/qgroup.c >> +++ b/fs/btrfs/qgroup.c >> @@ -31,13 +31,13 @@ >> #include "locking.h" >> #include "ulist.h" >> #include "backref.h" >> +#include "extent_io.h" >> >> /* TODO XXX FIXME >> * - subvol delete -> delete when ref goes to 0? delete limits also? >> * - reorganize keys >> * - compressed >> * - sync >> - * - rescan >> * - copy also limits on subvol creation >> * - limit >> * - caches fuer ulists >> @@ -98,6 +98,14 @@ struct btrfs_qgroup_list { >> struct btrfs_qgroup *member; >> }; >> >> +struct qgroup_rescan { >> + struct btrfs_work work; >> + struct btrfs_fs_info *fs_info; >> +}; >> + >> +static void qgroup_rescan_start(struct btrfs_fs_info *fs_info, >> + struct qgroup_rescan *qscan); >> + >> /* must be called with qgroup_ioctl_lock held */ >> static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, >> u64 qgroupid) >> @@ -298,7 +306,20 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info >> *fs_info) >> } >> fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, >> ptr); >> - /* FIXME read scan element */ >> + fs_info->qgroup_rescan_progress.objectid = >> + btrfs_qgroup_status_rescan(l, ptr); >> + if (fs_info->qgroup_flags & >> + BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >> + struct qgroup_rescan *qscan = >> + kmalloc(sizeof(*qscan), GFP_NOFS); >> + if (!qscan) { >> + ret = -ENOMEM; >> + goto out; >> + } >> + fs_info->qgroup_rescan_progress.type = 0; >> + fs_info->qgroup_rescan_progress.offset = 0; >> + qgroup_rescan_start(fs_info, qscan); >> + } >> goto next1; >> } >> >> @@ -719,7 +740,8 @@ static int update_qgroup_status_item(struct >> btrfs_trans_handle *trans, >> ptr = btrfs_item_ptr(l, slot, struct btrfs_qgroup_status_item); >> btrfs_set_qgroup_status_flags(l, ptr, fs_info->qgroup_flags); >> btrfs_set_qgroup_status_generation(l, ptr, trans->transid); >> - /* XXX scan */ >> + btrfs_set_qgroup_status_rescan(l, ptr, >> + fs_info->qgroup_rescan_progress.objectid); >> >> btrfs_mark_buffer_dirty(l); >> >> @@ -830,7 +852,7 @@ int btrfs_quota_enable(struct btrfs_trans_handle *trans, >> fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | >> BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; >> btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags); >> - btrfs_set_qgroup_status_scan(leaf, ptr, 0); >> + btrfs_set_qgroup_status_rescan(leaf, ptr, 0); >> >> btrfs_mark_buffer_dirty(leaf); >> >> @@ -944,10 +966,11 @@ out: >> return ret; >> } >> >> -int btrfs_quota_rescan(struct btrfs_fs_info *fs_info) >> +static void qgroup_dirty(struct btrfs_fs_info *fs_info, >> + struct btrfs_qgroup *qgroup) >> { >> - /* FIXME */ >> - return 0; >> + if (list_empty(&qgroup->dirty)) >> + list_add(&qgroup->dirty, &fs_info->dirty_qgroups); >> } >> >> int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, >> @@ -1155,13 +1178,6 @@ out: >> return ret; >> } >> >> -static void qgroup_dirty(struct btrfs_fs_info *fs_info, >> - struct btrfs_qgroup *qgroup) >> -{ >> - if (list_empty(&qgroup->dirty)) >> - list_add(&qgroup->dirty, &fs_info->dirty_qgroups); >> -} >> - >> /* >> * btrfs_qgroup_record_ref is called when the ref is added or deleted. it >> puts >> * the modification into a list that's later used by btrfs_end_transaction to >> @@ -1388,6 +1404,15 @@ int btrfs_qgroup_account_ref(struct >> btrfs_trans_handle *trans, >> BUG(); >> } >> >> + mutex_lock(&fs_info->qgroup_rescan_lock); >> + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >> + if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { >> + mutex_unlock(&fs_info->qgroup_rescan_lock); >> + return 0; >> + } >> + } >> + mutex_unlock(&fs_info->qgroup_rescan_lock); >> + >> /* >> * the delayed ref sequence number we pass depends on the direction of >> * the operation. for add operations, we pass (node->seq - 1) to skip >> @@ -1401,7 +1426,15 @@ int btrfs_qgroup_account_ref(struct >> btrfs_trans_handle *trans, >> if (ret < 0) >> return ret; >> >> + mutex_lock(&fs_info->qgroup_rescan_lock); >> spin_lock(&fs_info->qgroup_lock); >> + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { >> + if (fs_info->qgroup_rescan_progress.objectid <= node->bytenr) { >> + ret = 0; >> + goto unlock; >> + } >> + } >> + >> quota_root = fs_info->quota_root; >> if (!quota_root) >> goto unlock; >> @@ -1443,6 +1476,7 @@ int btrfs_qgroup_account_ref(struct btrfs_trans_handle >> *trans, >> >> unlock: >> spin_unlock(&fs_info->qgroup_lock); >> + mutex_unlock(&fs_info->qgroup_rescan_lock); > > > Why do you hold qgroup_rescan_lock when doing qgroup accounting here? > I can understand that we hold qgroup_rescan_lock when we update > qgroup_flag(at first in qgroup_account_ref()), > However, is it necessary that we hold qgroup_rescan_lock when we are doing > qgroup > accounting step1,2,3?? > > Or am i missing something here?
We need the lock for the check added above. This check needs the mutex lock, while the three accounting steps need a spin lock (which was not modified by my patch). We cannot call mutex_unlock while holding a spin lock, because mutex_unlock might schedule. Thanks, -Jan > Thanks, > Wang -- To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html