[Devel] [PATCH rh7] ploop: fix freeze/thaw ioctls
Current implementation suffers from several problems: 1) If someone, e.g. another instance of push-backup tool, mistakenly attempts to freeze ploop while its thawing is in progress, we can end up in double freeze. 2) After initiating thawing, no way to find out it by sysctl or /sys. 3) Handling PLOOP_S_FROZEN bit is not synchronized with ploop STOP/CLEAR ioctls. It's not nice if ploop releases bdev keeping it in frozen state. The patch fixes the above in straightforward way: more descriptive plo->freeze_state, visible via /sys/block/ploopN/pstate/freeze_state, and special checks in ioctl-s to ensure that freeze/thaw is allowed only on running ploops and that thaw must preceed ploop STOP. https://jira.sw.ru/browse/PSBM-49699 Signed-off-by: Maxim Patlasov --- drivers/block/ploop/dev.c | 34 ++ drivers/block/ploop/sysfs.c |6 ++ include/linux/ploop/ploop.h |8 +++- 3 files changed, 43 insertions(+), 5 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index 3dc94ca..81d463f 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -3905,6 +3905,13 @@ static int ploop_stop(struct ploop_device * plo, struct block_device *bdev) return -EBUSY; } + if (plo->freeze_state != PLOOP_F_NORMAL) { + if (printk_ratelimit()) + printk(KERN_INFO "stop ploop%d failed (freeze_state=%d)\n", + plo->index, plo->freeze_state); + return -EBUSY; + } + clear_bit(PLOOP_S_PUSH_BACKUP, &plo->state); ploop_pb_stop(plo->pbd, true); @@ -4819,15 +4826,21 @@ static int ploop_freeze(struct ploop_device *plo, struct block_device *bdev) { struct super_block *sb = plo->sb; - if (test_bit(PLOOP_S_FROZEN, &plo->state)) + if (!test_bit(PLOOP_S_RUNNING, &plo->state)) + return -EINVAL; + + if (plo->freeze_state == PLOOP_F_FROZEN) return 0; + if (plo->freeze_state == PLOOP_F_THAWING) + return -EBUSY; + sb = freeze_bdev(bdev); if (sb && IS_ERR(sb)) return PTR_ERR(sb); plo->sb = sb; - set_bit(PLOOP_S_FROZEN, &plo->state); + plo->freeze_state = PLOOP_F_FROZEN; return 0; } @@ -4836,16 +4849,29 @@ static int ploop_thaw(struct ploop_device *plo, struct block_device *bdev) struct super_block *sb = plo->sb; int err; - if (!test_bit(PLOOP_S_FROZEN, &plo->state)) + if (!test_bit(PLOOP_S_RUNNING, &plo->state)) + return -EINVAL; + + if (plo->freeze_state == PLOOP_F_NORMAL) return 0; + if (plo->freeze_state == PLOOP_F_THAWING) + return -EBUSY; + plo->sb = NULL; - clear_bit(PLOOP_S_FROZEN, &plo->state); + plo->freeze_state = PLOOP_F_THAWING; mutex_unlock(&plo->ctl_mutex); err = thaw_bdev(bdev, sb); mutex_lock(&plo->ctl_mutex); + BUG_ON(plo->freeze_state != PLOOP_F_THAWING); + + if (!err) + plo->freeze_state = PLOOP_F_NORMAL; + else + plo->freeze_state = PLOOP_F_FROZEN; + return err; } diff --git a/drivers/block/ploop/sysfs.c b/drivers/block/ploop/sysfs.c index d6dcc83..71b2a20 100644 --- a/drivers/block/ploop/sysfs.c +++ b/drivers/block/ploop/sysfs.c @@ -425,6 +425,11 @@ static ssize_t print_push_backup_uuid(struct ploop_device * plo, char * page) return snprintf(page, PAGE_SIZE, "%pUB\n", uuid); } +static u32 show_freeze_state(struct ploop_device * plo) +{ + return plo->freeze_state; +} + #define _TUNE_U32(_name) \ static u32 show_##_name(struct ploop_device * plo) \ { \ @@ -507,6 +512,7 @@ static struct attribute *state_attributes[] = { _A3(cookie), _A3(push_backup_uuid), _A(open_count), + _A(freeze_state), NULL }; diff --git a/include/linux/ploop/ploop.h b/include/linux/ploop/ploop.h index 7864edf..8ab4477 100644 --- a/include/linux/ploop/ploop.h +++ b/include/linux/ploop/ploop.h @@ -61,7 +61,12 @@ enum { (for minor mgmt only) */ PLOOP_S_ONCE, /* An event (e.g. printk once) happened */ PLOOP_S_PUSH_BACKUP,/* Push_backup is in progress */ - PLOOP_S_FROZEN /* Frozen PLOOP_IOC_FREEZE */ +}; + +enum { + PLOOP_F_NORMAL, /* Default: not yet freezed or unfrozen */ + PLOOP_F_FROZEN, /* Frozen PLOOP_IOC_FREEZE */ + PLOOP_F_THAWING,/* thaw_bdev is in progress */ }; struct ploop_snapdata @@ -411,6 +416,7 @@ struct ploop_device struct request_queue*queue; struct task_struct *thread; struct super_block *sb; + int freeze_state; struct rb_node link;
[Devel] [PATCH] ext4: improve ext4lazyinit scalability
ext4lazyinit is global thread. This thread performs itable initalization under It basically does followes: ext4_lazyinit_thread ->mutex_lock(&eli->li_list_mtx); ->ext4_run_li_request(elr) ->ext4_init_inode_table-> Do a lot of IO if list is large And when new mounts/umount arrives they have to block on ->li_list_mtx because lazy_thread holds it during full walk procedure. ext4_fill_super ->ext4_register_li_request ->mutex_lock(&ext4_li_info->li_list_mtx); ->list_add(&elr->lr_request, &ext4_li_info >li_request_list); In my case mount takes 40minutes on server with 36 * 4Tb HDD. Convinient user may face this in case of very slow dev ( /dev/mmcblkXXX) Even more. I one of filesystem was frozen lazyinit_thread will simply blocks on sb_start_write() so other mount/umounts will suck forever. This patch changes logic like follows: - grap ->s_umount read sem before process new li_request after that it is safe to drop list_mtx because all callers of li_remove_requers are holds ->s_umount for write. - li_thread skip frozen SB's Locking: Locking order is asserted by umout path like follows: s_umount ->li_list_mtx so the only way to to grab ->s_mount inside li_thread is via down_read_trylock https://jira.sw.ru/browse/PSBM-49658 Signed-off-by: Dmitry Monakhov --- fs/ext4/super.c | 53 - 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3822a5a..0ee193f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2635,7 +2635,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; - sb_start_write(sb); for (group = elr->lr_next_group; group < ngroups; group++) { gdp = ext4_get_group_desc(sb, group, NULL); if (!gdp) { @@ -2662,8 +2661,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr) elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1; } - sb_end_write(sb); - return ret; } @@ -2713,9 +2710,9 @@ static struct task_struct *ext4_lazyinit_task; static int ext4_lazyinit_thread(void *arg) { struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg; - struct list_head *pos, *n; struct ext4_li_request *elr; unsigned long next_wakeup, cur; + LIST_HEAD(request_list); BUG_ON(NULL == eli); @@ -2728,21 +2725,43 @@ cont_thread: mutex_unlock(&eli->li_list_mtx); goto exit_thread; } - - list_for_each_safe(pos, n, &eli->li_request_list) { - elr = list_entry(pos, struct ext4_li_request, -lr_request); - - if (time_after_eq(jiffies, elr->lr_next_sched)) { - if (ext4_run_li_request(elr) != 0) { - /* error, remove the lazy_init job */ - ext4_remove_li_request(elr); - continue; + list_splice_init(&eli->li_request_list, &request_list); + while (!list_empty(&request_list)) { + int err = 0; + int progress = 0; + + elr = list_entry(request_list.next, +struct ext4_li_request, lr_request); + list_move(request_list.next, &eli->li_request_list); + if (time_before(jiffies, elr->lr_next_sched)) { + if (time_before(elr->lr_next_sched, next_wakeup)) + next_wakeup = elr->lr_next_sched; + continue; + } + if (down_read_trylock(&elr->lr_super->s_umount)) { + if (sb_start_write_trylock(elr->lr_super)) { + progress = 1; + /* We holds sb->s_umount, sb can not +* be removed from the list, it is +* now safe to drop li_list_mtx +*/ + mutex_unlock(&eli->li_list_mtx); + err = ext4_run_li_request(elr); + sb_end_write(elr->lr_super); + mutex_lock(&eli->li_list_mtx); } + up_read((&elr->lr_super->s_umount)); + } + /* error, remove the lazy_init job */ + if (err) { + ext4_remove_li_request(elr); + cont
Re: [Devel] [PATCH rh7] ploop: release plo->ctl_mutex for thaw_bdev in PLOOP_IOC_THAW handler
On 07/15/2016 01:10 AM, Vladimir Davydov wrote: diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index d52975eaaa36..3dc94ca5c393 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -4839,11 +4839,12 @@ static int ploop_thaw(struct ploop_device *plo, struct block_device *bdev) if (!test_bit(PLOOP_S_FROZEN, &plo->state)) return 0; + plo->sb = NULL; + clear_bit(PLOOP_S_FROZEN, &plo->state); + + mutex_unlock(&plo->ctl_mutex); Since this point nothing in ploop state hints that the device is still frozen. Someone (another instance of backup tool?) may mistakenly try to freeze it again (before we call thaw_bdev) and succeed in it (because the S_FROZEN bit is already cleared). The result would be "double freeze" that we tried to avoid by initial patch. The fix must be simple, I'll send a patch soon. Thanks, Maxim err = thaw_bdev(bdev, sb); - if (!err) { - plo->sb = NULL; - clear_bit(PLOOP_S_FROZEN, &plo->state); - } + mutex_lock(&plo->ctl_mutex); return err; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [NEW KERNEL] 3.10.0-327.18.2.vz7.14.26 (rhel7)
Changelog: OpenVZ kernel rh7-3.10.0-327.18.2.vz7.14.26 * fix ploop backup deadlock Generated changelog: * Fri Jul 15 2016 Vladimir Davydov [3.10.0-327.18.2.vz7.14.26] - ploop: release plo->ctl_mutex for thaw_bdev in PLOOP_IOC_THAW handler (Vladimir Davydov) [PSBM-49699] Built packages: http://kojistorage.eng.sw.ru/packages/vzkernel/3.10.0/327.18.2.vz7.14.26/ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ploop: release plo->ctl_mutex for thaw_bdev in PLOOP_IOC_THAW handler
The commit is pushed to "branch-rh7-3.10.0-327.18.2.vz7.14.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-327.18.2.vz7.14.25 --> commit b81bf870a6c314b664977ee9b6747cf93e78bcc3 Author: Vladimir Davydov Date: Fri Jul 15 13:34:36 2016 +0400 ploop: release plo->ctl_mutex for thaw_bdev in PLOOP_IOC_THAW handler Recent patch to ploop 91a74e3b91a ("ploop: add PLOOP_IOC_FREEZE and PLOOP_IOC_THAW ioctls") introduced the follwing deadlock: Thread 1: [] sleep_on_buffer+0xe/0x20 [] __sync_dirty_buffer+0xb8/0xe0 [] sync_dirty_buffer+0x13/0x20 [] ext4_commit_super+0x1b0/0x240 [ext4] [] ext4_unfreeze+0x2d/0x40 [ext4] [] thaw_super+0x3f/0xb0 [] thaw_bdev+0x65/0x80 [] ploop_ioctl+0x6d0/0x29f0 [ploop] [] blkdev_ioctl+0x2df/0x770 [] block_ioctl+0x41/0x50 [] do_vfs_ioctl+0x255/0x4f0 [] SyS_ioctl+0x54/0xa0 [] system_call_fastpath+0x16/0x1b [] 0x Thread 2: [] ploop_pb_get_pending+0x163/0x290 [ploop] [] ploop_push_backup_io_get.isra.26+0x81/0x1b0 [ploop] [] ploop_push_backup_io+0x15b/0x260 [ploop] [] ploop_ioctl+0xe96/0x29f0 [ploop] [] blkdev_ioctl+0x2df/0x770 [] block_ioctl+0x41/0x50 [] do_vfs_ioctl+0x255/0x4f0 [] SyS_ioctl+0x54/0xa0 [] system_call_fastpath+0x16/0x1b [] 0x E.g. thread 1 is thawing ploop with PLOOP_IOC_THAW ioctl which holds plo->ctl_mutex during its work. To thaw itself, ext4 has to commit some data. This commit triggers push backup out-of-order request which must be processed and acked by userspace to be completed. But userspace can't process it, because ploop_pb_get_pending() wants the same mutext. Thus, deadlock. Fix the deadlock by releasing the mutex before calling thaw_bdev and reaquiring it after thaw_bdev is done. https://jira.sw.ru/browse/PSBM-49699 Reported-by: Pavel Borzenkov Signed-off-by: Vladimir Davydov Cc: Maxim Patlasov --- drivers/block/ploop/dev.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index d52975eaaa36..3dc94ca5c393 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -4839,11 +4839,12 @@ static int ploop_thaw(struct ploop_device *plo, struct block_device *bdev) if (!test_bit(PLOOP_S_FROZEN, &plo->state)) return 0; + plo->sb = NULL; + clear_bit(PLOOP_S_FROZEN, &plo->state); + + mutex_unlock(&plo->ctl_mutex); err = thaw_bdev(bdev, sb); - if (!err) { - plo->sb = NULL; - clear_bit(PLOOP_S_FROZEN, &plo->state); - } + mutex_lock(&plo->ctl_mutex); return err; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7] ploop: release plo->ctl_mutex for thaw_bdev in PLOOP_IOC_THAW handler
Recent patch to ploop 91a74e3b91a ("ploop: add PLOOP_IOC_FREEZE and PLOOP_IOC_THAW ioctls") introduced the follwing deadlock: Thread 1: [] sleep_on_buffer+0xe/0x20 [] __sync_dirty_buffer+0xb8/0xe0 [] sync_dirty_buffer+0x13/0x20 [] ext4_commit_super+0x1b0/0x240 [ext4] [] ext4_unfreeze+0x2d/0x40 [ext4] [] thaw_super+0x3f/0xb0 [] thaw_bdev+0x65/0x80 [] ploop_ioctl+0x6d0/0x29f0 [ploop] [] blkdev_ioctl+0x2df/0x770 [] block_ioctl+0x41/0x50 [] do_vfs_ioctl+0x255/0x4f0 [] SyS_ioctl+0x54/0xa0 [] system_call_fastpath+0x16/0x1b [] 0x Thread 2: [] ploop_pb_get_pending+0x163/0x290 [ploop] [] ploop_push_backup_io_get.isra.26+0x81/0x1b0 [ploop] [] ploop_push_backup_io+0x15b/0x260 [ploop] [] ploop_ioctl+0xe96/0x29f0 [ploop] [] blkdev_ioctl+0x2df/0x770 [] block_ioctl+0x41/0x50 [] do_vfs_ioctl+0x255/0x4f0 [] SyS_ioctl+0x54/0xa0 [] system_call_fastpath+0x16/0x1b [] 0x E.g. thread 1 is thawing ploop with PLOOP_IOC_THAW ioctl which holds plo->ctl_mutex during its work. To thaw itself, ext4 has to commit some data. This commit triggers push backup out-of-order request which must be processed and acked by userspace to be completed. But userspace can't process it, because ploop_pb_get_pending() wants the same mutext. Thus, deadlock. Fix the deadlock by releasing the mutex before calling thaw_bdev and reaquiring it after thaw_bdev is done. https://jira.sw.ru/browse/PSBM-49699 Reported-by: Pavel Borzenkov Signed-off-by: Vladimir Davydov Cc: Maxim Patlasov --- drivers/block/ploop/dev.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/block/ploop/dev.c b/drivers/block/ploop/dev.c index d52975eaaa36..3dc94ca5c393 100644 --- a/drivers/block/ploop/dev.c +++ b/drivers/block/ploop/dev.c @@ -4839,11 +4839,12 @@ static int ploop_thaw(struct ploop_device *plo, struct block_device *bdev) if (!test_bit(PLOOP_S_FROZEN, &plo->state)) return 0; + plo->sb = NULL; + clear_bit(PLOOP_S_FROZEN, &plo->state); + + mutex_unlock(&plo->ctl_mutex); err = thaw_bdev(bdev, sb); - if (!err) { - plo->sb = NULL; - clear_bit(PLOOP_S_FROZEN, &plo->state); - } + mutex_lock(&plo->ctl_mutex); return err; } -- 2.1.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel