Hi Jarod, On Wed, Apr 1, 2015 at 4:47 AM, Jarod Wilson <ja...@redhat.com> wrote: > If losetup is called with the -P option, it sets a flag to have the > resulting loop block device scanned for partitions. Unfortunately, due > to the way flags are passed in from userspace, there's first a > loop_set_fd() call, which does no partition scanning, then a > loop_set_status() call, where the partition scanning should kick in. > However, particularly on a system with slow I/O (such as a file-backed > vm), there's a race between the loop_set_status() call and udev poking the > device, which leads to partition scanning failing with an -EBUSY (passed > up from block/ioctl.c's blkdev_reread_part()) because the block_device's > bd_mutex is already held by udev calling blkdev_open(), which grabs > bd_mutex, and then in turn calls lo_open(), which then in turn tries to > grab lo_ctl_mutex, which we're holding in all loop ioctls.
IMO, lo_ctl_mutex can be avoided in lo_open(), and '--lo->lo_refcnt' can be moved out of the lock in lo_release() meantime. > > To combat this, if we discover bd_mutex is locked, we know partition > scanning will fail, and its probably because of udev, so we can > temporarily drop the lo_ctl_mutex ourselves to try to let udev do its > thing, then grab it back, and hopefully then successfully scan partitions. Even with above change, blkdev_reread_part() still might return -EBUSY, and there is no hurt to retry several times. > > Testing shows a definite improvement to partition scanning success when > calling losetup -fP file-image over and over (with matching losetup -D > too, of course), but still not to 100% success, I'm still getting the > occasional failure, which is typically due to an -EBUSY trying to rescan > partitions on loop device removal. This one is because bd_mutex has been held in release path already. Would you mind testing the attached patch which implements the above idea? Thanks, Ming Lei > > CC: Jens Axboe <ax...@fb.com> > CC: Ming Lei <ming....@canonical.com> > CC: Mike Galbraith <bitbuc...@online.de> > CC: Kent Overstreet <k...@daterainc.com> > CC: Mikulas Patocka <mpato...@redhat.com> > Signed-off-by: Jarod Wilson <ja...@redhat.com> > --- > drivers/block/loop.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- > 1 file changed, 44 insertions(+), 4 deletions(-) > > diff --git a/drivers/block/loop.c b/drivers/block/loop.c > index d1f168b..b30e32c 100644 > --- a/drivers/block/loop.c > +++ b/drivers/block/loop.c > @@ -75,6 +75,7 @@ > #include <linux/sysfs.h> > #include <linux/miscdevice.h> > #include <linux/falloc.h> > +#include <linux/delay.h> > #include "loop.h" > > #include <asm/uaccess.h> > @@ -529,6 +530,45 @@ static int loop_flush(struct loop_device *lo) > } > > /* > + * Re-reading partitions can fail with an -EBUSY return from block/ioctl.c's > + * blkdev_reread_part(), which calls mutex_trylock on the bd_mutex. Now, udev > + * is calling blkdev_open, which first grabs bd_mutex, then lo_ctl_mutex via > + * lo_open, which occasionally happens before partition scanning, and will > + * prevent partition scanning from ever being successful unless we give up > + * the lo_ctl_mutex temporarily. > + */ > +static void loop_reread_partitions(struct loop_device *lo, > + struct block_device *bdev) > +{ > + int rc; > + int retry = 5; > + > + pr_debug("%s: firing for loop%d (%s)\n", > + __func__, lo->lo_number, lo->lo_file_name); > + > + /* > + * If no lo_device, we were (probably) called from loop_clr_fd(), and > + * retries never seem to help, so don't retry. > + */ > + if (!lo->lo_device) > + retry = 1; > + > + while (mutex_is_locked(&bdev->bd_mutex) && retry > 0) { > + mutex_unlock(&lo->lo_ctl_mutex); > + msleep(50); > + mutex_lock(&lo->lo_ctl_mutex); > + retry--; > + pr_debug("%s: unlocked lo_ctl temporarily (retries left: > %d)\n", > + __func__, retry); > + } > + > + rc = ioctl_by_bdev(bdev, BLKRRPART, 0); > + if (rc) > + pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", > + __func__, lo->lo_number, lo->lo_file_name, rc); > +} > + > +/* > * loop_change_fd switched the backing store of a loopback device to > * a new file. This is useful for operating system installers to free up > * the original file and in High Availability environments to switch to > @@ -576,7 +616,7 @@ static int loop_change_fd(struct loop_device *lo, struct > block_device *bdev, > > fput(old_file); > if (lo->lo_flags & LO_FLAGS_PARTSCAN) > - ioctl_by_bdev(bdev, BLKRRPART, 0); > + loop_reread_partitions(lo, bdev); > return 0; > > out_putf: > @@ -807,7 +847,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t > mode, > if (part_shift) > lo->lo_flags |= LO_FLAGS_PARTSCAN; > if (lo->lo_flags & LO_FLAGS_PARTSCAN) > - ioctl_by_bdev(bdev, BLKRRPART, 0); > + loop_reread_partitions(lo, bdev); > > /* Grab the block_device to prevent its destruction after we > * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). > @@ -920,7 +960,7 @@ static int loop_clr_fd(struct loop_device *lo) > /* This is safe: open() is still holding a reference. */ > module_put(THIS_MODULE); > if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) > - ioctl_by_bdev(bdev, BLKRRPART, 0); > + loop_reread_partitions(lo, bdev); > lo->lo_flags = 0; > if (!part_shift) > lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; > @@ -995,7 +1035,7 @@ loop_set_status(struct loop_device *lo, const struct > loop_info64 *info) > !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { > lo->lo_flags |= LO_FLAGS_PARTSCAN; > lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; > - ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); > + loop_reread_partitions(lo, lo->lo_device); > } > > lo->lo_encrypt_key_size = info->lo_encrypt_key_size; > -- > 1.8.3.1 >
From 9aca30919e20ce7e24a97c3d1639a654eace5ba2 Mon Sep 17 00:00:00 2001 From: Ming Lei <ming....@canonical.com> Date: Tue, 31 Mar 2015 16:47:15 -0400 Subject: [PATCH] block/loop: fix race between open/release and reread part When another task(such as udev) is trying to open/close loop block, blkdev_reread_part() may return -EBUSY because bd_mutex is required for both open and release. The worse thing is that lo_open() need to hold lo_ctl_mutex, so the task trying to open loop can't open the loop disk until lo_ctl_mutex is released when rereading part is completed with failure. This patch trys to fixing the issue by the following approach: 1) introduce lo_open_mutex to protect lo_refcnt and avoid acquiring lo_ctl_mutex in lo_open(): - for open vs. add/del loop, no any problem because of loop_index_mutex - lo_open_mutex is used for syncing open() and loop_clr_fd() - both open() and release() are protected by bd_mutex 2) don't hold lo_ctl_mutex for decreasing/checking lo_refcnt in lo_release() 3) when reread part is run inside release path, rescan partition directly since bd_mutex is held already 4) retry several times if blkdev_reread_part() still returns -EBUSY. CC: Jens Axboe <ax...@fb.com> CC: Ming Lei <ming....@canonical.com> CC: Mike Galbraith <bitbuc...@online.de> CC: Kent Overstreet <k...@daterainc.com> CC: Mikulas Patocka <mpato...@redhat.com> Reported-by: Jarod Wilson <ja...@redhat.com> Signed-off-by: Ming Lei <ming....@canonical.com> --- block/partition-generic.c | 1 + drivers/block/loop.c | 88 +++++++++++++++++++++++++++++++++++++++------ drivers/block/loop.h | 1 + 3 files changed, 80 insertions(+), 10 deletions(-) diff --git a/block/partition-generic.c b/block/partition-generic.c index 0d9e5f9..9efa667 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -528,6 +528,7 @@ rescan: free_partitions(state); return 0; } +EXPORT_SYMBOL(rescan_partitions); int invalidate_partitions(struct gendisk *disk, struct block_device *bdev) { diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d1f168b..9a8d52a 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -75,6 +75,8 @@ #include <linux/sysfs.h> #include <linux/miscdevice.h> #include <linux/falloc.h> +#include <linux/delay.h> +#include <linux/genhd.h> #include "loop.h" #include <asm/uaccess.h> @@ -528,6 +530,52 @@ static int loop_flush(struct loop_device *lo) return loop_switch(lo, NULL); } +static int loop_reread_part_no_lock(struct block_device *bdev) +{ + struct gendisk *disk = bdev->bd_disk; + int res; + + if (!disk_part_scan_enabled(disk) || bdev != bdev->bd_contains) + return -EINVAL; + res = rescan_partitions(disk, bdev); + return res; +} + +/* + * Re-reading partitions can fail with an -EBUSY return from + * block/ioctl.c's blkdev_reread_part(), which calls mutex_trylock + * on the bd_mutex, which can be held in both open/release handler, + * so retry several times for the sake of safety. + */ +static void loop_reread_partitions(struct loop_device *lo, + struct block_device *bdev) +{ + int rc, in_release; + int retry = 5; + + + mutex_lock(&lo->lo_open_mutex); + in_release = lo->lo_refcnt == 0; + mutex_unlock(&lo->lo_open_mutex); + + /* bd_mutex has been held already in release path */ + if (in_release) { + rc = loop_reread_part_no_lock(bdev); + goto out; + } + + while (retry-- > 0) { + rc = ioctl_by_bdev(bdev, BLKRRPART, 0); + if (rc != -EBUSY) + break; + msleep(50); + } + out: + if (rc) + pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", + __func__, lo->lo_number, lo->lo_file_name, rc); +} + /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -576,7 +624,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, fput(old_file); if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); return 0; out_putf: @@ -807,7 +855,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); /* Grab the block_device to prevent its destruction after we * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). @@ -879,14 +927,18 @@ static int loop_clr_fd(struct loop_device *lo) * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d * command to fail with EBUSY. */ + mutex_lock(&lo->lo_open_mutex); if (lo->lo_refcnt > 1) { + mutex_unlock(&lo->lo_open_mutex); lo->lo_flags |= LO_FLAGS_AUTOCLEAR; mutex_unlock(&lo->lo_ctl_mutex); return 0; } - if (filp == NULL) + if (filp == NULL) { + mutex_unlock(&lo->lo_open_mutex); return -EINVAL; + } spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; @@ -919,8 +971,17 @@ static int loop_clr_fd(struct loop_device *lo) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); + + /* + * Unlock open_mutex for avoiding -EBUSY of rereading part: + * - try to acquire bd_mutex from reread part + * - another task is opening the loop with holding bd_mutex + * and trys to acquire open_mutex + */ + mutex_unlock(&lo->lo_open_mutex); + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; @@ -995,7 +1056,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { lo->lo_flags |= LO_FLAGS_PARTSCAN; lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); + loop_reread_partitions(lo, lo->lo_device); } lo->lo_encrypt_key_size = info->lo_encrypt_key_size; @@ -1376,9 +1437,9 @@ static int lo_open(struct block_device *bdev, fmode_t mode) goto out; } - mutex_lock(&lo->lo_ctl_mutex); + mutex_lock(&lo->lo_open_mutex); lo->lo_refcnt++; - mutex_unlock(&lo->lo_ctl_mutex); + mutex_unlock(&lo->lo_open_mutex); out: mutex_unlock(&loop_index_mutex); return err; @@ -1387,13 +1448,16 @@ out: static void lo_release(struct gendisk *disk, fmode_t mode) { struct loop_device *lo = disk->private_data; - int err; + int err, ref; - mutex_lock(&lo->lo_ctl_mutex); + mutex_lock(&lo->lo_open_mutex); + ref = --lo->lo_refcnt; + mutex_unlock(&lo->lo_open_mutex); - if (--lo->lo_refcnt) + if (ref) goto out; + mutex_lock(&lo->lo_ctl_mutex); if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { /* * In autoclear mode, stop the loop thread @@ -1646,6 +1710,7 @@ static int loop_add(struct loop_device **l, int i) disk->flags |= GENHD_FL_NO_PART_SCAN; disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); + mutex_init(&lo->lo_open_mutex); lo->lo_number = i; spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; @@ -1763,11 +1828,14 @@ static long loop_control_ioctl(struct file *file, unsigned int cmd, mutex_unlock(&lo->lo_ctl_mutex); break; } + mutex_lock(&lo->lo_open_mutex); if (lo->lo_refcnt > 0) { ret = -EBUSY; + mutex_unlock(&lo->lo_open_mutex); mutex_unlock(&lo->lo_ctl_mutex); break; } + mutex_unlock(&lo->lo_open_mutex); lo->lo_disk->private_data = NULL; mutex_unlock(&lo->lo_ctl_mutex); idr_remove(&loop_index_idr, lo->lo_number); diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 301c27f..1b4acf2 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -59,6 +59,7 @@ struct loop_device { bool write_started; int lo_state; struct mutex lo_ctl_mutex; + struct mutex lo_open_mutex; struct request_queue *lo_queue; struct blk_mq_tag_set tag_set; -- 1.7.9.5