Hi,
在 2026/6/24 18:13, Chen Cheng 写道:
> From: Chen Cheng <[email protected]>
>
> Save token as mddev-scoped in mddev->noio_flags cause PF_MEMALLOC_NOIO
> leak into task A, while task B restores a token that it never saved.
>
> scenario:
>
> task A mddev task B
> ====== ======= ============
> write suspend_lo
> mddev_suspend()
> suspended == 0
> drain active_io
> suspended = 1
> A: noio_A = memalloc_noio_save()
> A returns with PF_MEMALLOC_NOIO set
>
> write
> suspend_hi
> mddev_suspend()
> suspended == 1
> suspended = 2
> B returns
> (no save)
>
> mddev_resume()
> suspended = 1
> not last resume
> A returns
> A still has PF_MEMALLOC_NOIO <-- leaked
>
> mddev_resume()
> suspended = 0
>
> memalloc_noio_restore(noio_A)
> (restores A's
> token in B)
>
> Fixed by:
> - return each caller's noio_flags from mddev_suspend()
> - pass that token back into mddev_resume()
> - update the suspend-and-lock helpers to carry the token
> - store the token in struct raid_set for dm-raid paths where suspend
> and resume are paired across callbacks
>
> Validation:
> repeatedly updates the array's suspend_lo and suspend_hi sysfs from many
> concurrent userspace workers. That makes multiple tasks to call
> mddev_suspend()/mddev_resume() concurrently.
>
> Each worker:
> - reads its initial /proc/self/stat flags and verifies that
> PF_MEMALLOC_NOIO is not already
> set
> - writes 0 to either suspend_lo or suspend_hi
> - immediately reads its own task flags again
> - reports success if flags & PF_MEMALLOC_NOIO is true after the write
> returns
>
> Link:
> https://github.com/chencheng-fnnas/reproducer/blob/main/repro-md-noio-token-leak.sh
>
> Fixes: 78f57ef9d50a ("md: use memalloc scope APIs in
> mddev_suspend()/mddev_resume()")
>
> Signed-off-by: Chen Cheng <[email protected]>
> ---
> drivers/md/dm-raid.c | 7 ++--
> drivers/md/md-autodetect.c | 5 ++-
> drivers/md/md-bitmap.c | 12 +++---
> drivers/md/md.c | 85 ++++++++++++++++++++++----------------
> drivers/md/md.h | 23 ++++++-----
> drivers/md/raid5-cache.c | 11 +++--
> drivers/md/raid5.c | 25 ++++++-----
> 7 files changed, 97 insertions(+), 71 deletions(-)
>
> diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
> index 8f5a5e1342a9..d89207e3722a 100644
> --- a/drivers/md/dm-raid.c
> +++ b/drivers/md/dm-raid.c
> @@ -239,10 +239,11 @@ struct raid_set {
> int raid_disks;
> int delta_disks;
> int data_offset;
> int raid10_copies;
> int requested_bitmap_chunk_sectors;
> + unsigned int suspend_noio_flags;
>
> struct mddev md;
> struct raid_type *raid_type;
>
> sector_t array_sectors;
> @@ -3251,11 +3252,11 @@ static int raid_ctr(struct dm_target *ti, unsigned
> int argc, char **argv)
> /* Start raid set read-only and assumed clean to change in
> raid_resume() */
> rs->md.ro = MD_RDONLY;
> rs->md.in_sync = 1;
>
> /* Has to be held on running the array */
> - mddev_suspend_and_lock_nointr(&rs->md);
> + mddev_suspend_and_lock_nointr(&rs->md, &rs->suspend_noio_flags);
>
> /* Keep array frozen until resume. */
> md_frozen_sync_thread(&rs->md);
>
> r = md_run(&rs->md);
> @@ -3863,11 +3864,11 @@ static void raid_postsuspend(struct dm_target *ti)
> /*
> * sync_thread must be stopped during suspend, and writes have
> * to be stopped before suspending to avoid deadlocks.
> */
> md_stop_writes(&rs->md);
> - mddev_suspend(&rs->md, false);
> + mddev_suspend(&rs->md, false, &rs->suspend_noio_flags);
> rs->md.ro = MD_RDONLY;
> }
> clear_bit(MD_DM_SUSPENDING, &mddev->flags);
>
> }
> @@ -4141,11 +4142,11 @@ static void raid_resume(struct dm_target *ti)
>
> lockdep_is_held(&mddev->reconfig_mutex)));
> clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
> mddev->ro = MD_RDWR;
> mddev->in_sync = 0;
> md_unfrozen_sync_thread(mddev);
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, rs->suspend_noio_flags);
> }
> }
For mdraid, changes looks fine. However, for dm-raid, for example:
dmsetup suspend ...
In this case, array will be suspended while task returned to user. Take a look
at
the commit to introduce memalloc_noio_save(), it's supposed to be called for
mdraid
arrays to allocate memory while array is suspended, I don't see why dm-raid
need it.
So I'll suggest just bypass the memalloc_noio_save() for dm-raid where
mddev_suspend()
is not paired with mddev_resume() in the same task context.
>
> static struct target_type raid_target = {
> .name = "raid",
> diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
> index 4b80165afd23..58e062cd0580 100644
> --- a/drivers/md/md-autodetect.c
> +++ b/drivers/md/md-autodetect.c
> @@ -126,10 +126,11 @@ static void __init md_setup_drive(struct md_setup_args
> *args)
> dev_t devices[MD_SB_DISKS + 1], mdev;
> struct mdu_array_info_s ainfo = { };
> struct mddev *mddev;
> int err = 0, i;
> char name[16];
> + unsigned int noio_flags;
>
> if (args->partitioned) {
> mdev = MKDEV(mdp_major, args->minor << MdpMinorShift);
> sprintf(name, "md_d%d", args->minor);
> } else {
> @@ -173,11 +174,11 @@ static void __init md_setup_drive(struct md_setup_args
> *args)
> if (IS_ERR(mddev)) {
> pr_err("md: md_alloc failed - cannot start array %s\n", name);
> return;
> }
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err) {
> pr_err("md: failed to lock array %s\n", name);
> goto out_mddev_put;
> }
>
> @@ -219,11 +220,11 @@ static void __init md_setup_drive(struct md_setup_args
> *args)
> if (!err)
> err = do_md_run(mddev);
> if (err)
> pr_warn("md: starting %s failed\n", name);
> out_unlock:
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> out_mddev_put:
> mddev_put(mddev);
> }
>
> static int __init raid_setup(char *str)
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index 028b9ca8ce52..74b7f569a3f4 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -2620,13 +2620,14 @@ location_show(struct mddev *mddev, char *page)
> }
>
> static ssize_t
> location_store(struct mddev *mddev, const char *buf, size_t len)
> {
> + unsigned int noio_flags;
> int rv;
>
> - rv = mddev_suspend_and_lock(mddev);
> + rv = mddev_suspend_and_lock(mddev, &noio_flags);
> if (rv)
> return rv;
>
> if (mddev->pers) {
> if (mddev->recovery || mddev->sync_thread) {
> @@ -2711,11 +2712,11 @@ location_store(struct mddev *mddev, const char *buf,
> size_t len)
> set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
> md_wakeup_thread(mddev->thread);
> }
> rv = 0;
> out:
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> if (rv)
> return rv;
> return len;
>
> merge_err:
> @@ -2831,17 +2832,18 @@ backlog_store(struct mddev *mddev, const char *buf,
> size_t len)
> {
> unsigned long backlog;
> unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
> struct md_rdev *rdev;
> bool has_write_mostly = false;
> + unsigned int noio_flags;
> int rv = kstrtoul(buf, 10, &backlog);
> if (rv)
> return rv;
> if (backlog > COUNTER_MAX)
> return -EINVAL;
>
> - rv = mddev_suspend_and_lock(mddev);
> + rv = mddev_suspend_and_lock(mddev, &noio_flags);
> if (rv)
> return rv;
>
> /*
> * Without write mostly device, it doesn't make sense to set
> @@ -2854,11 +2856,11 @@ backlog_store(struct mddev *mddev, const char *buf,
> size_t len)
> }
> }
> if (!has_write_mostly) {
> pr_warn_ratelimited("%s: can't set backlog, no write mostly
> device available\n",
> mdname(mddev));
> - mddev_unlock(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return -EINVAL;
> }
>
> mddev->bitmap_info.max_write_behind = backlog;
> if (!backlog && mddev->serial_info_pool) {
> @@ -2871,11 +2873,11 @@ backlog_store(struct mddev *mddev, const char *buf,
> size_t len)
> mddev_create_serial_pool(mddev, rdev);
> }
> if (old_mwb != backlog)
> bitmap_update_sb(mddev->bitmap);
>
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return len;
> }
>
> static struct md_sysfs_entry bitmap_backlog =
> __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 1377c407614c..86d938dee50a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -459,11 +459,12 @@ static void md_submit_bio(struct bio *bio)
>
> /*
> * Make sure no new requests are submitted to the device, and any requests
> that
> * have been submitted are completely handled.
> */
> -int mddev_suspend(struct mddev *mddev, bool interruptible)
> +int mddev_suspend(struct mddev *mddev, bool interruptible,
> + unsigned int *noio_flags)
> {
> int err = 0;
>
> /*
> * hold reconfig_mutex to wait for normal io will deadlock, because
> @@ -478,10 +479,11 @@ int mddev_suspend(struct mddev *mddev, bool
> interruptible)
> mutex_lock(&mddev->suspend_mutex);
> if (err)
> return err;
>
> if (mddev->suspended) {
> + *noio_flags = memalloc_noio_save();
> WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
> mutex_unlock(&mddev->suspend_mutex);
> return 0;
> }
>
> @@ -515,31 +517,30 @@ int mddev_suspend(struct mddev *mddev, bool
> interruptible)
> * prevent deadlock.
> */
> WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
>
> /* restrict memory reclaim I/O during raid array is suspend */
> - mddev->noio_flag = memalloc_noio_save();
> + *noio_flags = memalloc_noio_save();
>
> mutex_unlock(&mddev->suspend_mutex);
> return 0;
> }
> EXPORT_SYMBOL_GPL(mddev_suspend);
>
> -static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
> +static void __mddev_resume(struct mddev *mddev, bool recovery_needed,
> + unsigned int noio_flags)
> {
> lockdep_assert_not_held(&mddev->reconfig_mutex);
>
> mutex_lock(&mddev->suspend_mutex);
> + memalloc_noio_restore(noio_flags);
> WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
> if (mddev->suspended) {
> mutex_unlock(&mddev->suspend_mutex);
> return;
> }
>
> - /* entred the memalloc scope from mddev_suspend() */
> - memalloc_noio_restore(mddev->noio_flag);
> -
> percpu_ref_resurrect(&mddev->active_io);
> wake_up(&mddev->sb_wait);
>
> if (recovery_needed)
> set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> @@ -547,13 +548,13 @@ static void __mddev_resume(struct mddev *mddev, bool
> recovery_needed)
> md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
>
> mutex_unlock(&mddev->suspend_mutex);
> }
>
> -void mddev_resume(struct mddev *mddev)
> +void mddev_resume(struct mddev *mddev, unsigned int noio_flags)
> {
> - return __mddev_resume(mddev, true);
> + return __mddev_resume(mddev, true, noio_flags);
> }
> EXPORT_SYMBOL_GPL(mddev_resume);
>
> /* sync bdev before setting device to readonly or stopping raid*/
> static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int
> opener_num)
> @@ -3737,10 +3738,11 @@ rdev_attr_store(struct kobject *kobj, struct
> attribute *attr,
> {
> struct rdev_sysfs_entry *entry = container_of(attr, struct
> rdev_sysfs_entry, attr);
> struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
> struct kernfs_node *kn = NULL;
> bool suspend = false;
> + unsigned int noio_flags = 0;
> ssize_t rv;
> struct mddev *mddev = READ_ONCE(rdev->mddev);
>
> if (!entry->store)
> return -EIO;
> @@ -3756,17 +3758,17 @@ rdev_attr_store(struct kobject *kobj, struct
> attribute *attr,
> cmd_match(page, "writemostly") ||
> cmd_match(page, "-writemostly"))
> suspend = true;
> }
>
> - rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
> + rv = suspend ? mddev_suspend_and_lock(mddev, &noio_flags) :
> mddev_lock(mddev);
> if (!rv) {
> if (rdev->mddev == NULL)
> rv = -ENODEV;
> else
> rv = entry->store(rdev, page, length);
> - suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
> + suspend ? mddev_unlock_and_resume(mddev, noio_flags) :
> mddev_unlock(mddev);
> }
>
> if (kn)
> sysfs_unbreak_active_protection(kn);
>
> @@ -4049,15 +4051,16 @@ level_store(struct mddev *mddev, const char *buf,
> size_t len)
> size_t slen = len;
> struct md_personality *pers, *oldpers;
> long level;
> void *priv, *oldpriv;
> struct md_rdev *rdev;
> + unsigned int noio_flags;
>
> if (slen == 0 || slen >= sizeof(clevel))
> return -EINVAL;
>
> - rv = mddev_suspend_and_lock(mddev);
> + rv = mddev_suspend_and_lock(mddev, &noio_flags);
> if (rv)
> return rv;
>
> if (mddev->pers == NULL) {
> memcpy(mddev->clevel, buf, slen);
> @@ -4231,11 +4234,11 @@ level_store(struct mddev *mddev, const char *buf,
> size_t len)
> md_update_sb(mddev, 1);
> sysfs_notify_dirent_safe(mddev->sysfs_level);
> md_new_event();
> rv = len;
> out_unlock:
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return rv;
> }
>
> static struct md_sysfs_entry md_level =
> __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
> @@ -4410,17 +4413,18 @@ static int update_raid_disks(struct mddev *mddev, int
> raid_disks);
>
> static ssize_t
> raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
> {
> unsigned int n;
> + unsigned int noio_flags;
> int err;
>
> err = kstrtouint(buf, 10, &n);
> if (err < 0)
> return err;
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err)
> return err;
> if (mddev->pers) {
> if (n != mddev->raid_disks)
> err = update_raid_disks(mddev, n);
> @@ -4442,11 +4446,11 @@ raid_disks_store(struct mddev *mddev, const char
> *buf, size_t len)
> mddev->raid_disks = n;
> mddev->reshape_backwards = (mddev->delta_disks < 0);
> } else
> mddev->raid_disks = n;
> out_unlock:
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return err ? err : len;
> }
> static struct md_sysfs_entry md_raid_disks =
> __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
>
> @@ -4822,10 +4826,11 @@ new_dev_store(struct mddev *mddev, const char *buf,
> size_t len)
> char *e;
> int major = simple_strtoul(buf, &e, 10);
> int minor;
> dev_t dev;
> struct md_rdev *rdev;
> + unsigned int noio_flags;
> int err;
>
> if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
> return -EINVAL;
> minor = simple_strtoul(e+1, &e, 10);
> @@ -4834,11 +4839,11 @@ new_dev_store(struct mddev *mddev, const char *buf,
> size_t len)
> dev = MKDEV(major, minor);
> if (major != MAJOR(dev) ||
> minor != MINOR(dev))
> return -EOVERFLOW;
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err)
> return err;
> if (mddev->persistent) {
> rdev = md_import_device(dev, mddev->major_version,
> mddev->minor_version);
> @@ -4855,18 +4860,18 @@ new_dev_store(struct mddev *mddev, const char *buf,
> size_t len)
> rdev = md_import_device(dev, -2, -1);
> else
> rdev = md_import_device(dev, -1, -1);
>
> if (IS_ERR(rdev)) {
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return PTR_ERR(rdev);
> }
> err = bind_rdev_to_array(rdev, mddev);
> out:
> if (err)
> export_rdev(rdev);
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> if (!err)
> md_new_event();
> return err ? err : len;
> }
>
> @@ -5257,28 +5262,29 @@ static int mddev_start_reshape(struct mddev *mddev)
> static ssize_t
> action_store(struct mddev *mddev, const char *page, size_t len)
> {
> int ret;
> enum sync_action action;
> + unsigned int noio_flags = 0;
>
> if (!mddev->pers || !mddev->pers->sync_request)
> return -EINVAL;
>
> action = md_sync_action_by_name(page);
> retry:
> if (work_busy(&mddev->sync_work))
> flush_work(&mddev->sync_work);
>
> ret = (action == ACTION_RESHAPE) ?
> - mddev_suspend_and_lock(mddev) :
> + mddev_suspend_and_lock(mddev, &noio_flags) :
> mddev_lock(mddev);
> if (ret)
> return ret;
>
> if (work_busy(&mddev->sync_work)) {
> if (action == ACTION_RESHAPE)
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> else
> mddev_unlock(mddev);
> goto retry;
> }
>
> @@ -5349,11 +5355,11 @@ action_store(struct mddev *mddev, const char *page,
> size_t len)
> sysfs_notify_dirent_safe(mddev->sysfs_action);
> ret = len;
>
> out:
> if (action == ACTION_RESHAPE)
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> else
> mddev_unlock(mddev);
> return ret;
> }
>
> @@ -5640,24 +5646,25 @@ suspend_lo_show(struct mddev *mddev, char *page)
>
> static ssize_t
> suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
> {
> unsigned long long new;
> + unsigned int noio_flags;
> int err;
>
> err = kstrtoull(buf, 10, &new);
> if (err < 0)
> return err;
> if (new != (sector_t)new)
> return -EINVAL;
>
> - err = mddev_suspend(mddev, true);
> + err = mddev_suspend(mddev, true, &noio_flags);
> if (err)
> return err;
>
> WRITE_ONCE(mddev->suspend_lo, new);
> - mddev_resume(mddev);
> + mddev_resume(mddev, noio_flags);
>
> return len;
> }
> static struct md_sysfs_entry md_suspend_lo =
> __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
> @@ -5671,24 +5678,25 @@ suspend_hi_show(struct mddev *mddev, char *page)
>
> static ssize_t
> suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
> {
> unsigned long long new;
> + unsigned int noio_flags;
> int err;
>
> err = kstrtoull(buf, 10, &new);
> if (err < 0)
> return err;
> if (new != (sector_t)new)
> return -EINVAL;
>
> - err = mddev_suspend(mddev, true);
> + err = mddev_suspend(mddev, true, &noio_flags);
> if (err)
> return err;
>
> WRITE_ONCE(mddev->suspend_hi, new);
> - mddev_resume(mddev);
> + mddev_resume(mddev, noio_flags);
>
> return len;
> }
> static struct md_sysfs_entry md_suspend_hi =
> __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
> @@ -5928,19 +5936,20 @@ static ssize_t serialize_policy_show(struct mddev
> *mddev, char *page)
> static ssize_t
> serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
> {
> int err;
> bool value;
> + unsigned int noio_flags;
>
> err = kstrtobool(buf, &value);
> if (err)
> return err;
>
> if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
> return len;
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err)
> return err;
> if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
> pr_err("md: serialize_policy is only effective for raid1\n");
> err = -EINVAL;
> @@ -5953,11 +5962,11 @@ serialize_policy_store(struct mddev *mddev, const
> char *buf, size_t len)
> } else {
> mddev_destroy_serial_pool(mddev, NULL);
> clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
> }
> unlock:
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return err ?: len;
> }
>
> static struct md_sysfs_entry md_serialize_policy =
> __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
> @@ -6263,21 +6272,22 @@ EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
>
> /* update the optimal I/O size after a reshape */
> void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
> {
> struct queue_limits lim;
> + unsigned int noio_flags;
>
> if (mddev_is_dm(mddev))
> return;
>
> /* don't bother updating io_opt if we can't suspend the array */
> - if (mddev_suspend(mddev, false) < 0)
> + if (mddev_suspend(mddev, false, &noio_flags) < 0)
> return;
> lim = queue_limits_start_update(mddev->gendisk->queue);
> lim.io_opt = lim.io_min * nr_stripes;
> queue_limits_commit_update(mddev->gendisk->queue, &lim);
> - mddev_resume(mddev);
> + mddev_resume(mddev, noio_flags);
> }
> EXPORT_SYMBOL_GPL(mddev_update_io_opt);
>
> static void mddev_delayed_delete(struct work_struct *ws)
> {
> @@ -7255,10 +7265,11 @@ static void autorun_array(struct mddev *mddev)
> */
> static void autorun_devices(int part)
> {
> struct md_rdev *rdev0, *rdev, *tmp;
> struct mddev *mddev;
> + unsigned int noio_flags;
>
> pr_info("md: autorun ...\n");
> while (!list_empty(&pending_raid_disks)) {
> int unit;
> dev_t dev;
> @@ -7295,27 +7306,27 @@ static void autorun_devices(int part)
>
> mddev = md_alloc(dev, NULL);
> if (IS_ERR(mddev))
> break;
>
> - if (mddev_suspend_and_lock(mddev))
> + if (mddev_suspend_and_lock(mddev, &noio_flags))
> pr_warn("md: %s locked, cannot run\n", mdname(mddev));
> else if (mddev->raid_disks || mddev->major_version
> || !list_empty(&mddev->disks)) {
> pr_warn("md: %s already running, cannot run %pg\n",
> mdname(mddev), rdev0->bdev);
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> } else {
> pr_debug("md: created %s\n", mdname(mddev));
> mddev->persistent = 1;
> rdev_for_each_list(rdev, tmp, &candidates) {
> list_del_init(&rdev->same_set);
> if (bind_rdev_to_array(rdev, mddev))
> export_rdev(rdev);
> }
> autorun_array(mddev);
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> }
> /* on success, candidates will be empty, on error
> * it won't...
> */
> rdev_for_each_list(rdev, tmp, &candidates) {
> @@ -8329,10 +8340,11 @@ static int __md_set_array_info(struct mddev *mddev,
> void __user *argp)
>
> static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
> unsigned int cmd, unsigned long arg)
> {
> int err = 0;
> + unsigned int noio_flags = 0;
> void __user *argp = (void __user *)arg;
> struct mddev *mddev = NULL;
>
> err = md_ioctl_valid(cmd);
> if (err)
> @@ -8380,11 +8392,11 @@ static int md_ioctl(struct block_device *bdev,
> blk_mode_t mode,
> }
>
> if (!md_is_rdwr(mddev))
> flush_work(&mddev->sync_work);
>
> - err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
> + err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev,
> &noio_flags) :
> mddev_lock(mddev);
> if (err) {
> pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
> err, cmd);
> goto out;
> @@ -8511,11 +8523,11 @@ static int md_ioctl(struct block_device *bdev,
> blk_mode_t mode,
> unlock:
> if (mddev->hold_active == UNTIL_IOCTL &&
> err != -EINVAL)
> mddev->hold_active = 0;
>
> - md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
> + md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev, noio_flags)
> :
> mddev_unlock(mddev);
>
> out:
> if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
> clear_bit(MD_CLOSING, &mddev->flags);
> @@ -10180,20 +10192,21 @@ static bool md_choose_sync_action(struct mddev
> *mddev, int *spares)
> static void md_start_sync(struct work_struct *ws)
> {
> struct mddev *mddev = container_of(ws, struct mddev, sync_work);
> int spares = 0;
> bool suspend = false;
> + unsigned int noio_flags = 0;
> char *name;
>
> /*
> * If reshape is still in progress, spares won't be added or removed
> * from conf until reshape is done.
> */
> if (mddev->reshape_position == MaxSector &&
> md_spares_need_change(mddev)) {
> suspend = true;
> - mddev_suspend(mddev, false);
> + mddev_suspend(mddev, false, &noio_flags);
> }
>
> mddev_lock_nointr(mddev);
> if (!md_is_rdwr(mddev)) {
> /*
> @@ -10237,11 +10250,11 @@ static void md_start_sync(struct work_struct *ws)
> * not set it again. Otherwise, we may cause issue like this one:
> * https://bugzilla.kernel.org/show_bug.cgi?id=218200
> * Therefore, use __mddev_resume(mddev, false).
> */
> if (suspend)
> - __mddev_resume(mddev, false);
> + __mddev_resume(mddev, false, noio_flags);
> md_wakeup_thread(mddev->sync_thread);
> sysfs_notify_dirent_safe(mddev->sysfs_action);
> md_new_event();
> return;
>
> @@ -10257,11 +10270,11 @@ static void md_start_sync(struct work_struct *ws)
> * not set it again. Otherwise, we may cause issue like this one:
> * https://bugzilla.kernel.org/show_bug.cgi?id=218200
> * Therefore, use __mddev_resume(mddev, false).
> */
> if (suspend)
> - __mddev_resume(mddev, false);
> + __mddev_resume(mddev, false, noio_flags);
>
> wake_up(&resync_wait);
> if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
> mddev->sysfs_action)
> sysfs_notify_dirent_safe(mddev->sysfs_action);
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index d8daf0f75cbb..3337cd21eb30 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -619,11 +619,10 @@ struct mddev {
> mempool_t *serial_info_pool;
> void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
> struct md_cluster_info *cluster_info;
> struct md_cluster_operations *cluster_ops;
> unsigned int good_device_nr; /* good device num
> within cluster raid */
> - unsigned int noio_flag; /* for memalloc scope API */
>
> /*
> * Temporarily store rdev that will be finally removed when
> * reconfig_mutex is unlocked, protected by reconfig_mutex.
> */
> @@ -953,12 +952,13 @@ extern void md_stop(struct mddev *mddev);
> extern void md_stop_writes(struct mddev *mddev);
> extern int md_rdev_init(struct md_rdev *rdev);
> extern void md_rdev_clear(struct md_rdev *rdev);
>
> extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
> -extern int mddev_suspend(struct mddev *mddev, bool interruptible);
> -extern void mddev_resume(struct mddev *mddev);
> +extern int mddev_suspend(struct mddev *mddev, bool interruptible,
> + unsigned int *noio_flags);
> +extern void mddev_resume(struct mddev *mddev, unsigned int noio_flags);
> extern void md_idle_sync_thread(struct mddev *mddev);
> extern void md_frozen_sync_thread(struct mddev *mddev);
> extern void md_unfrozen_sync_thread(struct mddev *mddev);
>
> extern void md_update_sb(struct mddev *mddev, int force);
> @@ -999,35 +999,38 @@ static inline void mddev_check_write_zeroes(struct
> mddev *mddev, struct bio *bio
> if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
> !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
> mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0;
> }
>
> -static inline int mddev_suspend_and_lock(struct mddev *mddev)
> +static inline int mddev_suspend_and_lock(struct mddev *mddev,
> + unsigned int *noio_flags)
> {
> int ret;
>
> - ret = mddev_suspend(mddev, true);
> + ret = mddev_suspend(mddev, true, noio_flags);
> if (ret)
> return ret;
>
> ret = mddev_lock(mddev);
> if (ret)
> - mddev_resume(mddev);
> + mddev_resume(mddev, *noio_flags);
>
> return ret;
> }
>
> -static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
> +static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev,
> + unsigned int *noio_flags)
> {
> - mddev_suspend(mddev, false);
> + mddev_suspend(mddev, false, noio_flags);
> mddev_lock_nointr(mddev);
> }
>
> -static inline void mddev_unlock_and_resume(struct mddev *mddev)
> +static inline void mddev_unlock_and_resume(struct mddev *mddev,
> + unsigned int noio_flags)
> {
> mddev_unlock(mddev);
> - mddev_resume(mddev);
> + mddev_resume(mddev, noio_flags);
> }
>
> struct mdu_array_info_s;
> struct mdu_disk_info_s;
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 7b7546bfa21f..6f8e3a624456 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -693,13 +693,15 @@ static void r5c_disable_writeback_async(struct
> work_struct *work)
> !READ_ONCE(conf->log) ||
> !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>
> log = READ_ONCE(conf->log);
> if (log) {
> - mddev_suspend(mddev, false);
> + unsigned int noio_flags;
> +
> + mddev_suspend(mddev, false, &noio_flags);
> log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
> - mddev_resume(mddev);
> + mddev_resume(mddev, noio_flags);
> }
> }
>
> static void r5l_submit_current_io(struct r5l_log *log)
> {
> @@ -2603,10 +2605,11 @@ EXPORT_SYMBOL(r5c_journal_mode_set);
> static ssize_t r5c_journal_mode_store(struct mddev *mddev,
> const char *page, size_t length)
> {
> int mode = ARRAY_SIZE(r5c_journal_mode_str);
> size_t len = length;
> + unsigned int noio_flags;
> int ret;
>
> if (len < 2)
> return -EINVAL;
>
> @@ -2615,15 +2618,15 @@ static ssize_t r5c_journal_mode_store(struct mddev
> *mddev,
>
> while (mode--)
> if (strlen(r5c_journal_mode_str[mode]) == len &&
> !strncmp(page, r5c_journal_mode_str[mode], len))
> break;
> - ret = mddev_suspend_and_lock(mddev);
> + ret = mddev_suspend_and_lock(mddev, &noio_flags);
> if (ret)
> return ret;
> ret = r5c_journal_mode_set(mddev, mode);
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return ret ?: length;
> }
>
> struct md_sysfs_entry
> r5c_journal_mode = __ATTR(journal_mode, 0644,
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 65ae7d8930fc..6062c4b62cc8 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6992,11 +6992,11 @@ raid5_show_stripe_size(struct mddev *mddev, char
> *page)
> #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
> static ssize_t
> raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
> {
> struct r5conf *conf;
> - unsigned long new;
> + unsigned long new, noio_flags;
> int err;
> int size;
>
> if (len >= PAGE_SIZE)
> return -EINVAL;
> @@ -7011,11 +7011,11 @@ raid5_store_stripe_size(struct mddev *mddev, const
> char *page, size_t len)
> if (new % DEFAULT_STRIPE_SIZE != 0 ||
> new > PAGE_SIZE || new == 0 ||
> new != roundup_pow_of_two(new))
> return -EINVAL;
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err)
> return err;
>
> conf = mddev->private;
> if (!conf) {
> @@ -7049,11 +7049,11 @@ raid5_store_stripe_size(struct mddev *mddev, const
> char *page, size_t len)
> err = -ENOMEM;
> }
> mutex_unlock(&conf->cache_size_mutex);
>
> out_unlock:
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return err ?: len;
> }
>
> static struct md_sysfs_entry
> raid5_stripe_size = __ATTR(stripe_size, 0644,
> @@ -7127,19 +7127,20 @@ raid5_show_skip_copy(struct mddev *mddev, char *page)
> static ssize_t
> raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
> {
> struct r5conf *conf;
> unsigned long new;
> + unsigned int noio_flags;
> int err;
>
> if (len >= PAGE_SIZE)
> return -EINVAL;
> if (kstrtoul(page, 10, &new))
> return -EINVAL;
> new = !!new;
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err)
> return err;
> conf = mddev->private;
> if (!conf)
> err = -ENODEV;
> @@ -7152,11 +7153,11 @@ raid5_store_skip_copy(struct mddev *mddev, const char
> *page, size_t len)
> lim.features |= BLK_FEAT_STABLE_WRITES;
> else
> lim.features &= ~BLK_FEAT_STABLE_WRITES;
> err = queue_limits_commit_update(q, &lim);
> }
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return err ?: len;
> }
>
> static struct md_sysfs_entry
> raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
> @@ -7195,10 +7196,11 @@ static int alloc_thread_groups(struct r5conf *conf,
> int cnt,
> static ssize_t
> raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t
> len)
> {
> struct r5conf *conf;
> unsigned int new;
> + unsigned int noio_flags;
> int err;
> struct r5worker_group *new_groups, *old_groups;
> int group_cnt;
>
> if (len >= PAGE_SIZE)
> @@ -7207,16 +7209,16 @@ raid5_store_group_thread_cnt(struct mddev *mddev,
> const char *page, size_t len)
> return -EINVAL;
> /* 8192 should be big enough */
> if (new > 8192)
> return -EINVAL;
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err)
> return err;
> conf = mddev->private;
> if (!conf) {
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return -ENODEV;
> }
> raid5_quiesce(mddev, true);
>
> if (new != conf->worker_cnt_per_group) {
> @@ -7237,11 +7239,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev,
> const char *page, size_t len)
> kfree(old_groups);
> }
> }
>
> raid5_quiesce(mddev, false);
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
>
> return err ?: len;
> }
>
> static struct md_sysfs_entry
> @@ -8940,18 +8942,19 @@ static void *raid6_takeover(struct mddev *mddev)
> }
>
> static int raid5_change_consistency_policy(struct mddev *mddev, const char
> *buf)
> {
> struct r5conf *conf;
> + unsigned int noio_flags;
> int err;
>
> - err = mddev_suspend_and_lock(mddev);
> + err = mddev_suspend_and_lock(mddev, &noio_flags);
> if (err)
> return err;
> conf = mddev->private;
> if (!conf) {
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
> return -ENODEV;
> }
>
> if (strncmp(buf, "ppl", 3) == 0) {
> /* ppl only works with RAID 5 */
> @@ -8990,11 +8993,11 @@ static int raid5_change_consistency_policy(struct
> mddev *mddev, const char *buf)
> }
>
> if (!err)
> md_update_sb(mddev, 1);
>
> - mddev_unlock_and_resume(mddev);
> + mddev_unlock_and_resume(mddev, noio_flags);
>
> return err;
> }
>
> static int raid5_start(struct mddev *mddev)
--
Thanks,
Kuai