Hi,

在 2026/6/24 18:13, Chen Cheng 写道:
> From: Chen Cheng <[email protected]>
>
> Save token as mddev-scoped in mddev->noio_flags cause PF_MEMALLOC_NOIO
> leak into task A, while task B restores a token that it never saved.
>
> scenario:
>
> task A                          mddev                         task B
> ======                          =======                       ============
> write suspend_lo
> mddev_suspend()
>                                  suspended == 0
>                                  drain active_io
>                                  suspended = 1
> A: noio_A = memalloc_noio_save()
> A returns with PF_MEMALLOC_NOIO set
>
>                                                                write 
> suspend_hi
>                                                                mddev_suspend()
>                                  suspended == 1
>                                  suspended = 2
>                                                                B returns
>                                                                (no save)
>
> mddev_resume()
>                                  suspended = 1
>                                  not last resume
> A returns
> A still has PF_MEMALLOC_NOIO   <-- leaked
>
>                                                                mddev_resume()
>                                  suspended = 0
>                                                                
> memalloc_noio_restore(noio_A)
>                                                                (restores A's 
> token in B)
>
> Fixed by:
>    - return each caller's noio_flags from mddev_suspend()
>    - pass that token back into mddev_resume()
>    - update the suspend-and-lock helpers to carry the token
>    - store the token in struct raid_set for dm-raid paths where suspend
>      and resume are paired across callbacks
>
> Validation:
> repeatedly updates the array's suspend_lo and suspend_hi sysfs from many
> concurrent userspace workers. That makes multiple tasks to call
> mddev_suspend()/mddev_resume() concurrently.
>
> Each worker:
>    - reads its initial /proc/self/stat flags and verifies that 
> PF_MEMALLOC_NOIO is not already
>      set
>    - writes 0 to either suspend_lo or suspend_hi
>    - immediately reads its own task flags again
>    - reports success if flags & PF_MEMALLOC_NOIO is true after the write 
> returns
>
> Link: 
> https://github.com/chencheng-fnnas/reproducer/blob/main/repro-md-noio-token-leak.sh
>
> Fixes: 78f57ef9d50a ("md: use memalloc scope APIs in 
> mddev_suspend()/mddev_resume()")
>
> Signed-off-by: Chen Cheng <[email protected]>
> ---
>   drivers/md/dm-raid.c       |  7 ++--
>   drivers/md/md-autodetect.c |  5 ++-
>   drivers/md/md-bitmap.c     | 12 +++---
>   drivers/md/md.c            | 85 ++++++++++++++++++++++----------------
>   drivers/md/md.h            | 23 ++++++-----
>   drivers/md/raid5-cache.c   | 11 +++--
>   drivers/md/raid5.c         | 25 ++++++-----
>   7 files changed, 97 insertions(+), 71 deletions(-)
>
> diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
> index 8f5a5e1342a9..d89207e3722a 100644
> --- a/drivers/md/dm-raid.c
> +++ b/drivers/md/dm-raid.c
> @@ -239,10 +239,11 @@ struct raid_set {
>       int raid_disks;
>       int delta_disks;
>       int data_offset;
>       int raid10_copies;
>       int requested_bitmap_chunk_sectors;
> +     unsigned int suspend_noio_flags;
>   
>       struct mddev md;
>       struct raid_type *raid_type;
>   
>       sector_t array_sectors;
> @@ -3251,11 +3252,11 @@ static int raid_ctr(struct dm_target *ti, unsigned 
> int argc, char **argv)
>       /* Start raid set read-only and assumed clean to change in 
> raid_resume() */
>       rs->md.ro = MD_RDONLY;
>       rs->md.in_sync = 1;
>   
>       /* Has to be held on running the array */
> -     mddev_suspend_and_lock_nointr(&rs->md);
> +     mddev_suspend_and_lock_nointr(&rs->md, &rs->suspend_noio_flags);
>   
>       /* Keep array frozen until resume. */
>       md_frozen_sync_thread(&rs->md);
>   
>       r = md_run(&rs->md);
> @@ -3863,11 +3864,11 @@ static void raid_postsuspend(struct dm_target *ti)
>               /*
>                * sync_thread must be stopped during suspend, and writes have
>                * to be stopped before suspending to avoid deadlocks.
>                */
>               md_stop_writes(&rs->md);
> -             mddev_suspend(&rs->md, false);
> +             mddev_suspend(&rs->md, false, &rs->suspend_noio_flags);
>               rs->md.ro = MD_RDONLY;
>       }
>       clear_bit(MD_DM_SUSPENDING, &mddev->flags);
>   
>   }
> @@ -4141,11 +4142,11 @@ static void raid_resume(struct dm_target *ti)
>                                                      
> lockdep_is_held(&mddev->reconfig_mutex)));
>               clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
>               mddev->ro = MD_RDWR;
>               mddev->in_sync = 0;
>               md_unfrozen_sync_thread(mddev);
> -             mddev_unlock_and_resume(mddev);
> +             mddev_unlock_and_resume(mddev, rs->suspend_noio_flags);
>       }
>   }

For mdraid, changes looks fine. However, for dm-raid, for example:

dmsetup suspend ...

In this case, array will be suspended while task returned to user. Take a look 
at
the commit to introduce memalloc_noio_save(), it's supposed to be called for 
mdraid
arrays to allocate memory while array is suspended, I don't see why dm-raid 
need it.
So I'll suggest just bypass the memalloc_noio_save() for dm-raid where 
mddev_suspend()
is not paired with mddev_resume() in the same task context.

>   
>   static struct target_type raid_target = {
>       .name = "raid",
> diff --git a/drivers/md/md-autodetect.c b/drivers/md/md-autodetect.c
> index 4b80165afd23..58e062cd0580 100644
> --- a/drivers/md/md-autodetect.c
> +++ b/drivers/md/md-autodetect.c
> @@ -126,10 +126,11 @@ static void __init md_setup_drive(struct md_setup_args 
> *args)
>       dev_t devices[MD_SB_DISKS + 1], mdev;
>       struct mdu_array_info_s ainfo = { };
>       struct mddev *mddev;
>       int err = 0, i;
>       char name[16];
> +     unsigned int noio_flags;
>   
>       if (args->partitioned) {
>               mdev = MKDEV(mdp_major, args->minor << MdpMinorShift);
>               sprintf(name, "md_d%d", args->minor);
>       } else {
> @@ -173,11 +174,11 @@ static void __init md_setup_drive(struct md_setup_args 
> *args)
>       if (IS_ERR(mddev)) {
>               pr_err("md: md_alloc failed - cannot start array %s\n", name);
>               return;
>       }
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err) {
>               pr_err("md: failed to lock array %s\n", name);
>               goto out_mddev_put;
>       }
>   
> @@ -219,11 +220,11 @@ static void __init md_setup_drive(struct md_setup_args 
> *args)
>       if (!err)
>               err = do_md_run(mddev);
>       if (err)
>               pr_warn("md: starting %s failed\n", name);
>   out_unlock:
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>   out_mddev_put:
>       mddev_put(mddev);
>   }
>   
>   static int __init raid_setup(char *str)
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index 028b9ca8ce52..74b7f569a3f4 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -2620,13 +2620,14 @@ location_show(struct mddev *mddev, char *page)
>   }
>   
>   static ssize_t
>   location_store(struct mddev *mddev, const char *buf, size_t len)
>   {
> +     unsigned int noio_flags;
>       int rv;
>   
> -     rv = mddev_suspend_and_lock(mddev);
> +     rv = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (rv)
>               return rv;
>   
>       if (mddev->pers) {
>               if (mddev->recovery || mddev->sync_thread) {
> @@ -2711,11 +2712,11 @@ location_store(struct mddev *mddev, const char *buf, 
> size_t len)
>               set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
>               md_wakeup_thread(mddev->thread);
>       }
>       rv = 0;
>   out:
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       if (rv)
>               return rv;
>       return len;
>   
>   merge_err:
> @@ -2831,17 +2832,18 @@ backlog_store(struct mddev *mddev, const char *buf, 
> size_t len)
>   {
>       unsigned long backlog;
>       unsigned long old_mwb = mddev->bitmap_info.max_write_behind;
>       struct md_rdev *rdev;
>       bool has_write_mostly = false;
> +     unsigned int noio_flags;
>       int rv = kstrtoul(buf, 10, &backlog);
>       if (rv)
>               return rv;
>       if (backlog > COUNTER_MAX)
>               return -EINVAL;
>   
> -     rv = mddev_suspend_and_lock(mddev);
> +     rv = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (rv)
>               return rv;
>   
>       /*
>        * Without write mostly device, it doesn't make sense to set
> @@ -2854,11 +2856,11 @@ backlog_store(struct mddev *mddev, const char *buf, 
> size_t len)
>               }
>       }
>       if (!has_write_mostly) {
>               pr_warn_ratelimited("%s: can't set backlog, no write mostly 
> device available\n",
>                                   mdname(mddev));
> -             mddev_unlock(mddev);
> +             mddev_unlock_and_resume(mddev, noio_flags);
>               return -EINVAL;
>       }
>   
>       mddev->bitmap_info.max_write_behind = backlog;
>       if (!backlog && mddev->serial_info_pool) {
> @@ -2871,11 +2873,11 @@ backlog_store(struct mddev *mddev, const char *buf, 
> size_t len)
>                       mddev_create_serial_pool(mddev, rdev);
>       }
>       if (old_mwb != backlog)
>               bitmap_update_sb(mddev->bitmap);
>   
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       return len;
>   }
>   
>   static struct md_sysfs_entry bitmap_backlog =
>   __ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store);
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 1377c407614c..86d938dee50a 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -459,11 +459,12 @@ static void md_submit_bio(struct bio *bio)
>   
>   /*
>    * Make sure no new requests are submitted to the device, and any requests 
> that
>    * have been submitted are completely handled.
>    */
> -int mddev_suspend(struct mddev *mddev, bool interruptible)
> +int mddev_suspend(struct mddev *mddev, bool interruptible,
> +               unsigned int *noio_flags)
>   {
>       int err = 0;
>   
>       /*
>        * hold reconfig_mutex to wait for normal io will deadlock, because
> @@ -478,10 +479,11 @@ int mddev_suspend(struct mddev *mddev, bool 
> interruptible)
>               mutex_lock(&mddev->suspend_mutex);
>       if (err)
>               return err;
>   
>       if (mddev->suspended) {
> +             *noio_flags = memalloc_noio_save();
>               WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
>               mutex_unlock(&mddev->suspend_mutex);
>               return 0;
>       }
>   
> @@ -515,31 +517,30 @@ int mddev_suspend(struct mddev *mddev, bool 
> interruptible)
>        * prevent deadlock.
>        */
>       WRITE_ONCE(mddev->suspended, mddev->suspended + 1);
>   
>       /* restrict memory reclaim I/O during raid array is suspend */
> -     mddev->noio_flag = memalloc_noio_save();
> +     *noio_flags = memalloc_noio_save();
>   
>       mutex_unlock(&mddev->suspend_mutex);
>       return 0;
>   }
>   EXPORT_SYMBOL_GPL(mddev_suspend);
>   
> -static void __mddev_resume(struct mddev *mddev, bool recovery_needed)
> +static void __mddev_resume(struct mddev *mddev, bool recovery_needed,
> +                        unsigned int noio_flags)
>   {
>       lockdep_assert_not_held(&mddev->reconfig_mutex);
>   
>       mutex_lock(&mddev->suspend_mutex);
> +     memalloc_noio_restore(noio_flags);
>       WRITE_ONCE(mddev->suspended, mddev->suspended - 1);
>       if (mddev->suspended) {
>               mutex_unlock(&mddev->suspend_mutex);
>               return;
>       }
>   
> -     /* entred the memalloc scope from mddev_suspend() */
> -     memalloc_noio_restore(mddev->noio_flag);
> -
>       percpu_ref_resurrect(&mddev->active_io);
>       wake_up(&mddev->sb_wait);
>   
>       if (recovery_needed)
>               set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> @@ -547,13 +548,13 @@ static void __mddev_resume(struct mddev *mddev, bool 
> recovery_needed)
>       md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
>   
>       mutex_unlock(&mddev->suspend_mutex);
>   }
>   
> -void mddev_resume(struct mddev *mddev)
> +void mddev_resume(struct mddev *mddev, unsigned int noio_flags)
>   {
> -     return __mddev_resume(mddev, true);
> +     return __mddev_resume(mddev, true, noio_flags);
>   }
>   EXPORT_SYMBOL_GPL(mddev_resume);
>   
>   /* sync bdev before setting device to readonly or stopping raid*/
>   static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int 
> opener_num)
> @@ -3737,10 +3738,11 @@ rdev_attr_store(struct kobject *kobj, struct 
> attribute *attr,
>   {
>       struct rdev_sysfs_entry *entry = container_of(attr, struct 
> rdev_sysfs_entry, attr);
>       struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
>       struct kernfs_node *kn = NULL;
>       bool suspend = false;
> +     unsigned int noio_flags = 0;
>       ssize_t rv;
>       struct mddev *mddev = READ_ONCE(rdev->mddev);
>   
>       if (!entry->store)
>               return -EIO;
> @@ -3756,17 +3758,17 @@ rdev_attr_store(struct kobject *kobj, struct 
> attribute *attr,
>                   cmd_match(page, "writemostly") ||
>                   cmd_match(page, "-writemostly"))
>                       suspend = true;
>       }
>   
> -     rv = suspend ? mddev_suspend_and_lock(mddev) : mddev_lock(mddev);
> +     rv = suspend ? mddev_suspend_and_lock(mddev, &noio_flags) : 
> mddev_lock(mddev);
>       if (!rv) {
>               if (rdev->mddev == NULL)
>                       rv = -ENODEV;
>               else
>                       rv = entry->store(rdev, page, length);
> -             suspend ? mddev_unlock_and_resume(mddev) : mddev_unlock(mddev);
> +             suspend ? mddev_unlock_and_resume(mddev, noio_flags) : 
> mddev_unlock(mddev);
>       }
>   
>       if (kn)
>               sysfs_unbreak_active_protection(kn);
>   
> @@ -4049,15 +4051,16 @@ level_store(struct mddev *mddev, const char *buf, 
> size_t len)
>       size_t slen = len;
>       struct md_personality *pers, *oldpers;
>       long level;
>       void *priv, *oldpriv;
>       struct md_rdev *rdev;
> +     unsigned int noio_flags;
>   
>       if (slen == 0 || slen >= sizeof(clevel))
>               return -EINVAL;
>   
> -     rv = mddev_suspend_and_lock(mddev);
> +     rv = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (rv)
>               return rv;
>   
>       if (mddev->pers == NULL) {
>               memcpy(mddev->clevel, buf, slen);
> @@ -4231,11 +4234,11 @@ level_store(struct mddev *mddev, const char *buf, 
> size_t len)
>               md_update_sb(mddev, 1);
>       sysfs_notify_dirent_safe(mddev->sysfs_level);
>       md_new_event();
>       rv = len;
>   out_unlock:
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       return rv;
>   }
>   
>   static struct md_sysfs_entry md_level =
>   __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
> @@ -4410,17 +4413,18 @@ static int update_raid_disks(struct mddev *mddev, int 
> raid_disks);
>   
>   static ssize_t
>   raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
>   {
>       unsigned int n;
> +     unsigned int noio_flags;
>       int err;
>   
>       err = kstrtouint(buf, 10, &n);
>       if (err < 0)
>               return err;
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err)
>               return err;
>       if (mddev->pers) {
>               if (n != mddev->raid_disks)
>                       err = update_raid_disks(mddev, n);
> @@ -4442,11 +4446,11 @@ raid_disks_store(struct mddev *mddev, const char 
> *buf, size_t len)
>               mddev->raid_disks = n;
>               mddev->reshape_backwards = (mddev->delta_disks < 0);
>       } else
>               mddev->raid_disks = n;
>   out_unlock:
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       return err ? err : len;
>   }
>   static struct md_sysfs_entry md_raid_disks =
>   __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
>   
> @@ -4822,10 +4826,11 @@ new_dev_store(struct mddev *mddev, const char *buf, 
> size_t len)
>       char *e;
>       int major = simple_strtoul(buf, &e, 10);
>       int minor;
>       dev_t dev;
>       struct md_rdev *rdev;
> +     unsigned int noio_flags;
>       int err;
>   
>       if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
>               return -EINVAL;
>       minor = simple_strtoul(e+1, &e, 10);
> @@ -4834,11 +4839,11 @@ new_dev_store(struct mddev *mddev, const char *buf, 
> size_t len)
>       dev = MKDEV(major, minor);
>       if (major != MAJOR(dev) ||
>           minor != MINOR(dev))
>               return -EOVERFLOW;
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err)
>               return err;
>       if (mddev->persistent) {
>               rdev = md_import_device(dev, mddev->major_version,
>                                       mddev->minor_version);
> @@ -4855,18 +4860,18 @@ new_dev_store(struct mddev *mddev, const char *buf, 
> size_t len)
>               rdev = md_import_device(dev, -2, -1);
>       else
>               rdev = md_import_device(dev, -1, -1);
>   
>       if (IS_ERR(rdev)) {
> -             mddev_unlock_and_resume(mddev);
> +             mddev_unlock_and_resume(mddev, noio_flags);
>               return PTR_ERR(rdev);
>       }
>       err = bind_rdev_to_array(rdev, mddev);
>    out:
>       if (err)
>               export_rdev(rdev);
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       if (!err)
>               md_new_event();
>       return err ? err : len;
>   }
>   
> @@ -5257,28 +5262,29 @@ static int mddev_start_reshape(struct mddev *mddev)
>   static ssize_t
>   action_store(struct mddev *mddev, const char *page, size_t len)
>   {
>       int ret;
>       enum sync_action action;
> +     unsigned int noio_flags = 0;
>   
>       if (!mddev->pers || !mddev->pers->sync_request)
>               return -EINVAL;
>   
>       action = md_sync_action_by_name(page);
>   retry:
>       if (work_busy(&mddev->sync_work))
>               flush_work(&mddev->sync_work);
>   
>       ret = (action == ACTION_RESHAPE) ?
> -             mddev_suspend_and_lock(mddev) :
> +             mddev_suspend_and_lock(mddev, &noio_flags) :
>               mddev_lock(mddev);
>       if (ret)
>               return ret;
>   
>       if (work_busy(&mddev->sync_work)) {
>               if (action == ACTION_RESHAPE)
> -                     mddev_unlock_and_resume(mddev);
> +                     mddev_unlock_and_resume(mddev, noio_flags);
>               else
>                       mddev_unlock(mddev);
>               goto retry;
>       }
>   
> @@ -5349,11 +5355,11 @@ action_store(struct mddev *mddev, const char *page, 
> size_t len)
>       sysfs_notify_dirent_safe(mddev->sysfs_action);
>       ret = len;
>   
>   out:
>       if (action == ACTION_RESHAPE)
> -             mddev_unlock_and_resume(mddev);
> +             mddev_unlock_and_resume(mddev, noio_flags);
>       else
>               mddev_unlock(mddev);
>       return ret;
>   }
>   
> @@ -5640,24 +5646,25 @@ suspend_lo_show(struct mddev *mddev, char *page)
>   
>   static ssize_t
>   suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
>   {
>       unsigned long long new;
> +     unsigned int noio_flags;
>       int err;
>   
>       err = kstrtoull(buf, 10, &new);
>       if (err < 0)
>               return err;
>       if (new != (sector_t)new)
>               return -EINVAL;
>   
> -     err = mddev_suspend(mddev, true);
> +     err = mddev_suspend(mddev, true, &noio_flags);
>       if (err)
>               return err;
>   
>       WRITE_ONCE(mddev->suspend_lo, new);
> -     mddev_resume(mddev);
> +     mddev_resume(mddev, noio_flags);
>   
>       return len;
>   }
>   static struct md_sysfs_entry md_suspend_lo =
>   __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
> @@ -5671,24 +5678,25 @@ suspend_hi_show(struct mddev *mddev, char *page)
>   
>   static ssize_t
>   suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
>   {
>       unsigned long long new;
> +     unsigned int noio_flags;
>       int err;
>   
>       err = kstrtoull(buf, 10, &new);
>       if (err < 0)
>               return err;
>       if (new != (sector_t)new)
>               return -EINVAL;
>   
> -     err = mddev_suspend(mddev, true);
> +     err = mddev_suspend(mddev, true, &noio_flags);
>       if (err)
>               return err;
>   
>       WRITE_ONCE(mddev->suspend_hi, new);
> -     mddev_resume(mddev);
> +     mddev_resume(mddev, noio_flags);
>   
>       return len;
>   }
>   static struct md_sysfs_entry md_suspend_hi =
>   __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
> @@ -5928,19 +5936,20 @@ static ssize_t serialize_policy_show(struct mddev 
> *mddev, char *page)
>   static ssize_t
>   serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
>   {
>       int err;
>       bool value;
> +     unsigned int noio_flags;
>   
>       err = kstrtobool(buf, &value);
>       if (err)
>               return err;
>   
>       if (value == test_bit(MD_SERIALIZE_POLICY, &mddev->flags))
>               return len;
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err)
>               return err;
>       if (mddev->pers == NULL || (mddev->pers->head.id != ID_RAID1)) {
>               pr_err("md: serialize_policy is only effective for raid1\n");
>               err = -EINVAL;
> @@ -5953,11 +5962,11 @@ serialize_policy_store(struct mddev *mddev, const 
> char *buf, size_t len)
>       } else {
>               mddev_destroy_serial_pool(mddev, NULL);
>               clear_bit(MD_SERIALIZE_POLICY, &mddev->flags);
>       }
>   unlock:
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       return err ?: len;
>   }
>   
>   static struct md_sysfs_entry md_serialize_policy =
>   __ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
> @@ -6263,21 +6272,22 @@ EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
>   
>   /* update the optimal I/O size after a reshape */
>   void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
>   {
>       struct queue_limits lim;
> +     unsigned int noio_flags;
>   
>       if (mddev_is_dm(mddev))
>               return;
>   
>       /* don't bother updating io_opt if we can't suspend the array */
> -     if (mddev_suspend(mddev, false) < 0)
> +     if (mddev_suspend(mddev, false, &noio_flags) < 0)
>               return;
>       lim = queue_limits_start_update(mddev->gendisk->queue);
>       lim.io_opt = lim.io_min * nr_stripes;
>       queue_limits_commit_update(mddev->gendisk->queue, &lim);
> -     mddev_resume(mddev);
> +     mddev_resume(mddev, noio_flags);
>   }
>   EXPORT_SYMBOL_GPL(mddev_update_io_opt);
>   
>   static void mddev_delayed_delete(struct work_struct *ws)
>   {
> @@ -7255,10 +7265,11 @@ static void autorun_array(struct mddev *mddev)
>    */
>   static void autorun_devices(int part)
>   {
>       struct md_rdev *rdev0, *rdev, *tmp;
>       struct mddev *mddev;
> +     unsigned int noio_flags;
>   
>       pr_info("md: autorun ...\n");
>       while (!list_empty(&pending_raid_disks)) {
>               int unit;
>               dev_t dev;
> @@ -7295,27 +7306,27 @@ static void autorun_devices(int part)
>   
>               mddev = md_alloc(dev, NULL);
>               if (IS_ERR(mddev))
>                       break;
>   
> -             if (mddev_suspend_and_lock(mddev))
> +             if (mddev_suspend_and_lock(mddev, &noio_flags))
>                       pr_warn("md: %s locked, cannot run\n", mdname(mddev));
>               else if (mddev->raid_disks || mddev->major_version
>                        || !list_empty(&mddev->disks)) {
>                       pr_warn("md: %s already running, cannot run %pg\n",
>                               mdname(mddev), rdev0->bdev);
> -                     mddev_unlock_and_resume(mddev);
> +                     mddev_unlock_and_resume(mddev, noio_flags);
>               } else {
>                       pr_debug("md: created %s\n", mdname(mddev));
>                       mddev->persistent = 1;
>                       rdev_for_each_list(rdev, tmp, &candidates) {
>                               list_del_init(&rdev->same_set);
>                               if (bind_rdev_to_array(rdev, mddev))
>                                       export_rdev(rdev);
>                       }
>                       autorun_array(mddev);
> -                     mddev_unlock_and_resume(mddev);
> +                     mddev_unlock_and_resume(mddev, noio_flags);
>               }
>               /* on success, candidates will be empty, on error
>                * it won't...
>                */
>               rdev_for_each_list(rdev, tmp, &candidates) {
> @@ -8329,10 +8340,11 @@ static int __md_set_array_info(struct mddev *mddev, 
> void __user *argp)
>   
>   static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
>                       unsigned int cmd, unsigned long arg)
>   {
>       int err = 0;
> +     unsigned int noio_flags = 0;
>       void __user *argp = (void __user *)arg;
>       struct mddev *mddev = NULL;
>   
>       err = md_ioctl_valid(cmd);
>       if (err)
> @@ -8380,11 +8392,11 @@ static int md_ioctl(struct block_device *bdev, 
> blk_mode_t mode,
>       }
>   
>       if (!md_is_rdwr(mddev))
>               flush_work(&mddev->sync_work);
>   
> -     err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev) :
> +     err = md_ioctl_need_suspend(cmd) ? mddev_suspend_and_lock(mddev, 
> &noio_flags) :
>                                          mddev_lock(mddev);
>       if (err) {
>               pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
>                        err, cmd);
>               goto out;
> @@ -8511,11 +8523,11 @@ static int md_ioctl(struct block_device *bdev, 
> blk_mode_t mode,
>   unlock:
>       if (mddev->hold_active == UNTIL_IOCTL &&
>           err != -EINVAL)
>               mddev->hold_active = 0;
>   
> -     md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev) :
> +     md_ioctl_need_suspend(cmd) ? mddev_unlock_and_resume(mddev, noio_flags) 
> :
>                                    mddev_unlock(mddev);
>   
>   out:
>       if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
>               clear_bit(MD_CLOSING, &mddev->flags);
> @@ -10180,20 +10192,21 @@ static bool md_choose_sync_action(struct mddev 
> *mddev, int *spares)
>   static void md_start_sync(struct work_struct *ws)
>   {
>       struct mddev *mddev = container_of(ws, struct mddev, sync_work);
>       int spares = 0;
>       bool suspend = false;
> +     unsigned int noio_flags = 0;
>       char *name;
>   
>       /*
>        * If reshape is still in progress, spares won't be added or removed
>        * from conf until reshape is done.
>        */
>       if (mddev->reshape_position == MaxSector &&
>           md_spares_need_change(mddev)) {
>               suspend = true;
> -             mddev_suspend(mddev, false);
> +             mddev_suspend(mddev, false, &noio_flags);
>       }
>   
>       mddev_lock_nointr(mddev);
>       if (!md_is_rdwr(mddev)) {
>               /*
> @@ -10237,11 +10250,11 @@ static void md_start_sync(struct work_struct *ws)
>        * not set it again. Otherwise, we may cause issue like this one:
>        *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
>        * Therefore, use __mddev_resume(mddev, false).
>        */
>       if (suspend)
> -             __mddev_resume(mddev, false);
> +             __mddev_resume(mddev, false, noio_flags);
>       md_wakeup_thread(mddev->sync_thread);
>       sysfs_notify_dirent_safe(mddev->sysfs_action);
>       md_new_event();
>       return;
>   
> @@ -10257,11 +10270,11 @@ static void md_start_sync(struct work_struct *ws)
>        * not set it again. Otherwise, we may cause issue like this one:
>        *     https://bugzilla.kernel.org/show_bug.cgi?id=218200
>        * Therefore, use __mddev_resume(mddev, false).
>        */
>       if (suspend)
> -             __mddev_resume(mddev, false);
> +             __mddev_resume(mddev, false, noio_flags);
>   
>       wake_up(&resync_wait);
>       if (test_and_clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
>           mddev->sysfs_action)
>               sysfs_notify_dirent_safe(mddev->sysfs_action);
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index d8daf0f75cbb..3337cd21eb30 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -619,11 +619,10 @@ struct mddev {
>       mempool_t *serial_info_pool;
>       void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
>       struct md_cluster_info          *cluster_info;
>       struct md_cluster_operations *cluster_ops;
>       unsigned int                    good_device_nr; /* good device num 
> within cluster raid */
> -     unsigned int                    noio_flag; /* for memalloc scope API */
>   
>       /*
>        * Temporarily store rdev that will be finally removed when
>        * reconfig_mutex is unlocked, protected by reconfig_mutex.
>        */
> @@ -953,12 +952,13 @@ extern void md_stop(struct mddev *mddev);
>   extern void md_stop_writes(struct mddev *mddev);
>   extern int md_rdev_init(struct md_rdev *rdev);
>   extern void md_rdev_clear(struct md_rdev *rdev);
>   
>   extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
> -extern int mddev_suspend(struct mddev *mddev, bool interruptible);
> -extern void mddev_resume(struct mddev *mddev);
> +extern int mddev_suspend(struct mddev *mddev, bool interruptible,
> +                      unsigned int *noio_flags);
> +extern void mddev_resume(struct mddev *mddev, unsigned int noio_flags);
>   extern void md_idle_sync_thread(struct mddev *mddev);
>   extern void md_frozen_sync_thread(struct mddev *mddev);
>   extern void md_unfrozen_sync_thread(struct mddev *mddev);
>   
>   extern void md_update_sb(struct mddev *mddev, int force);
> @@ -999,35 +999,38 @@ static inline void mddev_check_write_zeroes(struct 
> mddev *mddev, struct bio *bio
>       if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
>           !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
>               mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0;
>   }
>   
> -static inline int mddev_suspend_and_lock(struct mddev *mddev)
> +static inline int mddev_suspend_and_lock(struct mddev *mddev,
> +                                      unsigned int *noio_flags)
>   {
>       int ret;
>   
> -     ret = mddev_suspend(mddev, true);
> +     ret = mddev_suspend(mddev, true, noio_flags);
>       if (ret)
>               return ret;
>   
>       ret = mddev_lock(mddev);
>       if (ret)
> -             mddev_resume(mddev);
> +             mddev_resume(mddev, *noio_flags);
>   
>       return ret;
>   }
>   
> -static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev)
> +static inline void mddev_suspend_and_lock_nointr(struct mddev *mddev,
> +                                              unsigned int *noio_flags)
>   {
> -     mddev_suspend(mddev, false);
> +     mddev_suspend(mddev, false, noio_flags);
>       mddev_lock_nointr(mddev);
>   }
>   
> -static inline void mddev_unlock_and_resume(struct mddev *mddev)
> +static inline void mddev_unlock_and_resume(struct mddev *mddev,
> +                                        unsigned int noio_flags)
>   {
>       mddev_unlock(mddev);
> -     mddev_resume(mddev);
> +     mddev_resume(mddev, noio_flags);
>   }
>   
>   struct mdu_array_info_s;
>   struct mdu_disk_info_s;
>   
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 7b7546bfa21f..6f8e3a624456 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -693,13 +693,15 @@ static void r5c_disable_writeback_async(struct 
> work_struct *work)
>                  !READ_ONCE(conf->log) ||
>                  !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
>   
>       log = READ_ONCE(conf->log);
>       if (log) {
> -             mddev_suspend(mddev, false);
> +             unsigned int noio_flags;
> +
> +             mddev_suspend(mddev, false, &noio_flags);
>               log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
> -             mddev_resume(mddev);
> +             mddev_resume(mddev, noio_flags);
>       }
>   }
>   
>   static void r5l_submit_current_io(struct r5l_log *log)
>   {
> @@ -2603,10 +2605,11 @@ EXPORT_SYMBOL(r5c_journal_mode_set);
>   static ssize_t r5c_journal_mode_store(struct mddev *mddev,
>                                     const char *page, size_t length)
>   {
>       int mode = ARRAY_SIZE(r5c_journal_mode_str);
>       size_t len = length;
> +     unsigned int noio_flags;
>       int ret;
>   
>       if (len < 2)
>               return -EINVAL;
>   
> @@ -2615,15 +2618,15 @@ static ssize_t r5c_journal_mode_store(struct mddev 
> *mddev,
>   
>       while (mode--)
>               if (strlen(r5c_journal_mode_str[mode]) == len &&
>                   !strncmp(page, r5c_journal_mode_str[mode], len))
>                       break;
> -     ret = mddev_suspend_and_lock(mddev);
> +     ret = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (ret)
>               return ret;
>       ret = r5c_journal_mode_set(mddev, mode);
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       return ret ?: length;
>   }
>   
>   struct md_sysfs_entry
>   r5c_journal_mode = __ATTR(journal_mode, 0644,
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 65ae7d8930fc..6062c4b62cc8 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6992,11 +6992,11 @@ raid5_show_stripe_size(struct mddev  *mddev, char 
> *page)
>   #if PAGE_SIZE != DEFAULT_STRIPE_SIZE
>   static ssize_t
>   raid5_store_stripe_size(struct mddev  *mddev, const char *page, size_t len)
>   {
>       struct r5conf *conf;
> -     unsigned long new;
> +     unsigned long new, noio_flags;
>       int err;
>       int size;
>   
>       if (len >= PAGE_SIZE)
>               return -EINVAL;
> @@ -7011,11 +7011,11 @@ raid5_store_stripe_size(struct mddev  *mddev, const 
> char *page, size_t len)
>       if (new % DEFAULT_STRIPE_SIZE != 0 ||
>                       new > PAGE_SIZE || new == 0 ||
>                       new != roundup_pow_of_two(new))
>               return -EINVAL;
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err)
>               return err;
>   
>       conf = mddev->private;
>       if (!conf) {
> @@ -7049,11 +7049,11 @@ raid5_store_stripe_size(struct mddev  *mddev, const 
> char *page, size_t len)
>               err = -ENOMEM;
>       }
>       mutex_unlock(&conf->cache_size_mutex);
>   
>   out_unlock:
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       return err ?: len;
>   }
>   
>   static struct md_sysfs_entry
>   raid5_stripe_size = __ATTR(stripe_size, 0644,
> @@ -7127,19 +7127,20 @@ raid5_show_skip_copy(struct mddev *mddev, char *page)
>   static ssize_t
>   raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
>   {
>       struct r5conf *conf;
>       unsigned long new;
> +     unsigned int noio_flags;
>       int err;
>   
>       if (len >= PAGE_SIZE)
>               return -EINVAL;
>       if (kstrtoul(page, 10, &new))
>               return -EINVAL;
>       new = !!new;
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err)
>               return err;
>       conf = mddev->private;
>       if (!conf)
>               err = -ENODEV;
> @@ -7152,11 +7153,11 @@ raid5_store_skip_copy(struct mddev *mddev, const char 
> *page, size_t len)
>                       lim.features |= BLK_FEAT_STABLE_WRITES;
>               else
>                       lim.features &= ~BLK_FEAT_STABLE_WRITES;
>               err = queue_limits_commit_update(q, &lim);
>       }
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>       return err ?: len;
>   }
>   
>   static struct md_sysfs_entry
>   raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
> @@ -7195,10 +7196,11 @@ static int alloc_thread_groups(struct r5conf *conf, 
> int cnt,
>   static ssize_t
>   raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t 
> len)
>   {
>       struct r5conf *conf;
>       unsigned int new;
> +     unsigned int noio_flags;
>       int err;
>       struct r5worker_group *new_groups, *old_groups;
>       int group_cnt;
>   
>       if (len >= PAGE_SIZE)
> @@ -7207,16 +7209,16 @@ raid5_store_group_thread_cnt(struct mddev *mddev, 
> const char *page, size_t len)
>               return -EINVAL;
>       /* 8192 should be big enough */
>       if (new > 8192)
>               return -EINVAL;
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err)
>               return err;
>       conf = mddev->private;
>       if (!conf) {
> -             mddev_unlock_and_resume(mddev);
> +             mddev_unlock_and_resume(mddev, noio_flags);
>               return -ENODEV;
>       }
>       raid5_quiesce(mddev, true);
>   
>       if (new != conf->worker_cnt_per_group) {
> @@ -7237,11 +7239,11 @@ raid5_store_group_thread_cnt(struct mddev *mddev, 
> const char *page, size_t len)
>                       kfree(old_groups);
>               }
>       }
>   
>       raid5_quiesce(mddev, false);
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>   
>       return err ?: len;
>   }
>   
>   static struct md_sysfs_entry
> @@ -8940,18 +8942,19 @@ static void *raid6_takeover(struct mddev *mddev)
>   }
>   
>   static int raid5_change_consistency_policy(struct mddev *mddev, const char 
> *buf)
>   {
>       struct r5conf *conf;
> +     unsigned int noio_flags;
>       int err;
>   
> -     err = mddev_suspend_and_lock(mddev);
> +     err = mddev_suspend_and_lock(mddev, &noio_flags);
>       if (err)
>               return err;
>       conf = mddev->private;
>       if (!conf) {
> -             mddev_unlock_and_resume(mddev);
> +             mddev_unlock_and_resume(mddev, noio_flags);
>               return -ENODEV;
>       }
>   
>       if (strncmp(buf, "ppl", 3) == 0) {
>               /* ppl only works with RAID 5 */
> @@ -8990,11 +8993,11 @@ static int raid5_change_consistency_policy(struct 
> mddev *mddev, const char *buf)
>       }
>   
>       if (!err)
>               md_update_sb(mddev, 1);
>   
> -     mddev_unlock_and_resume(mddev);
> +     mddev_unlock_and_resume(mddev, noio_flags);
>   
>       return err;
>   }
>   
>   static int raid5_start(struct mddev *mddev)

-- 
Thanks,
Kuai


Reply via email to