On Tue, Mar 24, 2026 at 6:58 AM Benjamin Marzinski <[email protected]> wrote:
>
> If make_stripe_request() returns STRIPE_WAIT_RESHAPE,
> raid5_make_request() will free the cloned bio. But raid5_make_request()
> can call make_stripe_request() multiple times, writing to the various
> stripes. If that bio got added to the toread or towrite lists of a
> stripe disk in an earlier call to make_stripe_request(), then it's not
> safe to just free the bio if a later part of it is found to cross the
> reshape position. Doing so can lead to a UAF error, when bio_endio()
> is called on the bio for the earlier stripes.
>
> Instead, raid5_make_request() needs to wait until all parts of the bio
> have called bio_endio(). To do this, bios that cross the reshape
> position while the reshape can't make progress are flagged as needing a
> retry, and mddev tracks the number of bios needing a retry which have
> not yet completed. When raid5_make_request() has a bio that failed
> make_stripe_request() with STRIPE_WAIT_RESHAPE, it waits for this
> counter to reach zero. When the bio_endio() is called for the last time
> on a bio needing a retry, it decrements mddev's count of outstanding
> bios needing a retry. This guarantees that raid5_make_request() doesn't
> return until the cloned bio needing a retry for io across the reshape
> boundary is safely cleaned up.
>
> There is a simple reproducer available at [1]. Compile the kernel with
> KASAN for more useful reporting when the error is triggered (this is not
> necessary to see the bug).
>
> [1] https://gist.github.com/bmarzins/e48598824305cf2171289e47d7241fa5
>
> Signed-off-by: Benjamin Marzinski <[email protected]>
> ---
>
> I've tested this for regressions with the lvm2-testsuite raid tests. I
> have not run any md-specific tests on it.
>
>
> drivers/md/md.c | 30 +++++++++---------------------
> drivers/md/md.h | 5 ++++-
> drivers/md/raid5.c | 8 +++++++-
> 3 files changed, 20 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 3ce6f9e9d38e..5ec116b9da32 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -776,9 +776,11 @@ int mddev_init(struct mddev *mddev)
> atomic_set(&mddev->active, 1);
> atomic_set(&mddev->openers, 0);
> atomic_set(&mddev->sync_seq, 0);
> + atomic_set(&mddev->pending_retry_bios, 0);
> spin_lock_init(&mddev->lock);
> init_waitqueue_head(&mddev->sb_wait);
> init_waitqueue_head(&mddev->recovery_wait);
> + init_waitqueue_head(&mddev->retry_bios_wait);
> mddev->reshape_position = MaxSector;
> mddev->reshape_backwards = 0;
> mddev->last_sync_action = ACTION_IDLE;
> @@ -9218,6 +9220,7 @@ static void md_end_clone_io(struct bio *bio)
> struct md_io_clone *md_io_clone = bio->bi_private;
> struct bio *orig_bio = md_io_clone->orig_bio;
> struct mddev *mddev = md_io_clone->mddev;
> + unsigned int must_retry = md_io_clone->must_retry;
>
> if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev,
> false))
> md_bitmap_end(mddev, md_io_clone);
> @@ -9229,7 +9232,11 @@ static void md_end_clone_io(struct bio *bio)
> bio_end_io_acct(orig_bio, md_io_clone->start_time);
>
> bio_put(bio);
> - bio_endio(orig_bio);
> + if (unlikely(must_retry)) {
> + if (atomic_dec_and_test(&mddev->pending_retry_bios))
> + wake_up(&mddev->retry_bios_wait);
> + } else
> + bio_endio(orig_bio);
> percpu_ref_put(&mddev->active_io);
> }
>
> @@ -9243,6 +9250,7 @@ static void md_clone_bio(struct mddev *mddev, struct
> bio **bio)
> md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
> md_io_clone->orig_bio = *bio;
> md_io_clone->mddev = mddev;
> + md_io_clone->must_retry = 0;
> if (blk_queue_io_stat(bdev->bd_disk->queue))
> md_io_clone->start_time = bio_start_io_acct(*bio);
>
> @@ -9265,26 +9273,6 @@ void md_account_bio(struct mddev *mddev, struct bio
> **bio)
> }
> EXPORT_SYMBOL_GPL(md_account_bio);
>
> -void md_free_cloned_bio(struct bio *bio)
> -{
> - struct md_io_clone *md_io_clone = bio->bi_private;
> - struct bio *orig_bio = md_io_clone->orig_bio;
> - struct mddev *mddev = md_io_clone->mddev;
> -
> - if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev,
> false))
> - md_bitmap_end(mddev, md_io_clone);
> -
> - if (bio->bi_status && !orig_bio->bi_status)
> - orig_bio->bi_status = bio->bi_status;
> -
> - if (md_io_clone->start_time)
> - bio_end_io_acct(orig_bio, md_io_clone->start_time);
> -
> - bio_put(bio);
> - percpu_ref_put(&mddev->active_io);
> -}
> -EXPORT_SYMBOL_GPL(md_free_cloned_bio);
> -
> /* md_allow_write(mddev)
> * Calling this ensures that the array is marked 'active' so that writes
> * may proceed without blocking. It is important to call this before
> diff --git a/drivers/md/md.h b/drivers/md/md.h
> index ac84289664cd..49a231f11676 100644
> --- a/drivers/md/md.h
> +++ b/drivers/md/md.h
> @@ -626,6 +626,9 @@ struct mddev {
>
> /* The sequence number for sync thread */
> atomic_t sync_seq;
> +
> + wait_queue_head_t retry_bios_wait;
> + atomic_t pending_retry_bios;
> };
>
> enum recovery_flags {
> @@ -877,6 +880,7 @@ struct md_io_clone {
> sector_t offset;
> unsigned long sectors;
> enum stat_group rw;
> + unsigned int must_retry;
> struct bio bio_clone;
> };
>
> @@ -917,7 +921,6 @@ extern void md_finish_reshape(struct mddev *mddev);
> void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
> struct bio *bio, sector_t start, sector_t size);
> void md_account_bio(struct mddev *mddev, struct bio **bio);
> -void md_free_cloned_bio(struct bio *bio);
>
> extern bool __must_check md_flush_request(struct mddev *mddev, struct bio
> *bio);
> void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index a8e8d431071b..fb78a757f2fd 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -6217,7 +6217,13 @@ static bool raid5_make_request(struct mddev *mddev,
> struct bio * bi)
>
> mempool_free(ctx, conf->ctx_pool);
> if (res == STRIPE_WAIT_RESHAPE) {
> - md_free_cloned_bio(bi);
> + struct md_io_clone *md_io_clone = bi->bi_private;
> +
> + md_io_clone->must_retry = 1;
> + atomic_inc(&mddev->pending_retry_bios);
> + bio_endio(bi);
> + wait_event(mddev->retry_bios_wait,
> + atomic_read(&mddev->pending_retry_bios)==0);
Hi Ben
There is a problem here. The new counter pending_retry_bios above
doesn't represent the bios which have been added to stripes. So uaf
still can happen. Do we need to add a new counter? Because
md_end_clone_io is only called when all bios return. How about
something like:
md_end_clone_io
+ if (unlikely(READ_ONCE(md_io_clone->waiting_reshape))) {
+ complete(md_io_clone->reshape_completion);
+ } else {
+ bio_endio(orig_bio);
+ }
raid5_make_request:
if (res == STRIPE_WAIT_RESHAPE) {
- md_free_cloned_bio(bi);
+ struct md_io_clone *md_io_clone = bi->bi_private;
+ struct completion done;
+
+ init_completion(&done);
+ md_io_clone->reshape_completion = &done;
+ WRITE_ONCE(md_io_clone->waiting_reshape, 1);
+
+ bio_endio(bi);
+
+ wait_for_completion(&done);
Best Regards
Xiao
> return false;
> }
>
> --
> 2.53.0
>