If make_stripe_request() returns STRIPE_WAIT_RESHAPE, raid5_make_request() will free the cloned bio. But raid5_make_request() can call make_stripe_request() multiple times, writing to the various stripes. If that bio got added to the toread or towrite lists of a stripe disk in an earlier call to make_stripe_request(), then it's not safe to just free the bio if a later part of it is found to cross the reshape position. Doing so can lead to a UAF error, when bio_endio() is called on the bio for the earlier stripes.
Instead, raid5_make_request() needs to wait until all parts of the bio have called bio_endio(). To do this, bios that cross the reshape position while the reshape can't make progress are flagged as needing a retry, and mddev tracks the number of bios needing a retry which have not yet completed. When raid5_make_request() has a bio that failed make_stripe_request() with STRIPE_WAIT_RESHAPE, it waits for this counter to reach zero. When the bio_endio() is called for the last time on a bio needing a retry, it decrements mddev's count of outstanding bios needing a retry. This guarantees that raid5_make_request() doesn't return until the cloned bio needing a retry for io across the reshape boundary is safely cleaned up. There is a simple reproducer available at [1]. Compile the kernel with KASAN for more useful reporting when the error is triggered (this is not necessary to see the bug). [1] https://gist.github.com/bmarzins/e48598824305cf2171289e47d7241fa5 Signed-off-by: Benjamin Marzinski <[email protected]> --- I've tested this for regressions with the lvm2-testsuite raid tests. I have not run any md-specific tests on it. drivers/md/md.c | 30 +++++++++--------------------- drivers/md/md.h | 5 ++++- drivers/md/raid5.c | 8 +++++++- 3 files changed, 20 insertions(+), 23 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 3ce6f9e9d38e..5ec116b9da32 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -776,9 +776,11 @@ int mddev_init(struct mddev *mddev) atomic_set(&mddev->active, 1); atomic_set(&mddev->openers, 0); atomic_set(&mddev->sync_seq, 0); + atomic_set(&mddev->pending_retry_bios, 0); spin_lock_init(&mddev->lock); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); + init_waitqueue_head(&mddev->retry_bios_wait); mddev->reshape_position = MaxSector; mddev->reshape_backwards = 0; mddev->last_sync_action = ACTION_IDLE; @@ -9218,6 +9220,7 @@ static void md_end_clone_io(struct bio *bio) struct md_io_clone *md_io_clone = bio->bi_private; struct bio *orig_bio = md_io_clone->orig_bio; struct mddev *mddev = md_io_clone->mddev; + unsigned int must_retry = md_io_clone->must_retry; if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) md_bitmap_end(mddev, md_io_clone); @@ -9229,7 +9232,11 @@ static void md_end_clone_io(struct bio *bio) bio_end_io_acct(orig_bio, md_io_clone->start_time); bio_put(bio); - bio_endio(orig_bio); + if (unlikely(must_retry)) { + if (atomic_dec_and_test(&mddev->pending_retry_bios)) + wake_up(&mddev->retry_bios_wait); + } else + bio_endio(orig_bio); percpu_ref_put(&mddev->active_io); } @@ -9243,6 +9250,7 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio) md_io_clone = container_of(clone, struct md_io_clone, bio_clone); md_io_clone->orig_bio = *bio; md_io_clone->mddev = mddev; + md_io_clone->must_retry = 0; if (blk_queue_io_stat(bdev->bd_disk->queue)) md_io_clone->start_time = bio_start_io_acct(*bio); @@ -9265,26 +9273,6 @@ void md_account_bio(struct mddev *mddev, struct bio **bio) } EXPORT_SYMBOL_GPL(md_account_bio); -void md_free_cloned_bio(struct bio *bio) -{ - struct md_io_clone *md_io_clone = bio->bi_private; - struct bio *orig_bio = md_io_clone->orig_bio; - struct mddev *mddev = md_io_clone->mddev; - - if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false)) - md_bitmap_end(mddev, md_io_clone); - - if (bio->bi_status && !orig_bio->bi_status) - orig_bio->bi_status = bio->bi_status; - - if (md_io_clone->start_time) - bio_end_io_acct(orig_bio, md_io_clone->start_time); - - bio_put(bio); - percpu_ref_put(&mddev->active_io); -} -EXPORT_SYMBOL_GPL(md_free_cloned_bio); - /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before diff --git a/drivers/md/md.h b/drivers/md/md.h index ac84289664cd..49a231f11676 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -626,6 +626,9 @@ struct mddev { /* The sequence number for sync thread */ atomic_t sync_seq; + + wait_queue_head_t retry_bios_wait; + atomic_t pending_retry_bios; }; enum recovery_flags { @@ -877,6 +880,7 @@ struct md_io_clone { sector_t offset; unsigned long sectors; enum stat_group rw; + unsigned int must_retry; struct bio bio_clone; }; @@ -917,7 +921,6 @@ extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); void md_account_bio(struct mddev *mddev, struct bio **bio); -void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8e8d431071b..fb78a757f2fd 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6217,7 +6217,13 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) mempool_free(ctx, conf->ctx_pool); if (res == STRIPE_WAIT_RESHAPE) { - md_free_cloned_bio(bi); + struct md_io_clone *md_io_clone = bi->bi_private; + + md_io_clone->must_retry = 1; + atomic_inc(&mddev->pending_retry_bios); + bio_endio(bi); + wait_event(mddev->retry_bios_wait, + atomic_read(&mddev->pending_retry_bios)==0); return false; } -- 2.53.0
