If make_stripe_request() returns STRIPE_WAIT_RESHAPE,
raid5_make_request() will free the cloned bio. But raid5_make_request()
can call make_stripe_request() multiple times, writing to the various
stripes. If that bio got added to the toread or towrite lists of a
stripe disk in an earlier call to make_stripe_request(), then it's not
safe to just free the bio if a later part of it is found to cross the
reshape position. Doing so can lead to a UAF error, when bio_endio()
is called on the bio for the earlier stripes.
Instead, raid5_make_request() needs to wait until all parts of the bio
have called bio_endio(). To do this, bios that cross the reshape
position while the reshape can't make progress are flagged as needing to
wait for all parts to complete. When raid5_make_request() has a bio that
failed make_stripe_request() with STRIPE_WAIT_RESHAPE, it sets
bi->bi_private to a completion struct and waits for completion after
ending the bio. When the bio_endio() is called for the last time on a
clone bio with bi->bi_private set, it wakes up the waiter. This
guarantees that raid5_make_request() doesn't return until the cloned bio
needing a retry for io across the reshape boundary is safely cleaned up.
There is a simple reproducer available at [1]. Compile the kernel with
KASAN for more useful reporting when the error is triggered (this is not
necessary to see the bug).
[1] https://gist.github.com/bmarzins/e48598824305cf2171289e47d7241fa5
Signed-off-by: Benjamin Marzinski <[email protected]>
---
Changes from v1:
- Removed mddev->pending_retry_bios, mddev->retry_bios_wait, and
md_io_clone->must_retry. Instead, use a completion struct
pointed to by bi->bi_private, as suggested by Xiao Ni and Yu Kuai.
drivers/md/md.c | 31 ++++++++-----------------------
drivers/md/md.h | 1 -
drivers/md/raid5.c | 7 ++++++-
3 files changed, 14 insertions(+), 25 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3ce6f9e9d38e..4318d875a5f6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9215,9 +9215,11 @@ static void md_bitmap_end(struct mddev *mddev, struct
md_io_clone *md_io_clone)
static void md_end_clone_io(struct bio *bio)
{
- struct md_io_clone *md_io_clone = bio->bi_private;
+ struct md_io_clone *md_io_clone = container_of(bio, struct md_io_clone,
+ bio_clone);
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
+ struct completion *reshape_completion = bio->bi_private;
if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
md_bitmap_end(mddev, md_io_clone);
@@ -9229,7 +9231,10 @@ static void md_end_clone_io(struct bio *bio)
bio_end_io_acct(orig_bio, md_io_clone->start_time);
bio_put(bio);
- bio_endio(orig_bio);
+ if (unlikely(reshape_completion))
+ complete(reshape_completion);
+ else
+ bio_endio(orig_bio);
percpu_ref_put(&mddev->active_io);
}
@@ -9254,7 +9259,7 @@ static void md_clone_bio(struct mddev *mddev, struct bio **bio)
}
clone->bi_end_io = md_end_clone_io;
- clone->bi_private = md_io_clone;
+ clone->bi_private = NULL;
*bio = clone;
}
@@ -9265,26 +9270,6 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
}
EXPORT_SYMBOL_GPL(md_account_bio);
-void md_free_cloned_bio(struct bio *bio)
-{
- struct md_io_clone *md_io_clone = bio->bi_private;
- struct bio *orig_bio = md_io_clone->orig_bio;
- struct mddev *mddev = md_io_clone->mddev;
-
- if (bio_data_dir(orig_bio) == WRITE && md_bitmap_enabled(mddev, false))
- md_bitmap_end(mddev, md_io_clone);
-
- if (bio->bi_status && !orig_bio->bi_status)
- orig_bio->bi_status = bio->bi_status;
-
- if (md_io_clone->start_time)
- bio_end_io_acct(orig_bio, md_io_clone->start_time);
-
- bio_put(bio);
- percpu_ref_put(&mddev->active_io);
-}
-EXPORT_SYMBOL_GPL(md_free_cloned_bio);
-
/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ac84289664cd..5d57fee22901 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -917,7 +917,6 @@ extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
void md_account_bio(struct mddev *mddev, struct bio **bio);
-void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
void md_write_metadata(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a8e8d431071b..dc0c680ca199 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6217,7 +6217,12 @@ static bool raid5_make_request(struct mddev *mddev,
struct bio * bi)
mempool_free(ctx, conf->ctx_pool);
if (res == STRIPE_WAIT_RESHAPE) {
- md_free_cloned_bio(bi);
+ DECLARE_COMPLETION_ONSTACK(done);
+ WRITE_ONCE(bi->bi_private, &done);
+
+ bio_endio(bi);
+
+ wait_for_completion(&done);
return false;
}