On Thu, 2012-12-13 at 11:58 -0200, Herton Ronaldo Krzesinski wrote:
> 3.5.7.2 -stable review patch.  If anyone has any objections, please let me 
> know.
> 
> ------------------
> 
> From: NeilBrown <ne...@suse.de>
> 
> commit e7c0c3fa29280d62aa5e11101a674bb3064bd791 upstream.
> 
> When a replacement operation completes there is a small window
> when the original device is marked 'faulty' and the replacement
> still looks like a replacement.  The faulty should be removed and
> the replacement moved in place very quickly, bit it isn't instant.
> 
> So the code write out to the array must handle the possibility that
> the only working device for some slot in the replacement - but it
> doesn't.  If the primary device is faulty it just gives up.  This
> can lead to corruption.
> 
> So make the code more robust: if either  the primary or the
> replacement is present and working, write to them.  Only when
> neither are present do we give up.
> 
> This bug has been present since replacement was introduced in
> 3.3, so it is suitable for any -stable kernel since then.

This is missing from 3.4, so Greg will presumably want to apply this (if
the backport is correct).

Ben.

> Reported-by: "George Spelvin" <li...@horizon.com>
> Signed-off-by: NeilBrown <ne...@suse.de>
> [ herton: hairy code adjustment on 3rd hunk (conf->copies for loop) ]
> Signed-off-by: Herton Ronaldo Krzesinski <herton.krzesin...@canonical.com>
> ---
>  drivers/md/raid10.c |  113 
> +++++++++++++++++++++++++++------------------------
>  1 file changed, 59 insertions(+), 54 deletions(-)
> 
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 17fae37..0920adf 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -1267,18 +1267,21 @@ retry_write:
>                       blocked_rdev = rrdev;
>                       break;
>               }
> +             if (rdev && (test_bit(Faulty, &rdev->flags)
> +                          || test_bit(Unmerged, &rdev->flags)))
> +                     rdev = NULL;
>               if (rrdev && (test_bit(Faulty, &rrdev->flags)
>                             || test_bit(Unmerged, &rrdev->flags)))
>                       rrdev = NULL;
>  
>               r10_bio->devs[i].bio = NULL;
>               r10_bio->devs[i].repl_bio = NULL;
> -             if (!rdev || test_bit(Faulty, &rdev->flags) ||
> -                 test_bit(Unmerged, &rdev->flags)) {
> +
> +             if (!rdev && !rrdev) {
>                       set_bit(R10BIO_Degraded, &r10_bio->state);
>                       continue;
>               }
> -             if (test_bit(WriteErrorSeen, &rdev->flags)) {
> +             if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
>                       sector_t first_bad;
>                       sector_t dev_sector = r10_bio->devs[i].addr;
>                       int bad_sectors;
> @@ -1320,8 +1323,10 @@ retry_write:
>                                       max_sectors = good_sectors;
>                       }
>               }
> -             r10_bio->devs[i].bio = bio;
> -             atomic_inc(&rdev->nr_pending);
> +             if (rdev) {
> +                     r10_bio->devs[i].bio = bio;
> +                     atomic_inc(&rdev->nr_pending);
> +             }
>               if (rrdev) {
>                       r10_bio->devs[i].repl_bio = bio;
>                       atomic_inc(&rrdev->nr_pending);
> @@ -1377,58 +1382,58 @@ retry_write:
>       for (i = 0; i < conf->copies; i++) {
>               struct bio *mbio;
>               int d = r10_bio->devs[i].devnum;
> -             if (!r10_bio->devs[i].bio)
> -                     continue;
> -
> -             mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> -             md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> -                         max_sectors);
> -             r10_bio->devs[i].bio = mbio;
> -
> -             mbio->bi_sector = (r10_bio->devs[i].addr+
> -                                choose_data_offset(r10_bio,
> -                                                   conf->mirrors[d].rdev));
> -             mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
> -             mbio->bi_end_io = raid10_end_write_request;
> -             mbio->bi_rw = WRITE | do_sync | do_fua;
> -             mbio->bi_private = r10_bio;
> -
> -             atomic_inc(&r10_bio->remaining);
> -             spin_lock_irqsave(&conf->device_lock, flags);
> -             bio_list_add(&conf->pending_bio_list, mbio);
> -             conf->pending_count++;
> -             spin_unlock_irqrestore(&conf->device_lock, flags);
> -             if (!mddev_check_plugged(mddev))
> -                     md_wakeup_thread(mddev->thread);
> -
> -             if (!r10_bio->devs[i].repl_bio)
> -                     continue;
> +             if (r10_bio->devs[i].bio) {
> +                     struct md_rdev *rdev = conf->mirrors[d].rdev;
> +                     mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> +                     md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> +                                 max_sectors);
> +                     r10_bio->devs[i].bio = mbio;
> +
> +                     mbio->bi_sector = (r10_bio->devs[i].addr+
> +                                        choose_data_offset(r10_bio,
> +                                                           rdev));
> +                     mbio->bi_bdev = rdev->bdev;
> +                     mbio->bi_end_io = raid10_end_write_request;
> +                     mbio->bi_rw = WRITE | do_sync | do_fua;
> +                     mbio->bi_private = r10_bio;
>  
> -             mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> -             md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> -                         max_sectors);
> -             r10_bio->devs[i].repl_bio = mbio;
> +                     atomic_inc(&r10_bio->remaining);
> +                     spin_lock_irqsave(&conf->device_lock, flags);
> +                     bio_list_add(&conf->pending_bio_list, mbio);
> +                     conf->pending_count++;
> +                     spin_unlock_irqrestore(&conf->device_lock, flags);
> +                     if (!mddev_check_plugged(mddev))
> +                             md_wakeup_thread(mddev->thread);
> +             }
>  
> -             /* We are actively writing to the original device
> -              * so it cannot disappear, so the replacement cannot
> -              * become NULL here
> -              */
> -             mbio->bi_sector = (r10_bio->devs[i].addr +
> -                                choose_data_offset(
> -                                        r10_bio,
> -                                        conf->mirrors[d].replacement));
> -             mbio->bi_bdev = conf->mirrors[d].replacement->bdev;
> -             mbio->bi_end_io = raid10_end_write_request;
> -             mbio->bi_rw = WRITE | do_sync | do_fua;
> -             mbio->bi_private = r10_bio;
> +             if (r10_bio->devs[i].repl_bio) {
> +                     struct md_rdev *rdev = conf->mirrors[d].replacement;
> +                     if (rdev == NULL) {
> +                             /* Replacement just got moved to main 'rdev' */
> +                             smp_mb();
> +                             rdev = conf->mirrors[d].rdev;
> +                     }
> +                     mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
> +                     md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
> +                                 max_sectors);
> +                     r10_bio->devs[i].repl_bio = mbio;
> +
> +                     mbio->bi_sector = (r10_bio->devs[i].addr +
> +                                        choose_data_offset(
> +                                                r10_bio, rdev));
> +                     mbio->bi_bdev = rdev->bdev;
> +                     mbio->bi_end_io = raid10_end_write_request;
> +                     mbio->bi_rw = WRITE | do_sync | do_fua;
> +                     mbio->bi_private = r10_bio;
>  
> -             atomic_inc(&r10_bio->remaining);
> -             spin_lock_irqsave(&conf->device_lock, flags);
> -             bio_list_add(&conf->pending_bio_list, mbio);
> -             conf->pending_count++;
> -             spin_unlock_irqrestore(&conf->device_lock, flags);
> -             if (!mddev_check_plugged(mddev))
> -                     md_wakeup_thread(mddev->thread);
> +                     atomic_inc(&r10_bio->remaining);
> +                     spin_lock_irqsave(&conf->device_lock, flags);
> +                     bio_list_add(&conf->pending_bio_list, mbio);
> +                     conf->pending_count++;
> +                     spin_unlock_irqrestore(&conf->device_lock, flags);
> +                     if (!mddev_check_plugged(mddev))
> +                             md_wakeup_thread(mddev->thread);
> +             }
>       }
>  
>       /* Don't remove the bias on 'remaining' (one_write_done) until

-- 
Ben Hutchings
Theory and practice are closer in theory than in practice.
                                - John Levine, moderator of comp.compilers

Attachment: signature.asc
Description: This is a digitally signed message part

Reply via email to