RE: md raid acceleration and the async_tx api

Williams, Dan J Mon, 27 Aug 2007 12:13:36 -0700

> From: Yuri Tikhonov [mailto:[EMAIL PROTECTED]
>  Hello,
> 
>  I tested the h/w accelerated RAID-5 using the kernel with PAGE_SIZE
set to
> 64KB and found the bonnie++ application hangs-up during the
"Re-writing"
> test. I made some investigations and discovered that the hang-up
occurs
> because one of the mpage_end_io_read() calls is missing (these are the
> callbacks initiated from the ops_complete_biofill() function).
> 
>  The fact is that my low-level ADMA driver (the ppc440spe one)
successfully
> initiated the ops_complete_biofill() callback but the
ops_complete_biofill()
> function itself skipped calling the bi_end_io() handler of the
completed bio
> (current dev->read) because during processing of this (current
dev->read) bio
> some other request had come to the sh (current dev_q->toread). Thus
> ops_complete_biofill() scheduled another biofill operation which, as a
> result, overwrote the unacknowledged bio (dev->read in
ops_run_biofill()),
> and so we lost the previous dev->read bio completely.
> 
>  Here is a patch that solves this problem. Perhaps this might be
implemented
> in some more elegant and effective way. What are your thoughts
regarding
> this?
> 
>  Regards, Yuri
> 
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 08b4893..7abc96b 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -838,11 +838,24 @@ static void ops_complete_biofill(void
*stripe_head_ref)
>               /* acknowledge completion of a biofill operation */
>               /* and check if we need to reply to a read request
>                */
> -             if (test_bit(R5_Wantfill, &dev_q->flags) &&
!dev_q->toread) {
> +             if (test_bit(R5_Wantfill, &dev_q->flags)) {
>                       struct bio *rbi, *rbi2;
>                       struct r5dev *dev = &sh->dev[i];
> 
> -                     clear_bit(R5_Wantfill, &dev_q->flags);
> +                     /* There is a chance that another fill operation
> +                      * had been scheduled for this dev while we
> +                      * processed sh. In this case do one of the
following
> +                      * alternatives:
> +                      * - if there is no active completed biofill for
the dev
> +                      *   then go to the next dev leaving Wantfill
set;
> +                      * - if there is active completed biofill for
the dev
> +                      *   then ack it but leave Wantfill set.
> +                      */
> +                     if (dev_q->toread && !dev->read)
> +                             continue;
> +
> +                     if (!dev_q->toread)
> +                             clear_bit(R5_Wantfill, &dev_q->flags);
> 
>                       /* The access to dev->read is outside of the
>                        * spin_lock_irq(&conf->device_lock), but is
protected


This still looks racy...  I think the complete fix is to make the
R5_Wantfill and dev_q->toread accesses atomic.  Please test the
following patch (also attached) and let me know if it fixes what you are
seeing:

Applies on top of git://lost.foo-projects.org/~dwillia2/git/iop
md-for-linus

---
raid5: fix ops_complete_biofill race in the asynchronous offload case

Protect against dev_q->toread toggling while testing and clearing the
R5_Wantfill bit.  This change prevents all asynchronous completions
(tasklets) from running during handle_stripe.
---

 drivers/md/raid5.c |   16 +++++++++-------
 1 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2f9022d..91c14c6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -824,6 +824,7 @@ static void ops_complete_biofill(void
*stripe_head_ref)
                (unsigned long long)sh->sector);
 
        /* clear completed biofills */
+       spin_lock(&sq->lock);
        for (i = sh->disks; i--; ) {
                struct r5dev *dev = &sh->dev[i];
                struct r5_queue_dev *dev_q = &sq->dev[i];
@@ -861,6 +862,7 @@ static void ops_complete_biofill(void
*stripe_head_ref)
        }
        clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
        clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+       spin_unlock(&sq->lock);
 
        return_io(return_bi);
 
@@ -2279,7 +2281,7 @@ static int add_queue_bio(struct stripe_queue *sq,
struct bio *bi, int dd_idx,
                (unsigned long long)bi->bi_sector,
                (unsigned long long)sq->sector);
 
-       spin_lock(&sq->lock);
+       spin_lock_bh(&sq->lock);
        spin_lock_irq(&conf->device_lock);
        sh = sq->sh;
        if (forwrite) {
@@ -2306,7 +2308,7 @@ static int add_queue_bio(struct stripe_queue *sq,
struct bio *bi, int dd_idx,
        *bip = bi;
        bi->bi_phys_segments ++;
        spin_unlock_irq(&conf->device_lock);
-       spin_unlock(&sq->lock);
+       spin_unlock_bh(&sq->lock);
 
        pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
                (unsigned long long)bi->bi_sector,
@@ -2339,7 +2341,7 @@ static int add_queue_bio(struct stripe_queue *sq,
struct bio *bi, int dd_idx,
  overlap:
        set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
        spin_unlock_irq(&conf->device_lock);
-       spin_unlock(&sq->lock);
+       spin_unlock_bh(&sq->lock);
        return 0;
 }
 
@@ -3127,7 +3129,7 @@ static void handle_stripe5(struct stripe_head *sh)
                atomic_read(&sh->count), sq->pd_idx,
                sh->ops.pending, sh->ops.ack, sh->ops.complete);
 
-       spin_lock(&sq->lock);
+       spin_lock_bh(&sq->lock);
        clear_bit(STRIPE_HANDLE, &sh->state);
 
        s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3370,7 +3372,7 @@ static void handle_stripe5(struct stripe_head *sh)
        if (sh->ops.count)
                pending = get_stripe_work(sh);
 
-       spin_unlock(&sq->lock);
+       spin_unlock_bh(&sq->lock);
 
        if (pending)
                raid5_run_ops(sh, pending);
@@ -3397,7 +3399,7 @@ static void handle_stripe6(struct stripe_head *sh,
struct page *tmp_page)
               atomic_read(&sh->count), pd_idx, r6s.qd_idx);
        memset(&s, 0, sizeof(s));
 
-       spin_lock(&sq->lock);
+       spin_lock_bh(&sq->lock);
        clear_bit(STRIPE_HANDLE, &sh->state);
 
        s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
@@ -3576,7 +3578,7 @@ static void handle_stripe6(struct stripe_head *sh,
struct page *tmp_page)
        if (s.expanding && s.locked == 0)
                handle_stripe_expansion(conf, sh, &r6s);
 
-       spin_unlock(&sq->lock);
+       spin_unlock_bh(&sq->lock);
 
        return_io(return_bi);

md-accel-fix-ops-complete-biofill-race.patch
Description: md-accel-fix-ops-complete-biofill-race.patch

RE: md raid acceleration and the async_tx api

Reply via email to