On Wed, 2010-11-24 at 17:01 +0100, Wouter D'Haeseleer wrote:
> Hi Ben,
> 
> I have now successfully compiled the kernel including the patch which
> this time applied without problem.
> However the original bug is still present with the patch you grabbed
> upstream.
> 
> For testing purpose I have tried also the patch which is supplied by
> redhat and I can confirm that this patch is working without a problem.
> 
> So it looks like the patch from Neil Brown does not work for this bug.

The result of my conversation with Neil Brown is that his fix covers
only md devices at the top of a stack whereas the Red Hat patch covers
only dm devices at the top of a stack.  We should really be fixing both
in the same way.

Please can you test the attached patch, which covers both dm and md.

Ben.

-- 
Ben Hutchings
Once a job is fouled up, anything done to improve it makes it worse.
commit a451e30d044e5b29f31d965a32d17bf7e97f99b0
Author: Ben Hutchings <b...@decadent.org.uk>
Date:   Sun Nov 28 23:46:46 2010 +0000

    dm: Deal with merge_bvec_fn in component devices better
    
    This is analogous to commit 627a2d3c29427637f4c5d31ccc7fcbd8d312cd71,
    which does the same for md-devices at the top of the stack.  The
    following explanation is taken from that commit.  Thanks to Neil Brown
    <ne...@suse.de> for the advice.
    
    If a component device has a merge_bvec_fn then as we never call it
    we must ensure we never need to.  Currently this is done by setting
    max_sector to 1 PAGE, however this does not stop a bio being created
    with several sub-page iovecs that would violate the merge_bvec_fn.
    
    So instead set max_segments to 1 and set the segment boundary to the
    same as a page boundary to ensure there is only ever one single-page
    segment of IO requested at a time.
    
    This can particularly be an issue when 'xen' is used as it is
    known to submit multiple small buffers in a single bio.
    
    Signed-off-by: Ben Hutchings <b...@decadent.org.uk>

commit 71ff0067805fb917142a745246f7996f3ad86d5b
Author: NeilBrown <ne...@suse.de>
Date:   Mon Mar 8 16:44:38 2010 +1100

    md: deal with merge_bvec_fn in component devices better.
    
    commit 627a2d3c29427637f4c5d31ccc7fcbd8d312cd71 upstream.
    
    If a component device has a merge_bvec_fn then as we never call it
    we must ensure we never need to.  Currently this is done by setting
    max_sector to 1 PAGE, however this does not stop a bio being created
    with several sub-page iovecs that would violate the merge_bvec_fn.
    
    So instead set max_segments to 1 and set the segment boundary to the
    same as a page boundary to ensure there is only ever one single-page
    segment of IO requested at a time.
    
    This can particularly be an issue when 'xen' is used as it is
    known to submit multiple small buffers in a single bio.
    
    Signed-off-by: NeilBrown <ne...@suse.de>
    Cc: sta...@kernel.org
    [bwh: Backport to Linux 2.6.26]
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 94116ea..186445d0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -506,17 +506,15 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	rs->max_sectors =
 		min_not_zero(rs->max_sectors, q->max_sectors);
 
-	/* FIXME: Device-Mapper on top of RAID-0 breaks because DM
-	 *        currently doesn't honor MD's merge_bvec_fn routine.
-	 *        In this case, we'll force DM to use PAGE_SIZE or
-	 *        smaller I/O, just to be safe. A better fix is in the
-	 *        works, but add this for the time being so it will at
-	 *        least operate correctly.
+	/*
+	 * Since we don't call merge_bvec_fn, we must never risk
+	 * violating it, so limit max_phys_segments to 1 lying within
+	 * a single page.
 	 */
-	if (q->merge_bvec_fn)
-		rs->max_sectors =
-			min_not_zero(rs->max_sectors,
-				     (unsigned int) (PAGE_SIZE >> 9));
+	if (q->merge_bvec_fn) {
+		rs->max_phys_segments = 1;
+		rs->seg_boundary_mask = PAGE_CACHE_SIZE - 1;
+	}
 
 	rs->max_phys_segments =
 		min_not_zero(rs->max_phys_segments,
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index ec921f5..fe8508a 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -136,12 +136,14 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		blk_queue_stack_limits(mddev->queue,
 				       rdev->bdev->bd_disk->queue);
 		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
+		 * violating it, so limit max_segments to 1 lying within
+		 * a single page.
 		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
 
 		disk->size = rdev->size;
 		conf->array_size += rdev->size;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index e968116..67dd8a3 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -293,14 +293,16 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			blk_queue_stack_limits(mddev->queue, q);
 
 		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
+		 * violating it, so limit ->max_segments to one, lying
+		 * within a single page.
 		 * (Note: it is very unlikely that a device with
 		 * merge_bvec_fn will be involved in multipath.)
 		 */
-			if (q->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+			if (q->merge_bvec_fn) {
+				blk_queue_max_phys_segments(mddev->queue, 1);
+				blk_queue_segment_boundary(mddev->queue,
+							   PAGE_CACHE_SIZE - 1);
+			}
 
 			conf->working_disks++;
 			mddev->degraded--;
@@ -453,9 +455,11 @@ static int multipath_run (mddev_t *mddev)
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, not that we ever expect a device with
 		 * a merge_bvec_fn to be involved in multipath */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
 
 		if (!test_bit(Faulty, &rdev->flags))
 			conf->working_disks++;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 914c04d..b344e0e 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -141,14 +141,15 @@ static int create_strip_zones (mddev_t *mddev)
 		blk_queue_stack_limits(mddev->queue,
 				       rdev1->bdev->bd_disk->queue);
 		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
+		 * violating it, so limit ->max_segments to 1, lying within
+		 * a single page.
 		 */
 
-		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
-
+		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
 		if (!smallest || (rdev1->size <smallest->size))
 			smallest = rdev1;
 		cnt++;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c610b94..360c079 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1109,13 +1109,17 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 			blk_queue_stack_limits(mddev->queue,
 					       rdev->bdev->bd_disk->queue);
-			/* as we don't honour merge_bvec_fn, we must never risk
-			 * violating it, so limit ->max_sector to one PAGE, as
-			 * a one page request is never in violation.
+			/* as we don't honour merge_bvec_fn, we must
+			 * never risk violating it, so limit
+			 * ->max_segments to one lying with a single
+			 * page, as a one page request is never in
+			 * violation.
 			 */
-			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+				blk_queue_max_phys_segments(mddev->queue, 1);
+				blk_queue_segment_boundary(mddev->queue,
+							   PAGE_CACHE_SIZE - 1);
+			}
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -1971,12 +1975,14 @@ static int run(mddev_t *mddev)
 		blk_queue_stack_limits(mddev->queue,
 				       rdev->bdev->bd_disk->queue);
 		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
+		 * violating it, so limit ->max_segments to 1 lying within
+		 * a single page, as a one page request is never in violation.
 		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
 
 		disk->head_position = 0;
 	}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a71277b..bf7654d 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1135,13 +1135,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 
 			blk_queue_stack_limits(mddev->queue,
 					       rdev->bdev->bd_disk->queue);
-			/* as we don't honour merge_bvec_fn, we must never risk
-			 * violating it, so limit ->max_sector to one PAGE, as
-			 * a one page request is never in violation.
+			/* as we don't honour merge_bvec_fn, we must
+			 * never risk violating it, so limit
+			 * ->max_segments to one lying with a single
+			 * page, as a one page request is never in
+			 * violation.
 			 */
-			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+				blk_queue_max_phys_segments(mddev->queue, 1);
+				blk_queue_segment_boundary(mddev->queue,
+							   PAGE_CACHE_SIZE - 1);
+			}
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -2107,12 +2111,14 @@ static int run(mddev_t *mddev)
 		blk_queue_stack_limits(mddev->queue,
 				       rdev->bdev->bd_disk->queue);
 		/* as we don't honour merge_bvec_fn, we must never risk
-		 * violating it, so limit ->max_sector to one PAGE, as
-		 * a one page request is never in violation.
+		 * violating it, so limit max_segments to 1 lying
+		 * within a single page.
 		 */
-		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
+			blk_queue_max_phys_segments(mddev->queue, 1);
+			blk_queue_segment_boundary(mddev->queue,
+						   PAGE_CACHE_SIZE - 1);
+		}
 
 		disk->head_position = 0;
 	}

Attachment: signature.asc
Description: This is a digitally signed message part

Reply via email to