Re: [patch] optimize o_direct on block device - v3

2007-01-19 Thread Andrew Morton
> On Fri, 19 Jan 2007 12:14:05 -0600 Michael Reed <[EMAIL PROTECTED]> wrote:
> Thanks again for finding the fix to the problem I reported.
> Can you tell me when I might expect this fix to show up in
> 2.6.20-rc?

next week..
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-19 Thread Michael Reed
Hi Andrew,

Thanks again for finding the fix to the problem I reported.
Can you tell me when I might expect this fix to show up in
2.6.20-rc?

Thanks,
 Mike


Andrew Morton wrote:
> On Thu, 11 Jan 2007 13:21:57 -0600
> Michael Reed <[EMAIL PROTECTED]> wrote:
> 
>> Testing on my ia64 system reveals that this patch introduces a
>> data integrity error for direct i/o to a block device.  Device
>> errors which result in i/o failure do not propagate to the
>> process issuing direct i/o to the device.
>>
>> This can be reproduced by doing writes to a fibre channel block
>> device and then disabling the switch port connecting the host
>> adapter to the switch.
>>
> 
> Does this fix it?
> 
> 
> 
> 
> 
> diff -puN fs/block_dev.c~a fs/block_dev.c
> --- a/fs/block_dev.c~a
> +++ a/fs/block_dev.c
> @@ -146,7 +146,7 @@ static int blk_end_aio(struct bio *bio, 
>   iocb->ki_nbytes = -EIO;
>  
>   if (atomic_dec_and_test(bio_count)) {
> - if (iocb->ki_nbytes < 0)
> + if ((long)iocb->ki_nbytes < 0)
>   aio_complete(iocb, iocb->ki_nbytes, 0);
>   else
>   aio_complete(iocb, iocb->ki_left, 0);
> _
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-19 Thread Michael Reed
Hi Andrew,

Thanks again for finding the fix to the problem I reported.
Can you tell me when I might expect this fix to show up in
2.6.20-rc?

Thanks,
 Mike


Andrew Morton wrote:
 On Thu, 11 Jan 2007 13:21:57 -0600
 Michael Reed [EMAIL PROTECTED] wrote:
 
 Testing on my ia64 system reveals that this patch introduces a
 data integrity error for direct i/o to a block device.  Device
 errors which result in i/o failure do not propagate to the
 process issuing direct i/o to the device.

 This can be reproduced by doing writes to a fibre channel block
 device and then disabling the switch port connecting the host
 adapter to the switch.

 
 Does this fix it?
 
 thwaps Ken
 thwaps compiler
 adds new entry to Documentation/SubmitChecklist
 
 diff -puN fs/block_dev.c~a fs/block_dev.c
 --- a/fs/block_dev.c~a
 +++ a/fs/block_dev.c
 @@ -146,7 +146,7 @@ static int blk_end_aio(struct bio *bio, 
   iocb-ki_nbytes = -EIO;
  
   if (atomic_dec_and_test(bio_count)) {
 - if (iocb-ki_nbytes  0)
 + if ((long)iocb-ki_nbytes  0)
   aio_complete(iocb, iocb-ki_nbytes, 0);
   else
   aio_complete(iocb, iocb-ki_left, 0);
 _
 
 
 -
 To unsubscribe from this list: send the line unsubscribe linux-kernel in
 the body of a message to [EMAIL PROTECTED]
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 Please read the FAQ at  http://www.tux.org/lkml/
 
 

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-19 Thread Andrew Morton
 On Fri, 19 Jan 2007 12:14:05 -0600 Michael Reed [EMAIL PROTECTED] wrote:
 Thanks again for finding the fix to the problem I reported.
 Can you tell me when I might expect this fix to show up in
 2.6.20-rc?

next week..
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Chen, Kenneth W
Randy Dunlap wrote on Thursday, January 11, 2007 1:45 PM
> > +/* return a pge back to pvec array */
> 
> is pge just a typo or some other tla that i don't know?
> (not portland general electric or pacific gas & electric)


Typo with fat fingers. Thanks for catching it. Full patch with typo fixed.



[patch] fix blk_direct_IO bio preparation.

For large size DIO that needs multiple bio, one full page worth of data
was lost at the boundary of bio's maximum sector or segment limits.
After a bio is full and got submitted.  The outer while (nbytes) { ... }
loop will allocate a new bio and just march on to index into next page.
It just forget about the page that bio_add_page() rejected when previous
bio is full.  Fix it by put the rejected page back to pvec so we pick it
up again for the next bio.


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

diff -Nurp linux-2.6.20-rc4/fs/block_dev.c linux-2.6.20.ken/fs/block_dev.c
--- linux-2.6.20-rc4/fs/block_dev.c 2007-01-06 21:45:51.0 -0800
+++ linux-2.6.20.ken/fs/block_dev.c 2007-01-10 19:54:53.0 -0800
@@ -190,6 +190,12 @@ static struct page *blk_get_page(unsigne
return pvec->page[pvec->idx++];
 }
 
+/* return a page back to pvec array */
+static void blk_unget_page(struct page *page, struct pvec *pvec)
+{
+   pvec->page[--pvec->idx] = page;
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 loff_t pos, unsigned long nr_segs)
@@ -278,6 +284,8 @@ same_bio:
count = min(count, nbytes);
goto same_bio;
}
+   } else {
+   blk_unget_page(page, );
}
 
/* bio is ready, submit it */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Randy Dunlap
On Thu, 11 Jan 2007 13:36:28 -0800 Chen, Kenneth W wrote:

> Andrew Morton wrote on Thursday, January 11, 2007 11:29 AM
> > On Thu, 11 Jan 2007 13:21:57 -0600
> > Michael Reed <[EMAIL PROTECTED]> wrote:
> > > Testing on my ia64 system reveals that this patch introduces a
> > > data integrity error for direct i/o to a block device.  Device
> > > errors which result in i/o failure do not propagate to the
> > > process issuing direct i/o to the device.
> > > 
> > > This can be reproduced by doing writes to a fibre channel block
> > > device and then disabling the switch port connecting the host
> > > adapter to the switch.
> > > 
> > 
> > Does this fix it?
> > 
> > 
> 
> 
> Darn, kicking myself in the butt.  Thank you Andrew for fixing this.
> We've also running DIO stress test almost non-stop over the last 30
> days or so and we did uncover another bug in that patch.
> 
> Andrew, would you please take the follow bug fix patch as well.  It
> is critical because it also affects data integrity.
> 
> 
> [patch] fix blk_direct_IO bio preparation.
> 
> For large size DIO that needs multiple bio, one full page worth of data
> was lost at the boundary of bio's maximum sector or segment limits.
> After a bio is full and got submitted.  The outer while (nbytes) { ... }
> loop will allocate a new bio and just march on to index into next page.
> It just forget about the page that bio_add_page() rejected when previous
> bio is full.  Fix it by put the rejected page back to pvec so we pick it
> up again for the next bio.
> 
> 
> Signed-off-by: Ken Chen <[EMAIL PROTECTED]>
> 
> diff -Nurp linux-2.6.20-rc4/fs/block_dev.c linux-2.6.20.ken/fs/block_dev.c
> --- linux-2.6.20-rc4/fs/block_dev.c   2007-01-06 21:45:51.0 -0800
> +++ linux-2.6.20.ken/fs/block_dev.c   2007-01-10 19:54:53.0 -0800
> @@ -190,6 +190,12 @@ static struct page *blk_get_page(unsigne
>   return pvec->page[pvec->idx++];
>  }
>  
> +/* return a pge back to pvec array */

is pge just a typo or some other tla that i don't know?
(not portland general electric or pacific gas & electric)

> +static void blk_unget_page(struct page *page, struct pvec *pvec)
> +{
> + pvec->page[--pvec->idx] = page;
> +}
> +
>  static ssize_t
>  blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
>loff_t pos, unsigned long nr_segs)
> @@ -278,6 +284,8 @@ same_bio:
>   count = min(count, nbytes);
>   goto same_bio;
>   }
> + } else {
> + blk_unget_page(page, );
>   }
>  
>   /* bio is ready, submit it */
> -


---
~Randy
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Chen, Kenneth W
Andrew Morton wrote on Thursday, January 11, 2007 11:29 AM
> On Thu, 11 Jan 2007 13:21:57 -0600
> Michael Reed <[EMAIL PROTECTED]> wrote:
> > Testing on my ia64 system reveals that this patch introduces a
> > data integrity error for direct i/o to a block device.  Device
> > errors which result in i/o failure do not propagate to the
> > process issuing direct i/o to the device.
> > 
> > This can be reproduced by doing writes to a fibre channel block
> > device and then disabling the switch port connecting the host
> > adapter to the switch.
> > 
> 
> Does this fix it?
> 
> 


Darn, kicking myself in the butt.  Thank you Andrew for fixing this.
We've also running DIO stress test almost non-stop over the last 30
days or so and we did uncover another bug in that patch.

Andrew, would you please take the follow bug fix patch as well.  It
is critical because it also affects data integrity.


[patch] fix blk_direct_IO bio preparation.

For large size DIO that needs multiple bio, one full page worth of data
was lost at the boundary of bio's maximum sector or segment limits.
After a bio is full and got submitted.  The outer while (nbytes) { ... }
loop will allocate a new bio and just march on to index into next page.
It just forget about the page that bio_add_page() rejected when previous
bio is full.  Fix it by put the rejected page back to pvec so we pick it
up again for the next bio.


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

diff -Nurp linux-2.6.20-rc4/fs/block_dev.c linux-2.6.20.ken/fs/block_dev.c
--- linux-2.6.20-rc4/fs/block_dev.c 2007-01-06 21:45:51.0 -0800
+++ linux-2.6.20.ken/fs/block_dev.c 2007-01-10 19:54:53.0 -0800
@@ -190,6 +190,12 @@ static struct page *blk_get_page(unsigne
return pvec->page[pvec->idx++];
 }
 
+/* return a pge back to pvec array */
+static void blk_unget_page(struct page *page, struct pvec *pvec)
+{
+   pvec->page[--pvec->idx] = page;
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 loff_t pos, unsigned long nr_segs)
@@ -278,6 +284,8 @@ same_bio:
count = min(count, nbytes);
goto same_bio;
}
+   } else {
+   blk_unget_page(page, );
}
 
/* bio is ready, submit it */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Michael Reed

Andrew Morton wrote:
> On Thu, 11 Jan 2007 13:21:57 -0600
> Michael Reed <[EMAIL PROTECTED]> wrote:
> 
>> Testing on my ia64 system reveals that this patch introduces a
>> data integrity error for direct i/o to a block device.  Device
>> errors which result in i/o failure do not propagate to the
>> process issuing direct i/o to the device.
>>
>> This can be reproduced by doing writes to a fibre channel block
>> device and then disabling the switch port connecting the host
>> adapter to the switch.
>>
> 
> Does this fix it?
> 


Yes it does!  Thank you for finding this so quickly.

Mike


> 
> 
> 
> 
> diff -puN fs/block_dev.c~a fs/block_dev.c
> --- a/fs/block_dev.c~a
> +++ a/fs/block_dev.c
> @@ -146,7 +146,7 @@ static int blk_end_aio(struct bio *bio, 
>   iocb->ki_nbytes = -EIO;
>  
>   if (atomic_dec_and_test(bio_count)) {
> - if (iocb->ki_nbytes < 0)
> + if ((long)iocb->ki_nbytes < 0)
>   aio_complete(iocb, iocb->ki_nbytes, 0);
>   else
>   aio_complete(iocb, iocb->ki_left, 0);
> _
> 
> 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Andrew Morton
On Thu, 11 Jan 2007 13:21:57 -0600
Michael Reed <[EMAIL PROTECTED]> wrote:

> Testing on my ia64 system reveals that this patch introduces a
> data integrity error for direct i/o to a block device.  Device
> errors which result in i/o failure do not propagate to the
> process issuing direct i/o to the device.
> 
> This can be reproduced by doing writes to a fibre channel block
> device and then disabling the switch port connecting the host
> adapter to the switch.
> 

Does this fix it?





diff -puN fs/block_dev.c~a fs/block_dev.c
--- a/fs/block_dev.c~a
+++ a/fs/block_dev.c
@@ -146,7 +146,7 @@ static int blk_end_aio(struct bio *bio, 
iocb->ki_nbytes = -EIO;
 
if (atomic_dec_and_test(bio_count)) {
-   if (iocb->ki_nbytes < 0)
+   if ((long)iocb->ki_nbytes < 0)
aio_complete(iocb, iocb->ki_nbytes, 0);
else
aio_complete(iocb, iocb->ki_left, 0);
_


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Michael Reed
Testing on my ia64 system reveals that this patch introduces a
data integrity error for direct i/o to a block device.  Device
errors which result in i/o failure do not propagate to the
process issuing direct i/o to the device.

This can be reproduced by doing writes to a fibre channel block
device and then disabling the switch port connecting the host
adapter to the switch.

Can this patch be adjusted to take this into account?  Or pulled?
It was introduced with patch-2.6.19-git22.  I believe the commit
is e61c90188b9956edae1105eef361d8981a352fcd.

http://marc.theaimsgroup.com/?l=linux-scsi=116803117327546=2
http://marc.theaimsgroup.com/?l=linux-scsi=116838403700214=2

I'm happy to test any suggested fixes.

Thanks,
 Mike Reed
 SGI


Chen, Kenneth W wrote:
> This patch implements block device specific .direct_IO method instead
> of going through generic direct_io_worker for block device.
> 
> direct_io_worker is fairly complex because it needs to handle O_DIRECT
> on file system, where it needs to perform block allocation, hole detection,
> extents file on write, and tons of other corner cases. The end result is
> that it takes tons of CPU time to submit an I/O.
> 
> For block device, the block allocation is much simpler and a tight triple
> loop can be written to iterate each iovec and each page within the iovec
> in order to construct/prepare bio structure and then subsequently submit
> it to the block layer.  This significantly speeds up O_D on block device.
> 
> 
> Signed-off-by: Ken Chen <[EMAIL PROTECTED]>
> 
> 
> ---
> This is v3 of the patch, I have addressed all the comments from Andrew,
> Christoph, and Zach.  Too many to list here for v2->v3 changes.  I've
> created 34 test cases specifically for corner cases and tested this patch.
> (my monkey test code is on http://kernel-perf.sourceforge.net/diotest).
> 
> 
> 
> diff -Nurp linux-2.6.19/fs/block_dev.c linux-2.6.19.ken/fs/block_dev.c
> --- linux-2.6.19/fs/block_dev.c   2006-11-29 13:57:37.0 -0800
> +++ linux-2.6.19.ken/fs/block_dev.c   2006-12-06 13:16:43.0 -0800
> @@ -129,43 +129,188 @@ blkdev_get_block(struct inode *inode, se
>   return 0;
>  }
>  
> -static int
> -blkdev_get_blocks(struct inode *inode, sector_t iblock,
> - struct buffer_head *bh, int create)
> +static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
>  {
> - sector_t end_block = max_block(I_BDEV(inode));
> - unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
> + struct kiocb *iocb = bio->bi_private;
> + atomic_t *bio_count = >ki_bio_count;
>  
> - if ((iblock + max_blocks) > end_block) {
> - max_blocks = end_block - iblock;
> - if ((long)max_blocks <= 0) {
> - if (create)
> - return -EIO;/* write fully beyond EOF */
> - /*
> -  * It is a read which is fully beyond EOF.  We return
> -  * a !buffer_mapped buffer
> -  */
> - max_blocks = 0;
> - }
> + if (bio_data_dir(bio) == READ)
> + bio_check_pages_dirty(bio);
> + else {
> + bio_release_pages(bio);
> + bio_put(bio);
> + }
> +
> + /* iocb->ki_nbytes stores error code from LLDD */
> + if (error)
> + iocb->ki_nbytes = -EIO;
> +
> + if (atomic_dec_and_test(bio_count)) {
> + if (iocb->ki_nbytes < 0)
> + aio_complete(iocb, iocb->ki_nbytes, 0);
> + else
> + aio_complete(iocb, iocb->ki_left, 0);
>   }
>  
> - bh->b_bdev = I_BDEV(inode);
> - bh->b_blocknr = iblock;
> - bh->b_size = max_blocks << inode->i_blkbits;
> - if (max_blocks)
> - set_buffer_mapped(bh);
>   return 0;
>  }
>  
> +#define VEC_SIZE 16
> +struct pvec {
> + unsigned short nr;
> + unsigned short idx;
> + struct page *page[VEC_SIZE];
> +};
> +
> +#define PAGES_SPANNED(addr, len) \
> + (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
> +
> +/*
> + * get page pointer for user addr, we internally cache struct page array for
> + * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If
> + * internal page list is exhausted, a batch count of up to VEC_SIZE is used
> + * to get next set of page struct.
> + */
> +static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
> +  struct pvec *pvec)
> +{
> + int ret, nr_pages;
> + if (pvec->idx == pvec->nr) {
> + nr_pages = PAGES_SPANNED(addr, count);
> + nr_pages = min(nr_pages, VEC_SIZE);
> + down_read(>mm->mmap_sem);
> + ret = get_user_pages(current, current->mm, addr, nr_pages,
> +  rw == READ, 0, pvec->page, NULL);
> + up_read(>mm->mmap_sem);
> + if (ret < 0)
> +   

Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Michael Reed
Testing on my ia64 system reveals that this patch introduces a
data integrity error for direct i/o to a block device.  Device
errors which result in i/o failure do not propagate to the
process issuing direct i/o to the device.

This can be reproduced by doing writes to a fibre channel block
device and then disabling the switch port connecting the host
adapter to the switch.

Can this patch be adjusted to take this into account?  Or pulled?
It was introduced with patch-2.6.19-git22.  I believe the commit
is e61c90188b9956edae1105eef361d8981a352fcd.

http://marc.theaimsgroup.com/?l=linux-scsim=116803117327546w=2
http://marc.theaimsgroup.com/?l=linux-scsim=116838403700214w=2

I'm happy to test any suggested fixes.

Thanks,
 Mike Reed
 SGI


Chen, Kenneth W wrote:
 This patch implements block device specific .direct_IO method instead
 of going through generic direct_io_worker for block device.
 
 direct_io_worker is fairly complex because it needs to handle O_DIRECT
 on file system, where it needs to perform block allocation, hole detection,
 extents file on write, and tons of other corner cases. The end result is
 that it takes tons of CPU time to submit an I/O.
 
 For block device, the block allocation is much simpler and a tight triple
 loop can be written to iterate each iovec and each page within the iovec
 in order to construct/prepare bio structure and then subsequently submit
 it to the block layer.  This significantly speeds up O_D on block device.
 
 
 Signed-off-by: Ken Chen [EMAIL PROTECTED]
 
 
 ---
 This is v3 of the patch, I have addressed all the comments from Andrew,
 Christoph, and Zach.  Too many to list here for v2-v3 changes.  I've
 created 34 test cases specifically for corner cases and tested this patch.
 (my monkey test code is on http://kernel-perf.sourceforge.net/diotest).
 
 
 
 diff -Nurp linux-2.6.19/fs/block_dev.c linux-2.6.19.ken/fs/block_dev.c
 --- linux-2.6.19/fs/block_dev.c   2006-11-29 13:57:37.0 -0800
 +++ linux-2.6.19.ken/fs/block_dev.c   2006-12-06 13:16:43.0 -0800
 @@ -129,43 +129,188 @@ blkdev_get_block(struct inode *inode, se
   return 0;
  }
  
 -static int
 -blkdev_get_blocks(struct inode *inode, sector_t iblock,
 - struct buffer_head *bh, int create)
 +static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
  {
 - sector_t end_block = max_block(I_BDEV(inode));
 - unsigned long max_blocks = bh-b_size  inode-i_blkbits;
 + struct kiocb *iocb = bio-bi_private;
 + atomic_t *bio_count = iocb-ki_bio_count;
  
 - if ((iblock + max_blocks)  end_block) {
 - max_blocks = end_block - iblock;
 - if ((long)max_blocks = 0) {
 - if (create)
 - return -EIO;/* write fully beyond EOF */
 - /*
 -  * It is a read which is fully beyond EOF.  We return
 -  * a !buffer_mapped buffer
 -  */
 - max_blocks = 0;
 - }
 + if (bio_data_dir(bio) == READ)
 + bio_check_pages_dirty(bio);
 + else {
 + bio_release_pages(bio);
 + bio_put(bio);
 + }
 +
 + /* iocb-ki_nbytes stores error code from LLDD */
 + if (error)
 + iocb-ki_nbytes = -EIO;
 +
 + if (atomic_dec_and_test(bio_count)) {
 + if (iocb-ki_nbytes  0)
 + aio_complete(iocb, iocb-ki_nbytes, 0);
 + else
 + aio_complete(iocb, iocb-ki_left, 0);
   }
  
 - bh-b_bdev = I_BDEV(inode);
 - bh-b_blocknr = iblock;
 - bh-b_size = max_blocks  inode-i_blkbits;
 - if (max_blocks)
 - set_buffer_mapped(bh);
   return 0;
  }
  
 +#define VEC_SIZE 16
 +struct pvec {
 + unsigned short nr;
 + unsigned short idx;
 + struct page *page[VEC_SIZE];
 +};
 +
 +#define PAGES_SPANNED(addr, len) \
 + (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
 +
 +/*
 + * get page pointer for user addr, we internally cache struct page array for
 + * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If
 + * internal page list is exhausted, a batch count of up to VEC_SIZE is used
 + * to get next set of page struct.
 + */
 +static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
 +  struct pvec *pvec)
 +{
 + int ret, nr_pages;
 + if (pvec-idx == pvec-nr) {
 + nr_pages = PAGES_SPANNED(addr, count);
 + nr_pages = min(nr_pages, VEC_SIZE);
 + down_read(current-mm-mmap_sem);
 + ret = get_user_pages(current, current-mm, addr, nr_pages,
 +  rw == READ, 0, pvec-page, NULL);
 + up_read(current-mm-mmap_sem);
 + if (ret  0)
 + return ERR_PTR(ret);
 + pvec-nr = ret;
 + pvec-idx = 0;
 + }
 + return 

Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Andrew Morton
On Thu, 11 Jan 2007 13:21:57 -0600
Michael Reed [EMAIL PROTECTED] wrote:

 Testing on my ia64 system reveals that this patch introduces a
 data integrity error for direct i/o to a block device.  Device
 errors which result in i/o failure do not propagate to the
 process issuing direct i/o to the device.
 
 This can be reproduced by doing writes to a fibre channel block
 device and then disabling the switch port connecting the host
 adapter to the switch.
 

Does this fix it?

thwaps Ken
thwaps compiler
adds new entry to Documentation/SubmitChecklist

diff -puN fs/block_dev.c~a fs/block_dev.c
--- a/fs/block_dev.c~a
+++ a/fs/block_dev.c
@@ -146,7 +146,7 @@ static int blk_end_aio(struct bio *bio, 
iocb-ki_nbytes = -EIO;
 
if (atomic_dec_and_test(bio_count)) {
-   if (iocb-ki_nbytes  0)
+   if ((long)iocb-ki_nbytes  0)
aio_complete(iocb, iocb-ki_nbytes, 0);
else
aio_complete(iocb, iocb-ki_left, 0);
_


-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Michael Reed

Andrew Morton wrote:
 On Thu, 11 Jan 2007 13:21:57 -0600
 Michael Reed [EMAIL PROTECTED] wrote:
 
 Testing on my ia64 system reveals that this patch introduces a
 data integrity error for direct i/o to a block device.  Device
 errors which result in i/o failure do not propagate to the
 process issuing direct i/o to the device.

 This can be reproduced by doing writes to a fibre channel block
 device and then disabling the switch port connecting the host
 adapter to the switch.

 
 Does this fix it?
 


Yes it does!  Thank you for finding this so quickly.

Mike


 thwaps Ken
 thwaps compiler
 adds new entry to Documentation/SubmitChecklist
 
 diff -puN fs/block_dev.c~a fs/block_dev.c
 --- a/fs/block_dev.c~a
 +++ a/fs/block_dev.c
 @@ -146,7 +146,7 @@ static int blk_end_aio(struct bio *bio, 
   iocb-ki_nbytes = -EIO;
  
   if (atomic_dec_and_test(bio_count)) {
 - if (iocb-ki_nbytes  0)
 + if ((long)iocb-ki_nbytes  0)
   aio_complete(iocb, iocb-ki_nbytes, 0);
   else
   aio_complete(iocb, iocb-ki_left, 0);
 _
 
 
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Chen, Kenneth W
Andrew Morton wrote on Thursday, January 11, 2007 11:29 AM
 On Thu, 11 Jan 2007 13:21:57 -0600
 Michael Reed [EMAIL PROTECTED] wrote:
  Testing on my ia64 system reveals that this patch introduces a
  data integrity error for direct i/o to a block device.  Device
  errors which result in i/o failure do not propagate to the
  process issuing direct i/o to the device.
  
  This can be reproduced by doing writes to a fibre channel block
  device and then disabling the switch port connecting the host
  adapter to the switch.
  
 
 Does this fix it?
 
 thwaps Ken


Darn, kicking myself in the butt.  Thank you Andrew for fixing this.
We've also running DIO stress test almost non-stop over the last 30
days or so and we did uncover another bug in that patch.

Andrew, would you please take the follow bug fix patch as well.  It
is critical because it also affects data integrity.


[patch] fix blk_direct_IO bio preparation.

For large size DIO that needs multiple bio, one full page worth of data
was lost at the boundary of bio's maximum sector or segment limits.
After a bio is full and got submitted.  The outer while (nbytes) { ... }
loop will allocate a new bio and just march on to index into next page.
It just forget about the page that bio_add_page() rejected when previous
bio is full.  Fix it by put the rejected page back to pvec so we pick it
up again for the next bio.


Signed-off-by: Ken Chen [EMAIL PROTECTED]

diff -Nurp linux-2.6.20-rc4/fs/block_dev.c linux-2.6.20.ken/fs/block_dev.c
--- linux-2.6.20-rc4/fs/block_dev.c 2007-01-06 21:45:51.0 -0800
+++ linux-2.6.20.ken/fs/block_dev.c 2007-01-10 19:54:53.0 -0800
@@ -190,6 +190,12 @@ static struct page *blk_get_page(unsigne
return pvec-page[pvec-idx++];
 }
 
+/* return a pge back to pvec array */
+static void blk_unget_page(struct page *page, struct pvec *pvec)
+{
+   pvec-page[--pvec-idx] = page;
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 loff_t pos, unsigned long nr_segs)
@@ -278,6 +284,8 @@ same_bio:
count = min(count, nbytes);
goto same_bio;
}
+   } else {
+   blk_unget_page(page, pvec);
}
 
/* bio is ready, submit it */
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Randy Dunlap
On Thu, 11 Jan 2007 13:36:28 -0800 Chen, Kenneth W wrote:

 Andrew Morton wrote on Thursday, January 11, 2007 11:29 AM
  On Thu, 11 Jan 2007 13:21:57 -0600
  Michael Reed [EMAIL PROTECTED] wrote:
   Testing on my ia64 system reveals that this patch introduces a
   data integrity error for direct i/o to a block device.  Device
   errors which result in i/o failure do not propagate to the
   process issuing direct i/o to the device.
   
   This can be reproduced by doing writes to a fibre channel block
   device and then disabling the switch port connecting the host
   adapter to the switch.
   
  
  Does this fix it?
  
  thwaps Ken
 
 
 Darn, kicking myself in the butt.  Thank you Andrew for fixing this.
 We've also running DIO stress test almost non-stop over the last 30
 days or so and we did uncover another bug in that patch.
 
 Andrew, would you please take the follow bug fix patch as well.  It
 is critical because it also affects data integrity.
 
 
 [patch] fix blk_direct_IO bio preparation.
 
 For large size DIO that needs multiple bio, one full page worth of data
 was lost at the boundary of bio's maximum sector or segment limits.
 After a bio is full and got submitted.  The outer while (nbytes) { ... }
 loop will allocate a new bio and just march on to index into next page.
 It just forget about the page that bio_add_page() rejected when previous
 bio is full.  Fix it by put the rejected page back to pvec so we pick it
 up again for the next bio.
 
 
 Signed-off-by: Ken Chen [EMAIL PROTECTED]
 
 diff -Nurp linux-2.6.20-rc4/fs/block_dev.c linux-2.6.20.ken/fs/block_dev.c
 --- linux-2.6.20-rc4/fs/block_dev.c   2007-01-06 21:45:51.0 -0800
 +++ linux-2.6.20.ken/fs/block_dev.c   2007-01-10 19:54:53.0 -0800
 @@ -190,6 +190,12 @@ static struct page *blk_get_page(unsigne
   return pvec-page[pvec-idx++];
  }
  
 +/* return a pge back to pvec array */

is pge just a typo or some other tla that i don't know?
(not portland general electric or pacific gas  electric)

 +static void blk_unget_page(struct page *page, struct pvec *pvec)
 +{
 + pvec-page[--pvec-idx] = page;
 +}
 +
  static ssize_t
  blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t pos, unsigned long nr_segs)
 @@ -278,6 +284,8 @@ same_bio:
   count = min(count, nbytes);
   goto same_bio;
   }
 + } else {
 + blk_unget_page(page, pvec);
   }
  
   /* bio is ready, submit it */
 -


---
~Randy
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [patch] optimize o_direct on block device - v3

2007-01-11 Thread Chen, Kenneth W
Randy Dunlap wrote on Thursday, January 11, 2007 1:45 PM
  +/* return a pge back to pvec array */
 
 is pge just a typo or some other tla that i don't know?
 (not portland general electric or pacific gas  electric)


Typo with fat fingers. Thanks for catching it. Full patch with typo fixed.



[patch] fix blk_direct_IO bio preparation.

For large size DIO that needs multiple bio, one full page worth of data
was lost at the boundary of bio's maximum sector or segment limits.
After a bio is full and got submitted.  The outer while (nbytes) { ... }
loop will allocate a new bio and just march on to index into next page.
It just forget about the page that bio_add_page() rejected when previous
bio is full.  Fix it by put the rejected page back to pvec so we pick it
up again for the next bio.


Signed-off-by: Ken Chen [EMAIL PROTECTED]

diff -Nurp linux-2.6.20-rc4/fs/block_dev.c linux-2.6.20.ken/fs/block_dev.c
--- linux-2.6.20-rc4/fs/block_dev.c 2007-01-06 21:45:51.0 -0800
+++ linux-2.6.20.ken/fs/block_dev.c 2007-01-10 19:54:53.0 -0800
@@ -190,6 +190,12 @@ static struct page *blk_get_page(unsigne
return pvec-page[pvec-idx++];
 }
 
+/* return a page back to pvec array */
+static void blk_unget_page(struct page *page, struct pvec *pvec)
+{
+   pvec-page[--pvec-idx] = page;
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 loff_t pos, unsigned long nr_segs)
@@ -278,6 +284,8 @@ same_bio:
count = min(count, nbytes);
goto same_bio;
}
+   } else {
+   blk_unget_page(page, pvec);
}
 
/* bio is ready, submit it */
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2006-12-07 Thread Zach Brown


(my monkey test code is on http://kernel-perf.sourceforge.net/ 
diotest).


Nice.

Do you have any interest in working with the autotest ( http:// 
test.kernel.org/autotest ) guys to get your tests into their rotation?


- z
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v3

2006-12-07 Thread Zach Brown


(my monkey test code is on http://kernel-perf.sourceforge.net/ 
diotest).


Nice.

Do you have any interest in working with the autotest ( http:// 
test.kernel.org/autotest ) guys to get your tests into their rotation?


- z
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch] optimize o_direct on block device - v3

2006-12-06 Thread Chen, Kenneth W
This patch implements block device specific .direct_IO method instead
of going through generic direct_io_worker for block device.

direct_io_worker is fairly complex because it needs to handle O_DIRECT
on file system, where it needs to perform block allocation, hole detection,
extents file on write, and tons of other corner cases. The end result is
that it takes tons of CPU time to submit an I/O.

For block device, the block allocation is much simpler and a tight triple
loop can be written to iterate each iovec and each page within the iovec
in order to construct/prepare bio structure and then subsequently submit
it to the block layer.  This significantly speeds up O_D on block device.


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


---
This is v3 of the patch, I have addressed all the comments from Andrew,
Christoph, and Zach.  Too many to list here for v2->v3 changes.  I've
created 34 test cases specifically for corner cases and tested this patch.
(my monkey test code is on http://kernel-perf.sourceforge.net/diotest).



diff -Nurp linux-2.6.19/fs/block_dev.c linux-2.6.19.ken/fs/block_dev.c
--- linux-2.6.19/fs/block_dev.c 2006-11-29 13:57:37.0 -0800
+++ linux-2.6.19.ken/fs/block_dev.c 2006-12-06 13:16:43.0 -0800
@@ -129,43 +129,188 @@ blkdev_get_block(struct inode *inode, se
return 0;
 }
 
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-   struct buffer_head *bh, int create)
+static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
 {
-   sector_t end_block = max_block(I_BDEV(inode));
-   unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
+   struct kiocb *iocb = bio->bi_private;
+   atomic_t *bio_count = >ki_bio_count;
 
-   if ((iblock + max_blocks) > end_block) {
-   max_blocks = end_block - iblock;
-   if ((long)max_blocks <= 0) {
-   if (create)
-   return -EIO;/* write fully beyond EOF */
-   /*
-* It is a read which is fully beyond EOF.  We return
-* a !buffer_mapped buffer
-*/
-   max_blocks = 0;
-   }
+   if (bio_data_dir(bio) == READ)
+   bio_check_pages_dirty(bio);
+   else {
+   bio_release_pages(bio);
+   bio_put(bio);
+   }
+
+   /* iocb->ki_nbytes stores error code from LLDD */
+   if (error)
+   iocb->ki_nbytes = -EIO;
+
+   if (atomic_dec_and_test(bio_count)) {
+   if (iocb->ki_nbytes < 0)
+   aio_complete(iocb, iocb->ki_nbytes, 0);
+   else
+   aio_complete(iocb, iocb->ki_left, 0);
}
 
-   bh->b_bdev = I_BDEV(inode);
-   bh->b_blocknr = iblock;
-   bh->b_size = max_blocks << inode->i_blkbits;
-   if (max_blocks)
-   set_buffer_mapped(bh);
return 0;
 }
 
+#define VEC_SIZE   16
+struct pvec {
+   unsigned short nr;
+   unsigned short idx;
+   struct page *page[VEC_SIZE];
+};
+
+#define PAGES_SPANNED(addr, len)   \
+   (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
+
+/*
+ * get page pointer for user addr, we internally cache struct page array for
+ * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If
+ * internal page list is exhausted, a batch count of up to VEC_SIZE is used
+ * to get next set of page struct.
+ */
+static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
+struct pvec *pvec)
+{
+   int ret, nr_pages;
+   if (pvec->idx == pvec->nr) {
+   nr_pages = PAGES_SPANNED(addr, count);
+   nr_pages = min(nr_pages, VEC_SIZE);
+   down_read(>mm->mmap_sem);
+   ret = get_user_pages(current, current->mm, addr, nr_pages,
+rw == READ, 0, pvec->page, NULL);
+   up_read(>mm->mmap_sem);
+   if (ret < 0)
+   return ERR_PTR(ret);
+   pvec->nr = ret;
+   pvec->idx = 0;
+   }
+   return pvec->page[pvec->idx++];
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-   loff_t offset, unsigned long nr_segs)
+loff_t pos, unsigned long nr_segs)
 {
-   struct file *file = iocb->ki_filp;
-   struct inode *inode = file->f_mapping->host;
+   struct inode *inode = iocb->ki_filp->f_mapping->host;
+   unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
+   unsigned blocksize_mask = (1<< blkbits) - 1;
+   unsigned long seg = 0;  /* iov segment iterator */
+   unsigned long nvec; /* number of bio vec needed */
+   unsigned long cur_off;  /* offset into current page */
+   unsigned long cur_len;  /* I/O len of current page, 

[patch] optimize o_direct on block device - v3

2006-12-06 Thread Chen, Kenneth W
This patch implements block device specific .direct_IO method instead
of going through generic direct_io_worker for block device.

direct_io_worker is fairly complex because it needs to handle O_DIRECT
on file system, where it needs to perform block allocation, hole detection,
extents file on write, and tons of other corner cases. The end result is
that it takes tons of CPU time to submit an I/O.

For block device, the block allocation is much simpler and a tight triple
loop can be written to iterate each iovec and each page within the iovec
in order to construct/prepare bio structure and then subsequently submit
it to the block layer.  This significantly speeds up O_D on block device.


Signed-off-by: Ken Chen [EMAIL PROTECTED]


---
This is v3 of the patch, I have addressed all the comments from Andrew,
Christoph, and Zach.  Too many to list here for v2-v3 changes.  I've
created 34 test cases specifically for corner cases and tested this patch.
(my monkey test code is on http://kernel-perf.sourceforge.net/diotest).



diff -Nurp linux-2.6.19/fs/block_dev.c linux-2.6.19.ken/fs/block_dev.c
--- linux-2.6.19/fs/block_dev.c 2006-11-29 13:57:37.0 -0800
+++ linux-2.6.19.ken/fs/block_dev.c 2006-12-06 13:16:43.0 -0800
@@ -129,43 +129,188 @@ blkdev_get_block(struct inode *inode, se
return 0;
 }
 
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-   struct buffer_head *bh, int create)
+static int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
 {
-   sector_t end_block = max_block(I_BDEV(inode));
-   unsigned long max_blocks = bh-b_size  inode-i_blkbits;
+   struct kiocb *iocb = bio-bi_private;
+   atomic_t *bio_count = iocb-ki_bio_count;
 
-   if ((iblock + max_blocks)  end_block) {
-   max_blocks = end_block - iblock;
-   if ((long)max_blocks = 0) {
-   if (create)
-   return -EIO;/* write fully beyond EOF */
-   /*
-* It is a read which is fully beyond EOF.  We return
-* a !buffer_mapped buffer
-*/
-   max_blocks = 0;
-   }
+   if (bio_data_dir(bio) == READ)
+   bio_check_pages_dirty(bio);
+   else {
+   bio_release_pages(bio);
+   bio_put(bio);
+   }
+
+   /* iocb-ki_nbytes stores error code from LLDD */
+   if (error)
+   iocb-ki_nbytes = -EIO;
+
+   if (atomic_dec_and_test(bio_count)) {
+   if (iocb-ki_nbytes  0)
+   aio_complete(iocb, iocb-ki_nbytes, 0);
+   else
+   aio_complete(iocb, iocb-ki_left, 0);
}
 
-   bh-b_bdev = I_BDEV(inode);
-   bh-b_blocknr = iblock;
-   bh-b_size = max_blocks  inode-i_blkbits;
-   if (max_blocks)
-   set_buffer_mapped(bh);
return 0;
 }
 
+#define VEC_SIZE   16
+struct pvec {
+   unsigned short nr;
+   unsigned short idx;
+   struct page *page[VEC_SIZE];
+};
+
+#define PAGES_SPANNED(addr, len)   \
+   (DIV_ROUND_UP((addr) + (len), PAGE_SIZE) - (addr) / PAGE_SIZE);
+
+/*
+ * get page pointer for user addr, we internally cache struct page array for
+ * (addr, count) range in pvec to avoid frequent call to get_user_pages.  If
+ * internal page list is exhausted, a batch count of up to VEC_SIZE is used
+ * to get next set of page struct.
+ */
+static struct page *blk_get_page(unsigned long addr, size_t count, int rw,
+struct pvec *pvec)
+{
+   int ret, nr_pages;
+   if (pvec-idx == pvec-nr) {
+   nr_pages = PAGES_SPANNED(addr, count);
+   nr_pages = min(nr_pages, VEC_SIZE);
+   down_read(current-mm-mmap_sem);
+   ret = get_user_pages(current, current-mm, addr, nr_pages,
+rw == READ, 0, pvec-page, NULL);
+   up_read(current-mm-mmap_sem);
+   if (ret  0)
+   return ERR_PTR(ret);
+   pvec-nr = ret;
+   pvec-idx = 0;
+   }
+   return pvec-page[pvec-idx++];
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-   loff_t offset, unsigned long nr_segs)
+loff_t pos, unsigned long nr_segs)
 {
-   struct file *file = iocb-ki_filp;
-   struct inode *inode = file-f_mapping-host;
+   struct inode *inode = iocb-ki_filp-f_mapping-host;
+   unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
+   unsigned blocksize_mask = (1 blkbits) - 1;
+   unsigned long seg = 0;  /* iov segment iterator */
+   unsigned long nvec; /* number of bio vec needed */
+   unsigned long cur_off;  /* offset into current page */
+   unsigned long cur_len;  /* I/O len of current page, up to PAGE_SIZE */
+
+