Re: [patch] optimize o_direct on block device - v2

2006-12-05 Thread Andrew Morton
On Mon, 4 Dec 2006 20:55:50 -0800
"Chen, Kenneth W" <[EMAIL PROTECTED]> wrote:

> This patch implements block device specific .direct_IO method instead
> of going through generic direct_io_worker for block device.
> 
> direct_io_worker is fairly complex because it needs to handle O_DIRECT
> on file system, where it needs to perform block allocation, hole detection,
> extents file on write, and tons of other corner cases. The end result is
> that it takes tons of CPU time to submit an I/O.
> 
> For block device, the block allocation is much simpler and a tight triple
> loop can be written to iterate each iovec and each page within the iovec
> in order to construct/prepare bio structure and then subsequently submit
> it to the block layer.  This significantly speeds up O_D on block device.
> 

Looks reasonable to me.  A few musings:

>  
> -static int
> -blkdev_get_blocks(struct inode *inode, sector_t iblock,
> - struct buffer_head *bh, int create)
> +int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
>  {
> - sector_t end_block = max_block(I_BDEV(inode));
> - unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
> + struct kiocb* iocb = bio->bi_private;
> + atomic_t* bio_count = (atomic_t*) >private;

This atomic_t-in-a-void* thing is rather unpleasing.

It could be a new member of `struct kiocb', perhaps.

Please use " *" rather than "* ".

> + long res;
> +
> + if ((bio->bi_rw & 1) == READ)

bio_data_dir()

> + bio_check_pages_dirty(bio);
> + else {
> + bio_release_pages(bio);
> + bio_put(bio);
> + }
>  
> - if ((iblock + max_blocks) > end_block) {
> - max_blocks = end_block - iblock;
> - if ((long)max_blocks <= 0) {
> - if (create)
> - return -EIO;/* write fully beyond EOF */
> - /*
> -  * It is a read which is fully beyond EOF.  We return
> -  * a !buffer_mapped buffer
> -  */
> - max_blocks = 0;
> - }
> + if (error)
> + iocb->ki_left = -EIO;
> +
> + if (atomic_dec_and_test(bio_count)) {
> + res = (iocb->ki_left < 0) ? iocb->ki_left : iocb->ki_nbytes;
> + aio_complete(iocb, res, 0);
>   }
>  
> - bh->b_bdev = I_BDEV(inode);
> - bh->b_blocknr = iblock;
> - bh->b_size = max_blocks << inode->i_blkbits;
> - if (max_blocks)
> - set_buffer_mapped(bh);
>   return 0;
>  }
>  
> +#define VEC_SIZE 16
> +struct pvec {
> + unsigned short nr;
> + unsigned short idx;
> + struct page *page[VEC_SIZE];
> +};

Do we actually need this?  afaict all it does is saves an additional
function arg in a couple of places.

I guess it doesn't hurt though.

> +
> +struct page *blk_get_page(unsigned long addr, size_t count, int rw,
> +   struct pvec *pvec)

Does this need kernel-wide scope?

A nice introductory comment explaining what it does would be, err, nice.

> +{
> + int ret, nr_pages;
> + if (pvec->idx == pvec->nr) {
> + nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE -
> + addr / PAGE_SIZE;
> + nr_pages = min(nr_pages, VEC_SIZE);
> + down_read(>mm->mmap_sem);
> + ret = get_user_pages(current, current->mm, addr, nr_pages,
> +  rw==READ, 0, pvec->page, NULL);
> + up_read(>mm->mmap_sem);
> + if (ret < 0)
> + return ERR_PTR(ret);
> + pvec->nr = ret;
> + pvec->idx = 0;
> + }
> + return pvec->page[pvec->idx++];
> +}

hm, if get_user_pages() returns zero, badness ensues.

It won't do that, unless perhaps we passed it a zero nr_pages in the first
place.  We shouldn't do that.

Has this code been tested with zero-length writes?  And with iovecs which
contain a zero-length segment?


>  static ssize_t
>  blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
> - loff_t offset, unsigned long nr_segs)
> +  loff_t pos, unsigned long nr_segs)
>  {
> - struct file *file = iocb->ki_filp;
> - struct inode *inode = file->f_mapping->host;
> + struct inode *inode = iocb->ki_filp->f_mapping->host;
> + unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
> + unsigned blocksize_mask = (1<< blkbits) - 1;
> + unsigned long seg, nvec, cur_off, cur_len;
> +
> + unsigned long addr;
> + size_t count, nbytes = iocb->ki_nbytes;
> + loff_t size;
> + struct bio * bio;
> + atomic_t *bio_count = (atomic_t *) >private;
> + struct page *page;
> + struct pvec pvec = {.nr = 0, .idx = 0, };

Please use one declaration per line (no commas).  That leaves you room for
a little comment alongside each local, explaining its operation.

This function needs little 

Re: [patch] optimize o_direct on block device - v2

2006-12-05 Thread Andrew Morton
On Tue, 5 Dec 2006 11:02:30 +
Christoph Hellwig <[EMAIL PROTECTED]> wrote:

> > +   long res;
> > +
> > +   if ((bio->bi_rw & 1) == READ)
> 
> I just wanted to complain about not using a proper helper for this,
> but apparently we don't have one yet..

There's bio_data_dir().
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [patch] optimize o_direct on block device - v2

2006-12-05 Thread Christoph Hellwig
On Mon, Dec 04, 2006 at 08:55:50PM -0800, Chen, Kenneth W wrote:
> This patch implements block device specific .direct_IO method instead
> of going through generic direct_io_worker for block device.
> 
> direct_io_worker is fairly complex because it needs to handle O_DIRECT
> on file system, where it needs to perform block allocation, hole detection,
> extents file on write, and tons of other corner cases. The end result is
> that it takes tons of CPU time to submit an I/O.
> 
> For block device, the block allocation is much simpler and a tight triple
> loop can be written to iterate each iovec and each page within the iovec
> in order to construct/prepare bio structure and then subsequently submit
> it to the block layer.  This significantly speeds up O_D on block device.

I think this separation makes a lot of sense.  direct-io.c has to deal
with a lot of convuled issues that only arise on regular files, and
the block mapping is the most trivial part of that :)  While you're
at it please send a followup patch that removes the DIO_NO_LOCKING
code from direct-io.c as it's now unused and only needlessly complicates
the code.  (Hopefully we'll get down to one single variant one day..)

> --- ./fs/block_dev.c.orig 2006-11-29 13:57:37.0 -0800
> +++ ./fs/block_dev.c  2006-12-04 18:38:53.0 -0800
> @@ -129,43 +129,164 @@ blkdev_get_block(struct inode *inode, se
>   return 0;
>  }
>  
> -static int
> -blkdev_get_blocks(struct inode *inode, sector_t iblock,
> - struct buffer_head *bh, int create)
> +int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)

This should be static.

> + struct kiocb* iocb = bio->bi_private;
> + atomic_t* bio_count = (atomic_t*) >private;

The * placement is wrong all over.  This should be:

struct kiocb *iocb = bio->bi_private;
atomic_t *bio_count = (atomic_t *)>private;

> + long res;
> +
> + if ((bio->bi_rw & 1) == READ)

I just wanted to complain about not using a proper helper for this,
but apparently we don't have one yet..

> + bio_check_pages_dirty(bio);
> + else {
> + bio_release_pages(bio);
> + bio_put(bio);
> + }

> + if (error)
> + iocb->ki_left = -EIO;
> +
> + if (atomic_dec_and_test(bio_count)) {
> + res = (iocb->ki_left < 0) ? iocb->ki_left : iocb->ki_nbytes;
> + aio_complete(iocb, res, 0);
>   }


I suspect this would be a lot more readable as:

if (atomic_dec_and_test(bio_count)) {
if (iocb->ki_left < 0)
aio_complete(iocb, iocb->ki_left, 0);
else
aio_complete(iocb, iocb->ki_nbytes, 0);
}

> +struct page *blk_get_page(unsigned long addr, size_t count, int rw,
> +   struct pvec *pvec)

static, again.

> +{
> + int ret, nr_pages;
> + if (pvec->idx == pvec->nr) {
> + nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE -
> + addr / PAGE_SIZE;
> + nr_pages = min(nr_pages, VEC_SIZE);

Didn't someone say you should add a macro for this in the last round
of reviews?

> + down_read(>mm->mmap_sem);
> + ret = get_user_pages(current, current->mm, addr, nr_pages,
> +  rw==READ, 0, pvec->page, NULL);

 rw == READ

>  static ssize_t
>  blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
> - loff_t offset, unsigned long nr_segs)
> +  loff_t pos, unsigned long nr_segs)
>  {
> - struct file *file = iocb->ki_filp;
> - struct inode *inode = file->f_mapping->host;
> + struct inode *inode = iocb->ki_filp->f_mapping->host;
> + unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
> + unsigned blocksize_mask = (1<< blkbits) - 1;
> + unsigned long seg, nvec, cur_off, cur_len;
> +
> + unsigned long addr;
> + size_t count, nbytes = iocb->ki_nbytes;
> + loff_t size;
> + struct bio * bio;
> + atomic_t *bio_count = (atomic_t *) >private;
> + struct page *page;
> + struct pvec pvec = {.nr = 0, .idx = 0, };
> +
> + BUILD_BUG_ON(sizeof(atomic_t) > sizeof(iocb->private));
> +
> + size = i_size_read(inode);
> + if (pos + nbytes > size)
> + nbytes = size - pos;
> +
> + seg = 0;
> + addr = (unsigned long) iov[0].iov_base;
> + count = iov[0].iov_len;
> + atomic_set(bio_count, 1);
> +
> + /* first check the alignment */
> + if (addr & blocksize_mask || count & blocksize_mask ||
> + pos & blocksize_mask)
> + return -EINVAL;

You only use one iovec and simply ignore all others, that's a huge
regression over the existing functionality (and actually a bug that
causes silent data loss)

>  static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
> --- ./fs/read_write.c.orig

Re: [patch] optimize o_direct on block device - v2

2006-12-05 Thread Andrew Morton
On Mon, 4 Dec 2006 20:55:50 -0800
Chen, Kenneth W [EMAIL PROTECTED] wrote:

 This patch implements block device specific .direct_IO method instead
 of going through generic direct_io_worker for block device.
 
 direct_io_worker is fairly complex because it needs to handle O_DIRECT
 on file system, where it needs to perform block allocation, hole detection,
 extents file on write, and tons of other corner cases. The end result is
 that it takes tons of CPU time to submit an I/O.
 
 For block device, the block allocation is much simpler and a tight triple
 loop can be written to iterate each iovec and each page within the iovec
 in order to construct/prepare bio structure and then subsequently submit
 it to the block layer.  This significantly speeds up O_D on block device.
 

Looks reasonable to me.  A few musings:

  
 -static int
 -blkdev_get_blocks(struct inode *inode, sector_t iblock,
 - struct buffer_head *bh, int create)
 +int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
  {
 - sector_t end_block = max_block(I_BDEV(inode));
 - unsigned long max_blocks = bh-b_size  inode-i_blkbits;
 + struct kiocb* iocb = bio-bi_private;
 + atomic_t* bio_count = (atomic_t*) iocb-private;

This atomic_t-in-a-void* thing is rather unpleasing.

It could be a new member of `struct kiocb', perhaps.

Please use  * rather than * .

 + long res;
 +
 + if ((bio-bi_rw  1) == READ)

bio_data_dir()

 + bio_check_pages_dirty(bio);
 + else {
 + bio_release_pages(bio);
 + bio_put(bio);
 + }
  
 - if ((iblock + max_blocks)  end_block) {
 - max_blocks = end_block - iblock;
 - if ((long)max_blocks = 0) {
 - if (create)
 - return -EIO;/* write fully beyond EOF */
 - /*
 -  * It is a read which is fully beyond EOF.  We return
 -  * a !buffer_mapped buffer
 -  */
 - max_blocks = 0;
 - }
 + if (error)
 + iocb-ki_left = -EIO;
 +
 + if (atomic_dec_and_test(bio_count)) {
 + res = (iocb-ki_left  0) ? iocb-ki_left : iocb-ki_nbytes;
 + aio_complete(iocb, res, 0);
   }
  
 - bh-b_bdev = I_BDEV(inode);
 - bh-b_blocknr = iblock;
 - bh-b_size = max_blocks  inode-i_blkbits;
 - if (max_blocks)
 - set_buffer_mapped(bh);
   return 0;
  }
  
 +#define VEC_SIZE 16
 +struct pvec {
 + unsigned short nr;
 + unsigned short idx;
 + struct page *page[VEC_SIZE];
 +};

Do we actually need this?  afaict all it does is saves an additional
function arg in a couple of places.

I guess it doesn't hurt though.

 +
 +struct page *blk_get_page(unsigned long addr, size_t count, int rw,
 +   struct pvec *pvec)

Does this need kernel-wide scope?

A nice introductory comment explaining what it does would be, err, nice.

 +{
 + int ret, nr_pages;
 + if (pvec-idx == pvec-nr) {
 + nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE -
 + addr / PAGE_SIZE;
 + nr_pages = min(nr_pages, VEC_SIZE);
 + down_read(current-mm-mmap_sem);
 + ret = get_user_pages(current, current-mm, addr, nr_pages,
 +  rw==READ, 0, pvec-page, NULL);
 + up_read(current-mm-mmap_sem);
 + if (ret  0)
 + return ERR_PTR(ret);
 + pvec-nr = ret;
 + pvec-idx = 0;
 + }
 + return pvec-page[pvec-idx++];
 +}

hm, if get_user_pages() returns zero, badness ensues.

It won't do that, unless perhaps we passed it a zero nr_pages in the first
place.  We shouldn't do that.

Has this code been tested with zero-length writes?  And with iovecs which
contain a zero-length segment?


  static ssize_t
  blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 - loff_t offset, unsigned long nr_segs)
 +  loff_t pos, unsigned long nr_segs)
  {
 - struct file *file = iocb-ki_filp;
 - struct inode *inode = file-f_mapping-host;
 + struct inode *inode = iocb-ki_filp-f_mapping-host;
 + unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
 + unsigned blocksize_mask = (1 blkbits) - 1;
 + unsigned long seg, nvec, cur_off, cur_len;
 +
 + unsigned long addr;
 + size_t count, nbytes = iocb-ki_nbytes;
 + loff_t size;
 + struct bio * bio;
 + atomic_t *bio_count = (atomic_t *) iocb-private;
 + struct page *page;
 + struct pvec pvec = {.nr = 0, .idx = 0, };

Please use one declaration per line (no commas).  That leaves you room for
a little comment alongside each local, explaining its operation.

This function needs little comments alongside each local, explaining their
operation.

 + BUILD_BUG_ON(sizeof(atomic_t)  sizeof(iocb-private));

argh.  

Re: [patch] optimize o_direct on block device - v2

2006-12-05 Thread Christoph Hellwig
On Mon, Dec 04, 2006 at 08:55:50PM -0800, Chen, Kenneth W wrote:
 This patch implements block device specific .direct_IO method instead
 of going through generic direct_io_worker for block device.
 
 direct_io_worker is fairly complex because it needs to handle O_DIRECT
 on file system, where it needs to perform block allocation, hole detection,
 extents file on write, and tons of other corner cases. The end result is
 that it takes tons of CPU time to submit an I/O.
 
 For block device, the block allocation is much simpler and a tight triple
 loop can be written to iterate each iovec and each page within the iovec
 in order to construct/prepare bio structure and then subsequently submit
 it to the block layer.  This significantly speeds up O_D on block device.

I think this separation makes a lot of sense.  direct-io.c has to deal
with a lot of convuled issues that only arise on regular files, and
the block mapping is the most trivial part of that :)  While you're
at it please send a followup patch that removes the DIO_NO_LOCKING
code from direct-io.c as it's now unused and only needlessly complicates
the code.  (Hopefully we'll get down to one single variant one day..)

 --- ./fs/block_dev.c.orig 2006-11-29 13:57:37.0 -0800
 +++ ./fs/block_dev.c  2006-12-04 18:38:53.0 -0800
 @@ -129,43 +129,164 @@ blkdev_get_block(struct inode *inode, se
   return 0;
  }
  
 -static int
 -blkdev_get_blocks(struct inode *inode, sector_t iblock,
 - struct buffer_head *bh, int create)
 +int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)

This should be static.

 + struct kiocb* iocb = bio-bi_private;
 + atomic_t* bio_count = (atomic_t*) iocb-private;

The * placement is wrong all over.  This should be:

struct kiocb *iocb = bio-bi_private;
atomic_t *bio_count = (atomic_t *)iocb-private;

 + long res;
 +
 + if ((bio-bi_rw  1) == READ)

I just wanted to complain about not using a proper helper for this,
but apparently we don't have one yet..

 + bio_check_pages_dirty(bio);
 + else {
 + bio_release_pages(bio);
 + bio_put(bio);
 + }

 + if (error)
 + iocb-ki_left = -EIO;
 +
 + if (atomic_dec_and_test(bio_count)) {
 + res = (iocb-ki_left  0) ? iocb-ki_left : iocb-ki_nbytes;
 + aio_complete(iocb, res, 0);
   }


I suspect this would be a lot more readable as:

if (atomic_dec_and_test(bio_count)) {
if (iocb-ki_left  0)
aio_complete(iocb, iocb-ki_left, 0);
else
aio_complete(iocb, iocb-ki_nbytes, 0);
}

 +struct page *blk_get_page(unsigned long addr, size_t count, int rw,
 +   struct pvec *pvec)

static, again.

 +{
 + int ret, nr_pages;
 + if (pvec-idx == pvec-nr) {
 + nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE -
 + addr / PAGE_SIZE;
 + nr_pages = min(nr_pages, VEC_SIZE);

Didn't someone say you should add a macro for this in the last round
of reviews?

 + down_read(current-mm-mmap_sem);
 + ret = get_user_pages(current, current-mm, addr, nr_pages,
 +  rw==READ, 0, pvec-page, NULL);

 rw == READ

  static ssize_t
  blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 - loff_t offset, unsigned long nr_segs)
 +  loff_t pos, unsigned long nr_segs)
  {
 - struct file *file = iocb-ki_filp;
 - struct inode *inode = file-f_mapping-host;
 + struct inode *inode = iocb-ki_filp-f_mapping-host;
 + unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
 + unsigned blocksize_mask = (1 blkbits) - 1;
 + unsigned long seg, nvec, cur_off, cur_len;
 +
 + unsigned long addr;
 + size_t count, nbytes = iocb-ki_nbytes;
 + loff_t size;
 + struct bio * bio;
 + atomic_t *bio_count = (atomic_t *) iocb-private;
 + struct page *page;
 + struct pvec pvec = {.nr = 0, .idx = 0, };
 +
 + BUILD_BUG_ON(sizeof(atomic_t)  sizeof(iocb-private));
 +
 + size = i_size_read(inode);
 + if (pos + nbytes  size)
 + nbytes = size - pos;
 +
 + seg = 0;
 + addr = (unsigned long) iov[0].iov_base;
 + count = iov[0].iov_len;
 + atomic_set(bio_count, 1);
 +
 + /* first check the alignment */
 + if (addr  blocksize_mask || count  blocksize_mask ||
 + pos  blocksize_mask)
 + return -EINVAL;

You only use one iovec and simply ignore all others, that's a huge
regression over the existing functionality (and actually a bug that
causes silent data loss)

  static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
 --- ./fs/read_write.c.orig2006-11-29 13:57:37.0 -0800
 +++ ./fs/read_write.c 2006-12-04 17:30:34.0 -0800
 @@ 

Re: [patch] optimize o_direct on block device - v2

2006-12-05 Thread Andrew Morton
On Tue, 5 Dec 2006 11:02:30 +
Christoph Hellwig [EMAIL PROTECTED] wrote:

  +   long res;
  +
  +   if ((bio-bi_rw  1) == READ)
 
 I just wanted to complain about not using a proper helper for this,
 but apparently we don't have one yet..

There's bio_data_dir().
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[patch] optimize o_direct on block device - v2

2006-12-04 Thread Chen, Kenneth W
This patch implements block device specific .direct_IO method instead
of going through generic direct_io_worker for block device.

direct_io_worker is fairly complex because it needs to handle O_DIRECT
on file system, where it needs to perform block allocation, hole detection,
extents file on write, and tons of other corner cases. The end result is
that it takes tons of CPU time to submit an I/O.

For block device, the block allocation is much simpler and a tight triple
loop can be written to iterate each iovec and each page within the iovec
in order to construct/prepare bio structure and then subsequently submit
it to the block layer.  This significantly speeds up O_D on block device.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


---
Changes since v1->v2:

* add BUILD_BUG_ON to ensure bio_count fit inside iocb->private
* add comment that bio_alloc won't fail with GFP_KERNEL
* fix back out path if get_uer_pages fail
* fix back out path if iov segment doesn't align properly

 fs/bio.c|2
 fs/block_dev.c  |  173 
 fs/read_write.c |2
 include/linux/bio.h |1
 4 files changed, 150 insertions(+), 28 deletions(-)


--- ./fs/block_dev.c.orig   2006-11-29 13:57:37.0 -0800
+++ ./fs/block_dev.c2006-12-04 18:38:53.0 -0800
@@ -129,43 +129,164 @@ blkdev_get_block(struct inode *inode, se
return 0;
 }
 
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-   struct buffer_head *bh, int create)
+int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
 {
-   sector_t end_block = max_block(I_BDEV(inode));
-   unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
+   struct kiocb* iocb = bio->bi_private;
+   atomic_t* bio_count = (atomic_t*) >private;
+   long res;
+
+   if ((bio->bi_rw & 1) == READ)
+   bio_check_pages_dirty(bio);
+   else {
+   bio_release_pages(bio);
+   bio_put(bio);
+   }
 
-   if ((iblock + max_blocks) > end_block) {
-   max_blocks = end_block - iblock;
-   if ((long)max_blocks <= 0) {
-   if (create)
-   return -EIO;/* write fully beyond EOF */
-   /*
-* It is a read which is fully beyond EOF.  We return
-* a !buffer_mapped buffer
-*/
-   max_blocks = 0;
-   }
+   if (error)
+   iocb->ki_left = -EIO;
+
+   if (atomic_dec_and_test(bio_count)) {
+   res = (iocb->ki_left < 0) ? iocb->ki_left : iocb->ki_nbytes;
+   aio_complete(iocb, res, 0);
}
 
-   bh->b_bdev = I_BDEV(inode);
-   bh->b_blocknr = iblock;
-   bh->b_size = max_blocks << inode->i_blkbits;
-   if (max_blocks)
-   set_buffer_mapped(bh);
return 0;
 }
 
+#define VEC_SIZE   16
+struct pvec {
+   unsigned short nr;
+   unsigned short idx;
+   struct page *page[VEC_SIZE];
+};
+
+
+struct page *blk_get_page(unsigned long addr, size_t count, int rw,
+ struct pvec *pvec)
+{
+   int ret, nr_pages;
+   if (pvec->idx == pvec->nr) {
+   nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE -
+   addr / PAGE_SIZE;
+   nr_pages = min(nr_pages, VEC_SIZE);
+   down_read(>mm->mmap_sem);
+   ret = get_user_pages(current, current->mm, addr, nr_pages,
+rw==READ, 0, pvec->page, NULL);
+   up_read(>mm->mmap_sem);
+   if (ret < 0)
+   return ERR_PTR(ret);
+   pvec->nr = ret;
+   pvec->idx = 0;
+   }
+   return pvec->page[pvec->idx++];
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-   loff_t offset, unsigned long nr_segs)
+loff_t pos, unsigned long nr_segs)
 {
-   struct file *file = iocb->ki_filp;
-   struct inode *inode = file->f_mapping->host;
+   struct inode *inode = iocb->ki_filp->f_mapping->host;
+   unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
+   unsigned blocksize_mask = (1<< blkbits) - 1;
+   unsigned long seg, nvec, cur_off, cur_len;
+
+   unsigned long addr;
+   size_t count, nbytes = iocb->ki_nbytes;
+   loff_t size;
+   struct bio * bio;
+   atomic_t *bio_count = (atomic_t *) >private;
+   struct page *page;
+   struct pvec pvec = {.nr = 0, .idx = 0, };
+
+   BUILD_BUG_ON(sizeof(atomic_t) > sizeof(iocb->private));
+
+   size = i_size_read(inode);
+   if (pos + nbytes > size)
+   nbytes = size - pos;
+
+   seg = 0;
+   addr = (unsigned long) iov[0].iov_base;
+   count = iov[0].iov_len;
+   atomic_set(bio_count, 1);
+
+ 

[patch] optimize o_direct on block device - v2

2006-12-04 Thread Chen, Kenneth W
This patch implements block device specific .direct_IO method instead
of going through generic direct_io_worker for block device.

direct_io_worker is fairly complex because it needs to handle O_DIRECT
on file system, where it needs to perform block allocation, hole detection,
extents file on write, and tons of other corner cases. The end result is
that it takes tons of CPU time to submit an I/O.

For block device, the block allocation is much simpler and a tight triple
loop can be written to iterate each iovec and each page within the iovec
in order to construct/prepare bio structure and then subsequently submit
it to the block layer.  This significantly speeds up O_D on block device.

Signed-off-by: Ken Chen [EMAIL PROTECTED]


---
Changes since v1-v2:

* add BUILD_BUG_ON to ensure bio_count fit inside iocb-private
* add comment that bio_alloc won't fail with GFP_KERNEL
* fix back out path if get_uer_pages fail
* fix back out path if iov segment doesn't align properly

 fs/bio.c|2
 fs/block_dev.c  |  173 
 fs/read_write.c |2
 include/linux/bio.h |1
 4 files changed, 150 insertions(+), 28 deletions(-)


--- ./fs/block_dev.c.orig   2006-11-29 13:57:37.0 -0800
+++ ./fs/block_dev.c2006-12-04 18:38:53.0 -0800
@@ -129,43 +129,164 @@ blkdev_get_block(struct inode *inode, se
return 0;
 }
 
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-   struct buffer_head *bh, int create)
+int blk_end_aio(struct bio *bio, unsigned int bytes_done, int error)
 {
-   sector_t end_block = max_block(I_BDEV(inode));
-   unsigned long max_blocks = bh-b_size  inode-i_blkbits;
+   struct kiocb* iocb = bio-bi_private;
+   atomic_t* bio_count = (atomic_t*) iocb-private;
+   long res;
+
+   if ((bio-bi_rw  1) == READ)
+   bio_check_pages_dirty(bio);
+   else {
+   bio_release_pages(bio);
+   bio_put(bio);
+   }
 
-   if ((iblock + max_blocks)  end_block) {
-   max_blocks = end_block - iblock;
-   if ((long)max_blocks = 0) {
-   if (create)
-   return -EIO;/* write fully beyond EOF */
-   /*
-* It is a read which is fully beyond EOF.  We return
-* a !buffer_mapped buffer
-*/
-   max_blocks = 0;
-   }
+   if (error)
+   iocb-ki_left = -EIO;
+
+   if (atomic_dec_and_test(bio_count)) {
+   res = (iocb-ki_left  0) ? iocb-ki_left : iocb-ki_nbytes;
+   aio_complete(iocb, res, 0);
}
 
-   bh-b_bdev = I_BDEV(inode);
-   bh-b_blocknr = iblock;
-   bh-b_size = max_blocks  inode-i_blkbits;
-   if (max_blocks)
-   set_buffer_mapped(bh);
return 0;
 }
 
+#define VEC_SIZE   16
+struct pvec {
+   unsigned short nr;
+   unsigned short idx;
+   struct page *page[VEC_SIZE];
+};
+
+
+struct page *blk_get_page(unsigned long addr, size_t count, int rw,
+ struct pvec *pvec)
+{
+   int ret, nr_pages;
+   if (pvec-idx == pvec-nr) {
+   nr_pages = (addr + count + PAGE_SIZE - 1) / PAGE_SIZE -
+   addr / PAGE_SIZE;
+   nr_pages = min(nr_pages, VEC_SIZE);
+   down_read(current-mm-mmap_sem);
+   ret = get_user_pages(current, current-mm, addr, nr_pages,
+rw==READ, 0, pvec-page, NULL);
+   up_read(current-mm-mmap_sem);
+   if (ret  0)
+   return ERR_PTR(ret);
+   pvec-nr = ret;
+   pvec-idx = 0;
+   }
+   return pvec-page[pvec-idx++];
+}
+
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-   loff_t offset, unsigned long nr_segs)
+loff_t pos, unsigned long nr_segs)
 {
-   struct file *file = iocb-ki_filp;
-   struct inode *inode = file-f_mapping-host;
+   struct inode *inode = iocb-ki_filp-f_mapping-host;
+   unsigned blkbits = blksize_bits(bdev_hardsect_size(I_BDEV(inode)));
+   unsigned blocksize_mask = (1 blkbits) - 1;
+   unsigned long seg, nvec, cur_off, cur_len;
+
+   unsigned long addr;
+   size_t count, nbytes = iocb-ki_nbytes;
+   loff_t size;
+   struct bio * bio;
+   atomic_t *bio_count = (atomic_t *) iocb-private;
+   struct page *page;
+   struct pvec pvec = {.nr = 0, .idx = 0, };
+
+   BUILD_BUG_ON(sizeof(atomic_t)  sizeof(iocb-private));
+
+   size = i_size_read(inode);
+   if (pos + nbytes  size)
+   nbytes = size - pos;
+
+   seg = 0;
+   addr = (unsigned long) iov[0].iov_base;
+   count = iov[0].iov_len;
+   atomic_set(bio_count, 1);
+
+   /* first check the