Re: [Kiobuf-io-devel] RFC: Kernel mechanism: Compound event wait

Neil Brown Wed, 07 Feb 2001 16:28:40 -0800
On Wednesday February 7, [EMAIL PROTECTED] wrote:
> 
> 
> On Wed, 7 Feb 2001, Christoph Hellwig wrote:
> 
> > On Tue, Feb 06, 2001 at 12:59:02PM -0800, Linus Torvalds wrote:
> > > 
> > > Actually, they really aren't.
> > > 
> > > They kind of _used_ to be, but more and more they've moved away from that
> > > historical use. Check in particular the page cache, and as a really
> > > extreme case the swap cache version of the page cache.
> > 
> > Yes.  And that exactly why I think it's ugly to have the left-over
> > caching stuff in the same data sctruture as the IO buffer.
> 
> I do agree.
> 
> I would not be opposed to factoring out the "pure block IO" part from the
> bh struct. It should not even be very hard. You'd do something like
> 
>       struct block_io {
>               .. here is the stuff needed for block IO ..
>       };
> 
>       struct buffer_head {
>               struct block_io io;
>               .. here is the stuff needed for hashing etc ..
>       }
> 
> and then you make "generic_make_request()" and everything lower down take
> just the "struct block_io".
> 

I was just thinking the same, or a similar thing.
I wanted to do

    struct io_head {
         stuff
    };
    struct buffer_head {
         struct io_head;
         more stuff;
    }

so that, as an unnamed substructure, the content of the struct io_head
would automagically be promoted to appear to be content of
buffer_head.
However I then remembered (when it didn't work) that unnamed
substructures are a feature of the Plan-9 C compiler, not the GNU
Compiler Collection. (Any gcc coders out there think this would be a
good thing to add?
  http://plan9.bell-labs.com/sys/doc/compiler.html
)

Anyway, I produced the same result in a rather ugly way with #defines
and modified raid5 to use 32byte block_io structures instead of the
80+ byte buffer_heads, and it ... doesn't quite work :-( it boots
fine, but raid5 dies and the Oops message is a few kilometers away.
Anyway, I think the concept it fine.

Patch is below for your inspection.

It occurs to me that Stephen's desire to pass lots of requests through
make_request all at once isn't a bad idea and could be done by simply
linking the io_heads together with b_reqnext.
This would require:
  1/ all callers of generic_make_request (there are 3) to initialise
     b_reqnext
  2/ all registered make_request_fn functions (there are again 3 I
     think)  to cope with following b_reqnext

It shouldn't be too hard to make the elevator code take advantage of
any ordering that it fines in the list.

I don't have a patch which does this.

NeilBrown


--- ./include/linux/fs.h        2001/02/07 22:45:37     1.1
+++ ./include/linux/fs.h        2001/02/07 23:09:05
@@ -207,6 +207,7 @@
 #define BH_Protected   6       /* 1 if the buffer is protected */
 
 /*
+ * THIS COMMENT NO-LONGER CORRECT.
  * Try to keep the most commonly used fields in single cache lines (16
  * bytes) to improve performance.  This ordering should be
  * particularly beneficial on 32-bit processors.
@@ -217,31 +218,43 @@
  * The second 16 bytes we use for lru buffer scans, as used by
  * sync_buffers() and refill_freelist().  -- sct
  */
+
+/* 
+ * io_head is all that is needed by device drivers.
+ */
+#define io_head_fields \
+       unsigned long b_state;          /* buffer state bitmap (see above) */   \
+       struct buffer_head *b_reqnext;  /* request queue */                     \
+       unsigned short b_size;          /* block size */                        \
+       kdev_t b_rdev;                  /* Real device */                       \
+       unsigned long b_rsector;        /* Real buffer location on disk */      \
+       char * b_data;                  /* pointer to data block (512 byte) */  \
+       void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */ \
+       void *b_private;                /* reserved for b_end_io */             \
+       struct page *b_page;            /* the page this bh is mapped to */     \
+     /* this line intensionally left blank */
+struct io_head {
+       io_head_fields
+};
+
+/* buffer_head adds all the stuff needed by the buffer cache */
 struct buffer_head {
-       /* First cache line: */
+       io_head_fields
+
        struct buffer_head *b_next;     /* Hash queue list */
        unsigned long b_blocknr;        /* block number */
-       unsigned short b_size;          /* block size */
        unsigned short b_list;          /* List that this buffer appears */
        kdev_t b_dev;                   /* device (B_FREE = free) */
 
        atomic_t b_count;               /* users using this block */
-       kdev_t b_rdev;                  /* Real device */
-       unsigned long b_state;          /* buffer state bitmap (see above) */
        unsigned long b_flushtime;      /* Time when (dirty) buffer should be written 
*/
 
        struct buffer_head *b_next_free;/* lru/free list linkage */
        struct buffer_head *b_prev_free;/* doubly linked list of buffers */
        struct buffer_head *b_this_page;/* circular list of buffers in one page */
-       struct buffer_head *b_reqnext;  /* request queue */
 
        struct buffer_head **b_pprev;   /* doubly linked list of hash-queue */
-       char * b_data;                  /* pointer to data block (512 byte) */
-       struct page *b_page;            /* the page this bh is mapped to */
-       void (*b_end_io)(struct buffer_head *bh, int uptodate); /* I/O completion */
-       void *b_private;                /* reserved for b_end_io */
 
-       unsigned long b_rsector;        /* Real buffer location on disk */
        wait_queue_head_t b_wait;
 
        struct inode *       b_inode;
--- ./drivers/md/raid5.c        2001/02/06 05:43:31     1.2
+++ ./drivers/md/raid5.c        2001/02/07 23:15:36
@@ -151,18 +151,16 @@
 
        for (i=0; i<num; i++) {
                struct page *page;
-               bh = kmalloc(sizeof(struct buffer_head), priority);
+               bh = kmalloc(sizeof(struct io_head), priority);
                if (!bh)
                        return 1;
-               memset(bh, 0, sizeof (struct buffer_head));
-               init_waitqueue_head(&bh->b_wait);
+               memset(bh, 0, sizeof (struct io_head));
                page = alloc_page(priority);
                bh->b_data = page_address(page);
                if (!bh->b_data) {
                        kfree(bh);
                        return 1;
                }
-               atomic_set(&bh->b_count, 0);
                bh->b_page = page;
                sh->bh_cache[i] = bh;
 
@@ -412,7 +410,7 @@
                        spin_lock_irqsave(&conf->device_lock, flags);
                }
        } else {
-               md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
+               md_error(mddev_to_kdev(conf->mddev), conf->disks[i].dev);
                clear_bit(BH_Uptodate, &bh->b_state);
        }
        clear_bit(BH_Lock, &bh->b_state);
@@ -440,7 +438,7 @@
 
        md_spin_lock_irqsave(&conf->device_lock, flags);
        if (!uptodate)
-               md_error(mddev_to_kdev(conf->mddev), bh->b_dev);
+               md_error(mddev_to_kdev(conf->mddev), conf->disks[i].dev);
        clear_bit(BH_Lock, &bh->b_state);
        set_bit(STRIPE_HANDLE, &sh->state);
        __release_stripe(conf, sh);
@@ -456,12 +454,10 @@
        unsigned long block = sh->sector / (sh->size >> 9);
 
        init_buffer(bh, raid5_end_read_request, sh);
-       bh->b_dev       = conf->disks[i].dev;
        bh->b_blocknr   = block;
 
        bh->b_state     = (1 << BH_Req) | (1 << BH_Mapped);
        bh->b_size      = sh->size;
-       bh->b_list      = BUF_LOCKED;
        return bh;
 }
 
@@ -1085,15 +1081,14 @@
                        else
                                bh->b_end_io = raid5_end_write_request;
                        if (conf->disks[i].operational)
-                               bh->b_dev = conf->disks[i].dev;
+                               bh->b_rdev = conf->disks[i].dev;
                        else if (conf->spare && action[i] == WRITE+1)
-                               bh->b_dev = conf->spare->dev;
+                               bh->b_rdev = conf->spare->dev;
                        else skip=1;
                        if (!skip) {
                                PRINTK("for %ld schedule op %d on disc %d\n", 
sh->sector, action[i]-1, i);
                                atomic_inc(&sh->count);
-                               bh->b_rdev = bh->b_dev;
-                               bh->b_rsector = bh->b_blocknr * (bh->b_size>>9);
+                               bh->b_rsector = sh->sector;
                                generic_make_request(action[i]-1, bh);
                        } else {
                                PRINTK("skip op %d on disc %d for sector %ld\n", 
action[i]-1, i, sh->sector);
@@ -1502,7 +1497,7 @@
        }
 
        memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
-                conf->raid_disks * ((sizeof(struct buffer_head) + PAGE_SIZE))) / 1024;
+                conf->raid_disks * ((sizeof(struct io_head) + PAGE_SIZE))) / 1024;
        if (grow_stripes(conf, conf->max_nr_stripes, GFP_KERNEL)) {
                printk(KERN_ERR "raid5: couldn't allocate %dkB for buffers\n", memory);
                shrink_stripes(conf, conf->max_nr_stripes);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
Please read the FAQ at http://www.tux.org/lkml/
Re: [Kiobuf-io-devel] RFC: Kernel mechanism: Compound event wait

Reply via email to