Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-21 Thread Evgeniy Polyakov
On Sun, Apr 20, 2008 at 02:05:31AM +1000, Rusty Russell ([EMAIL PROTECTED]) 
wrote:
> > Should this whole function and vring_used_buffer() be protected with
> > vr->lock mutex?
> 
> No; it's up to the caller to make sure that they are serialized.  In the case 
> of tun that happens naturally.
> 
> There are two reasons not to grab the lock.  It turns out that if we tried to 
> lock here, we'd deadlock, since the callbacks are called under the lock.  
> Secondly, it's possible to implement an atomic vring_used_buffer variant, 
> which could fail: this would avoid using the thread most of the time.

Yep, I decided that too. But it limits its usage to tun only or any
other system where only single thread picks up results, so no generic
userspace ring buffers?

-- 
Evgeniy Polyakov
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-21 Thread Evgeniy Polyakov
Hi.

On Fri, Apr 18, 2008 at 02:39:48PM +1000, Rusty Russell ([EMAIL PROTECTED]) 
wrote:

> +int vring_get_buffer(struct vring_info *vr,
> +  struct iovec *in_iov,
> +  unsigned int *num_in, unsigned long *in_len,
> +  struct iovec *out_iov,
> +  unsigned int *num_out, unsigned long *out_len)
> +{
> + unsigned int i, in = 0, out = 0;
> + unsigned long dummy;
> + u16 avail, last_avail, head;
> + struct vring_desc d;

Should this whole function and vring_used_buffer() be protected with
vr->lock mutex?

> + if (unlikely(get_user(avail, &vr->ring.avail->idx)))
> + return -EFAULT;
> + if (unlikely(get_user(last_avail, &vring_last_avail(&vr->ring
> + return -EFAULT;
> +
> + if (last_avail == avail)
> + return 0;
> +
> + if (!in_len)
> + in_len = &dummy;
> + if (!out_len)
> + out_len = &dummy;
> +
> + *in_len = *out_len = 0;
> +
> + if (unlikely(get_user(head, &vr->ring.avail->ring[last_avail
> +   & vr->mask])))
> + return -EFAULT;
> +
> + i = head;
> + do {
> + if (unlikely(i >= vr->ring.num)) {
> + pr_debug("vring: bad index: %u\n", i);
> + return -EINVAL;
> + }
> +
> + if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
> + return -EFAULT;
> +
> + if (d.flags & VRING_DESC_F_WRITE) {
> + /* Check for length and iovec overflows */
> + if (!num_in) {
> + pr_debug("vring: writable desc %u in ring %p\n",
> +  i, vr->ring.desc);
> + return -EINVAL;
> + }
> + if (in == *num_in || *in_len + d.len < *in_len)
> + return -E2BIG;
> + in_iov[in].iov_len = d.len;
> + *in_len += d.len;
> + in_iov[in].iov_base = (void __user *)(long)d.addr;
> + in++;
> + } else {
> + if (!num_out) {
> + pr_debug("vring: readable desc %u in ring %p\n",
> +  i, vr->ring.desc);
> + return -EINVAL;
> + }
> + if (out == *num_out || *out_len + d.len < *out_len)
> + return -E2BIG;
> + out_iov[out].iov_len = d.len;
> + *out_len += d.len;
> + out_iov[out].iov_base = (void __user *)(long)d.addr;
> + out++;
> + }
> +
> + i = d.next;
> + } while (d.flags & VRING_DESC_F_NEXT);
> +
> + if (num_in)
> + *num_in = in;
> + if (num_out)
> + *num_out = out;
> +
> + last_avail++;
> + put_user(last_avail, &vring_last_avail(&vr->ring));
> +
> + /* 0 is a valid head, so add one. */
> + return head + 1;
> +}
> +EXPORT_SYMBOL_GPL(vring_get_buffer);
> +
> +/**
> + * vring_used_buffer - return a used buffer to the vring
> + * @vr: the vring
> + * @id: the id returned from vring_get_buffer
> + * @len: the total bytes *written* to the buffer
> + */
> +void vring_used_buffer(struct vring_info *vr, int id, u32 len)
> +{
> + struct vring_used_elem used;
> + u16 used_idx;
> +
> + BUG_ON(id <= 0 || id > vr->ring.num);
> +
> + used.id = id - 1;
> + used.len = len;
> + if (get_user(used_idx, &vr->ring.used->idx) != 0)
> + return;
> +
> + if (copy_to_user(&vr->ring.used->ring[used_idx & vr->mask], &used,
> +  sizeof(used)))
> + return;
> +
> + wmb();
> + used_idx++;
> + put_user(used_idx, &vr->ring.used->idx);
> +}
> +EXPORT_SYMBOL_GPL(vring_used_buffer);

-- 
Evgeniy Polyakov
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-19 Thread David Miller
From: Rusty Russell <[EMAIL PROTECTED]>
Date: Sun, 20 Apr 2008 02:41:14 +1000

> If only there were some kind of, I don't know... summit... for kernel 
> people... 

I'm starting to disbelieve the myth that because we can discuss
technical issues on mailing lists, we should talk primarily about
process issues during the kernel summit.

There is a distinct advantage to discussing and hashing things out in
person.  You can't say "screw you, your idea sucks" when you're face
to face with the other person, whereas online it's way too easy.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-19 Thread Rusty Russell
On Saturday 19 April 2008 05:38:50 Michael Kerrisk wrote:
> On 4/18/08, Andrew Morton <[EMAIL PROTECTED]> wrote:
> > This is may be our third high-bandwidth user/kernel interface to
> > transport bulk data ("hbukittbd") which was implemented because its
> > predecessors weren't quite right.  In a year or two's time someone else
> > will need a hbukittbd and will find that the existing three aren't quite
> > right and will give us another one.  One day we need to stop doing this
> > ;)

If only there were some kind of, I don't know... summit... for kernel 
people... 

> >  It could be that this person will look at Rusty's hbukittbd and find
> > that it _could_ be tweaked to do what he wants, but it's already shipping
> > and it's part of the kernel API and hence can't be made to do what he
> > wants.

Indeed.  I marked it experimental because of these questions (ie. it's not yet 
kernel ABI).  Getting everyone's attention is hard tho, so I figured we put 
it in as a device and moving to a syscall if and when we feel it's ready.

> >  So I think it would be good to plonk the proposed interface on the table
> >  and have a poke at it.  Is it compat-safe?  Is it extensible in a
> >  backward-compatible fashion?  Are there future-safe changes we should
> > make to it?  Can Michael Kerrisk understand, review and document it? 
> > etc.
>
> Well, it helps if he's CCed

It is compat safe, and we've already extended it once, so I'm reasonably happy 
so far.  If it were a syscall I'd add a flags arg, for the device it'd be an 
ioctl.  Starting with the virtio ABI seemed a reasonable first step, because 
*we* can use this today even if noone else does.

> I'm happy to work *with someone* on the documentation (pointless to do
> it on my own -- how do I know what Rusty's *intended* behavior for the
> interface is), and review, and testing.

Document coming up...
Rusty.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-19 Thread Rusty Russell
On Sunday 20 April 2008 02:33:22 Evgeniy Polyakov wrote:
> On Sun, Apr 20, 2008 at 02:05:31AM +1000, Rusty Russell 
([EMAIL PROTECTED]) wrote:
> > There are two reasons not to grab the lock.  It turns out that if we
> > tried to lock here, we'd deadlock, since the callbacks are called under
> > the lock. Secondly, it's possible to implement an atomic
> > vring_used_buffer variant, which could fail: this would avoid using the
> > thread most of the time.
>
> Yep, I decided that too. But it limits its usage to tun only or any
> other system where only single thread picks up results, so no generic
> userspace ring buffers?

I don't think so, it just externalizes the locking.  The mutex protects the 
attaching and detaching of the ops structure, some other lock or code 
protects simultenous kernel ring accesses.

Cheers,
Rusty.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-19 Thread Rusty Russell
On Saturday 19 April 2008 20:22:15 Evgeniy Polyakov wrote:
> Hi.
>
> On Fri, Apr 18, 2008 at 02:39:48PM +1000, Rusty Russell 
([EMAIL PROTECTED]) wrote:
> > +int vring_get_buffer(struct vring_info *vr,
> > +struct iovec *in_iov,
> > +unsigned int *num_in, unsigned long *in_len,
> > +struct iovec *out_iov,
> > +unsigned int *num_out, unsigned long *out_len)
> > +{
> > +   unsigned int i, in = 0, out = 0;
> > +   unsigned long dummy;
> > +   u16 avail, last_avail, head;
> > +   struct vring_desc d;
>
> Should this whole function and vring_used_buffer() be protected with
> vr->lock mutex?

No; it's up to the caller to make sure that they are serialized.  In the case 
of tun that happens naturally.

There are two reasons not to grab the lock.  It turns out that if we tried to 
lock here, we'd deadlock, since the callbacks are called under the lock.  
Secondly, it's possible to implement an atomic vring_used_buffer variant, 
which could fail: this would avoid using the thread most of the time.

Hope that helps,
Rusty.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-19 Thread Jonathan Corbet
> So I think it would be good to plonk the proposed interface on the table
> and have a poke at it.  Is it compat-safe?  Is it extensible in a
> backward-compatible fashion?  Are there future-safe changes we should make
> to it?  Can Michael Kerrisk understand, review and document it?  etc.
> 
> You know what I'm saying ;)  What is the proposed interface?

So, I'm not Michael, but I *did* make an attempt to document this
interface - user and kernel sides - so that it could be more easily
understood:

http://lwn.net/Articles/276856/

That was the previous posting, but a quick look suggests it hasn't
changed *that* much in this round.

jon
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-18 Thread Michael Kerrisk
On 4/18/08, Andrew Morton <[EMAIL PROTECTED]> wrote:
> On Sat, 19 Apr 2008 00:32:39 +1000 Rusty Russell <[EMAIL PROTECTED]> wrote:
>
>  > > Isn't this kinda-sorta like what a relayfs file does?  The oprofile
>  > > buffers?  etc?  Nothing in common at all, no hope?
>  >
>  > An excellent question, but I thought the modern kernel etiquette was to 
> only
>  > comment on whitespace and formatting, and call it "review"? :)
>  >
>  > Yes, kinda-sorta in that it's a ring buffer.  No, in that it's bidir and
>  > consumption can be out-of-order (kind of important for I/O buffers).
>  >
>  > But the reason I'm not proposing it as a syscall is that I'm not convinced
>  > it's the One True Solution which everyone should be using.  Time will tell:
>  > it's clearly not tied to tun and it's been generically useful for virtual
>  > I/O, but history has not been kind to new userspace interfaces.
>
>
> This is may be our third high-bandwidth user/kernel interface to transport
>  bulk data ("hbukittbd") which was implemented because its predecessors
>  weren't quite right.  In a year or two's time someone else will need a
>  hbukittbd and will find that the existing three aren't quite right and will
>  give us another one.  One day we need to stop doing this ;)
>
>  It could be that this person will look at Rusty's hbukittbd and find that
>  it _could_ be tweaked to do what he wants, but it's already shipping and
>  it's part of the kernel API and hence can't be made to do what he wants.
>
>  So I think it would be good to plonk the proposed interface on the table
>  and have a poke at it.  Is it compat-safe?  Is it extensible in a
>  backward-compatible fashion?  Are there future-safe changes we should make
>  to it?  Can Michael Kerrisk understand, review and document it?  etc.

Well, it helps if he's CCed

I'm happy to work *with someone* on the documentation (pointless to do
it on my own -- how do I know what Rusty's *intended* behavior for the
interface is), and review, and testing.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-18 Thread Andrew Morton
On Sat, 19 Apr 2008 00:32:39 +1000 Rusty Russell <[EMAIL PROTECTED]> wrote:

> > Isn't this kinda-sorta like what a relayfs file does?  The oprofile
> > buffers?  etc?  Nothing in common at all, no hope?
> 
> An excellent question, but I thought the modern kernel etiquette was to only 
> comment on whitespace and formatting, and call it "review"? :)
> 
> Yes, kinda-sorta in that it's a ring buffer.  No, in that it's bidir and 
> consumption can be out-of-order (kind of important for I/O buffers).
> 
> But the reason I'm not proposing it as a syscall is that I'm not convinced 
> it's the One True Solution which everyone should be using.  Time will tell: 
> it's clearly not tied to tun and it's been generically useful for virtual 
> I/O, but history has not been kind to new userspace interfaces.

This is may be our third high-bandwidth user/kernel interface to transport
bulk data ("hbukittbd") which was implemented because its predecessors
weren't quite right.  In a year or two's time someone else will need a
hbukittbd and will find that the existing three aren't quite right and will
give us another one.  One day we need to stop doing this ;)

It could be that this person will look at Rusty's hbukittbd and find that
it _could_ be tweaked to do what he wants, but it's already shipping and
it's part of the kernel API and hence can't be made to do what he wants.

So I think it would be good to plonk the proposed interface on the table
and have a poke at it.  Is it compat-safe?  Is it extensible in a
backward-compatible fashion?  Are there future-safe changes we should make
to it?  Can Michael Kerrisk understand, review and document it?  etc.

You know what I'm saying ;)  What is the proposed interface?
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-18 Thread Rusty Russell
On Friday 18 April 2008 21:18:46 Andrew Morton wrote:
> > +   /* Must be a power of two, and limit indices to a u16. */
> > +   if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)
>
> We have an is_power_of_2().

Thanks, fixed.

> > + * vring_get - check out a vring file descriptor
> > + * @filp: the file structure to attach to (eg. from fget()).
> > + *
> > + * Userspace opens /dev/vring and mmaps it, then hands that fd to the
> > + * kernel subsystem it wants to communicate with.  That subsystem uses
> > + * this routine and vring_set_ops() to attach to it.
> > + *
> > + * This simply checks that it really is a vring fd (otherwise it
> > + * returns NULL), the other routine checks that it's not already
> > + * attached.
> > + */
>
> hm, I don't understand the big picture here yet.
>
> Isn't this kinda-sorta like what a relayfs file does?  The oprofile
> buffers?  etc?  Nothing in common at all, no hope?

An excellent question, but I thought the modern kernel etiquette was to only 
comment on whitespace and formatting, and call it "review"? :)

Yes, kinda-sorta in that it's a ring buffer.  No, in that it's bidir and 
consumption can be out-of-order (kind of important for I/O buffers).

But the reason I'm not proposing it as a syscall is that I'm not convinced 
it's the One True Solution which everyone should be using.  Time will tell: 
it's clearly not tied to tun and it's been generically useful for virtual 
I/O, but history has not been kind to new userspace interfaces.

> > +   mutex_unlock(&vr->lock);
> > +   local_irq_enable();
>
> what's this doing here?

Snot from previous version.  Removed.

> > +void vring_unset_ops(struct vring_info *vr)
> > +{
> > +   BUG_ON(!vr->ops);
> > +   mutex_lock(&vr->lock);
> > +   vr->ops = NULL;
> > +   mutex_unlock(&vr->lock);
> > +}
> > +EXPORT_SYMBOL_GPL(vring_unset_ops);
>
> Isn't this just vring_set_ops(vr, NULL, NULL)?

Yes, except I like the clarity and the BUG_ON.

> ponders #include 

"#include " for me, just to add more inclement weather to 
that teacup...

Thanks,
Rusty.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-18 Thread Andrew Morton
On Fri, 18 Apr 2008 14:39:48 +1000 Rusty Russell <[EMAIL PROTECTED]> wrote:

> virtio introduced a ring structure ABI for guest-host communications
> (currently used by lguest and kvm).  Using this same ABI, we can
> create a nice fd version.
> 
> This is useful for efficiently passing packets to and from the tun,
> for example.
> 
> ...
>
> +static int vring_mmap(struct file *filp, struct vm_area_struct *vma)
> +{
> + unsigned long size, num_descs;
> + struct vring_info *vr = filp->private_data;
> + int err;
> +
> + /* We overload mmap's offset to hold the ring number. */
> + num_descs = vma->vm_pgoff;
> +
> + /* Must be a power of two, and limit indices to a u16. */
> + if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536)

We have an is_power_of_2().

> + return -EINVAL;
> +
> + /* mmap size must be what we expect for such a ring. */
> + size = vma->vm_end - vma->vm_start;
> + if (size != ALIGN(vring_size(num_descs, PAGE_SIZE), PAGE_SIZE))
> + return -EINVAL;
> +
> + /* We only let them map this in one place. */
> + mutex_lock(&vr->lock);
> + if (vr->ring.num != 0) {
> + err = -EBUSY;
> + goto unlock;
> + }
> +
> + vring_init(&vr->ring, num_descs, (void *)vma->vm_start, PAGE_SIZE);
> +
> + vr->mask = num_descs - 1;
> + err = 0;
> +
> +unlock:
> + mutex_unlock(&vr->lock);
> + return err;
> +}
>
> ...
>
> +/**
> + * vring_get - check out a vring file descriptor
> + * @filp: the file structure to attach to (eg. from fget()).
> + *
> + * Userspace opens /dev/vring and mmaps it, then hands that fd to the
> + * kernel subsystem it wants to communicate with.  That subsystem uses
> + * this routine and vring_set_ops() to attach to it.
> + *
> + * This simply checks that it really is a vring fd (otherwise it
> + * returns NULL), the other routine checks that it's not already
> + * attached.
> + */

hm, I don't understand the big picture here yet.

Isn't this kinda-sorta like what a relayfs file does?  The oprofile
buffers?  etc?  Nothing in common at all, no hope?

> +struct vring_info *vring_get(struct file *filp)
> +{
> + /* Must be one of ours. */
> + if (filp->f_op != &vring_fops)
> + return NULL;
> +
> + return filp->private_data;
> +}
> +EXPORT_SYMBOL_GPL(vring_get);
> +
> +/**
> + * vring_set_ops - attach operations to a vring file descriptor.
> + * @vr: the vring_info returned from vring_get.
> + * @ops: the operations to attach.
> + * @ops_data: the argument to the ops callbacks.
> + *
> + * This is called after vring_get(): the reason for the two-part
> + * process is that the ops can be called before vring_set_ops returns
> + * (we don't do locking), so you really need to set things up before
> + * this call.
> + *
> + * This simply checks that the ring is not already attached to something,
> + * then sets the ops.
> + */
> +int vring_set_ops(struct vring_info *vr,
> +   const struct vring_ops *ops, void *ops_data)
> +{
> + int err;
> +
> + mutex_lock(&vr->lock);
> + if (vr->ops) {
> + err = -EBUSY;
> + goto unlock;
> + }
> +
> + /* We don't lock, so make sure we get this in the right order. */
> + vr->ops_data = ops_data;
> + wmb();
> + vr->ops = ops;
> +
> + err = 0;
> +unlock:
> + mutex_unlock(&vr->lock);
> + local_irq_enable();

what's this doing here?

> + return err;
> +}
> +EXPORT_SYMBOL_GPL(vring_set_ops);
> +
> +/**
> + * vring_unset_ops - remove operations to a vring file descriptor.
> + * @vr: the vring_info previously successfully vring_set_ops'd
> + */
> +void vring_unset_ops(struct vring_info *vr)
> +{
> + BUG_ON(!vr->ops);
> + mutex_lock(&vr->lock);
> + vr->ops = NULL;
> + mutex_unlock(&vr->lock);
> +}
> +EXPORT_SYMBOL_GPL(vring_unset_ops);

Isn't this just vring_set_ops(vr, NULL, NULL)?

> +static struct miscdevice vring_dev = {
> + .minor = MISC_DYNAMIC_MINOR,
> + .name = KBUILD_MODNAME,
> + .fops = &vring_fops,
> +};
> +
> +static int __init init(void)
> +{
> + return misc_register(&vring_dev);
> +}
> +
> +static void __exit fini(void)
> +{
> + misc_deregister(&vring_dev);
> +}
> +
> +module_init(init);
> +module_exit(fini);
> diff -r b2d9869d338f include/linux/vring.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +
> +++ b/include/linux/vring.h   Fri Apr 18 13:35:16 2008 +1000
> @@ -0,0 +1,58 @@
> +/* Ring-buffer file descriptor implementation.
> + *
> + *  Copyright 2008 Rusty Russell IBM Corporation
> + *
> + *  This program is free software; you can redistribute it and/or modify
> + *  it under the terms of the GNU General Public License as published by
> + *  the Free Software Foundation; either version 2 of the License, or
> + *  (at your option) any later version.
> + *
> + *  This program is distributed in the hope that it will be useful,
> + *  but WITHOUT ANY WARRANTY; without even the implied warr

[PATCH 2/5] /dev/vring: simple userspace-kernel ringbuffer interface.

2008-04-17 Thread Rusty Russell
virtio introduced a ring structure ABI for guest-host communications
(currently used by lguest and kvm).  Using this same ABI, we can
create a nice fd version.

This is useful for efficiently passing packets to and from the tun,
for example.

Signed-off-by: Rusty Russell <[EMAIL PROTECTED]>
---
 drivers/char/Kconfig  |9 +
 drivers/char/Makefile |2 
 drivers/char/vring.c  |  400 ++
 include/linux/vring.h |   58 +++
 4 files changed, 469 insertions(+)

diff -r b2d9869d338f drivers/char/Kconfig
--- a/drivers/char/Kconfig  Fri Apr 18 10:33:58 2008 +1000
+++ b/drivers/char/Kconfig  Fri Apr 18 13:35:16 2008 +1000
@@ -1049,5 +1049,14 @@ config DEVPORT
 
 source "drivers/s390/char/Kconfig"
 
+config VRING
+   tristate "/dev/vring support (EXPERIMENTAL)"
+   depends on EXPERIMENTAL
+   help
+ vring is a ringbuffer implementation for efficient I/O.  It is
+currently used by virtualization hosts (lguest, kvm) for efficient
+networking using the tun driver.
+
+If unsure, say N, but there's a part of you that wants to say M.
 endmenu
 
diff -r b2d9869d338f drivers/char/Makefile
--- a/drivers/char/Makefile Fri Apr 18 10:33:58 2008 +1000
+++ b/drivers/char/Makefile Fri Apr 18 13:35:16 2008 +1000
@@ -112,6 +112,8 @@ obj-$(CONFIG_JS_RTC)+= js-rtc.o
 obj-$(CONFIG_JS_RTC)   += js-rtc.o
 js-rtc-y = rtc.o
 
+obj-$(CONFIG_VRING)+= vring.o
+
 # Files generated that shall be removed upon make clean
 clean-files := consolemap_deftbl.c defkeymap.c
 
diff -r b2d9869d338f drivers/char/vring.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +
+++ b/drivers/char/vring.c  Fri Apr 18 13:35:16 2008 +1000
@@ -0,0 +1,400 @@
+/* Ring-buffer device implementation.
+ *
+ *  Copyright 2008 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct vring_info {
+   struct mutex lock;
+
+   struct vring ring;
+   u16 mask;
+   u16 last_used;
+
+   const struct vring_ops *ops;
+   void *ops_data;
+
+   /* Waitqueue for poll() */
+   wait_queue_head_t poll_wait;
+};
+
+static unsigned int vring_poll(struct file *filp,
+  struct poll_table_struct *poll)
+{
+   struct vring_info *vr = filp->private_data;
+   unsigned int mask;
+   u16 used = 0;
+
+   /* Poll can't error, so let's not go silly here. */
+   get_user(used, &vr->ring.used->idx);
+
+   /* More buffers have been used?  It's 'readable'. */
+   if (used != vr->last_used)
+   mask = POLLIN | POLLRDNORM;
+   else {
+   mask = 0;
+   /* If we need to pull, it's also readable. */
+   mutex_lock(&vr->lock);
+   if (vr->ops && vr->ops->needs_pull) {
+   if (vr->ops->needs_pull(vr->ops_data))
+   mask = POLLIN | POLLRDNORM;
+   }
+   mutex_unlock(&vr->lock);
+   }
+
+   poll_wait(filp, &vr->poll_wait, poll);
+
+   return mask;
+}
+
+/* Read may not be necessary for all use cases, in fact. */
+static ssize_t vring_read(struct file *filp, char __user *buf,
+ size_t size, loff_t *off)
+{
+   struct vring_info *vr = filp->private_data;
+   int err;
+
+   /* Some uses of vrings require updating in user context.  This
+* is best done close to the caller, ie. here. */
+   mutex_lock(&vr->lock);
+   if (vr->ops && vr->ops->pull)
+   err = vr->ops->pull(vr->ops_data);
+   else
+   err = 0;
+   mutex_unlock(&vr->lock);
+
+   /* Update our last_used value to clear the poll. */
+   if (!err)
+   err = get_user(vr->last_used, &vr->ring.used->idx);
+
+   return err;
+}
+
+/* Write kicks the other end to say we have buffers. */
+static ssize_t vring_write(struct file *filp, const char __user *buf,
+  size_t size, loff_t *off)
+{
+   struct vring_info *vr = filp->private_data;
+   int err;
+
+   mutex_lock(&vr->lock);
+   if (vr->ops && vr->ops->push)
+   err =