On Thu, 17 May 2007, Avi Kivity wrote:
> Davide Libenzi wrote:
> > On Wed, 16 May 2007, Avi Kivity wrote:
> >
> >
> > > IMO doing eventfd_fget() asap is best. I much prefer refcounted pointers
> > > to
> > > handles in the kernel: it's easier to see what things point to, and there
> > > is
> > > to context needed for dereferencing.
> > >
> >
> > There are concerns (from Al and Christoph) about file lifetime tracking of
> > an fd passed down to the kernel.
> >
>
> What concerns are these?
>
> I have my own concerns about bare fds, which can change their backing file,
> and which are dependent on the context. The kernel appears to be moving away
> from handles to pointers, as seen by the pid -> task_struck/struct pid
> conversion.
The concern is that by keeping a reference to the file* you may end up
with a file that has no more userspace links. Although the file_count()==1
will tell you that you've the only reference left, this relies on you
doing the check. I really don't know how this will be used in KVM, so I
can't really say how well these concerns applies.
> > Avi, how about you go the other way around? I expose you something like:
> >
> > long eventfd_create(unsigned int count, void (*release)(void *),
> > void *priv);
> >
> > This returns you an eventfd (or error), and lets you install a callback to
> > be used when the file_operations->release is called (basically, userspace
> > closed the last file instance).
> > Can KVM pass back an fd to userspace instead of the other way around?
> >
>
> That was my original thought, but when I saw your proposal I preferred that as
> more flexible.
>
> If we go this way, I prefer to have a struct file * and do away with the
> callback, but that brings us back to the file lifetime concerns.
Doing the above requires some care too. Access to your copy of the file*
must be protected by a lock (you can sleep from inside the callback, so a
mutex is fine too). I'll sketch you some code:
/* Defined in linux/eventfd.h */
struct eventfd_relcb {
struct list_head lnk;
void (*proc)(struct eventfd_relcb *);
};
/* Your context data */
struct your_ctx {
whatever_lock your_lock;
...
struct eventfd_relcb rcb;
struct file *evfile;
};
/* Your eventfd release callback */
void rcb_callback(struct eventfd_relcb *rcb) {
struct your_ctx *c = container_of(rcb, struct your_ctx, rcb);
whatever_lock_lock(&c->your_lock);
...
c->evfile = NULL;
whatever_lock_unlock(&c->your_lock);
}
/* Your notify userspace function */
void notify_userspace(struct your_ctx *c) {
whatever_lock_lock(&c->your_lock);
if (c->evfile != NULL)
eventfd_signal(c->evfile, 1);
whatever_lock_unlock(&c->your_lock);
}
/* Your eventfd create/setup function (modulo error checks) */
void setup_eventfd(struct your_ctx *c) {
int fd;
c->rcb.proc = rcb_callback;
fd = eventfd_create(0, &c->rcb);
c->evfile = eventfd_fget(fd);
/* Then return fd to userspace in some way */
...
}
If, by any chance, you need to detach yourself from the eventfd:
void detach_eventfd(struct your_ctx *c) {
whatever_lock_lock(&c->your_lock);
if (c->evfile != NULL) {
eventfd_del_release_cb(c->evfile, &c->rcb);
/* And, optionally ... */
eventfd_set_shutdown(c->evfile);
}
whatever_lock_unlock(&c->your_lock);
}
The eventfd_set_shutdown() function will make a POLLIN signaled to
userspace, and a read(2) on the eventfd will return 0 (like peer
disconnected sockets). Then userspace will know that no more eventfs will
show up in there.
Tentative patch below (builds, not tested yet).
- Davide
Index: linux-2.6.22-rc1-git1.mod/fs/eventfd.c
===================================================================
--- linux-2.6.22-rc1-git1.mod.orig/fs/eventfd.c 2007-05-16 15:10:26.000000000
-0700
+++ linux-2.6.22-rc1-git1.mod/fs/eventfd.c 2007-05-17 10:53:38.000000000
-0700
@@ -16,9 +16,15 @@
#include <linux/anon_inodes.h>
#include <linux/eventfd.h>
+#define EVENTFD_FLG_SHUTDOWN (1 << 0)
+
+#define EVENTFD_FILE(f) ((f)->f_op == &eventfd_fops)
+
struct eventfd_ctx {
spinlock_t lock;
wait_queue_head_t wqh;
+ unsigned long flags;
+
/*
* Every time that a write(2) is performed on an eventfd, the
* value of the __u64 being written is added to "count" and a
@@ -28,37 +34,72 @@
* issue a wakeup.
*/
__u64 count;
+
+ /*
+ * List of callbacks to be invoked when the eventfd file
+ * in going away.
+ */
+ struct list_head rcb_list;
};
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @count to the eventfd counter "count"
+ *
+ * @file: [in] Pointer to the eventfd file
+ * @count: [in] Value to be added to the eventfd counter
+ *
+ * Returns @count in case of success, or a value lower then @count in case of
+ * coutner overflow. In this function we allow the counter to reach the
+ * ULLONG_MAX value, and we signal this as overflow condition by returining
+ * a POLLERR to poll(2). This function can be called from any context.
*/
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct file *file, int count)
{
struct eventfd_ctx *ctx = file->private_data;
unsigned long flags;
- if (n < 0)
+ if (count < 0)
return -EINVAL;
spin_lock_irqsave(&ctx->lock, flags);
- if (ULLONG_MAX - ctx->count < n)
- n = (int) (ULLONG_MAX - ctx->count);
- ctx->count += n;
+ if (ULLONG_MAX - ctx->count < count)
+ count = (int) (ULLONG_MAX - ctx->count);
+ ctx->count += count;
if (waitqueue_active(&ctx->wqh))
wake_up_locked(&ctx->wqh);
spin_unlock_irqrestore(&ctx->lock, flags);
- return n;
+ return count;
}
static int eventfd_release(struct inode *inode, struct file *file)
{
- kfree(file->private_data);
+ struct eventfd_ctx *ctx = file->private_data;
+ unsigned long flags;
+ struct eventfd_relcb *rcb;
+
+next:
+ spin_lock_irqsave(&ctx->lock, flags);
+ if (!list_empty(&ctx->rcb_list)) {
+ rcb = list_first_entry(&ctx->rcb_list, struct eventfd_relcb,
lnk);
+ list_del_init(&rcb->lnk);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ /*
+ * We need to call the callback w/out the lock being held.
+ * When the eventfd kernel user calls us, it is probably holding
+ * its own lock, and then we lock ctx->lock to operate with our
+ * data. Here we would be calling the callback with our lock
held,
+ * and the kernel user might be trying to grab its own lock from
+ * inside the callback. So we would end up with a nice AB-BA
+ * deadlock.
+ */
+ (*rcb->proc)(rcb);
+
+ goto next;
+ }
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ kfree(ctx);
return 0;
}
@@ -73,7 +114,9 @@
spin_lock_irqsave(&ctx->lock, flags);
if (ctx->count > 0)
events |= POLLIN;
- if (ctx->count == ULLONG_MAX)
+ if (unlikely(ctx->flags & EVENTFD_FLG_SHUTDOWN))
+ events |= POLLIN;
+ if (unlikely(ctx->count == ULLONG_MAX))
events |= POLLERR;
if (ULLONG_MAX - 1 > ctx->count)
events |= POLLOUT;
@@ -86,13 +129,15 @@
loff_t *ppos)
{
struct eventfd_ctx *ctx = file->private_data;
- ssize_t res;
+ ssize_t res = 0;
__u64 ucnt;
DECLARE_WAITQUEUE(wait, current);
if (count < sizeof(ucnt))
return -EINVAL;
spin_lock_irq(&ctx->lock);
+ if (unlikely(ctx->flags & EVENTFD_FLG_SHUTDOWN))
+ goto out_unlock;
res = -EAGAIN;
ucnt = ctx->count;
if (ucnt > 0)
@@ -122,6 +167,7 @@
if (waitqueue_active(&ctx->wqh))
wake_up_locked(&ctx->wqh);
}
+out_unlock:
spin_unlock_irq(&ctx->lock);
if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
return -EFAULT;
@@ -183,6 +229,15 @@
.write = eventfd_write,
};
+/**
+ * eventfd_fget - Gets a reference of an eventfd file from an eventfd
+ * file descriptor
+ *
+ * @fd: [in] Eventfd file descriptor
+ *
+ * Returns the eventfd file pointer, or error in case the passed file
+ * descriptor is not a valid eventfd file.
+ */
struct file *eventfd_fget(int fd)
{
struct file *file;
@@ -190,7 +245,7 @@
file = fget(fd);
if (!file)
return ERR_PTR(-EBADF);
- if (file->f_op != &eventfd_fops) {
+ if (!EVENTFD_FILE(file)) {
fput(file);
return ERR_PTR(-EINVAL);
}
@@ -198,7 +253,95 @@
return file;
}
-asmlinkage long sys_eventfd(unsigned int count)
+/**
+ * eventfd_set_shutdown - Sets the shutdown flag in the eventfd, so that the
+ * userspace can notice it with a POLLIN from poll(2)
+ * and a zero bytes read(2)
+ *
+ * @file: [in] Pointer to the eventfd file
+ *
+ * Returns zero in case of success, or -EINVAL if the @file is not an eventfd
+ * file pointer. This function can be called from any context.
+ */
+int eventfd_set_shutdown(struct file *file)
+{
+ struct eventfd_ctx *ctx = file->private_data;
+ unsigned long flags;
+
+ if (unlikely(!EVENTFD_FILE(file)))
+ return -EINVAL;
+ spin_lock_irqsave(&ctx->lock, flags);
+ ctx->flags |= EVENTFD_FLG_SHUTDOWN;
+ if (waitqueue_active(&ctx->wqh))
+ wake_up_locked(&ctx->wqh);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ return 0;
+}
+
+/**
+ * eventfd_add_release_cb - Install a callback function to be invoked when
+ * the file_operations->release function is called
+ * (that is, right before the eventfd file is going
away)
+ *
+ * @file: [in] Pointer to the eventfd file
+ * @rcb: [in] Pointer to the callabck structure
+ *
+ * Returns zero in case of success, or -EINVAL if the @file is not an eventfd
+ * file pointer. This function can be called from any context.
+ */
+int eventfd_add_release_cb(struct file *file, struct eventfd_relcb *rcb)
+{
+ struct eventfd_ctx *ctx = file->private_data;
+ unsigned long flags;
+
+ if (unlikely(!EVENTFD_FILE(file)))
+ return -EINVAL;
+ spin_lock_irqsave(&ctx->lock, flags);
+ list_add_tail(&rcb->lnk, &ctx->rcb_list);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ return 0;
+}
+
+/**
+ * eventfd_del_release_cb - Deletes a previously installed callback function
+ * from the eventfd release callback list
+ *
+ * @file: [in] Pointer to the eventfd file
+ * @rcb: [in] Pointer to the callabck structure
+ *
+ * Returns zero in case of success, or -EINVAL if the @file is not an eventfd
+ * file pointer. This function can be called from any context.
+ */
+int eventfd_del_release_cb(struct file *file, struct eventfd_relcb *rcb)
+{
+ struct eventfd_ctx *ctx = file->private_data;
+ unsigned long flags;
+
+ if (unlikely(!EVENTFD_FILE(file)))
+ return -EINVAL;
+ spin_lock_irqsave(&ctx->lock, flags);
+ list_del_init(&rcb->lnk);
+ spin_unlock_irqrestore(&ctx->lock, flags);
+
+ return 0;
+}
+
+/**
+ * eventfd_create - Creates an eventfd file and returns its descriptor.
+ * It also allows the caller to specify a callback
+ * (struct eventfd_relcb) to be invoked when the last
+ * instance of this eventfd file goes away. If @rcb is
+ * NULL, no callback will be installed
+ *
+ * @count: [in] Initial count for the eventfd
+ * @rcb: [in] Pointer to the callabck structure, or NULL
+ *
+ * Returns the newly created eventfd file descriptor, or a negative value
+ * in case of error.
+ */
+int eventfd_create(unsigned int count, struct eventfd_relcb *rcb)
{
int error, fd;
struct eventfd_ctx *ctx;
@@ -212,6 +355,10 @@
init_waitqueue_head(&ctx->wqh);
spin_lock_init(&ctx->lock);
ctx->count = count;
+ ctx->flags = 0;
+ INIT_LIST_HEAD(&ctx->rcb_list);
+ if (rcb)
+ list_add_tail(&rcb->lnk, &ctx->rcb_list);
/*
* When we call this, the initialization must be complete, since
@@ -226,3 +373,8 @@
return error;
}
+asmlinkage long sys_eventfd(unsigned int count)
+{
+ return eventfd_create(count, NULL);
+}
+
Index: linux-2.6.22-rc1-git1.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.22-rc1-git1.mod.orig/include/linux/eventfd.h 2007-05-16
15:10:34.000000000 -0700
+++ linux-2.6.22-rc1-git1.mod/include/linux/eventfd.h 2007-05-17
10:48:11.000000000 -0700
@@ -11,15 +11,50 @@
#ifdef __KERNEL__
+#include <linux/list.h>
+
+struct eventfd_relcb {
+ struct list_head lnk;
+ void (*proc)(struct eventfd_relcb *);
+};
+
#ifdef CONFIG_EVENTFD
struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
+int eventfd_signal(struct file *file, int count);
+int eventfd_add_release_cb(struct file *file, struct eventfd_relcb *rcb);
+int eventfd_del_release_cb(struct file *file, struct eventfd_relcb *rcb);
+int eventfd_set_shutdown(struct file *file);
+int eventfd_create(unsigned int count, struct eventfd_relcb *rcb);
#else /* CONFIG_EVENTFD */
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-#define eventfd_signal(f, n) 0
+static inline struct file *eventfd_fget(int fd)
+{
+ return ERR_PTR(-ENOSYS);
+}
+static inline int eventfd_signal(struct file *file, int count)
+{
+ return 0;
+}
+static inline int eventfd_add_release_cb(struct file *file,
+ struct eventfd_relcb *rcb)
+{
+ return -ENOSYS;
+}
+static inline int eventfd_del_release_cb(struct file *file,
+ struct eventfd_relcb *rcb)
+{
+ return -ENOSYS;
+}
+static inline int eventfd_set_shutdown(struct file *file)
+{
+ return -ENOSYS;
+}
+static inline int eventfd_create(unsigned int count, struct eventfd_relcb *rcb)
+{
+ return -ENOSYS;
+}
#endif /* CONFIG_EVENTFD */
-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel