On Thu, 17 May 2007, Avi Kivity wrote:

> Davide Libenzi wrote:
> > On Wed, 16 May 2007, Avi Kivity wrote:
> > 
> >   
> > > IMO doing eventfd_fget() asap is best.  I much prefer refcounted pointers
> > > to
> > > handles in the kernel: it's easier to see what things point to, and there
> > > is
> > > to context needed for dereferencing.
> > >     
> > 
> > There are concerns (from Al and Christoph) about file lifetime tracking of
> > an fd passed down to the kernel.
> >   
> 
> What concerns are these?
> 
> I have my own concerns about bare fds, which can change their backing file,
> and which are dependent on the context.  The kernel appears to be moving away
> from handles to pointers, as seen by the pid -> task_struck/struct pid
> conversion.

The concern is that by keeping a reference to the file* you may end up 
with a file that has no more userspace links. Although the file_count()==1 
will tell you that you've the only reference left, this relies on you 
doing the check. I really don't know how this will be used in KVM, so I 
can't really say how well these concerns applies.




> > Avi, how about you go the other way around? I expose you something like:
> > 
> > long eventfd_create(unsigned int count, void (*release)(void *),
> >                     void *priv);
> > 
> > This returns you an eventfd (or error), and lets you install a callback to
> > be used when the file_operations->release is called (basically, userspace
> > closed the last file instance).
> > Can KVM pass back an fd to userspace instead of the other way around?
> >   
> 
> That was my original thought, but when I saw your proposal I preferred that as
> more flexible.
> 
> If we go this way, I prefer to have a struct file * and do away with the
> callback, but that brings us back to the file lifetime concerns.

Doing the above requires some care too. Access to your copy of the file* 
must be protected by a lock (you can sleep from inside the callback, so a 
mutex is fine too). I'll sketch you some code:


/* Defined in linux/eventfd.h */
struct eventfd_relcb {
        struct list_head lnk;
        void (*proc)(struct eventfd_relcb *);
};

/* Your context data */
struct your_ctx {
        whatever_lock your_lock;
        ...
        struct eventfd_relcb rcb;
        struct file *evfile;
};

/* Your eventfd release callback */
void rcb_callback(struct eventfd_relcb *rcb) {
        struct your_ctx *c = container_of(rcb, struct your_ctx, rcb);

        whatever_lock_lock(&c->your_lock);
        ...
        c->evfile = NULL;
        whatever_lock_unlock(&c->your_lock);
}

/* Your notify userspace function */
void notify_userspace(struct your_ctx *c) {

        whatever_lock_lock(&c->your_lock);
        if (c->evfile != NULL)
                eventfd_signal(c->evfile, 1);
        whatever_lock_unlock(&c->your_lock);
}

/* Your eventfd create/setup function (modulo error checks) */
void setup_eventfd(struct your_ctx *c) {
        int fd;

        c->rcb.proc = rcb_callback;
        fd = eventfd_create(0, &c->rcb);
        c->evfile = eventfd_fget(fd);
        /* Then return fd to userspace in some way */
        ...
}



If, by any chance, you need to detach yourself from the eventfd:

void detach_eventfd(struct your_ctx *c) {

        whatever_lock_lock(&c->your_lock);
        if (c->evfile != NULL) {
                eventfd_del_release_cb(c->evfile, &c->rcb);
                /* And, optionally ... */
                eventfd_set_shutdown(c->evfile);
        }
        whatever_lock_unlock(&c->your_lock);
}


The eventfd_set_shutdown() function will make a POLLIN signaled to 
userspace, and a read(2) on the eventfd will return 0 (like peer 
disconnected sockets). Then userspace will know that no more eventfs will 
show up in there.
Tentative patch below (builds, not tested yet).




- Davide



Index: linux-2.6.22-rc1-git1.mod/fs/eventfd.c
===================================================================
--- linux-2.6.22-rc1-git1.mod.orig/fs/eventfd.c 2007-05-16 15:10:26.000000000 
-0700
+++ linux-2.6.22-rc1-git1.mod/fs/eventfd.c      2007-05-17 10:53:38.000000000 
-0700
@@ -16,9 +16,15 @@
 #include <linux/anon_inodes.h>
 #include <linux/eventfd.h>
 
+#define EVENTFD_FLG_SHUTDOWN   (1 << 0)
+
+#define EVENTFD_FILE(f)                ((f)->f_op == &eventfd_fops)
+
 struct eventfd_ctx {
        spinlock_t lock;
        wait_queue_head_t wqh;
+       unsigned long flags;
+
        /*
         * Every time that a write(2) is performed on an eventfd, the
         * value of the __u64 being written is added to "count" and a
@@ -28,37 +34,72 @@
         * issue a wakeup.
         */
        __u64 count;
+
+       /*
+        * List of callbacks to be invoked when the eventfd file
+        * in going away.
+        */
+       struct list_head rcb_list;
 };
 
-/*
- * Adds "n" to the eventfd counter "count". Returns "n" in case of
- * success, or a value lower then "n" in case of coutner overflow.
- * This function is supposed to be called by the kernel in paths
- * that do not allow sleeping. In this function we allow the counter
- * to reach the ULLONG_MAX value, and we signal this as overflow
- * condition by returining a POLLERR to poll(2).
+/**
+ * eventfd_signal - Adds @count to the eventfd counter "count"
+ *
+ * @file:    [in]  Pointer to the eventfd file
+ * @count:   [in]  Value to be added to the eventfd counter
+ *
+ * Returns @count in case of success, or a value lower then @count in case of
+ * coutner overflow. In this function we allow the counter to reach the
+ * ULLONG_MAX value, and we signal this as overflow condition by returining
+ * a POLLERR to poll(2). This function can be called from any context.
  */
-int eventfd_signal(struct file *file, int n)
+int eventfd_signal(struct file *file, int count)
 {
        struct eventfd_ctx *ctx = file->private_data;
        unsigned long flags;
 
-       if (n < 0)
+       if (count < 0)
                return -EINVAL;
        spin_lock_irqsave(&ctx->lock, flags);
-       if (ULLONG_MAX - ctx->count < n)
-               n = (int) (ULLONG_MAX - ctx->count);
-       ctx->count += n;
+       if (ULLONG_MAX - ctx->count < count)
+               count = (int) (ULLONG_MAX - ctx->count);
+       ctx->count += count;
        if (waitqueue_active(&ctx->wqh))
                wake_up_locked(&ctx->wqh);
        spin_unlock_irqrestore(&ctx->lock, flags);
 
-       return n;
+       return count;
 }
 
 static int eventfd_release(struct inode *inode, struct file *file)
 {
-       kfree(file->private_data);
+       struct eventfd_ctx *ctx = file->private_data;
+       unsigned long flags;
+       struct eventfd_relcb *rcb;
+
+next:
+       spin_lock_irqsave(&ctx->lock, flags);
+       if (!list_empty(&ctx->rcb_list)) {
+               rcb = list_first_entry(&ctx->rcb_list, struct eventfd_relcb, 
lnk);
+               list_del_init(&rcb->lnk);
+               spin_unlock_irqrestore(&ctx->lock, flags);
+
+               /*
+                * We need to call the callback w/out the lock being held.
+                * When the eventfd kernel user calls us, it is probably holding
+                * its own lock, and then we lock ctx->lock to operate with our
+                * data. Here we would be calling the callback with our lock 
held,
+                * and the kernel user might be trying to grab its own lock from
+                * inside the callback. So we would end up with a nice AB-BA
+                * deadlock.
+                */
+               (*rcb->proc)(rcb);
+
+               goto next;
+       }
+       spin_unlock_irqrestore(&ctx->lock, flags);
+
+       kfree(ctx);
        return 0;
 }
 
@@ -73,7 +114,9 @@
        spin_lock_irqsave(&ctx->lock, flags);
        if (ctx->count > 0)
                events |= POLLIN;
-       if (ctx->count == ULLONG_MAX)
+       if (unlikely(ctx->flags & EVENTFD_FLG_SHUTDOWN))
+               events |= POLLIN;
+       if (unlikely(ctx->count == ULLONG_MAX))
                events |= POLLERR;
        if (ULLONG_MAX - 1 > ctx->count)
                events |= POLLOUT;
@@ -86,13 +129,15 @@
                            loff_t *ppos)
 {
        struct eventfd_ctx *ctx = file->private_data;
-       ssize_t res;
+       ssize_t res = 0;
        __u64 ucnt;
        DECLARE_WAITQUEUE(wait, current);
 
        if (count < sizeof(ucnt))
                return -EINVAL;
        spin_lock_irq(&ctx->lock);
+       if (unlikely(ctx->flags & EVENTFD_FLG_SHUTDOWN))
+               goto out_unlock;
        res = -EAGAIN;
        ucnt = ctx->count;
        if (ucnt > 0)
@@ -122,6 +167,7 @@
                if (waitqueue_active(&ctx->wqh))
                        wake_up_locked(&ctx->wqh);
        }
+out_unlock:
        spin_unlock_irq(&ctx->lock);
        if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
                return -EFAULT;
@@ -183,6 +229,15 @@
        .write          = eventfd_write,
 };
 
+/**
+ * eventfd_fget - Gets a reference of an eventfd file from an eventfd
+ *                file descriptor
+ *
+ * @fd:   [in]   Eventfd file descriptor
+ *
+ * Returns the eventfd file pointer, or error in case the passed file
+ * descriptor is not a valid eventfd file.
+ */
 struct file *eventfd_fget(int fd)
 {
        struct file *file;
@@ -190,7 +245,7 @@
        file = fget(fd);
        if (!file)
                return ERR_PTR(-EBADF);
-       if (file->f_op != &eventfd_fops) {
+       if (!EVENTFD_FILE(file)) {
                fput(file);
                return ERR_PTR(-EINVAL);
        }
@@ -198,7 +253,95 @@
        return file;
 }
 
-asmlinkage long sys_eventfd(unsigned int count)
+/**
+ * eventfd_set_shutdown - Sets the shutdown flag in the eventfd, so that the
+ *                        userspace can notice it with a POLLIN from poll(2)
+ *                        and a zero bytes read(2)
+ *
+ * @file:   [in]   Pointer to the eventfd file
+ *
+ * Returns zero in case of success, or -EINVAL if the @file is not an eventfd
+ * file pointer. This function can be called from any context.
+ */
+int eventfd_set_shutdown(struct file *file)
+{
+       struct eventfd_ctx *ctx = file->private_data;
+       unsigned long flags;
+
+       if (unlikely(!EVENTFD_FILE(file)))
+               return -EINVAL;
+       spin_lock_irqsave(&ctx->lock, flags);
+       ctx->flags |= EVENTFD_FLG_SHUTDOWN;
+       if (waitqueue_active(&ctx->wqh))
+               wake_up_locked(&ctx->wqh);
+       spin_unlock_irqrestore(&ctx->lock, flags);
+
+       return 0;
+}
+
+/**
+ * eventfd_add_release_cb - Install a callback function to be invoked when
+ *                          the file_operations->release function is called
+ *                          (that is, right before the eventfd file is going 
away)
+ *
+ * @file:   [in]   Pointer to the eventfd file
+ * @rcb:    [in]   Pointer to the callabck structure
+ *
+ * Returns zero in case of success, or -EINVAL if the @file is not an eventfd
+ * file pointer. This function can be called from any context.
+ */
+int eventfd_add_release_cb(struct file *file, struct eventfd_relcb *rcb)
+{
+       struct eventfd_ctx *ctx = file->private_data;
+       unsigned long flags;
+
+       if (unlikely(!EVENTFD_FILE(file)))
+               return -EINVAL;
+       spin_lock_irqsave(&ctx->lock, flags);
+       list_add_tail(&rcb->lnk, &ctx->rcb_list);
+       spin_unlock_irqrestore(&ctx->lock, flags);
+
+       return 0;
+}
+
+/**
+ * eventfd_del_release_cb - Deletes a previously installed callback function
+ *                          from the eventfd release callback list
+ *
+ * @file:   [in]   Pointer to the eventfd file
+ * @rcb:    [in]   Pointer to the callabck structure
+ *
+ * Returns zero in case of success, or -EINVAL if the @file is not an eventfd
+ * file pointer. This function can be called from any context.
+ */
+int eventfd_del_release_cb(struct file *file, struct eventfd_relcb *rcb)
+{
+       struct eventfd_ctx *ctx = file->private_data;
+       unsigned long flags;
+
+       if (unlikely(!EVENTFD_FILE(file)))
+               return -EINVAL;
+       spin_lock_irqsave(&ctx->lock, flags);
+       list_del_init(&rcb->lnk);
+       spin_unlock_irqrestore(&ctx->lock, flags);
+
+       return 0;
+}
+
+/**
+ * eventfd_create - Creates an eventfd file and returns its descriptor.
+ *                  It also allows the caller to specify a callback
+ *                  (struct eventfd_relcb) to be invoked when the last
+ *                  instance of this eventfd file goes away. If @rcb is
+ *                  NULL, no callback will be installed
+ *
+ * @count:  [in]   Initial count for the eventfd
+ * @rcb:    [in]   Pointer to the callabck structure, or NULL
+ *
+ * Returns the newly created eventfd file descriptor, or a negative value
+ * in case of error.
+ */
+int eventfd_create(unsigned int count, struct eventfd_relcb *rcb)
 {
        int error, fd;
        struct eventfd_ctx *ctx;
@@ -212,6 +355,10 @@
        init_waitqueue_head(&ctx->wqh);
        spin_lock_init(&ctx->lock);
        ctx->count = count;
+       ctx->flags = 0;
+       INIT_LIST_HEAD(&ctx->rcb_list);
+       if (rcb)
+               list_add_tail(&rcb->lnk, &ctx->rcb_list);
 
        /*
         * When we call this, the initialization must be complete, since
@@ -226,3 +373,8 @@
        return error;
 }
 
+asmlinkage long sys_eventfd(unsigned int count)
+{
+       return eventfd_create(count, NULL);
+}
+
Index: linux-2.6.22-rc1-git1.mod/include/linux/eventfd.h
===================================================================
--- linux-2.6.22-rc1-git1.mod.orig/include/linux/eventfd.h      2007-05-16 
15:10:34.000000000 -0700
+++ linux-2.6.22-rc1-git1.mod/include/linux/eventfd.h   2007-05-17 
10:48:11.000000000 -0700
@@ -11,15 +11,50 @@
 
 #ifdef __KERNEL__
 
+#include <linux/list.h>
+
+struct eventfd_relcb {
+       struct list_head lnk;
+       void (*proc)(struct eventfd_relcb *);
+};
+
 #ifdef CONFIG_EVENTFD
 
 struct file *eventfd_fget(int fd);
-int eventfd_signal(struct file *file, int n);
+int eventfd_signal(struct file *file, int count);
+int eventfd_add_release_cb(struct file *file, struct eventfd_relcb *rcb);
+int eventfd_del_release_cb(struct file *file, struct eventfd_relcb *rcb);
+int eventfd_set_shutdown(struct file *file);
+int eventfd_create(unsigned int count, struct eventfd_relcb *rcb);
 
 #else /* CONFIG_EVENTFD */
 
-#define eventfd_fget(fd) ERR_PTR(-ENOSYS)
-#define eventfd_signal(f, n) 0
+static inline struct file *eventfd_fget(int fd)
+{
+       return ERR_PTR(-ENOSYS);
+}
+static inline int eventfd_signal(struct file *file, int count)
+{
+       return 0;
+}
+static inline int eventfd_add_release_cb(struct file *file,
+                                        struct eventfd_relcb *rcb)
+{
+       return -ENOSYS;
+}
+static inline int eventfd_del_release_cb(struct file *file,
+                                        struct eventfd_relcb *rcb)
+{
+       return -ENOSYS;
+}
+static inline int eventfd_set_shutdown(struct file *file)
+{
+       return -ENOSYS;
+}
+static inline int eventfd_create(unsigned int count, struct eventfd_relcb *rcb)
+{
+       return -ENOSYS;
+}
 
 #endif /* CONFIG_EVENTFD */
 

-------------------------------------------------------------------------
This SF.net email is sponsored by DB2 Express
Download DB2 Express C - the FREE version of DB2 express and take
control of your XML. No limits. Just data. Click to get it now.
http://sourceforge.net/powerbar/db2/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to