On Tue, Jun 02, 2009 at 09:53:29PM -0400, Gregory Haskins wrote:
> Paul E. McKenney wrote:
> > On Tue, Jun 02, 2009 at 02:23:14PM -0400, Gregory Haskins wrote:
> >   
> >> Paul E. McKenney wrote:
> >>     
> >>> On Tue, Jun 02, 2009 at 11:15:38AM -0400, Gregory Haskins wrote:
> >>>   
> >>>       
> >>>> Assigning an irqfd object to a kvm object creates a relationship that we
> >>>> currently manage by having the kvm oject acquire/hold a file* reference 
> >>>> to
> >>>> the underlying eventfd.  The lifetime of these objects is properly 
> >>>> maintained
> >>>> by decoupling the two objects whenever the irqfd is closed or kvm is 
> >>>> closed,
> >>>> whichever comes first.
> >>>>
> >>>> However, the irqfd "close" method is less than ideal since it requires 
> >>>> two
> >>>> system calls to complete (one for ioctl(kvmfd, IRQFD_DEASSIGN), the 
> >>>> other for
> >>>> close(eventfd)).  This dual-call approach was utilized because there was 
> >>>> no
> >>>> notification mechanism on the eventfd side at the time irqfd was 
> >>>> implemented.
> >>>>
> >>>> Recently, Davide proposed a patch to send a POLLHUP wakeup whenever an
> >>>> eventfd is about to close.  So we eliminate the IRQFD_DEASSIGN ioctl (*)
> >>>> vector in favor of sensing the desassign automatically when the fd is 
> >>>> closed.
> >>>> The resulting code is slightly more complex as a result since we need to
> >>>> allow either side to sever the relationship independently.  We utilize 
> >>>> SRCU
> >>>> to guarantee stable concurrent access to the KVM pointer without adding
> >>>> additional atomic operations in the fast path.
> >>>>
> >>>> At minimum, this design should be acked by both Davide and Paul (cc'd).
> >>>>
> >>>> (*) The irqfd patch does not exist in any released tree, so the 
> >>>> understanding
> >>>> is that we can alter the irqfd specific ABI without taking the normal
> >>>> precautions, such as CAP bits.
> >>>>     
> >>>>         
> >>> A few questions and suggestions interspersed below.
> >>>
> >>>                                                   Thanx, Paul
> >>>   
> >>>       
> >> Thanks for the review, Paul.
> >>     
> >
> > Some questions, clarifications, and mea culpas below.
> >
> >                                                     Thanx, Paul
> >
> >   
> >> (FYI: This isn't quite what I was asking you about on IRC yesterday, but
> >> it's related...and the SRCU portion of the conversation *did* inspire me
> >> here.  Just note that the stuff I was asking about is still forthcoming)
> >>     
> >
> > ;-)
> >
> >   
> >>>> Signed-off-by: Gregory Haskins <ghask...@novell.com>
> >>>> CC: Davide Libenzi <davi...@xmailserver.org>
> >>>> CC: Michael S. Tsirkin <m...@redhat.com>
> >>>> CC: Paul E. McKenney <paul...@linux.vnet.ibm.com>
> >>>> ---
> >>>>
> >>>>  include/linux/kvm.h |    2 -
> >>>>  virt/kvm/eventfd.c  |  177 
> >>>> +++++++++++++++++++++++----------------------------
> >>>>  virt/kvm/kvm_main.c |    3 +
> >>>>  3 files changed, 81 insertions(+), 101 deletions(-)
> >>>>
> >>>> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> >>>> index 632a856..29b62cc 100644
> >>>> --- a/include/linux/kvm.h
> >>>> +++ b/include/linux/kvm.h
> >>>> @@ -482,8 +482,6 @@ struct kvm_x86_mce {
> >>>>  };
> >>>>  #endif
> >>>>
> >>>> -#define KVM_IRQFD_FLAG_DEASSIGN (1 << 0)
> >>>> -
> >>>>  struct kvm_irqfd {
> >>>>          __u32 fd;
> >>>>          __u32 gsi;
> >>>> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> >>>> index f3f2ea1..6ed62e2 100644
> >>>> --- a/virt/kvm/eventfd.c
> >>>> +++ b/virt/kvm/eventfd.c
> >>>> @@ -37,26 +37,63 @@
> >>>>   * --------------------------------------------------------------------
> >>>>   */
> >>>>  struct _irqfd {
> >>>> +        struct mutex              lock;
> >>>> +        struct srcu_struct        srcu;
> >>>>          struct kvm               *kvm;
> >>>>          int                       gsi;
> >>>> -        struct file              *file;
> >>>>          struct list_head          list;
> >>>>          poll_table                pt;
> >>>>          wait_queue_head_t        *wqh;
> >>>>          wait_queue_t              wait;
> >>>> -        struct work_struct        work;
> >>>> +        struct work_struct        inject;
> >>>>  };
> >>>>
> >>>>  static void
> >>>>  irqfd_inject(struct work_struct *work)
> >>>>  {
> >>>> -        struct _irqfd *irqfd = container_of(work, struct _irqfd, work);
> >>>> -        struct kvm *kvm = irqfd->kvm;
> >>>> +        struct _irqfd *irqfd = container_of(work, struct _irqfd, 
> >>>> inject);
> >>>> +        struct kvm *kvm;
> >>>> +        int idx;
> >>>> +
> >>>> +        idx = srcu_read_lock(&irqfd->srcu);
> >>>> +
> >>>> +        kvm = rcu_dereference(irqfd->kvm);
> >>>>         
> 
> [4]
> 
> >>>> +        if (kvm) {
> >>>> +                mutex_lock(&kvm->lock);
> >>>> +                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 
> >>>> irqfd->gsi, 1);
> >>>> +                kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 
> >>>> irqfd->gsi, 0);
> >>>> +                mutex_unlock(&kvm->lock);
> >>>> +        }
> >>>> +
> >>>> +        srcu_read_unlock(&irqfd->srcu, idx);
> >>>> +}
> >>>>     
> >>>>         
> >> [1]
> >>
> >>     
> >>>> +
> >>>> +static void
> >>>> +irqfd_disconnect(struct _irqfd *irqfd)
> >>>> +{
> >>>> +        struct kvm *kvm;
> >>>> +
> >>>> +        mutex_lock(&irqfd->lock);
> >>>> +
> >>>> +        kvm = rcu_dereference(irqfd->kvm);
> >>>> +        rcu_assign_pointer(irqfd->kvm, NULL);
> >>>> +
> >>>> +        mutex_unlock(&irqfd->lock);
> >>>>     
> >>>>         
> >> [2]
> >>
> >>     
> >>>> +
> >>>> +        if (!kvm)
> >>>> +                return;
> >>>>
> >>>>          mutex_lock(&kvm->lock);
> >>>> -        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
> >>>> -        kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
> >>>> +        list_del(&irqfd->list);
> >>>>          mutex_unlock(&kvm->lock);
> >>>> +
> >>>> +        /*
> >>>> +         * It is important to not drop the kvm reference until the next 
> >>>> grace
> >>>> +         * period because there might be lockless references in flight 
> >>>> up
> >>>> +         * until then
> >>>> +         */
> >>>>     
> >>>>         
> >>> The lockless references are all harmless even if carried out after the
> >>> kvm structure has been removed?
> >>>       
> >> No, but all ([1]) references to my knowledge occur within an srcu
> >> read-side CS, and we only drop the object reference ([3]) outside of
> >> that CS by virtue of the synchronize_srcu() barrier below.  The one
> >> notable exception is [2], which I don't declare as a read-side CS since
> >> I hold the mutex during the swap.
> >>
> >> OTOH, this is really my _intention_, not _reality_ per se. ;)  E.g. I
> >> may have completely flubbed up the design, so I'm glad you are looking
> >> at it.
> >>     
> >
> > Looks good in general -- my question is about the window of time
> > between when the object is removed from the list (and might still have
> > readers referencing it) and when the object is freed (and, courtesy of
> > synchronize_srcu(), can no longer be referenced by readers).
> >
> > In other words, the following sequence of events:
> >
> > o   CPU 0 picks up a pointer to the object.
> >
> > o   CPU 1 removes that same object from the list, and invokes
> >     synchronize_srcu(), which cannot return yet due to CPU 0
> >     being in an SRCU read-side critical section.
> >
> > o   CPU 0 acquires the lock and invokes the pair of kvm_set_irq()
> >     calls, releases the lock and exits the SRCU read-side critical
> >     section.
> >
> > o   CPU 1's synchronize_srcu() can now return, and does.
> >
> > o   CPU 1 frees the object.
> >
> > I honestly don't know enough about KVM to say whether or not this is a
> > problem, but thought I should ask.  ;-)
> 
> Right, ok.  What you outline is consistent with my expectations.  That
> is, I need to make sure that it is not possible to have any concurrent
> code call kvm_put_cpu between [4] and [1] against the same pointer.   It
> is, of course, ok if some other code path enters irqfd_inject() _after_
> [2] because I would have already swapped the pointer with NULL and it
> will simply bail out on the conditional right after [4].

Yep, that is what the combination of srcu_read_lock(), srcu_read_unlock(),
and synchronize_srcu() will do.

> >>>   Or does there need to be a ->deleted
> >>> field that allows the lockless references to ignore a kvm structure that
> >>> has already been deleted?
> >>>       
> >> I guess you could say that the "rcu_assign_pointer(NULL)" is my
> >> "deleted" field.  The reader-side code in question should check for that
> >> condition before proceeding.
> >>     
> >
> > Fair enough!  But please keep in mind that the pointer could be assigned
> > to NULL just after we dereference it, especially if we are interrupted
> > or preempted or something.
> 
> Right, and that should be ok IIUC as long as I can be guaranteed to
> never call kvm_put_kvm() before the dereferencer calls
> srcu_read_unlock().  I currently believe this guarantee is provided by
> the synchronize_srcu() at [3], but please straighten me out if that is a
> naive assumption.

You are correct, that is indeed the guarantee provided by
synchronize_srcu().

> >   Or is the idea to re-check the pointer under some lock?
> >   
> I do not currently believe I need to worry about that case, but as
> always, straighten me out if that is wrong. ;)

Given what you say below, you should be OK.

> >>> On the other hand, if it really is somehow OK for kvm_set_irq() to be
> >>> called on an already-deleted kvm structure, then this code would be OK
> >>> as is.
> >>>       
> >> Definitely not, so if you think that can happen please raise the flag.
> >
> > Apologies, I was being a bit sloppy.  Instead of "already-deleted",
> > I should have said something like "already removed from the list but
> > not yet freed".
> 
> Ah, ok.  The answer in that case would be "yes".  It's ok to call
> kvm_set_irq() while the irqfd->kvm pointer is NULL, but it is not ok to
> call it after or during kvm_put_kvm() has been invoked.  Technically the
> first safe point is right after the last mutex_unlock(&kvm->lock)
> completes (right before [1]), and is officially annotated with the
> subsequent srcu_read_unlock().

Good enough!

> >>>> +        synchronize_srcu(&irqfd->srcu);
> >>>> +        kvm_put_kvm(kvm);
> >>>>     
> >>>>         
> >> [3]
> >>
> >>     
> >>>>  }
> >>>>
> >>>>  static int
> >>>> @@ -64,12 +101,28 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int 
> >>>> sync, void *key)
> >>>>  {
> >>>>          struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
> >>>>
> >>>> -        /*
> >>>> -         * The wake_up is called with interrupts disabled.  Therefore 
> >>>> we need
> >>>> -         * to defer the IRQ injection until later since we need to 
> >>>> acquire the
> >>>> -         * kvm->lock to do so.
> >>>> -         */
> >>>> -        schedule_work(&irqfd->work);
> >>>> +        switch ((unsigned long)key) {
> >>>> +        case POLLIN:
> >>>> +                /*
> >>>> +                 * The POLLIN wake_up is called with interrupts 
> >>>> disabled.
> >>>> +                 * Therefore we need to defer the IRQ injection until 
> >>>> later
> >>>> +                 * since we need to acquire the kvm->lock to do so.
> >>>> +                 */
> >>>> +                schedule_work(&irqfd->inject);
> >>>> +                break;
> >>>> +        case POLLHUP:
> >>>> +                /*
> >>>> +                 * The POLLHUP is called unlocked, so it theoretically 
> >>>> should
> >>>> +                 * be safe to remove ourselves from the wqh
> >>>> +                 */
> >>>> +                remove_wait_queue(irqfd->wqh, &irqfd->wait);
> >>>> +                flush_work(&irqfd->inject);
> >>>> +                irqfd_disconnect(irqfd);
> >>>>     
> >>>>         
> >>> Good.  The fact that irqfd_disconnect() does a synchronize_srcu()
> >>> prevents any readers from trying to do an SRCU operation on an already
> >>> cleaned-up srcu_struct, so this does look safe to me.
> >>>       
> >> As an additional data point, we can guarantee that we will never be
> >> called again since we synchronously unhook from the wqh and kvm->irqfds
> >> list, and the POLLHUP is called from f_ops->release().
> >
> > So the window is short, but still exists, correct?
> 
> Can you elaborate?

This is the same window we discussed above.  It is the window that
allows an object to be removed from the list between the time that a
reader obtains a reference to the object and the time that this readers
actually uses this reference.

> >>>> +                cleanup_srcu_struct(&irqfd->srcu);
> >>>> +                kfree(irqfd);
> >>>> +                break;
> >>>> +        }
> >>>>
> >>>>          return 0;
> >>>>  }
> >>>> @@ -84,8 +137,8 @@ irqfd_ptable_queue_proc(struct file *file, 
> >>>> wait_queue_head_t *wqh,
> >>>>          add_wait_queue(wqh, &irqfd->wait);
> >>>>  }
> >>>>
> >>>> -static int
> >>>> -kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi)
> >>>> +int
> >>>> +kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> >>>>  {
> >>>>          struct _irqfd *irqfd;
> >>>>          struct file *file = NULL;
> >>>> @@ -95,10 +148,12 @@ kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi)
> >>>>          if (!irqfd)
> >>>>                  return -ENOMEM;
> >>>>
> >>>> +        mutex_init(&irqfd->lock);
> >>>> +        init_srcu_struct(&irqfd->srcu);
> >>>>          irqfd->kvm = kvm;
> >>>>          irqfd->gsi = gsi;
> >>>>          INIT_LIST_HEAD(&irqfd->list);
> >>>> -        INIT_WORK(&irqfd->work, irqfd_inject);
> >>>> +        INIT_WORK(&irqfd->inject, irqfd_inject);
> >>>>
> >>>>          /*
> >>>>           * Embed the file* lifetime in the irqfd.
> >>>> @@ -120,12 +175,18 @@ kvm_assign_irqfd(struct kvm *kvm, int fd, int gsi)
> >>>>          if (ret < 0)
> >>>>                  goto fail;
> >>>>
> >>>> -        irqfd->file = file;
> >>>> +        kvm_get_kvm(kvm);
> >>>>
> >>>>          mutex_lock(&kvm->lock);
> >>>>          list_add_tail(&irqfd->list, &kvm->irqfds);
> >>>>     
> >>>>         
> >>> Doesn't the above need to be list_add_tail_rcu()?  Unless I am confused,
> >>> this is the point at which the new SRCU-protected structure is first
> >>> made public.  If so, you really do need list_add_tail_rcu() to make sure
> >>> that concurrent readers don't see pre-initialized values in the structure.
> >>>       
> >> I *think* this is ok as a traditional list, because the only paths that
> >> touch this list are guarded by the kvm->lock mutex.  Let me know if you
> >> see otherwise or if that is not enough.
> >
> > My mistake, you are using rcu_assign_pointer() and rcu_dereference()
> > instead of the list primitives.  Never mind!!!
> 
> Yeah, and note that we actually have two types of objects and their
> references floating around:
> 
> *) We have "struct irqfd" which can be thought of as an extension of
> eventfd.  It holds exactly one (or zero) references to kvm via the
> irqfd->kvm pointer, and as you note above I use rcu_XX() macros and srcu
> to manage it.
> 
> *) Conversely, we have "struct kvm" which may have a 1:N relationship
> with many irqfds, which I manage with a standard list at kvm->irqfds
> protected by kvm->lock.

This combination should work fine.  You do acquire the lock within the
SRCU read-side critical section, as required.  It looks to me that you
avoid freeing the underlying "struct kvm" until after a grace period
past removal from the list, again, as required.

                                                        Thanx, Paul

> So the code that uses the rcu_dereference/rcu_assign_pointer is actually
> different than the code mentioned above that is manipulating the
> kvm->irqfds list with list_add_tail().  The latter isn't directly RCU
> related and is why you see the non-rcu variants of the list functions in
> use.
> 
> That said, if you still see a hole in that approach, do not be shy in
> pointing it out ;)
> 
> Thanks again for taking to time to go over all this, Paul.  I know you
> are very busy, and its very much appreciated!
> 
> -Greg
> 
> >   
> >>>>          mutex_unlock(&kvm->lock);
> >>>>
> >>>> +        /*
> >>>> +         * do not drop the file until the irqfd is fully initialized, 
> >>>> otherwise
> >>>> +         * we might race against the POLLHUP
> >>>> +         */
> >>>> +        fput(file);
> >>>> +
> >>>>          return 0;
> >>>>
> >>>>  fail:
> >>>> @@ -139,97 +200,17 @@ fail:
> >>>>          return ret;
> >>>>  }
> >>>>
> >>>> -static void
> >>>> -irqfd_release(struct _irqfd *irqfd)
> >>>> -{
> >>>> -        /*
> >>>> -         * The ordering is important.  We must remove ourselves from 
> >>>> the wqh
> >>>> -         * first to ensure no more event callbacks are issued, and then 
> >>>> flush
> >>>> -         * any previously scheduled work prior to freeing the memory
> >>>> -         */
> >>>> -        remove_wait_queue(irqfd->wqh, &irqfd->wait);
> >>>> -
> >>>> -        flush_work(&irqfd->work);
> >>>> -
> >>>> -        fput(irqfd->file);
> >>>> -        kfree(irqfd);
> >>>> -}
> >>>> -
> >>>> -static struct _irqfd *
> >>>> -irqfd_remove(struct kvm *kvm, struct file *file, int gsi)
> >>>> -{
> >>>> -        struct _irqfd *irqfd;
> >>>> -
> >>>> -        mutex_lock(&kvm->lock);
> >>>> -
> >>>> -        /*
> >>>> -         * linear search isn't brilliant, but this should be an 
> >>>> infrequent
> >>>> -         * slow-path operation, and the list should not grow very large
> >>>> -         */
> >>>> -        list_for_each_entry(irqfd, &kvm->irqfds, list) {
> >>>> -                if (irqfd->file != file || irqfd->gsi != gsi)
> >>>> -                        continue;
> >>>> -
> >>>> -                list_del(&irqfd->list);
> >>>> -                mutex_unlock(&kvm->lock);
> >>>> -
> >>>> -                return irqfd;
> >>>> -        }
> >>>> -
> >>>> -        mutex_unlock(&kvm->lock);
> >>>> -
> >>>> -        return NULL;
> >>>> -}
> >>>> -
> >>>> -static int
> >>>> -kvm_deassign_irqfd(struct kvm *kvm, int fd, int gsi)
> >>>> -{
> >>>> -        struct _irqfd *irqfd;
> >>>> -        struct file *file;
> >>>> -        int count = 0;
> >>>> -
> >>>> -        file = fget(fd);
> >>>> -        if (IS_ERR(file))
> >>>> -                return PTR_ERR(file);
> >>>> -
> >>>> -        while ((irqfd = irqfd_remove(kvm, file, gsi))) {
> >>>> -                /*
> >>>> -                 * We remove the item from the list under the lock, but 
> >>>> we
> >>>> -                 * free it outside the lock to avoid deadlocking with 
> >>>> the
> >>>> -                 * flush_work and the work_item taking the lock
> >>>> -                 */
> >>>> -                irqfd_release(irqfd);
> >>>> -                count++;
> >>>> -        }
> >>>> -
> >>>> -        fput(file);
> >>>> -
> >>>> -        return count ? count : -ENOENT;
> >>>> -}
> >>>> -
> >>>>  void
> >>>>  kvm_irqfd_init(struct kvm *kvm)
> >>>>  {
> >>>>          INIT_LIST_HEAD(&kvm->irqfds);
> >>>>  }
> >>>>
> >>>> -int
> >>>> -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags)
> >>>> -{
> >>>> -        if (flags & KVM_IRQFD_FLAG_DEASSIGN)
> >>>> -                return kvm_deassign_irqfd(kvm, fd, gsi);
> >>>> -
> >>>> -        return kvm_assign_irqfd(kvm, fd, gsi);
> >>>> -}
> >>>> -
> >>>>  void
> >>>>  kvm_irqfd_release(struct kvm *kvm)
> >>>>  {
> >>>>          struct _irqfd *irqfd, *tmp;
> >>>>
> >>>> -        /* don't bother with the lock..we are shutting down */
> >>>> -        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list) {
> >>>> -                list_del(&irqfd->list);
> >>>> -                irqfd_release(irqfd);
> >>>> -        }
> >>>> +        list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds, list)
> >>>> +                irqfd_disconnect(irqfd);
> >>>>  }
> >>>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> >>>> index 902fed9..a9f62bb 100644
> >>>> --- a/virt/kvm/kvm_main.c
> >>>> +++ b/virt/kvm/kvm_main.c
> >>>> @@ -1029,7 +1029,6 @@ static void kvm_destroy_vm(struct kvm *kvm)
> >>>>          spin_lock(&kvm_lock);
> >>>>          list_del(&kvm->vm_list);
> >>>>          spin_unlock(&kvm_lock);
> >>>> -        kvm_irqfd_release(kvm);
> >>>>          kvm_free_irq_routing(kvm);
> >>>>          kvm_io_bus_destroy(&kvm->pio_bus);
> >>>>          kvm_io_bus_destroy(&kvm->mmio_bus);
> >>>> @@ -1064,6 +1063,8 @@ static int kvm_vm_release(struct inode *inode, 
> >>>> struct file *filp)
> >>>>  {
> >>>>          struct kvm *kvm = filp->private_data;
> >>>>
> >>>> +        kvm_irqfd_release(kvm);
> >>>> +
> >>>>          kvm_put_kvm(kvm);
> >>>>          return 0;
> >>>>  }
> >>>>
> >>>>     
> >>>>         
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe kvm" in
> >>> the body of a message to majord...@vger.kernel.org
> >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>   
> >>>       
> >>     
> >
> >
> >   
> 
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to