On Fri, 30 Aug 2019 16:42:06 +0800
Ben Luo <luo...@linux.alibaba.com> wrote:

> When userspace (e.g. qemu) triggers a switch between KVM
> irqfd and userspace eventfd, only dev_id of irqaction
> (i.e. the "trigger" in this patch's context) will be
> changed, but a free-then-request-irq action is taken in
> current code. And, irq affinity setting in VM will also
> trigger a free-then-request-irq action, which actually
> changes nothing, but only need to bounce the irqbypass
> registraion in case that posted-interrupt is in use.
> 
> This patch makes use of irq_update_devid() and optimize
> both cases above, which reduces the risk of losing interrupt
> and also cuts some overhead.
> 
> Signed-off-by: Ben Luo <luo...@linux.alibaba.com>
> ---
>  drivers/vfio/pci/vfio_pci_intrs.c | 124 
> ++++++++++++++++++++++++++------------
>  1 file changed, 87 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
> b/drivers/vfio/pci/vfio_pci_intrs.c
> index 3fa3f72..d3a93d7 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -284,70 +284,120 @@ static int vfio_msi_enable(struct vfio_pci_device 
> *vdev, int nvec, bool msix)
>  static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
>                                     int vector, int fd, bool msix)
>  {
> +     struct eventfd_ctx *trigger = NULL;
>       struct pci_dev *pdev = vdev->pdev;
> -     struct eventfd_ctx *trigger;
>       int irq, ret;
>  
>       if (vector < 0 || vector >= vdev->num_ctx)
>               return -EINVAL;
>  
> +     if (fd >= 0) {
> +             trigger = eventfd_ctx_fdget(fd);
> +             if (IS_ERR(trigger)) {
> +                     /* oops, going to disable this interrupt */
> +                     dev_info(&pdev->dev,
> +                              "get ctx error on bad fd: %d for vector:%d\n",
> +                              fd, vector);

I think a user could trigger this maliciously as a denial of service by
simply providing a bogus file descriptor.  The user is informed of the
error by the return value, why do we need to spam the logs?

> +             }
> +     }
> +
>       irq = pci_irq_vector(pdev, vector);
>  
> +     /*
> +      * 'trigger' is NULL or invalid, disable the interrupt
> +      * 'trigger' is same as before, only bounce the bypass registration
> +      * 'trigger' is a new invalid one, update it to irqaction and other

s/invalid/valid/

> +      * data structures referencing to the old one; fallback to disable
> +      * the interrupt on error
> +      */
>       if (vdev->ctx[vector].trigger) {
> -             free_irq(irq, vdev->ctx[vector].trigger);
> +             /*
> +              * even if the trigger is unchanged we need to bounce the
> +              * interrupt bypass connection to allow affinity changes in
> +              * the guest to be realized.
> +              */
>               irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
> -             kfree(vdev->ctx[vector].name);
> -             eventfd_ctx_put(vdev->ctx[vector].trigger);
> -             vdev->ctx[vector].trigger = NULL;
> +
> +             if (vdev->ctx[vector].trigger == trigger) {
> +                     /* avoid duplicated referencing to the same trigger */
> +                     eventfd_ctx_put(trigger);
> +
> +             } else if (trigger && !IS_ERR(trigger)) {
> +                     ret = irq_update_devid(irq,
> +                                            vdev->ctx[vector].trigger, 
> trigger);
> +                     if (unlikely(ret)) {
> +                             dev_info(&pdev->dev,
> +                                      "update devid of %d (token %p) failed: 
> %d\n",
> +                                      irq, vdev->ctx[vector].trigger, ret);
> +                             eventfd_ctx_put(trigger);
> +                             free_irq(irq, vdev->ctx[vector].trigger);
> +                             kfree(vdev->ctx[vector].name);
> +                             eventfd_ctx_put(vdev->ctx[vector].trigger);
> +                             vdev->ctx[vector].trigger = NULL;
> +                             return ret;
> +                     }
> +                     eventfd_ctx_put(vdev->ctx[vector].trigger);
> +                     vdev->ctx[vector].producer.token = trigger;
> +                     vdev->ctx[vector].trigger = trigger;
> +
> +             } else {
> +                     free_irq(irq, vdev->ctx[vector].trigger);
> +                     kfree(vdev->ctx[vector].name);
> +                     eventfd_ctx_put(vdev->ctx[vector].trigger);
> +                     vdev->ctx[vector].trigger = NULL;
> +             }
>       }
>  
>       if (fd < 0)
>               return 0;
> +     else if (IS_ERR(trigger))
> +             return PTR_ERR(trigger);
>  
> -     vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)",
> -                                        msix ? "x" : "", vector,
> -                                        pci_name(pdev));
> -     if (!vdev->ctx[vector].name)
> -             return -ENOMEM;
> +     if (!vdev->ctx[vector].trigger) {
> +             vdev->ctx[vector].name = kasprintf(GFP_KERNEL,
> +                                                "vfio-msi%s[%d](%s)",
> +                                                msix ? "x" : "", vector,
> +                                                pci_name(pdev));
> +             if (!vdev->ctx[vector].name) {
> +                     eventfd_ctx_put(trigger);
> +                     return -ENOMEM;
> +             }
>  
> -     trigger = eventfd_ctx_fdget(fd);
> -     if (IS_ERR(trigger)) {
> -             kfree(vdev->ctx[vector].name);
> -             return PTR_ERR(trigger);
> -     }
> +             /*
> +              * The MSIx vector table resides in device memory which may be
> +              * cleared via backdoor resets. We don't allow direct access to
> +              * the vector table so even if a userspace driver attempts to
> +              * save/restore around such a reset it would be unsuccessful.
> +              * To avoid this, restore the cached value of the message prior
> +              * to enabling.
> +              */
> +             if (msix) {
> +                     struct msi_msg msg;
>  
> -     /*
> -      * The MSIx vector table resides in device memory which may be cleared
> -      * via backdoor resets. We don't allow direct access to the vector
> -      * table so even if a userspace driver attempts to save/restore around
> -      * such a reset it would be unsuccessful. To avoid this, restore the
> -      * cached value of the message prior to enabling.
> -      */
> -     if (msix) {
> -             struct msi_msg msg;
> +                     get_cached_msi_msg(irq, &msg);
> +                     pci_write_msi_msg(irq, &msg);
> +             }
>  
> -             get_cached_msi_msg(irq, &msg);
> -             pci_write_msi_msg(irq, &msg);
> -     }
> +             ret = request_irq(irq, vfio_msihandler, 0,
> +                               vdev->ctx[vector].name, trigger);
> +             if (ret) {
> +                     kfree(vdev->ctx[vector].name);
> +                     eventfd_ctx_put(trigger);
> +                     return ret;
> +             }
>  
> -     ret = request_irq(irq, vfio_msihandler, 0,
> -                       vdev->ctx[vector].name, trigger);
> -     if (ret) {
> -             kfree(vdev->ctx[vector].name);
> -             eventfd_ctx_put(trigger);
> -             return ret;
> +             vdev->ctx[vector].producer.token = trigger;
> +             vdev->ctx[vector].producer.irq = irq;
> +             vdev->ctx[vector].trigger = trigger;
>       }
>  
> -     vdev->ctx[vector].producer.token = trigger;
> -     vdev->ctx[vector].producer.irq = irq;
> +     /* setup bypass connection and make irte updated */
>       ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
>       if (unlikely(ret))
>               dev_info(&pdev->dev,
>               "irq bypass producer (token %p) registration fails: %d\n",
>               vdev->ctx[vector].producer.token, ret);
>  
> -     vdev->ctx[vector].trigger = trigger;
> -
>       return 0;
>  }
>  

Reply via email to