Re: [PATCH v5 3/3] vfio/pci: make use of irq_update_devid and optimize irq ops

2019-08-30 Thread Ben Luo



在 2019/8/31 上午4:06, Alex Williamson 写道:

On Fri, 30 Aug 2019 16:42:06 +0800
Ben Luo  wrote:


When userspace (e.g. qemu) triggers a switch between KVM
irqfd and userspace eventfd, only dev_id of irqaction
(i.e. the "trigger" in this patch's context) will be
changed, but a free-then-request-irq action is taken in
current code. And, irq affinity setting in VM will also
trigger a free-then-request-irq action, which actually
changes nothing, but only need to bounce the irqbypass
registraion in case that posted-interrupt is in use.

This patch makes use of irq_update_devid() and optimize
both cases above, which reduces the risk of losing interrupt
and also cuts some overhead.

Signed-off-by: Ben Luo 
---
  drivers/vfio/pci/vfio_pci_intrs.c | 124 ++
  1 file changed, 87 insertions(+), 37 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 3fa3f72..d3a93d7 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -284,70 +284,120 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, 
int nvec, bool msix)
  static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
  int vector, int fd, bool msix)
  {
+   struct eventfd_ctx *trigger = NULL;
struct pci_dev *pdev = vdev->pdev;
-   struct eventfd_ctx *trigger;
int irq, ret;
  
  	if (vector < 0 || vector >= vdev->num_ctx)

return -EINVAL;
  
+	if (fd >= 0) {

+   trigger = eventfd_ctx_fdget(fd);
+   if (IS_ERR(trigger)) {
+   /* oops, going to disable this interrupt */
+   dev_info(>dev,
+"get ctx error on bad fd: %d for vector:%d\n",
+fd, vector);

I think a user could trigger this maliciously as a denial of service by
simply providing a bogus file descriptor.  The user is informed of the
error by the return value, why do we need to spam the logs?

Ah, you are right, I will remove this log in next version

+   }
+   }
+
irq = pci_irq_vector(pdev, vector);
  
+	/*

+* 'trigger' is NULL or invalid, disable the interrupt
+* 'trigger' is same as before, only bounce the bypass registration
+* 'trigger' is a new invalid one, update it to irqaction and other

s/invalid/valid/

sorry for typo :)

+* data structures referencing to the old one; fallback to disable
+* the interrupt on error
+*/
if (vdev->ctx[vector].trigger) {
-   free_irq(irq, vdev->ctx[vector].trigger);
+   /*
+* even if the trigger is unchanged we need to bounce the
+* interrupt bypass connection to allow affinity changes in
+* the guest to be realized.
+*/
irq_bypass_unregister_producer(>ctx[vector].producer);
-   kfree(vdev->ctx[vector].name);
-   eventfd_ctx_put(vdev->ctx[vector].trigger);
-   vdev->ctx[vector].trigger = NULL;
+
+   if (vdev->ctx[vector].trigger == trigger) {
+   /* avoid duplicated referencing to the same trigger */
+   eventfd_ctx_put(trigger);
+
+   } else if (trigger && !IS_ERR(trigger)) {
+   ret = irq_update_devid(irq,
+  vdev->ctx[vector].trigger, 
trigger);
+   if (unlikely(ret)) {
+   dev_info(>dev,
+"update devid of %d (token %p) failed: 
%d\n",
+irq, vdev->ctx[vector].trigger, ret);
+   eventfd_ctx_put(trigger);
+   free_irq(irq, vdev->ctx[vector].trigger);
+   kfree(vdev->ctx[vector].name);
+   eventfd_ctx_put(vdev->ctx[vector].trigger);
+   vdev->ctx[vector].trigger = NULL;
+   return ret;
+   }
+   eventfd_ctx_put(vdev->ctx[vector].trigger);
+   vdev->ctx[vector].producer.token = trigger;
+   vdev->ctx[vector].trigger = trigger;
+
+   } else {
+   free_irq(irq, vdev->ctx[vector].trigger);
+   kfree(vdev->ctx[vector].name);
+   eventfd_ctx_put(vdev->ctx[vector].trigger);
+   vdev->ctx[vector].trigger = NULL;
+   }
}
  
  	if (fd < 0)

return 0;
+   else if (IS_ERR(trigger))
+   return PTR_ERR(trigger);
  
-	vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)",

-  msix ? "x" : "", vector,
-  

Re: [PATCH v5 3/3] vfio/pci: make use of irq_update_devid and optimize irq ops

2019-08-30 Thread Alex Williamson
On Fri, 30 Aug 2019 16:42:06 +0800
Ben Luo  wrote:

> When userspace (e.g. qemu) triggers a switch between KVM
> irqfd and userspace eventfd, only dev_id of irqaction
> (i.e. the "trigger" in this patch's context) will be
> changed, but a free-then-request-irq action is taken in
> current code. And, irq affinity setting in VM will also
> trigger a free-then-request-irq action, which actually
> changes nothing, but only need to bounce the irqbypass
> registraion in case that posted-interrupt is in use.
> 
> This patch makes use of irq_update_devid() and optimize
> both cases above, which reduces the risk of losing interrupt
> and also cuts some overhead.
> 
> Signed-off-by: Ben Luo 
> ---
>  drivers/vfio/pci/vfio_pci_intrs.c | 124 
> ++
>  1 file changed, 87 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
> b/drivers/vfio/pci/vfio_pci_intrs.c
> index 3fa3f72..d3a93d7 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -284,70 +284,120 @@ static int vfio_msi_enable(struct vfio_pci_device 
> *vdev, int nvec, bool msix)
>  static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
> int vector, int fd, bool msix)
>  {
> + struct eventfd_ctx *trigger = NULL;
>   struct pci_dev *pdev = vdev->pdev;
> - struct eventfd_ctx *trigger;
>   int irq, ret;
>  
>   if (vector < 0 || vector >= vdev->num_ctx)
>   return -EINVAL;
>  
> + if (fd >= 0) {
> + trigger = eventfd_ctx_fdget(fd);
> + if (IS_ERR(trigger)) {
> + /* oops, going to disable this interrupt */
> + dev_info(>dev,
> +  "get ctx error on bad fd: %d for vector:%d\n",
> +  fd, vector);

I think a user could trigger this maliciously as a denial of service by
simply providing a bogus file descriptor.  The user is informed of the
error by the return value, why do we need to spam the logs?

> + }
> + }
> +
>   irq = pci_irq_vector(pdev, vector);
>  
> + /*
> +  * 'trigger' is NULL or invalid, disable the interrupt
> +  * 'trigger' is same as before, only bounce the bypass registration
> +  * 'trigger' is a new invalid one, update it to irqaction and other

s/invalid/valid/

> +  * data structures referencing to the old one; fallback to disable
> +  * the interrupt on error
> +  */
>   if (vdev->ctx[vector].trigger) {
> - free_irq(irq, vdev->ctx[vector].trigger);
> + /*
> +  * even if the trigger is unchanged we need to bounce the
> +  * interrupt bypass connection to allow affinity changes in
> +  * the guest to be realized.
> +  */
>   irq_bypass_unregister_producer(>ctx[vector].producer);
> - kfree(vdev->ctx[vector].name);
> - eventfd_ctx_put(vdev->ctx[vector].trigger);
> - vdev->ctx[vector].trigger = NULL;
> +
> + if (vdev->ctx[vector].trigger == trigger) {
> + /* avoid duplicated referencing to the same trigger */
> + eventfd_ctx_put(trigger);
> +
> + } else if (trigger && !IS_ERR(trigger)) {
> + ret = irq_update_devid(irq,
> +vdev->ctx[vector].trigger, 
> trigger);
> + if (unlikely(ret)) {
> + dev_info(>dev,
> +  "update devid of %d (token %p) failed: 
> %d\n",
> +  irq, vdev->ctx[vector].trigger, ret);
> + eventfd_ctx_put(trigger);
> + free_irq(irq, vdev->ctx[vector].trigger);
> + kfree(vdev->ctx[vector].name);
> + eventfd_ctx_put(vdev->ctx[vector].trigger);
> + vdev->ctx[vector].trigger = NULL;
> + return ret;
> + }
> + eventfd_ctx_put(vdev->ctx[vector].trigger);
> + vdev->ctx[vector].producer.token = trigger;
> + vdev->ctx[vector].trigger = trigger;
> +
> + } else {
> + free_irq(irq, vdev->ctx[vector].trigger);
> + kfree(vdev->ctx[vector].name);
> + eventfd_ctx_put(vdev->ctx[vector].trigger);
> + vdev->ctx[vector].trigger = NULL;
> + }
>   }
>  
>   if (fd < 0)
>   return 0;
> + else if (IS_ERR(trigger))
> + return PTR_ERR(trigger);
>  
> - vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)",
> -msix ? "x" : "", vector,
> -pci_name(pdev));
> - if 

[PATCH v5 3/3] vfio/pci: make use of irq_update_devid and optimize irq ops

2019-08-30 Thread Ben Luo
When userspace (e.g. qemu) triggers a switch between KVM
irqfd and userspace eventfd, only dev_id of irqaction
(i.e. the "trigger" in this patch's context) will be
changed, but a free-then-request-irq action is taken in
current code. And, irq affinity setting in VM will also
trigger a free-then-request-irq action, which actually
changes nothing, but only need to bounce the irqbypass
registraion in case that posted-interrupt is in use.

This patch makes use of irq_update_devid() and optimize
both cases above, which reduces the risk of losing interrupt
and also cuts some overhead.

Signed-off-by: Ben Luo 
---
 drivers/vfio/pci/vfio_pci_intrs.c | 124 ++
 1 file changed, 87 insertions(+), 37 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 3fa3f72..d3a93d7 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -284,70 +284,120 @@ static int vfio_msi_enable(struct vfio_pci_device *vdev, 
int nvec, bool msix)
 static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
  int vector, int fd, bool msix)
 {
+   struct eventfd_ctx *trigger = NULL;
struct pci_dev *pdev = vdev->pdev;
-   struct eventfd_ctx *trigger;
int irq, ret;
 
if (vector < 0 || vector >= vdev->num_ctx)
return -EINVAL;
 
+   if (fd >= 0) {
+   trigger = eventfd_ctx_fdget(fd);
+   if (IS_ERR(trigger)) {
+   /* oops, going to disable this interrupt */
+   dev_info(>dev,
+"get ctx error on bad fd: %d for vector:%d\n",
+fd, vector);
+   }
+   }
+
irq = pci_irq_vector(pdev, vector);
 
+   /*
+* 'trigger' is NULL or invalid, disable the interrupt
+* 'trigger' is same as before, only bounce the bypass registration
+* 'trigger' is a new invalid one, update it to irqaction and other
+* data structures referencing to the old one; fallback to disable
+* the interrupt on error
+*/
if (vdev->ctx[vector].trigger) {
-   free_irq(irq, vdev->ctx[vector].trigger);
+   /*
+* even if the trigger is unchanged we need to bounce the
+* interrupt bypass connection to allow affinity changes in
+* the guest to be realized.
+*/
irq_bypass_unregister_producer(>ctx[vector].producer);
-   kfree(vdev->ctx[vector].name);
-   eventfd_ctx_put(vdev->ctx[vector].trigger);
-   vdev->ctx[vector].trigger = NULL;
+
+   if (vdev->ctx[vector].trigger == trigger) {
+   /* avoid duplicated referencing to the same trigger */
+   eventfd_ctx_put(trigger);
+
+   } else if (trigger && !IS_ERR(trigger)) {
+   ret = irq_update_devid(irq,
+  vdev->ctx[vector].trigger, 
trigger);
+   if (unlikely(ret)) {
+   dev_info(>dev,
+"update devid of %d (token %p) failed: 
%d\n",
+irq, vdev->ctx[vector].trigger, ret);
+   eventfd_ctx_put(trigger);
+   free_irq(irq, vdev->ctx[vector].trigger);
+   kfree(vdev->ctx[vector].name);
+   eventfd_ctx_put(vdev->ctx[vector].trigger);
+   vdev->ctx[vector].trigger = NULL;
+   return ret;
+   }
+   eventfd_ctx_put(vdev->ctx[vector].trigger);
+   vdev->ctx[vector].producer.token = trigger;
+   vdev->ctx[vector].trigger = trigger;
+
+   } else {
+   free_irq(irq, vdev->ctx[vector].trigger);
+   kfree(vdev->ctx[vector].name);
+   eventfd_ctx_put(vdev->ctx[vector].trigger);
+   vdev->ctx[vector].trigger = NULL;
+   }
}
 
if (fd < 0)
return 0;
+   else if (IS_ERR(trigger))
+   return PTR_ERR(trigger);
 
-   vdev->ctx[vector].name = kasprintf(GFP_KERNEL, "vfio-msi%s[%d](%s)",
-  msix ? "x" : "", vector,
-  pci_name(pdev));
-   if (!vdev->ctx[vector].name)
-   return -ENOMEM;
+   if (!vdev->ctx[vector].trigger) {
+   vdev->ctx[vector].name = kasprintf(GFP_KERNEL,
+  "vfio-msi%s[%d](%s)",
+  msix ? "x" : "", vector,
+