Re: [PATCH v2 1/6] vfio: implement iommu driver capabilities with an enum

2014-10-31 Thread Alex Williamson
On Mon, 2014-10-27 at 19:05 +0100, Antonios Motakis wrote:
> Currently a VFIO driver's IOMMU capabilities are encoded as a series of
> numerical defines. Replace this with an enum for future maintainability.
> 
> Signed-off-by: Antonios Motakis 
> ---
>  include/uapi/linux/vfio.h | 21 ++---
>  1 file changed, 10 insertions(+), 11 deletions(-)
> 
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 6612974..1e39842 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -19,19 +19,18 @@
>  
>  /* Kernel & User level defines for VFIO IOCTLs. */
>  
> -/* Extensions */
> -
> -#define VFIO_TYPE1_IOMMU 1
> -#define VFIO_SPAPR_TCE_IOMMU 2
> -#define VFIO_TYPE1v2_IOMMU   3
>  /*
> - * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping).  This
> - * capability is subject to change as groups are added or removed.
> + * Capabilities exposed by the VFIO IOMMU driver. Some capabilities are 
> subject
> + * to change as groups are added or removed.
>   */
> -#define VFIO_DMA_CC_IOMMU4
> -
> -/* Check if EEH is supported */
> -#define VFIO_EEH 5
> +enum vfio_iommu_cap {
> + VFIO_TYPE1_IOMMU= 1,
> + VFIO_SPAPR_TCE_IOMMU= 2,
> + VFIO_TYPE1v2_IOMMU  = 3,
> + VFIO_DMA_CC_IOMMU   = 4, /* IOMMU enforces DMA cache coherence
> + (ex. PCIe NoSnoop stripping) */
> + VFIO_EEH= 5, /* Check if EEH is supported */
> +};

Your code base is a little out of date, you're missing:

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/include/uapi/linux/vfio.h?id=f5c9ecebaf2a2c9381973798e389cc019dd983e0

I think the logic looks ok in the rest, but you'll need to use index 7
and the above commit touched the type1 c file as well so you may need to
adjustment there too.  Thanks,

Alex
>  
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v9 10/19] vfio/platform: return IRQ info

2014-10-31 Thread Alex Williamson
On Mon, 2014-10-27 at 19:07 +0100, Antonios Motakis wrote:
> Return information for the interrupts exposed by the device.
> This patch extends VFIO_DEVICE_GET_INFO with the number of IRQs
> and enables VFIO_DEVICE_GET_IRQ_INFO.
> 
> Signed-off-by: Antonios Motakis 
> ---
>  drivers/vfio/platform/Makefile|  2 +-
>  drivers/vfio/platform/vfio_platform_common.c  | 31 --
>  drivers/vfio/platform/vfio_platform_irq.c | 59 
> +++
>  drivers/vfio/platform/vfio_platform_private.h | 10 +
>  4 files changed, 97 insertions(+), 5 deletions(-)
>  create mode 100644 drivers/vfio/platform/vfio_platform_irq.c
> 
> diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
> index 1957170..81de144 100644
> --- a/drivers/vfio/platform/Makefile
> +++ b/drivers/vfio/platform/Makefile
> @@ -1,5 +1,5 @@
>  
> -vfio-platform-y := vfio_platform.o vfio_platform_common.o
> +vfio-platform-y := vfio_platform.o vfio_platform_common.o vfio_platform_irq.o
>  
>  obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
>  
> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
> b/drivers/vfio/platform/vfio_platform_common.c
> index aeaaec9..5f2c205 100644
> --- a/drivers/vfio/platform/vfio_platform_common.c
> +++ b/drivers/vfio/platform/vfio_platform_common.c
> @@ -105,6 +105,7 @@ static void vfio_platform_release(void *device_data)
>  
>   if (!(--vdev->refcnt)) {
>   vfio_platform_regions_cleanup(vdev);
> + vfio_platform_irq_cleanup(vdev);
>   }
>  
>   mutex_unlock(&driver_lock);
> @@ -126,6 +127,10 @@ static int vfio_platform_open(void *device_data)
>   ret = vfio_platform_regions_init(vdev);
>   if (ret)
>   goto err_reg;
> +
> + ret = vfio_platform_irq_init(vdev);
> + if (ret)
> + goto err_irq;
>   }
>  
>   vdev->refcnt++;
> @@ -133,6 +138,8 @@ static int vfio_platform_open(void *device_data)
>   mutex_unlock(&driver_lock);
>   return 0;
>  
> +err_irq:
> + vfio_platform_regions_cleanup(vdev);
>  err_reg:
>   mutex_unlock(&driver_lock);
>   module_put(THIS_MODULE);
> @@ -158,7 +165,7 @@ static long vfio_platform_ioctl(void *device_data,
>  
>   info.flags = vdev->flags;
>   info.num_regions = vdev->num_regions;
> - info.num_irqs = 0;
> + info.num_irqs = vdev->num_irqs;
>  
>   return copy_to_user((void __user *)arg, &info, minsz);
>  
> @@ -183,10 +190,26 @@ static long vfio_platform_ioctl(void *device_data,
>  
>   return copy_to_user((void __user *)arg, &info, minsz);
>  
> - } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
> - return -EINVAL;
> + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
> + struct vfio_irq_info info;
> +
> + minsz = offsetofend(struct vfio_irq_info, count);
> +
> + if (copy_from_user(&info, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (info.argsz < minsz)
> + return -EINVAL;
> +
> + if (info.index >= vdev->num_irqs)
> + return -EINVAL;
> +
> + info.flags = vdev->irqs[info.index].flags;
> + info.count = vdev->irqs[info.index].count;
> +
> + return copy_to_user((void __user *)arg, &info, minsz);
>  
> - else if (cmd == VFIO_DEVICE_SET_IRQS)
> + } else if (cmd == VFIO_DEVICE_SET_IRQS)
>   return -EINVAL;
>  
>   else if (cmd == VFIO_DEVICE_RESET)
> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
> b/drivers/vfio/platform/vfio_platform_irq.c
> new file mode 100644
> index 000..d99c71c
> --- /dev/null
> +++ b/drivers/vfio/platform/vfio_platform_irq.c
> @@ -0,0 +1,59 @@
> +/*
> + * VFIO platform devices interrupt handling
> + *
> + * Copyright (C) 2013 - Virtual Open Systems
> + * Author: Antonios Motakis 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "vfio_platform_private.h"
> +
> +int vfio_platform_irq_init(struct vfio_platform_device *vdev)
> +{
> + int cnt = 0, i;
> +
> + while (vdev->get_irq(vdev, cnt) > 0)

For platform devices this is just a wrapper around platform_get_irq(),
which sort of seems like it can return 0 as a valid irq.  Am I wrong
about that?  If 0 can be 

Re: [PATCH v9 15/19] vfio: add local lock in virqfd instead of depending on VFIO PCI

2014-10-31 Thread Alex Williamson
On Mon, 2014-10-27 at 19:08 +0100, Antonios Motakis wrote:
> Virqfd just needs to keep accesses to any struct *virqfd safe, but this
> comes into play only when creating or destroying eventfds, so sharing
> the same spinlock with the VFIO bus driver is not necessary.
> 
> Signed-off-by: Antonios Motakis 
> ---
>  drivers/vfio/pci/vfio_pci_intrs.c | 10 +-
>  drivers/vfio/virqfd.c | 24 +---
>  include/linux/vfio.h  |  3 +--
>  3 files changed, 19 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
> b/drivers/vfio/pci/vfio_pci_intrs.c
> index 3f909bb..e56c814 100644
> --- a/drivers/vfio/pci/vfio_pci_intrs.c
> +++ b/drivers/vfio/pci/vfio_pci_intrs.c
> @@ -226,8 +226,8 @@ static int vfio_intx_set_signal(struct vfio_pci_device 
> *vdev, int fd)
>  static void vfio_intx_disable(struct vfio_pci_device *vdev)
>  {
>   vfio_intx_set_signal(vdev, -1);
> - virqfd_disable(vdev, &vdev->ctx[0].unmask);
> - virqfd_disable(vdev, &vdev->ctx[0].mask);
> + virqfd_disable(&vdev->ctx[0].unmask);
> + virqfd_disable(&vdev->ctx[0].mask);
>   vdev->irq_type = VFIO_PCI_NUM_IRQS;
>   vdev->num_ctx = 0;
>   kfree(vdev->ctx);
> @@ -377,8 +377,8 @@ static void vfio_msi_disable(struct vfio_pci_device 
> *vdev, bool msix)
>   vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
>  
>   for (i = 0; i < vdev->num_ctx; i++) {
> - virqfd_disable(vdev, &vdev->ctx[i].unmask);
> - virqfd_disable(vdev, &vdev->ctx[i].mask);
> + virqfd_disable(&vdev->ctx[i].unmask);
> + virqfd_disable(&vdev->ctx[i].mask);
>   }
>  
>   if (msix) {
> @@ -415,7 +415,7 @@ static int vfio_pci_set_intx_unmask(struct 
> vfio_pci_device *vdev,
>vfio_send_intx_eventfd, NULL,
>&vdev->ctx[0].unmask, fd);
>  
> - virqfd_disable(vdev, &vdev->ctx[0].unmask);
> + virqfd_disable(&vdev->ctx[0].unmask);
>   }
>  
>   return 0;
> diff --git a/drivers/vfio/virqfd.c b/drivers/vfio/virqfd.c
> index 243eb61..27fa2f0 100644
> --- a/drivers/vfio/virqfd.c
> +++ b/drivers/vfio/virqfd.c
> @@ -17,6 +17,7 @@
>  #include "pci/vfio_pci_private.h"
>  
>  static struct workqueue_struct *vfio_irqfd_cleanup_wq;
> +static spinlock_t lock;

Define this as:

DEFINE_SPINLOCK(lock);

and we can avoid needing the init.
 
>  int __init vfio_pci_virqfd_init(void)
>  {
> @@ -25,6 +26,8 @@ int __init vfio_pci_virqfd_init(void)
>   if (!vfio_irqfd_cleanup_wq)
>   return -ENOMEM;
>  
> + spin_lock_init(&lock);
> +
>   return 0;
>  }
>  
> @@ -53,21 +56,21 @@ static int virqfd_wakeup(wait_queue_t *wait, unsigned 
> mode, int sync, void *key)
>  
>   if (flags & POLLHUP) {
>   unsigned long flags;
> - spin_lock_irqsave(&virqfd->vdev->irqlock, flags);
> + spin_lock_irqsave(&lock, flags);
>  
>   /*
>* The eventfd is closing, if the virqfd has not yet been
>* queued for release, as determined by testing whether the
> -  * vdev pointer to it is still valid, queue it now.  As
> +  * virqfd pointer to it is still valid, queue it now.  As
>* with kvm irqfds, we know we won't race against the virqfd
> -  * going away because we hold wqh->lock to get here.
> +  * going away because we hold the lock to get here.
>*/
>   if (*(virqfd->pvirqfd) == virqfd) {
>   *(virqfd->pvirqfd) = NULL;
>   virqfd_deactivate(virqfd);
>   }
>  
> - spin_unlock_irqrestore(&virqfd->vdev->irqlock, flags);
> + spin_unlock_irqrestore(&lock, flags);
>   }
>  
>   return 0;
> @@ -143,16 +146,16 @@ int virqfd_enable(struct vfio_pci_device *vdev,
>* we update the pointer to the virqfd under lock to avoid
>* pushing multiple jobs to release the same virqfd.
>*/
> - spin_lock_irq(&vdev->irqlock);
> + spin_lock_irq(&lock);
>  
>   if (*pvirqfd) {
> - spin_unlock_irq(&vdev->irqlock);
> + spin_unlock_irq(&lock);
>   ret = -EBUSY;
>   goto err_busy;
>   }
>   *pvirqfd = virqfd;
>  
> - spin_unlock_irq(&vdev->irqlock);
> + spin_unlock_irq(&lock);
>  
>   /*
>* Install our own custom wake-up handling so we are notified via
> @@ -190,19 +193,18 @@ err_fd:
>  }
>  EXPORT_SYMBOL_GPL(virqfd_enable);
>  
> -void virqfd_disable(struct vfio_pci_device *vdev,
> -struct virqfd **pvirqfd)
> +void virqfd_disable(struct virqfd **pvirqfd)
>  {
>   unsigned long flags;
>  
> - spin_lock_irqsave(&vdev->irqlock, flags);
> + spin_lock_irqsave(&lock, flags);
>  
>   if (*pvirqfd) {
>   virqfd_deactivate(*pvirqfd);
>   *pvirqf

Re: [PATCH v9 12/19] vfio/platform: trigger an interrupt via eventfd

2014-10-31 Thread Alex Williamson
On Mon, 2014-10-27 at 19:07 +0100, Antonios Motakis wrote:
> This patch allows to set an eventfd for a patform device's interrupt,
> and also to trigger the interrupt eventfd from userspace for testing.
> Level sensitive interrupts are marked as maskable and are handled in
> a later patch. Edge triggered interrupts are not advertised as maskable
> and are implemented here using a simple and efficient IRQ handler.
> 
> Signed-off-by: Antonios Motakis 
> ---
>  drivers/vfio/platform/vfio_platform_irq.c | 93 
> ++-
>  drivers/vfio/platform/vfio_platform_private.h |  2 +
>  2 files changed, 93 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
> b/drivers/vfio/platform/vfio_platform_irq.c
> index 007b386..2ac8ed7 100644
> --- a/drivers/vfio/platform/vfio_platform_irq.c
> +++ b/drivers/vfio/platform/vfio_platform_irq.c
> @@ -45,11 +45,91 @@ static int vfio_platform_set_irq_unmask(struct 
> vfio_platform_device *vdev,
>   return -EINVAL;
>  }
>  
> +static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
> +{
> + struct vfio_platform_irq *irq_ctx = dev_id;
> +
> + eventfd_signal(irq_ctx->trigger, 1);
> +
> + return IRQ_HANDLED;
> +}
> +
> +static int vfio_set_trigger(struct vfio_platform_device *vdev, int index,
> + int fd, irq_handler_t handler)
> +{
> + struct vfio_platform_irq *irq = &vdev->irqs[index];
> + struct eventfd_ctx *trigger;
> + int ret;
> +
> + if (irq->trigger) {
> + free_irq(irq->hwirq, irq);
> + kfree(irq->name);
> + eventfd_ctx_put(irq->trigger);
> + irq->trigger = NULL;
> + }
> +
> + if (fd < 0) /* Disable only */
> + return 0;
> +
> + irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)",
> + irq->hwirq, vdev->name);
> + if (!irq->name)
> + return -ENOMEM;
> +
> + trigger = eventfd_ctx_fdget(fd);
> + if (IS_ERR(trigger)) {
> + kfree(irq->name);
> + return PTR_ERR(trigger);
> + }
> +
> + irq->trigger = trigger;
> +
> + ret = request_irq(irq->hwirq, handler, 0, irq->name, irq);
> + if (ret) {
> + kfree(irq->name);
> + eventfd_ctx_put(trigger);
> + irq->trigger = NULL;
> + return ret;
> + }
> +
> + return 0;
> +}
> +
>  static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
>unsigned index, unsigned start,
>unsigned count, uint32_t flags, void *data)
>  {
> - return -EINVAL;
> + struct vfio_platform_irq *irq = &vdev->irqs[index];
> + irq_handler_t handler;
> +
> + if (vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE)
> + return -EINVAL; /* not implemented */
> + else
> + handler = vfio_irq_handler;
> +
> + if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
> + return vfio_set_trigger(vdev, index, -1, handler);
> +
> + if (start != 0 || count != 1)
> + return -EINVAL;
> +
> + if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
> + int32_t fd = *(int32_t *)data;
> +
> + return vfio_set_trigger(vdev, index, fd, handler);
> + }
> +
> + if (flags & VFIO_IRQ_SET_DATA_NONE) {
> + handler(irq->hwirq, irq);
> +
> + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
> + uint8_t trigger = *(uint8_t *)data;
> +
> + if (trigger)
> + handler(irq->hwirq, irq);
> + }
> +
> + return 0;
>  }
>  
>  int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
> @@ -95,7 +175,11 @@ int vfio_platform_irq_init(struct vfio_platform_device 
> *vdev)
>   if (hwirq < 0)
>   goto err;
>  
> - vdev->irqs[i].flags = 0;
> + vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
> +
> + if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
> + vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE;

This is a bit confusing because edge interrupts can support masking, but
they don't require it.  Level interrupts really must support masking
because we need to mask them on the host and therefore the user needs to
be able to unmask them (ignoring the irq prioritization thing you guys
can do on arm).  So this works, but I would really have expected
VFIO_IRQ_INFO_AUTOMASKED here and in the above function.

> +
>   vdev->irqs[i].count = 1;
>   vdev->irqs[i].hwirq = hwirq;
>   }
> @@ -110,6 +194,11 @@ err:
>  
>  void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
>  {
> + int i;
> +
> + for (i = 0; i < vdev->num_irqs; i++)
> + vfio_set_trigger(vdev, i, -1, NULL);
> +
>   vdev->num_irqs = 0;
>   kfree(vdev->irqs);
>  }
> diff --git a/drivers/vfio/platform/vfio_platform_private.h 
> b/d

Re: [PATCH v9 13/19] vfio/platform: support for level sensitive interrupts

2014-10-31 Thread Alex Williamson
On Mon, 2014-10-27 at 19:07 +0100, Antonios Motakis wrote:
> Level sensitive interrupts are exposed as maskable and automasked
> interrupts and are masked and disabled automatically when they fire.
> 
> Signed-off-by: Antonios Motakis 
> ---
>  drivers/vfio/platform/vfio_platform_irq.c | 102 
> +-
>  drivers/vfio/platform/vfio_platform_private.h |   2 +
>  2 files changed, 100 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
> b/drivers/vfio/platform/vfio_platform_irq.c
> index 2ac8ed7..563abf6 100644
> --- a/drivers/vfio/platform/vfio_platform_irq.c
> +++ b/drivers/vfio/platform/vfio_platform_irq.c
> @@ -31,18 +31,108 @@
>  
>  #include "vfio_platform_private.h"
>  
> +static void vfio_platform_mask(struct vfio_platform_irq *irq_ctx)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&irq_ctx->lock, flags);
> +
> + if (!irq_ctx->masked) {
> + disable_irq(irq_ctx->hwirq);
> + irq_ctx->masked = true;
> + }
> +
> + spin_unlock_irqrestore(&irq_ctx->lock, flags);
> +}
> +
>  static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
>   unsigned index, unsigned start,
>   unsigned count, uint32_t flags, void *data)
>  {
> - return -EINVAL;
> + if (start != 0 || count != 1)
> + return -EINVAL;
> +
> + if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
> + return -EINVAL;
> +
> + if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
> + return -EINVAL; /* not implemented yet */
> +
> + if (flags & VFIO_IRQ_SET_DATA_NONE) {
> + vfio_platform_mask(&vdev->irqs[index]);
> +
> + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
> + uint8_t mask = *(uint8_t *)data;
> +
> + if (mask)
> + vfio_platform_mask(&vdev->irqs[index]);
> + }
> +
> + return 0;
> +}
> +
> +static void vfio_platform_unmask(struct vfio_platform_irq *irq_ctx)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&irq_ctx->lock, flags);
> +
> + if (irq_ctx->masked) {
> + enable_irq(irq_ctx->hwirq);
> + irq_ctx->masked = false;
> + }
> +
> + spin_unlock_irqrestore(&irq_ctx->lock, flags);
>  }
>  
>  static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
>   unsigned index, unsigned start,
>   unsigned count, uint32_t flags, void *data)
>  {
> - return -EINVAL;
> + if (start != 0 || count != 1)
> + return -EINVAL;
> +
> + if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
> + return -EINVAL;
> +
> + if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
> + return -EINVAL; /* not implemented yet */
> +
> + if (flags & VFIO_IRQ_SET_DATA_NONE) {
> + vfio_platform_unmask(&vdev->irqs[index]);
> +
> + } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
> + uint8_t unmask = *(uint8_t *)data;
> +
> + if (unmask)
> + vfio_platform_unmask(&vdev->irqs[index]);
> + }
> +
> + return 0;
> +}
> +
> +static irqreturn_t vfio_maskable_irq_handler(int irq, void *dev_id)
> +{
> + struct vfio_platform_irq *irq_ctx = dev_id;
> + unsigned long flags;
> + int ret = IRQ_NONE;
> +
> + spin_lock_irqsave(&irq_ctx->lock, flags);
> +
> + if (!irq_ctx->masked) {
> + ret = IRQ_HANDLED;
> +
> + /* automask maskable interrupts */
> + disable_irq_nosync(irq_ctx->hwirq);
> + irq_ctx->masked = true;
> + }
> +
> + spin_unlock_irqrestore(&irq_ctx->lock, flags);
> +
> + if (ret == IRQ_HANDLED)
> + eventfd_signal(irq_ctx->trigger, 1);
> +
> + return ret;

This is not just a maskable irq handler, but specifically a level (aka
automasked) handler.  So this should only be used for AUTOMASKED irqs.

>  }
>  
>  static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
> @@ -103,7 +193,7 @@ static int vfio_platform_set_irq_trigger(struct 
> vfio_platform_device *vdev,
>   irq_handler_t handler;
>  
>   if (vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE)
> - return -EINVAL; /* not implemented */
> + handler = vfio_maskable_irq_handler;

As noted in the previous patch, this should be a test for AUTOMASKED,
not just MASKABLE.

>   else
>   handler = vfio_irq_handler;
>  
> @@ -175,13 +265,17 @@ int vfio_platform_irq_init(struct vfio_platform_device 
> *vdev)
>   if (hwirq < 0)
>   goto err;
>  
> + spin_lock_init(&vdev->irqs[i].lock);
> +
>   vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
>  
>   if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
> - vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE;
> + vdev->irqs[i]

Re: [PATCH v9 07/19] vfio/platform: return info for device memory mapped IO regions

2014-10-31 Thread Alex Williamson
On Mon, 2014-10-27 at 19:07 +0100, Antonios Motakis wrote:
> This patch enables the IOCTLs VFIO_DEVICE_GET_REGION_INFO ioctl call,
> which allows the user to learn about the available MMIO resources of
> a device.
> 
> Signed-off-by: Antonios Motakis 
> ---
>  drivers/vfio/platform/vfio_platform_common.c  | 110 
> +-
>  drivers/vfio/platform/vfio_platform_private.h |  22 ++
>  2 files changed, 128 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/vfio/platform/vfio_platform_common.c 
> b/drivers/vfio/platform/vfio_platform_common.c
> index cb20526..82de752 100644
> --- a/drivers/vfio/platform/vfio_platform_common.c
> +++ b/drivers/vfio/platform/vfio_platform_common.c
> @@ -27,17 +27,97 @@
>  
>  #include "vfio_platform_private.h"
>  
> +static DEFINE_MUTEX(driver_lock);
> +
> +static int vfio_platform_regions_init(struct vfio_platform_device *vdev)
> +{
> + int cnt = 0, i;
> +
> + while (vdev->get_resource(vdev, cnt))
> + cnt++;
> +
> + vdev->regions = kcalloc(cnt, sizeof(struct vfio_platform_region),
> + GFP_KERNEL);
> + if (!vdev->regions)
> + return -ENOMEM;
> +
> + for (i = 0; i < cnt;  i++) {
> + struct resource *res =
> + vdev->get_resource(vdev, i);
> +
> + if (!res)
> + goto err;
> +
> + vdev->regions[i].addr = res->start;
> + vdev->regions[i].size = resource_size(res);
> + vdev->regions[i].flags = 0;
> +
> + switch (resource_type(res)) {
> + case IORESOURCE_MEM:
> + vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_MMIO;
> + break;
> + case IORESOURCE_IO:
> + vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
> + break;
> + default:
> + goto err;
> + }
> + }
> +
> + vdev->num_regions = cnt;
> +
> + return 0;
> +err:
> + kfree(vdev->regions);
> + return -EINVAL;
> +}
> +
> +static void vfio_platform_regions_cleanup(struct vfio_platform_device *vdev)
> +{
> + vdev->num_regions = 0;
> + kfree(vdev->regions);
> +}
> +
>  static void vfio_platform_release(void *device_data)
>  {
> + struct vfio_platform_device *vdev = device_data;
> +
> + mutex_lock(&driver_lock);
> +
> + if (!(--vdev->refcnt)) {
> + vfio_platform_regions_cleanup(vdev);
> + }
> +
> + mutex_unlock(&driver_lock);
> +
>   module_put(THIS_MODULE);
>  }
>  
>  static int vfio_platform_open(void *device_data)
>  {
> + struct vfio_platform_device *vdev = device_data;
> + int ret;
> +
>   if (!try_module_get(THIS_MODULE))
>   return -ENODEV;
>  
> + mutex_lock(&driver_lock);
> +
> + if (!vdev->refcnt) {
> + ret = vfio_platform_regions_init(vdev);
> + if (ret)
> + goto err_reg;
> + }
> +
> + vdev->refcnt++;
> +
> + mutex_unlock(&driver_lock);
>   return 0;
> +
> +err_reg:
> + mutex_unlock(&driver_lock);
> + module_put(THIS_MODULE);
> + return ret;
>  }
>  
>  static long vfio_platform_ioctl(void *device_data,
> @@ -58,15 +138,33 @@ static long vfio_platform_ioctl(void *device_data,
>   return -EINVAL;
>  
>   info.flags = vdev->flags;
> - info.num_regions = 0;
> + info.num_regions = vdev->num_regions;
>   info.num_irqs = 0;
>  
>   return copy_to_user((void __user *)arg, &info, minsz);
>  
> - } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
> - return -EINVAL;
> + } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
> + struct vfio_region_info info;
> +
> + minsz = offsetofend(struct vfio_region_info, offset);
> +
> + if (copy_from_user(&info, (void __user *)arg, minsz))
> + return -EFAULT;
>  
> - else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
> + if (info.argsz < minsz)
> + return -EINVAL;
> +
> + if (info.index >= vdev->num_regions)
> + return -EINVAL;
> +
> + /* map offset to the physical address  */
> + info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index);
> + info.size = vdev->regions[info.index].size;
> + info.flags = vdev->regions[info.index].flags;
> +
> + return copy_to_user((void __user *)arg, &info, minsz);
> +
> + } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
>   return -EINVAL;
>  
>   else if (cmd == VFIO_DEVICE_SET_IRQS)
> @@ -134,10 +232,14 @@ struct vfio_platform_device 
> *vfio_platform_remove_common(struct device *dev)
>  {
>   struct vfio_platform_device *vdev;
>  
> + mutex_lock(&driver_lock);
> +
>   vdev = vfio_del_group_dev(dev);
>   if (vdev)
>   iommu_group_put(dev->iommu_g

Re: [PATCH v9 04/19] vfio: amba: VFIO support for AMBA devices

2014-10-31 Thread Alex Williamson
On Mon, 2014-10-27 at 19:07 +0100, Antonios Motakis wrote:
> Add support for discovering AMBA devices with VFIO and handle them
> similarly to Linux platform devices.
> 
> Signed-off-by: Antonios Motakis 
> ---
>  drivers/vfio/platform/vfio_amba.c | 116 
> ++
>  include/uapi/linux/vfio.h |   1 +
>  2 files changed, 117 insertions(+)
>  create mode 100644 drivers/vfio/platform/vfio_amba.c
> 
> diff --git a/drivers/vfio/platform/vfio_amba.c 
> b/drivers/vfio/platform/vfio_amba.c
> new file mode 100644
> index 000..cf61324
> --- /dev/null
> +++ b/drivers/vfio/platform/vfio_amba.c
> @@ -0,0 +1,116 @@
> +/*
> + * Copyright (C) 2013 - Virtual Open Systems
> + * Author: Antonios Motakis 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "vfio_platform_private.h"
> +
> +#define DRIVER_VERSION  "0.9"
> +#define DRIVER_AUTHOR   "Antonios Motakis "
> +#define DRIVER_DESC "VFIO for AMBA devices - User Level meta-driver"
> +
> +/* probing devices from the AMBA bus */
> +
> +static struct resource *get_amba_resource(struct vfio_platform_device *vdev,
> + int i)
> +{
> + struct amba_device *adev = (struct amba_device *) vdev->opaque;
> +
> + if (i == 0)
> + return &adev->res;
> +
> + return NULL;
> +}
> +
> +static int get_amba_irq(struct vfio_platform_device *vdev, int i)
> +{
> + struct amba_device *adev = (struct amba_device *) vdev->opaque;
> +
> + if (i < AMBA_NR_IRQS)
> + return adev->irq[i];
> +
> + return 0;
> +}
> +
> +static int vfio_amba_probe(struct amba_device *adev, const struct amba_id 
> *id)
> +{
> +
> + struct vfio_platform_device *vdev;
> + int ret;
> +
> + vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
> + if (!vdev)
> + return -ENOMEM;
> +
> + vdev->opaque = (void *) adev;
> + vdev->name = "vfio-amba-dev";

You need to actually allocate some memory here with something like
kasprintf.  Don't forget to free it on the error and remove path.

> + vdev->flags = VFIO_DEVICE_FLAGS_AMBA;
> + vdev->get_resource = get_amba_resource;
> + vdev->get_irq = get_amba_irq;
> +
> + ret = vfio_platform_probe_common(vdev, &adev->dev);
> + if (ret)
> + kfree(vdev);
> +
> + return ret;
> +}
> +
> +static int vfio_amba_remove(struct amba_device *adev)
> +{
> + struct vfio_platform_device *vdev;
> +
> + vdev = vfio_platform_remove_common(&adev->dev);
> + if(vdev) {
> + kfree(vdev);
> + return 0;
> + }
> +
> + return -EINVAL;
> +}
> +
> +static struct amba_id pl330_ids[] = {
> + { 0, 0 },
> +};
> +
> +MODULE_DEVICE_TABLE(amba, pl330_ids);
> +
> +static struct amba_driver vfio_amba_driver = {
> + .probe = vfio_amba_probe,
> + .remove = vfio_amba_remove,
> + .id_table = pl330_ids,
> + .drv = {
> + .name = "vfio-amba",
> + .owner = THIS_MODULE,
> + },
> +};
> +
> +module_amba_driver(vfio_amba_driver);
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index 9db1056..92469e0 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -158,6 +158,7 @@ struct vfio_device_info {
>  #define VFIO_DEVICE_FLAGS_RESET  (1 << 0)/* Device supports 
> reset */
>  #define VFIO_DEVICE_FLAGS_PCI(1 << 1)/* vfio-pci device */
>  #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)  /* vfio-platform device */
> +#define VFIO_DEVICE_FLAGS_AMBA  (1 << 3) /* vfio-amba device */
>   __u32   num_regions;/* Max region index + 1 */
>   __u32   num_irqs;   /* Max IRQ index + 1 */
>  };



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v9 17/19] vfio: virqfd: add vfio_ prefix to virqfd_enable and virqfd_disable

2014-10-31 Thread Antonios Motakis
On Mon, Oct 27, 2014 at 9:12 PM, Bjorn Helgaas  wrote:
> On Mon, Oct 27, 2014 at 12:08 PM, Antonios Motakis
>  wrote:
>> The virqfd_enable and virqfd_disable functions are now global. Add the
>> vfio_ prefix to those functions.
>
> Wouldn't it be better to change the name *before* making them global?

I will add a separate patch for renaming & exporting at the same time.

>
> Bjorn
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v9 14/19] vfio: move eventfd support code for VFIO_PCI to a separate file

2014-10-31 Thread Antonios Motakis
On Mon, Oct 27, 2014 at 8:16 PM, Bjorn Helgaas  wrote:
> Hi Antonios,
>
> On Mon, Oct 27, 2014 at 12:07 PM, Antonios Motakis
>  wrote:
>> The virqfd functionality that is used by VFIO_PCI to implement interrupt
>> masking and unmasking via an eventfd, is generic enough and can be reused
>> by another driver. Move it to a separate file in order to allow the code
>> to be shared.
>>
>> Also properly export virqfd_enable and virqfd_disable in the process.
>
> Alex will handle this, not me, but my personal preference is to avoid
> doing things "in the process" because the small changes get lost in
> the big patch.
>
> I'd rather see a strict move that changes no code at all (except
> things like necessary Makefile changes), followed by a smaller patch
> that does the additional stuff.
>
> Does "properly export" mean that those functions were previously
> *improperly* exported and the way they used to be exported caused a
> problem?  Or does it just mean "export"?

It just means "export", but you are right there is no reason why to
export it in this patch. I will move it to one of the next patches.

>
> Bjorn
>
>> Signed-off-by: Antonios Motakis 
>> ---
>>  drivers/vfio/Makefile   |   4 +-
>>  drivers/vfio/pci/vfio_pci_intrs.c   | 213 
>> ---
>>  drivers/vfio/pci/vfio_pci_private.h |   3 -
>>  drivers/vfio/virqfd.c   | 214 
>> 
>>  include/linux/vfio.h|  28 +
>>  5 files changed, 245 insertions(+), 217 deletions(-)
>>  create mode 100644 drivers/vfio/virqfd.c
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC PATCH v2 0/4] vfio: platform: return device tree info for a platform device node

2014-10-31 Thread Antonios Motakis
On Wed, Oct 22, 2014 at 11:56 PM, Alex Williamson
 wrote:
> On Thu, 2014-10-16 at 17:54 +0200, Antonios Motakis wrote:
>> This RFC's intention is to show what an interface to access device node
>> properties for the VFIO platform driver can look like.
>>
>> If a device tree node corresponding to a platform device bound by VFIO 
>> PLATFORM
>> or VFIO AMBA is available, this patch series will allow the user to query the
>> properties associated with this device node. This can be useful for userspace
>> drivers to automatically query parameters related to the device.
>>
>> An API to return data from a device's device tree has been proposed before on
>> these lists. The API proposed here is slightly different.
>>
>> Properties to parse from the device tree are not indexed by a numerical id.
>> The host system doesn't guarantee any specific ordering for the available
>> properties, or that those will remain the same; while this does not happen in
>> practice, there is nothing from the host changing the device nodes during
>> operation. For this reason in this RFC properties are accessed by property 
>> name.
>>
>> The type of the property accessed must also be known by the user. Properties
>> types implemented in this RFC:
>> - VFIO_DEVTREE_TYPE_STRINGS (strings sepparated by the null character)
>> - VFIO_DEVTREE_TYPE_U32
>> - VFIO_DEVTREE_TYPE_U16
>> - VFIO_DEVTREE_TYPE_U8
>>
>> These can all be access by the ioctl VFIO_DEVICE_GET_DEVTREE_INFO. A new 
>> ioctl
>> was preferred instead of shoehorning the functionality in 
>> VFIO_DEVICE_GET_INFO.
>> The structure exchanged with userspace looks like this:
>>
>> /**
>>  * VFIO_DEVICE_GET_DEVTREE_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 16,
>>  *struct vfio_devtree_info)
>>  *
>>  * Retrieve information from the device's device tree, if available.
>>  * Caller will initialize data[] with a single string with the requested
>>  * devicetree property name, and type depending on whether a array of strings
>>  * or an array of u32 values is expected. On success, data[] will be extended
>>  * with the requested information, either as an array of u32, or with a list
>>  * of strings sepparated by the NULL terminating character.
>>  * Return: 0 on success, -errno on failure.
>>  */
>> struct vfio_devtree_info {
>>   __u32   argsz;
>>   __u32   type;
>> #define VFIO_DEVTREE_PROP_LIST0
>> #define VFIO_DEVTREE_TYPE_STRINGS 1
>> #define VFIO_DEVTREE_TYPE_U8  2
>> #define VFIO_DEVTREE_TYPE_U16 3
>> #define VFIO_DEVTREE_TYPE_U32 4
>>   __u32   length;
>>   __u8data[];
>> };
>> #define VFIO_DEVICE_GET_DEVTREE_INFO  _IO(VFIO_TYPE, VFIO_BASE + 17)
>
> Seems pretty reasonable.  We should add a flags variable to
> vfio_devtree_info as well.  General question, in order to get a VFIO

Hm, any idea what kind of flags we might want to add in the future?
Generally what we have is key-value pairs of the above types.

> device file descriptor, the user already needs to know the name of the
> device.  Presumably they'll get this from sysfs or otherwise just know
> it in advance.  Is that redundant information to what we're providing
> here?  What other information already exists in sysfs for device tree
> devices that we're duplicating here for the sake of creating a VFIO
> mechanism?  If it's not already exposed in sysfs, should it be?  Thanks,

Yeah, it is kinda redundant information, but maybe it is not necessary
to explicitly mask the device name just because it is also available
otherwise. The full_name though is not available, which includes the
path on the device tree.

The information for a device is used by a driver for that device. For
example for the PL330 DMA controller, the driver finds out from the
device tree the number of DMA channels available and other
configuration time parameters. This is information that might not be
otherwise available from the device itself, so it should be exposed to
the driver.

This information is also available in /proc/device-tree, but a VFIO
driver shouldn't have to go through that in my view.

>
> Alex
>
>> The length of the property will be reported in length, so the user can 
>> reallocate
>> the structure if the data does not fit the first time the call is used.
>>
>> Specifically for QEMU, reading the "compatible" property of the device tree
>> node could be of use to find out what device is being assigned to the guest 
>> and
>> handle appropriately a wider range of devices in the future, and to generate 
>> an
>> appropriate device tree for the guest.
>>
>> TODOs:
>>  - 64 bit values support
>>  - We can consider to rebase this on the patch series by Rafael J. Wysocki:
>>"[PATCH v4 00/13] Add ACPI _DSD and unified device properties support"
>>This would make 64 bit support more straightforward as it already includes
>>the APIs we need for 64 bit OF values.
>>
>> Changes since v1:
>>  - Updated for VFIO platform patch series v

Re: [Patch Part2 v3 15/24] x86, MSI: Use hierarchy irqdomain to manage MSI interrupts

2014-10-31 Thread Thomas Gleixner
On Fri, 31 Oct 2014, Jiang Liu wrote:
> On 2014/10/29 5:37, Thomas Gleixner wrote:
> > Then it calls down the domain allocation chain. x86_msi_domain would
> > simply hand down to the parent domain. That would either be the remap
> > domain or the vector domain.
> The issue here is that, the hierarchy irqdomain maintains a tree
> topology and every irqdomain only supports one parent.
> 
> In case of irq remapping, we need to build one irqdomain for each IOMMU
> unit to support hotplug and simplify the implementation. So we need to
> build one MSI irqdomain for each IOMMU unit too instead of using a
> common MSI irqdomain.

That makes indeed a difference.
 
> Current design is that, a common MSI irqdomain to support all MSI when
> irq remapping is disabled, and one MSI irqdomain for each IOMMU unit
> when irq remapping is enabled.

> So we have the code below to choose the correct irqdomain for MSI.
> domain = irq_remapping_get_irq_domain(&info);
> if (domain == NULL)
> domain = msi_default_domain;
> if (domain == NULL)
> return -ENOSYS;

Right. I guess we need to keep it that way for now.

But looking at the code makes me wonder why we actually need to call
into the remap code and do a list walk to figure the domain out. The
association of device and iommu should be known at startup/hotplug
time already. That's out of the scope of this work, but should be
fixed eventually.

Thanks,

tglx




___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 05/12] memory: Add NVIDIA Tegra memory controller support

2014-10-31 Thread Thierry Reding
On Thu, Oct 30, 2014 at 04:08:41PM +0100, Thierry Reding wrote:
> On Wed, Oct 15, 2014 at 03:09:30PM -0700, Olof Johansson wrote:
> > Hi,
> > 
> > Oh, a few more comments:
> > 
> > On Mon, Oct 13, 2014 at 3:33 AM, Thierry Reding
> >  wrote:
> > 
> > > diff --git a/drivers/memory/Makefile b/drivers/memory/Makefile
> > > index c32d31981be3..1c932e7e7b8d 100644
> > > --- a/drivers/memory/Makefile
> > > +++ b/drivers/memory/Makefile
> > > @@ -12,4 +12,5 @@ obj-$(CONFIG_FSL_CORENET_CF)  += fsl-corenet-cf.o
> > >  obj-$(CONFIG_FSL_IFC)  += fsl_ifc.o
> > >  obj-$(CONFIG_MVEBU_DEVBUS) += mvebu-devbus.o
> > >  obj-$(CONFIG_TEGRA20_MC)   += tegra20-mc.o
> > > -obj-$(CONFIG_TEGRA30_MC)   += tegra30-mc.o
> > > +
> > > +obj-$(CONFIG_ARCH_TEGRA)   += tegra/
> > > diff --git a/drivers/memory/tegra/Makefile b/drivers/memory/tegra/Makefile
> > > new file mode 100644
> > > index ..51b9e8fcde1b
> > > --- /dev/null
> > > +++ b/drivers/memory/tegra/Makefile
> > > @@ -0,0 +1,5 @@
> > > +obj-y   = tegra-mc.o
> > > +obj-$(CONFIG_ARCH_TEGRA_3x_SOC)+= tegra30-mc.o
> > > +obj-$(CONFIG_ARCH_TEGRA_114_SOC)   += tegra114-mc.o
> > > +obj-$(CONFIG_ARCH_TEGRA_124_SOC)   += tegra124-mc.o
> > > +obj-$(CONFIG_ARCH_TEGRA_132_SOC)   += tegra124-mc.o
> > 
> > You'll need a Kconfig and not just a makefile -- there are definitely
> > dependencies on this driver (IOMMU in particular).
> 
> This is handled within the tegra-mc driver by only setting up the IOMMU
> when TEGRA_IOMMU_SMMU is enabled. That config option remains in place.
> 
> > Also, the problem of having a global enable bit that is only under
> > control of TrustZone FW is a big problem -- if the bit is not set, the
> > driver will not work (and the machine will crash).
> > 
> > I think you'll need to come up with a way to detect that in the
> > driver. I don't have a good idea of how it can be done though.
> 
> I don't think I ever got back to you on this. We discussed this
> internally and it seems like there's no way to detect this properly, so
> the best suggestion so far was to make it a requirement on the secure
> firmware to enable IOMMU or not. Since there's no way for the kernel to
> detect whether IOMMU was enabled or not, I think the firmware would
> equally have to adjust the SMMU's device tree node's status property
> appropriately.

The other option would be for the firmware not to touch the SMMU device
tree node and the kernel simply assuming that if it's running in non-
secure mode then there must be secure firmware and it has enabled the
SMMU. Enabling the SMMU would become part of the contract between
firmware and kernel, much like locking the VPR is required to get the
GPU to work.

Those are really the only two choices we have.

Thierry


pgpOp1ho8erQa.pgp
Description: PGP signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [Patch Part2 v3 15/24] x86, MSI: Use hierarchy irqdomain to manage MSI interrupts

2014-10-31 Thread Jiang Liu


On 2014/10/29 5:37, Thomas Gleixner wrote:
> On Tue, 28 Oct 2014, Jiang Liu wrote:
>> @@ -166,25 +264,59 @@ int setup_msi_irq(struct pci_dev *dev, struct msi_desc 
>> *msidesc,
>>  
>>  int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
>>  {
>> -struct msi_desc *msidesc;
>> -int irq, ret;
>> +int irq, cnt, nvec_pow2;
>> +struct irq_domain *domain;
>> +struct msi_desc *msidesc, *iter;
>> +struct irq_alloc_info info;
>> +int node = dev_to_node(&dev->dev);
>>  
>> -/* Multiple MSI vectors only supported with interrupt remapping */
>> -if (type == PCI_CAP_ID_MSI && nvec > 1)
>> -return 1;
>> +if (disable_apic)
>> +return -ENOSYS;
>>  
>> -list_for_each_entry(msidesc, &dev->msi_list, list) {
>> -irq = irq_domain_alloc_irqs(NULL, 1, NUMA_NO_NODE, NULL);
>> +init_irq_alloc_info(&info, NULL);
>> +info.msi_dev = dev;
>> +if (type == PCI_CAP_ID_MSI) {
>> +msidesc = list_first_entry(&dev->msi_list,
>> +   struct msi_desc, list);
>> +WARN_ON(!list_is_singular(&dev->msi_list));
>> +WARN_ON(msidesc->irq);
>> +WARN_ON(msidesc->msi_attrib.multiple);
>> +WARN_ON(msidesc->nvec_used);
>> +info.type = X86_IRQ_ALLOC_TYPE_MSI;
>> +cnt = nvec;
>> +} else {
>> +info.type = X86_IRQ_ALLOC_TYPE_MSIX;
>> +cnt = 1;
>> +}
> 
> We have a similar issue here.
> 
>> +domain = irq_remapping_get_irq_domain(&info);
> 
> We add domain specific knowledge to the MSI implementation. Not
> necessary at all.
> 
> Again MSI is not an x86 problem and we really can move most of that to
> the core code. The above sanity checks and the distinction between MSI
> and MSIX can be handled in the core code. And every domain involved in
> the MSI chain would need a alloc_msi() callback.
Add alloc_msi() callback to irq_domain_ops seems a little overaction.
I think the main idea is to make MSI code public as much as possible,
so I have found another solution to move MSI irqdomain code into
drivers/pci/msi.c and only following code are platform dependent now.
How about this solution?

int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
struct irq_domain *domain;
struct irq_alloc_info info;

init_irq_alloc_info(&info, NULL);
info.msi_dev = dev;
if (type == PCI_CAP_ID_MSI) {
info.type = X86_IRQ_ALLOC_TYPE_MSI;
info.flags |= X86_IRQ_ALLOC_CONTIGOUS_VECTORS;
} else {
info.type = X86_IRQ_ALLOC_TYPE_MSIX;
}

domain = irq_remapping_get_irq_domain(&info);
if (domain == NULL)
domain = msi_default_domain;
if (domain == NULL)
return -ENOSYS;

return msi_irq_domain_alloc_irqs(domain, type, dev, &info);
}

void native_teardown_msi_irq(unsigned int irq)
{
irq_domain_free_irqs(irq, 1);
}

irq_hw_number_t arch_msi_irq_domain_get_hwirq(void *arg)
{
struct irq_alloc_info *info = arg;

return info->msi_hwirq;
}

void arch_msi_irq_domain_set_hwirq(void *arg, irq_hw_number_t hwirq)
{
struct irq_alloc_info *info = arg;

info->msi_hwirq = hwirq;
}

void arch_init_msi_domain(struct irq_domain *parent)
{
if (disable_apic)
return;

msi_default_domain = msi_create_irq_domain(parent);
if (!msi_default_domain)
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
}

#ifdef CONFIG_IRQ_REMAP
struct irq_domain *arch_create_msi_irq_domain(struct irq_domain *parent)
{
return msi_create_irq_domain(parent);
}
#endif
-

> 
> So native_setup_msi_irqs() would boil down to:
> + {
> + if (disable_apic)
> + return -ENOSYS;
> + 
> + return irq_domain_alloc_msi(msi_domain, dev, nvec, type);   
> + }
> 
> Now that core function performs the sanity checks for the MSI case. In
> fact it should not proceed when a warning condition is detected. Not a
> x86 issue at all, its true for every MSI implementation.
> 
> Then it calls down the domain allocation chain. x86_msi_domain would
> simply hand down to the parent domain. That would either be the remap
> domain or the vector domain.
The issue here is that, the hierarchy irqdomain maintains a tree
topology and every irqdomain only supports one parent.

In case of irq remapping, we need to build one irqdomain for each IOMMU
unit to support hotplug and simplify the implementation. So we need to
build one MSI irqdomain for each IOMMU unit too instead of using a
common MSI irqdomain.

Current design is that, a common MSI irqdomain to support all MSI when
irq remapping is disabled, and one MSI irqdomain for each IOMMU unit
when irq remapping is enabled.

So we ha