On Thu, 2015-06-11 at 18:51 +0800, Feng Wu wrote:
> This patch adds the kvm-vfio interface for VT-d Posted-Interrupts.
> When guests update MSI/MSI-x information for an assigned-device,
> QEMU will use KVM_DEV_VFIO_DEVICE_POST_IRQ attribute to setup
> IRTE for VT-d PI. Userspace program can also use
> KVM_DEV_VFIO_DEVICE_UNPOST_IRQ to change back to irq remapping mode.
> This patch implements these IRQ attributes.
> 
> Signed-off-by: Feng Wu <feng...@intel.com>
> ---
>  include/linux/kvm_host.h |  22 +++++++++
>  virt/kvm/vfio.c          | 126 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 148 insertions(+)
> 
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index f591f7c..69f8711 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1073,6 +1073,28 @@ extern struct kvm_device_ops kvm_xics_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>  
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> +/*
> + * kvm_arch_vfio_update_pi_irte - set IRTE for Posted-Interrupts
> + *
> + * @kvm: kvm
> + * @host_irq: host irq of the interrupt
> + * @guest_irq: gsi of the interrupt
> + * @set: set or unset PI
> + * returns 0 on success, < 0 on failure
> + */
> +int kvm_arch_vfio_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
> +                              uint32_t guest_irq, bool set);
> +#else
> +static inline int kvm_arch_vfio_update_pi_irte(struct kvm *kvm,
> +                                            unsigned int host_irq,
> +                                            uint32_t guest_irq,
> +                                            bool set)
> +{
> +     return 0;
> +}

The code below can't get to this function without
__KVM_HAVE_ARCH_KVM_VFIO_POST, but this seems like it should return an
error if not implemented.

> +#endif
> +
>  #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
>  
>  static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> index 80a45e4..547fc51 100644
> --- a/virt/kvm/vfio.c
> +++ b/virt/kvm/vfio.c
> @@ -18,6 +18,7 @@
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  #include <linux/vfio.h>
> +#include <asm/irq_remapping.h>

This only exists on x86.   Are we also getting lucky with some of the
include chains that give us the PCI related defines?  It looks like
we're implicitly assuming CONFIG_PCI

>  #include "vfio.h"
>  
>  struct kvm_vfio_group {
> @@ -276,12 +277,128 @@ static int kvm_vfio_set_group(struct kvm_device *dev, 
> long attr, u64 arg)
>       return -ENXIO;
>  }
>  
> +static int kvm_vfio_pci_get_irq_count(struct pci_dev *pdev, int irq_type)
> +{
> +     if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) {
> +             u8 pin;
> +
> +             pci_read_config_byte(pdev, PCI_INTERRUPT_PIN, &pin);
> +             if (pin)
> +                     return 1;
> +     } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) {
> +             return pci_msi_vec_count(pdev);
> +     } else if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX) {
> +             return pci_msix_vec_count(pdev);
> +     }
> +
> +     return 0;
> +}
> +
> +static int kvm_vfio_control_pi(struct kvm_device *kdev,
> +                            int32_t __user *argp, bool set)
> +{
> +     struct kvm_vfio_dev_irq pi_info;
> +     uint32_t *gsi;
> +     unsigned long minsz;
> +     struct vfio_device *vdev;
> +     struct msi_desc *entry;
> +     struct device *dev;
> +     struct pci_dev *pdev;
> +     int i, max, ret;
> +
> +     minsz = offsetofend(struct kvm_vfio_dev_irq, count);
> +
> +     if (copy_from_user(&pi_info, (void __user *)argp, minsz))
> +             return -EFAULT;
> +
> +     if (pi_info.argsz < minsz || pi_info.index >= VFIO_PCI_NUM_IRQS)
> +             return -EINVAL;

Could we also abort on pi_info.count == 0?

> +
> +     vdev = kvm_vfio_get_vfio_device(pi_info.fd);
> +     if (IS_ERR(vdev))
> +             return PTR_ERR(vdev);
> +
> +     dev = kvm_vfio_external_base_device(vdev);
> +     if (!dev || !dev_is_pci(dev)) {
> +             ret = -EFAULT;
> +             goto put_vfio_device;
> +     }
> +
> +     pdev = to_pci_dev(dev);
> +
> +     max = kvm_vfio_pci_get_irq_count(pdev, pi_info.index);
> +     if (max <= 0) {
> +             ret = -EFAULT;
> +             goto put_vfio_device;
> +     }
> +
> +     if (pi_info.argsz - minsz < pi_info.count * sizeof(u32) ||
> +         pi_info.start >= max || pi_info.start + pi_info.count > max) {
> +             ret = -EINVAL;
> +             goto put_vfio_device;
> +     }
> +
> +     gsi = memdup_user((void __user *)((unsigned long)argp + minsz),
> +                        pi_info.count * sizeof(u32));
> +     if (IS_ERR(gsi)) {
> +             ret = PTR_ERR(gsi);
> +             goto put_vfio_device;
> +     }
> +
> +#ifdef CONFIG_PCI_MSI
> +     for (i = 0; i < pi_info.count; i++) {
> +             list_for_each_entry(entry, &pdev->msi_list, list) {

Should we be able to get here for INTx?

> +                     if (entry->msi_attrib.entry_nr != pi_info.start+i)
> +                             continue;
> +
> +                     ret = kvm_arch_vfio_update_pi_irte(kdev->kvm,
> +                                                        entry->irq,
> +                                                        gsi[i],
> +                                                        set);
> +                     if (ret)
> +                             goto free_gsi;
> +             }
> +     }
> +#endif
> +
> +     ret = 0;

So if we didn't do anything, return success?  That seems strange.
Should we also be doing some unwind on failure?  Thanks,

Alex

> +
> +free_gsi:
> +     kfree(gsi);
> +
> +put_vfio_device:
> +     kvm_vfio_put_vfio_device(vdev);
> +     return ret;
> +}
> +
> +static int kvm_vfio_set_device(struct kvm_device *kdev, long attr, u64 arg)
> +{
> +     int32_t __user *argp = (int32_t __user *)(unsigned long)arg;
> +     int ret;
> +
> +     switch (attr) {
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> +     case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> +             ret = kvm_vfio_control_pi(kdev, argp, 1);
> +             break;
> +     case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> +             ret = kvm_vfio_control_pi(kdev, argp, 0);
> +             break;
> +#endif
> +     default:
> +             ret = -ENXIO;
> +     }
> +     return ret;
> +}
> +
>  static int kvm_vfio_set_attr(struct kvm_device *dev,
>                            struct kvm_device_attr *attr)
>  {
>       switch (attr->group) {
>       case KVM_DEV_VFIO_GROUP:
>               return kvm_vfio_set_group(dev, attr->attr, attr->addr);
> +     case KVM_DEV_VFIO_DEVICE:
> +             return kvm_vfio_set_device(dev, attr->attr, attr->addr);
>       }
>  
>       return -ENXIO;
> @@ -299,6 +416,15 @@ static int kvm_vfio_has_attr(struct kvm_device *dev,
>               }
>  
>               break;
> +     case KVM_DEV_VFIO_DEVICE:
> +             switch (attr->attr) {
> +#ifdef __KVM_HAVE_ARCH_KVM_VFIO_POST
> +             case KVM_DEV_VFIO_DEVICE_POST_IRQ:
> +             case KVM_DEV_VFIO_DEVICE_UNPOST_IRQ:
> +                     return irq_remapping_cap(IRQ_POSTING_CAP) ? 0 : -ENXIO;
> +#endif
> +             }
> +             break;
>       }
>  
>       return -ENXIO;



--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to