On Fri, Jun 06, 2014 at 12:20:04PM -0600, Alex Williamson wrote:
>On Fri, 2014-06-06 at 15:00 +1000, Gavin Shan wrote:
>> The patch adds new IOCTL commands for sPAPR VFIO container device
>> to support EEH functionality for PCI devices, which have been passed
>> through from host to somebody else via VFIO.
>> 
>> Signed-off-by: Gavin Shan <gws...@linux.vnet.ibm.com>
>> Acked-by: Alexander Graf <ag...@suse.de>
>> ---
>>  Documentation/vfio.txt              | 87 
>> +++++++++++++++++++++++++++++++++++--
>>  drivers/vfio/Makefile               |  1 +
>>  drivers/vfio/pci/vfio_pci.c         | 18 ++++++--
>>  drivers/vfio/vfio_iommu_spapr_tce.c | 17 +++++++-
>>  drivers/vfio/vfio_spapr_eeh.c       | 87 
>> +++++++++++++++++++++++++++++++++++++
>>  include/linux/vfio.h                | 23 ++++++++++
>>  include/uapi/linux/vfio.h           | 34 +++++++++++++++
>>  7 files changed, 259 insertions(+), 8 deletions(-)
>>  create mode 100644 drivers/vfio/vfio_spapr_eeh.c
>> 
>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>> index b9ca023..3fa4538 100644
>> --- a/Documentation/vfio.txt
>> +++ b/Documentation/vfio.txt
>> @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
>> real mode which provides
>>  an excellent performance which has limitations such as inability to do
>>  locked pages accounting in real time.
>>  
>> -So 3 additional ioctls have been added:
>> +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
>> +subtree that can be treated as a unit for the purposes of partitioning and
>> +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
>> +function of a multi-function IOA, or multiple IOAs (possibly including 
>> switch
>> +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
>> errors
>> +and recover from them via EEH RTAS services, which works on the basis of
>> +additional ioctl commands.
>> +
>> +So 4 additional ioctls have been added:
>>  
>>      VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>>              of the DMA window on the PCI bus.
>> @@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
>>  
>>      VFIO_IOMMU_DISABLE - disables the container.
>>  
>> +    VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
>> recovery.
>>  
>>  The code flow from the example above should be slightly changed:
>>  
>> +    struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
>> +
>>      .....
>>      /* Add the group to the container */
>>      ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
>> @@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
>> changed:
>>      dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
>>  
>>      /* Check here is .iova/.size are within DMA window from 
>> spapr_iommu_info */
>> -
>>      ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>> -    .....
>> +
>> +    /* Get a file descriptor for the device */
>> +    device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
>> +
>> +    ....
>> +
>> +    /* Gratuitous device reset and go... */
>> +    ioctl(device, VFIO_DEVICE_RESET);
>> +
>> +    /* Make sure EEH is supported */
>> +    ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
>> +
>> +    /* Enable the EEH functionality on the device */
>> +    pe_op.op = VFIO_EEH_PE_ENABLE;
>> +    ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +    /* You're suggested to create additional data struct to represent
>> +     * PE, and put child devices belonging to same IOMMU group to the
>> +     * PE instance for later reference.
>> +     */
>> +
>> +    /* Check the PE's state and make sure it's in functional state */
>> +    pe_op.op = VFIO_EEH_PE_GET_STATE;
>> +    ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +    /* Save device state using pci_save_state().
>> +     * EEH should be enabled on the specified device.
>> +     */
>> +
>> +    ....
>> +
>> +    /* When 0xFF's returned from reading PCI config space or IO BARs
>> +     * of the PCI device. Check the PE's state to see if that has been
>> +     * frozen.
>> +     */
>> +    ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +    /* Waiting for pending PCI transactions to be completed and don't
>> +     * produce any more PCI traffic from/to the affected PE until
>> +     * recovery is finished.
>> +     */
>> +
>> +    /* Enable IO for the affected PE and collect logs. Usually, the
>> +     * standard part of PCI config space, AER registers are dumped
>> +     * as logs for further analysis.
>> +     */
>> +    pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
>> +    ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +    /*
>> +     * Issue PE reset: hot or fundamental reset. Usually, hot reset
>> +     * is enough. However, the firmware of some PCI adapters would
>> +     * require fundamental reset.
>> +     */
>> +    pe_op.op = VFIO_EEH_PE_RESET_HOT;
>> +    ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +    pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
>> +    ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +    /* Configure the PCI bridges for the affected PE */
>> +    pe_op.op = VFIO_EEH_PE_CONFIGURE;
>> +    ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +    /* Restored state we saved at initialization time. pci_restore_state()
>> +     * is good enough as an example.
>> +     */
>> +
>> +    /* Hopefully, error is recovered successfully. Now, you can resume to
>> +     * start PCI traffic to/from the affected PE.
>> +     */
>> +
>> +    ....
>>  
>>  
>> -------------------------------------------------------------------------------
>>  
>> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
>> index 72bfabc..50e30bc 100644
>> --- a/drivers/vfio/Makefile
>> +++ b/drivers/vfio/Makefile
>> @@ -1,4 +1,5 @@
>>  obj-$(CONFIG_VFIO) += vfio.o
>>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>>  obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>> +obj-$(CONFIG_EEH) += vfio_spapr_eeh.o
>>  obj-$(CONFIG_VFIO_PCI) += pci/
>> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
>> index 7ba0424..0122665 100644
>> --- a/drivers/vfio/pci/vfio_pci.c
>> +++ b/drivers/vfio/pci/vfio_pci.c
>> @@ -156,8 +156,10 @@ static void vfio_pci_release(void *device_data)
>>  {
>>      struct vfio_pci_device *vdev = device_data;
>>  
>> -    if (atomic_dec_and_test(&vdev->refcnt))
>> +    if (atomic_dec_and_test(&vdev->refcnt)) {
>> +            vfio_spapr_pci_eeh_release(vdev->pdev);
>>              vfio_pci_disable(vdev);
>> +    }
>>  
>>      module_put(THIS_MODULE);
>>  }
>> @@ -165,19 +167,27 @@ static void vfio_pci_release(void *device_data)
>>  static int vfio_pci_open(void *device_data)
>>  {
>>      struct vfio_pci_device *vdev = device_data;
>> +    int ret;
>>  
>>      if (!try_module_get(THIS_MODULE))
>>              return -ENODEV;
>>  
>>      if (atomic_inc_return(&vdev->refcnt) == 1) {
>> -            int ret = vfio_pci_enable(vdev);
>> +            ret = vfio_pci_enable(vdev);
>> +            if (ret)
>> +                    goto error;
>> +
>> +            ret = vfio_spapr_pci_eeh_open(vdev->pdev);
>>              if (ret) {
>> -                    module_put(THIS_MODULE);
>> -                    return ret;
>> +                    vfio_pci_disable(vdev);
>> +                    goto error;
>>              }
>>      }
>>  
>>      return 0;
>> +error:
>> +    module_put(THIS_MODULE);
>> +    return ret;
>>  }
>>  
>>  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int 
>> irq_type)
>> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
>> b/drivers/vfio/vfio_iommu_spapr_tce.c
>> index a84788b..730b4ef 100644
>> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
>> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
>> @@ -156,7 +156,16 @@ static long tce_iommu_ioctl(void *iommu_data,
>>  
>>      switch (cmd) {
>>      case VFIO_CHECK_EXTENSION:
>> -            return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
>> +            switch (arg) {
>> +            case VFIO_SPAPR_TCE_IOMMU:
>> +                    ret = 1;
>> +                    break;
>> +            default:
>> +                    ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
>> +                    break;
>> +            }
>> +
>> +            return (ret < 0) ? 0 : ret;
>>  
>>      case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>>              struct vfio_iommu_spapr_tce_info info;
>> @@ -283,6 +292,12 @@ static long tce_iommu_ioctl(void *iommu_data,
>>              tce_iommu_disable(container);
>>              mutex_unlock(&container->lock);
>>              return 0;
>> +    case VFIO_EEH_PE_OP:
>> +            if (!container->tbl || !container->tbl->it_group)
>> +                    return -ENODEV;
>> +
>> +            return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
>> +                                              cmd, arg);
>>      }
>>  
>>      return -ENOTTY;
>> diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
>> new file mode 100644
>> index 0000000..d438394
>> --- /dev/null
>> +++ b/drivers/vfio/vfio_spapr_eeh.c
>> @@ -0,0 +1,87 @@
>> +/*
>> + * EEH functionality support for VFIO devices. The feature is only
>> + * available on sPAPR compatible platforms.
>> + *
>> + * Copyright Gavin Shan, IBM Corporation 2014.
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/uaccess.h>
>> +#include <linux/vfio.h>
>> +#include <asm/eeh.h>
>> +
>> +/* We might build address mapping here for "fast" path later */
>> +int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
>> +{
>> +    return eeh_dev_open(pdev);
>> +}
>> +
>> +void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
>> +{
>> +    eeh_dev_release(pdev);
>> +}
>> +
>> +long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>> +                            unsigned int cmd, unsigned long arg)
>> +{
>> +    struct eeh_pe *pe;
>> +    struct vfio_eeh_pe_op op;
>> +    unsigned long minsz;
>> +    long ret = -EINVAL;
>> +
>> +    switch (cmd) {
>> +    case VFIO_CHECK_EXTENSION:
>> +            if (arg == VFIO_EEH)
>> +                    ret = eeh_enabled() ? 1 : 0;
>> +            else
>> +                    ret = 0;
>> +            break;
>> +    case VFIO_EEH_PE_OP:
>> +            pe = eeh_iommu_group_to_pe(group);
>> +            if (!pe)
>> +                    return -ENODEV;
>> +
>> +            minsz = offsetofend(struct vfio_eeh_pe_op, op);
>> +            if (copy_from_user(&op, (void __user *)arg, minsz))
>> +                    return -EFAULT;
>> +            if (op.argsz < minsz)
>> +                    return -EINVAL;
>
>I'm running low on comments, but I found one more.  flags should be
>tested as zero here or else we'll run into problems with adding new
>functionality later.  The caller could leave it uninitialized and pass
>junk which would then break if we start using those flags.  New
>userspace on an old kernel could also intentionally pass flag bits that
>we ignore and let through here.
>

Thanks, Alex. I'll fix in next revision. The QEMU part needs corresponding
changes (set flags to zero), which will be updated and sent separately.

>
>How are you expecting this to go in, an ack from me then pushed through
>AlexG's tree?  Thanks,
>

I don't think it can catch 3.16 merge window. Perhaps, Patch[1/3] and
Patch[2/3] goes to Ben's tree and Patch[3/3] could be picked by AlexG
or you, but I'm not sure.

Thanks,
Gavin

>> +
>> +            switch (op.op) {
>> +            case VFIO_EEH_PE_DISABLE:
>> +                    ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE);
>> +                    break;
>> +            case VFIO_EEH_PE_ENABLE:
>> +                    ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE);
>> +                    break;
>> +            case VFIO_EEH_PE_UNFREEZE_IO:
>> +                    ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
>> +                    break;
>> +            case VFIO_EEH_PE_UNFREEZE_DMA:
>> +                    ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
>> +                    break;
>> +            case VFIO_EEH_PE_GET_STATE:
>> +                    ret = eeh_pe_get_state(pe);
>> +                    break;
>> +            case VFIO_EEH_PE_RESET_DEACTIVATE:
>> +                    ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
>> +                    break;
>> +            case VFIO_EEH_PE_RESET_HOT:
>> +                    ret = eeh_pe_reset(pe, EEH_RESET_HOT);
>> +                    break;
>> +            case VFIO_EEH_PE_RESET_FUNDAMENTAL:
>> +                    ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL);
>> +                    break;
>> +            case VFIO_EEH_PE_CONFIGURE:
>> +                    ret = eeh_pe_configure(pe);
>> +                    break;
>> +            default:
>> +                    ret = -EINVAL;
>> +            }
>> +    }
>> +
>> +    return ret;
>> +}
>> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
>> index 81022a52..0d3bb8f 100644
>> --- a/include/linux/vfio.h
>> +++ b/include/linux/vfio.h
>> @@ -99,4 +99,27 @@ extern int vfio_external_user_iommu_id(struct vfio_group 
>> *group);
>>  extern long vfio_external_check_extension(struct vfio_group *group,
>>                                        unsigned long arg);
>>  
>> +#ifdef CONFIG_EEH
>> +extern int vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
>> +extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
>> +extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>> +                                   unsigned int cmd,
>> +                                   unsigned long arg);
>> +#else
>> +static inline int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
>> +{
>> +    return 0;
>> +}
>> +
>> +static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
>> +{
>> +}
>> +
>> +static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
>> +                                          unsigned int cmd,
>> +                                          unsigned long arg)
>> +{
>> +    return -ENOTTY;
>> +}
>> +#endif /* CONFIG_EEH */
>>  #endif /* VFIO_H */
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index cb9023d..6612974 100644
>> --- a/include/uapi/linux/vfio.h
>> +++ b/include/uapi/linux/vfio.h
>> @@ -30,6 +30,9 @@
>>   */
>>  #define VFIO_DMA_CC_IOMMU           4
>>  
>> +/* Check if EEH is supported */
>> +#define VFIO_EEH                    5
>> +
>>  /*
>>   * The IOCTL interface is designed for extensibility by embedding the
>>   * structure length (argsz) and flags into structures passed between
>> @@ -455,6 +458,37 @@ struct vfio_iommu_spapr_tce_info {
>>  
>>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO       _IO(VFIO_TYPE, VFIO_BASE + 12)
>>  
>> +/*
>> + * EEH PE operation struct provides ways to:
>> + * - enable/disable EEH functionality;
>> + * - unfreeze IO/DMA for frozen PE;
>> + * - read PE state;
>> + * - reset PE;
>> + * - configure PE.
>> + */
>> +struct vfio_eeh_pe_op {
>> +    __u32 argsz;
>> +    __u32 flags;
>> +    __u32 op;
>> +};
>> +
>> +#define VFIO_EEH_PE_DISABLE         0       /* Disable EEH functionality */
>> +#define VFIO_EEH_PE_ENABLE          1       /* Enable EEH functionality  */
>> +#define VFIO_EEH_PE_UNFREEZE_IO             2       /* Enable IO for frozen 
>> PE   */
>> +#define VFIO_EEH_PE_UNFREEZE_DMA    3       /* Enable DMA for frozen PE  */
>> +#define VFIO_EEH_PE_GET_STATE               4       /* PE state retrieval   
>>      */
>> +#define  VFIO_EEH_PE_STATE_NORMAL   0       /* PE in functional state    */
>> +#define  VFIO_EEH_PE_STATE_RESET    1       /* PE reset in progress      */
>> +#define  VFIO_EEH_PE_STATE_STOPPED  2       /* Stopped DMA and IO        */
>> +#define  VFIO_EEH_PE_STATE_STOPPED_DMA      4       /* Stopped DMA only     
>>      */
>> +#define  VFIO_EEH_PE_STATE_UNAVAIL  5       /* State unavailable         */
>> +#define VFIO_EEH_PE_RESET_DEACTIVATE        5       /* Deassert PE reset    
>>      */
>> +#define VFIO_EEH_PE_RESET_HOT               6       /* Assert hot reset     
>>      */
>> +#define VFIO_EEH_PE_RESET_FUNDAMENTAL       7       /* Assert fundamental 
>> reset  */
>> +#define VFIO_EEH_PE_CONFIGURE               8       /* PE configuration     
>>      */
>> +
>> +#define VFIO_EEH_PE_OP                      _IO(VFIO_TYPE, VFIO_BASE + 21)
>> +
>>  /* ***************************************************************** */
>>  
>>  #endif /* _UAPIVFIO_H */
>
>
>

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to