On Tue, 2014-06-10 at 11:41 +1000, Gavin Shan wrote:
> The patch adds new IOCTL commands for sPAPR VFIO container device
> to support EEH functionality for PCI devices, which have been passed
> through from host to somebody else via VFIO.
> 
> Signed-off-by: Gavin Shan <gws...@linux.vnet.ibm.com>
> Acked-by: Alexander Graf <ag...@suse.de>

Acked-by: Alex Williamson <alex.william...@redhat.com>

> ---
>  Documentation/vfio.txt              | 87 
> +++++++++++++++++++++++++++++++++++--
>  drivers/vfio/Makefile               |  1 +
>  drivers/vfio/pci/vfio_pci.c         | 18 ++++++--
>  drivers/vfio/vfio_iommu_spapr_tce.c | 17 +++++++-
>  drivers/vfio/vfio_spapr_eeh.c       | 87 
> +++++++++++++++++++++++++++++++++++++
>  include/linux/vfio.h                | 23 ++++++++++
>  include/uapi/linux/vfio.h           | 34 +++++++++++++++
>  7 files changed, 259 insertions(+), 8 deletions(-)
>  create mode 100644 drivers/vfio/vfio_spapr_eeh.c
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..96978ec 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
> real mode which provides
>  an excellent performance which has limitations such as inability to do
>  locked pages accounting in real time.
>  
> -So 3 additional ioctls have been added:
> +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
> +subtree that can be treated as a unit for the purposes of partitioning and
> +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
> +function of a multi-function IOA, or multiple IOAs (possibly including switch
> +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
> errors
> +and recover from them via EEH RTAS services, which works on the basis of
> +additional ioctl commands.
> +
> +So 4 additional ioctls have been added:
>  
>       VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>               of the DMA window on the PCI bus.
> @@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
>  
>       VFIO_IOMMU_DISABLE - disables the container.
>  
> +     VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
> recovery.
>  
>  The code flow from the example above should be slightly changed:
>  
> +     struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op), .flags = 0 };
> +
>       .....
>       /* Add the group to the container */
>       ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
> @@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
> changed:
>       dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
>  
>       /* Check here is .iova/.size are within DMA window from 
> spapr_iommu_info */
> -
>       ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
> -     .....
> +
> +     /* Get a file descriptor for the device */
> +     device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:06:0d.0");
> +
> +     ....
> +
> +     /* Gratuitous device reset and go... */
> +     ioctl(device, VFIO_DEVICE_RESET);
> +
> +     /* Make sure EEH is supported */
> +     ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
> +
> +     /* Enable the EEH functionality on the device */
> +     pe_op.op = VFIO_EEH_PE_ENABLE;
> +     ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> +     /* You're suggested to create additional data struct to represent
> +      * PE, and put child devices belonging to same IOMMU group to the
> +      * PE instance for later reference.
> +      */
> +
> +     /* Check the PE's state and make sure it's in functional state */
> +     pe_op.op = VFIO_EEH_PE_GET_STATE;
> +     ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> +     /* Save device state using pci_save_state().
> +      * EEH should be enabled on the specified device.
> +      */
> +
> +     ....
> +
> +     /* When 0xFF's returned from reading PCI config space or IO BARs
> +      * of the PCI device. Check the PE's state to see if that has been
> +      * frozen.
> +      */
> +     ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> +     /* Waiting for pending PCI transactions to be completed and don't
> +      * produce any more PCI traffic from/to the affected PE until
> +      * recovery is finished.
> +      */
> +
> +     /* Enable IO for the affected PE and collect logs. Usually, the
> +      * standard part of PCI config space, AER registers are dumped
> +      * as logs for further analysis.
> +      */
> +     pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
> +     ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> +     /*
> +      * Issue PE reset: hot or fundamental reset. Usually, hot reset
> +      * is enough. However, the firmware of some PCI adapters would
> +      * require fundamental reset.
> +      */
> +     pe_op.op = VFIO_EEH_PE_RESET_HOT;
> +     ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +     pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
> +     ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> +     /* Configure the PCI bridges for the affected PE */
> +     pe_op.op = VFIO_EEH_PE_CONFIGURE;
> +     ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> +     /* Restored state we saved at initialization time. pci_restore_state()
> +      * is good enough as an example.
> +      */
> +
> +     /* Hopefully, error is recovered successfully. Now, you can resume to
> +      * start PCI traffic to/from the affected PE.
> +      */
> +
> +     ....
>  
>  
> -------------------------------------------------------------------------------
>  
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 72bfabc..50e30bc 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -1,4 +1,5 @@
>  obj-$(CONFIG_VFIO) += vfio.o
>  obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
>  obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
> +obj-$(CONFIG_EEH) += vfio_spapr_eeh.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
> diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
> index 7ba0424..0122665 100644
> --- a/drivers/vfio/pci/vfio_pci.c
> +++ b/drivers/vfio/pci/vfio_pci.c
> @@ -156,8 +156,10 @@ static void vfio_pci_release(void *device_data)
>  {
>       struct vfio_pci_device *vdev = device_data;
>  
> -     if (atomic_dec_and_test(&vdev->refcnt))
> +     if (atomic_dec_and_test(&vdev->refcnt)) {
> +             vfio_spapr_pci_eeh_release(vdev->pdev);
>               vfio_pci_disable(vdev);
> +     }
>  
>       module_put(THIS_MODULE);
>  }
> @@ -165,19 +167,27 @@ static void vfio_pci_release(void *device_data)
>  static int vfio_pci_open(void *device_data)
>  {
>       struct vfio_pci_device *vdev = device_data;
> +     int ret;
>  
>       if (!try_module_get(THIS_MODULE))
>               return -ENODEV;
>  
>       if (atomic_inc_return(&vdev->refcnt) == 1) {
> -             int ret = vfio_pci_enable(vdev);
> +             ret = vfio_pci_enable(vdev);
> +             if (ret)
> +                     goto error;
> +
> +             ret = vfio_spapr_pci_eeh_open(vdev->pdev);
>               if (ret) {
> -                     module_put(THIS_MODULE);
> -                     return ret;
> +                     vfio_pci_disable(vdev);
> +                     goto error;
>               }
>       }
>  
>       return 0;
> +error:
> +     module_put(THIS_MODULE);
> +     return ret;
>  }
>  
>  static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
> diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
> b/drivers/vfio/vfio_iommu_spapr_tce.c
> index a84788b..730b4ef 100644
> --- a/drivers/vfio/vfio_iommu_spapr_tce.c
> +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
> @@ -156,7 +156,16 @@ static long tce_iommu_ioctl(void *iommu_data,
>  
>       switch (cmd) {
>       case VFIO_CHECK_EXTENSION:
> -             return (arg == VFIO_SPAPR_TCE_IOMMU) ? 1 : 0;
> +             switch (arg) {
> +             case VFIO_SPAPR_TCE_IOMMU:
> +                     ret = 1;
> +                     break;
> +             default:
> +                     ret = vfio_spapr_iommu_eeh_ioctl(NULL, cmd, arg);
> +                     break;
> +             }
> +
> +             return (ret < 0) ? 0 : ret;
>  
>       case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
>               struct vfio_iommu_spapr_tce_info info;
> @@ -283,6 +292,12 @@ static long tce_iommu_ioctl(void *iommu_data,
>               tce_iommu_disable(container);
>               mutex_unlock(&container->lock);
>               return 0;
> +     case VFIO_EEH_PE_OP:
> +             if (!container->tbl || !container->tbl->it_group)
> +                     return -ENODEV;
> +
> +             return vfio_spapr_iommu_eeh_ioctl(container->tbl->it_group,
> +                                               cmd, arg);
>       }
>  
>       return -ENOTTY;
> diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
> new file mode 100644
> index 0000000..f834b4c
> --- /dev/null
> +++ b/drivers/vfio/vfio_spapr_eeh.c
> @@ -0,0 +1,87 @@
> +/*
> + * EEH functionality support for VFIO devices. The feature is only
> + * available on sPAPR compatible platforms.
> + *
> + * Copyright Gavin Shan, IBM Corporation 2014.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/uaccess.h>
> +#include <linux/vfio.h>
> +#include <asm/eeh.h>
> +
> +/* We might build address mapping here for "fast" path later */
> +int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
> +{
> +     return eeh_dev_open(pdev);
> +}
> +
> +void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
> +{
> +     eeh_dev_release(pdev);
> +}
> +
> +long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
> +                             unsigned int cmd, unsigned long arg)
> +{
> +     struct eeh_pe *pe;
> +     struct vfio_eeh_pe_op op;
> +     unsigned long minsz;
> +     long ret = -EINVAL;
> +
> +     switch (cmd) {
> +     case VFIO_CHECK_EXTENSION:
> +             if (arg == VFIO_EEH)
> +                     ret = eeh_enabled() ? 1 : 0;
> +             else
> +                     ret = 0;
> +             break;
> +     case VFIO_EEH_PE_OP:
> +             pe = eeh_iommu_group_to_pe(group);
> +             if (!pe)
> +                     return -ENODEV;
> +
> +             minsz = offsetofend(struct vfio_eeh_pe_op, op);
> +             if (copy_from_user(&op, (void __user *)arg, minsz))
> +                     return -EFAULT;
> +             if (op.argsz < minsz || op.flags)
> +                     return -EINVAL;
> +
> +             switch (op.op) {
> +             case VFIO_EEH_PE_DISABLE:
> +                     ret = eeh_pe_set_option(pe, EEH_OPT_DISABLE);
> +                     break;
> +             case VFIO_EEH_PE_ENABLE:
> +                     ret = eeh_pe_set_option(pe, EEH_OPT_ENABLE);
> +                     break;
> +             case VFIO_EEH_PE_UNFREEZE_IO:
> +                     ret = eeh_pe_set_option(pe, EEH_OPT_THAW_MMIO);
> +                     break;
> +             case VFIO_EEH_PE_UNFREEZE_DMA:
> +                     ret = eeh_pe_set_option(pe, EEH_OPT_THAW_DMA);
> +                     break;
> +             case VFIO_EEH_PE_GET_STATE:
> +                     ret = eeh_pe_get_state(pe);
> +                     break;
> +             case VFIO_EEH_PE_RESET_DEACTIVATE:
> +                     ret = eeh_pe_reset(pe, EEH_RESET_DEACTIVATE);
> +                     break;
> +             case VFIO_EEH_PE_RESET_HOT:
> +                     ret = eeh_pe_reset(pe, EEH_RESET_HOT);
> +                     break;
> +             case VFIO_EEH_PE_RESET_FUNDAMENTAL:
> +                     ret = eeh_pe_reset(pe, EEH_RESET_FUNDAMENTAL);
> +                     break;
> +             case VFIO_EEH_PE_CONFIGURE:
> +                     ret = eeh_pe_configure(pe);
> +                     break;
> +             default:
> +                     ret = -EINVAL;
> +             }
> +     }
> +
> +     return ret;
> +}
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index 81022a52..0d3bb8f 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -99,4 +99,27 @@ extern int vfio_external_user_iommu_id(struct vfio_group 
> *group);
>  extern long vfio_external_check_extension(struct vfio_group *group,
>                                         unsigned long arg);
>  
> +#ifdef CONFIG_EEH
> +extern int vfio_spapr_pci_eeh_open(struct pci_dev *pdev);
> +extern void vfio_spapr_pci_eeh_release(struct pci_dev *pdev);
> +extern long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
> +                                    unsigned int cmd,
> +                                    unsigned long arg);
> +#else
> +static inline int vfio_spapr_pci_eeh_open(struct pci_dev *pdev)
> +{
> +     return 0;
> +}
> +
> +static inline void vfio_spapr_pci_eeh_release(struct pci_dev *pdev)
> +{
> +}
> +
> +static inline long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
> +                                           unsigned int cmd,
> +                                           unsigned long arg)
> +{
> +     return -ENOTTY;
> +}
> +#endif /* CONFIG_EEH */
>  #endif /* VFIO_H */
> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
> index cb9023d..6612974 100644
> --- a/include/uapi/linux/vfio.h
> +++ b/include/uapi/linux/vfio.h
> @@ -30,6 +30,9 @@
>   */
>  #define VFIO_DMA_CC_IOMMU            4
>  
> +/* Check if EEH is supported */
> +#define VFIO_EEH                     5
> +
>  /*
>   * The IOCTL interface is designed for extensibility by embedding the
>   * structure length (argsz) and flags into structures passed between
> @@ -455,6 +458,37 @@ struct vfio_iommu_spapr_tce_info {
>  
>  #define VFIO_IOMMU_SPAPR_TCE_GET_INFO        _IO(VFIO_TYPE, VFIO_BASE + 12)
>  
> +/*
> + * EEH PE operation struct provides ways to:
> + * - enable/disable EEH functionality;
> + * - unfreeze IO/DMA for frozen PE;
> + * - read PE state;
> + * - reset PE;
> + * - configure PE.
> + */
> +struct vfio_eeh_pe_op {
> +     __u32 argsz;
> +     __u32 flags;
> +     __u32 op;
> +};
> +
> +#define VFIO_EEH_PE_DISABLE          0       /* Disable EEH functionality */
> +#define VFIO_EEH_PE_ENABLE           1       /* Enable EEH functionality  */
> +#define VFIO_EEH_PE_UNFREEZE_IO              2       /* Enable IO for frozen 
> PE   */
> +#define VFIO_EEH_PE_UNFREEZE_DMA     3       /* Enable DMA for frozen PE  */
> +#define VFIO_EEH_PE_GET_STATE                4       /* PE state retrieval   
>      */
> +#define  VFIO_EEH_PE_STATE_NORMAL    0       /* PE in functional state    */
> +#define  VFIO_EEH_PE_STATE_RESET     1       /* PE reset in progress      */
> +#define  VFIO_EEH_PE_STATE_STOPPED   2       /* Stopped DMA and IO        */
> +#define  VFIO_EEH_PE_STATE_STOPPED_DMA       4       /* Stopped DMA only     
>      */
> +#define  VFIO_EEH_PE_STATE_UNAVAIL   5       /* State unavailable         */
> +#define VFIO_EEH_PE_RESET_DEACTIVATE 5       /* Deassert PE reset         */
> +#define VFIO_EEH_PE_RESET_HOT                6       /* Assert hot reset     
>      */
> +#define VFIO_EEH_PE_RESET_FUNDAMENTAL        7       /* Assert fundamental 
> reset  */
> +#define VFIO_EEH_PE_CONFIGURE                8       /* PE configuration     
>      */
> +
> +#define VFIO_EEH_PE_OP                       _IO(VFIO_TYPE, VFIO_BASE + 21)
> +
>  /* ***************************************************************** */
>  
>  #endif /* _UAPIVFIO_H */



--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to