The patch adds new IOCTL command VFIO_EEH_OP to VFIO PCI device
to support EEH functionality for PCI devices, which have been
passed from host to guest via VFIO.

Signed-off-by: Gavin Shan <gws...@linux.vnet.ibm.com>
---
 Documentation/vfio.txt         |   6 +-
 arch/powerpc/include/asm/eeh.h |  10 ++
 arch/powerpc/kernel/eeh.c      | 323 +++++++++++++++++++++++++++++++++++++++++
 drivers/vfio/pci/vfio_pci.c    |  99 ++++++++++++-
 include/uapi/linux/vfio.h      |  43 ++++++
 5 files changed, 474 insertions(+), 7 deletions(-)

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..bb17ec7 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,10 @@ faster, the map/unmap handling has been implemented in 
real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) PPC64 guests detect PCI errors and recover from them via EEH RTAS services,
+which works on the basis of additional ioctl command VFIO_EEH_OP.
+
+So 4 additional ioctls have been added:
 
        VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
                of the DMA window on the PCI bus.
@@ -316,6 +319,7 @@ So 3 additional ioctls have been added:
 
        VFIO_IOMMU_DISABLE - disables the container.
 
+       VFIO_EEH_OP - EEH dependent operations
 
 The code flow from the example above should be slightly changed:
 
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 34a2d83..93922ef 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -305,6 +305,16 @@ void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
+#ifdef CONFIG_VFIO_PCI_EEH
+int eeh_vfio_open(struct pci_dev *pdev);
+void eeh_vfio_release(struct pci_dev *pdev);
+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval);
+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
+                        int *retval, int *info);
+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state);
+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval);
+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval);
+#endif
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..2aaf90e 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1098,6 +1098,329 @@ void eeh_remove_device(struct pci_dev *dev)
        edev->mode &= ~EEH_DEV_SYSFS;
 }
 
+#ifdef CONFIG_VFIO_PCI_EEH
+int eeh_vfio_open(struct pci_dev *pdev)
+{
+       struct eeh_dev *edev;
+
+       /* No PCI device ? */
+       if (!pdev)
+               return -ENODEV;
+
+       /* No EEH device ? */
+       edev = pci_dev_to_eeh_dev(pdev);
+       if (!edev || !edev->pe)
+               return -ENODEV;
+
+       eeh_dev_set_passed(edev, true);
+       eeh_pe_set_passed(edev->pe, true);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_open);
+
+void eeh_vfio_release(struct pci_dev *pdev)
+{
+       bool release_pe = true;
+       struct eeh_pe *pe = NULL;
+       struct eeh_dev *tmp, *edev;
+
+       /* No PCI device ? */
+       if (!pdev)
+               return;
+
+       /* No EEH device ? */
+       edev = pci_dev_to_eeh_dev(pdev);
+       if (!edev || !eeh_dev_passed(edev) ||
+           !edev->pe || !eeh_pe_passed(pe))
+               return;
+
+       /* Release device */
+       pe = edev->pe;
+       eeh_dev_set_passed(edev, false);
+
+       /* Release PE */
+       eeh_pe_for_each_dev(pe, edev, tmp) {
+               if (eeh_dev_passed(edev)) {
+                       release_pe = false;
+                       break;
+               }
+       }
+
+       if (release_pe)
+               eeh_pe_set_passed(pe, false);
+}
+EXPORT_SYMBOL(eeh_vfio_release);
+
+static int eeh_vfio_check_dev(struct pci_dev *pdev,
+                             struct eeh_dev **pedev,
+                             struct eeh_pe **ppe)
+{
+       struct eeh_dev *edev;
+
+       /* No device ? */
+       if (!pdev)
+               return -ENODEV;
+
+       edev = pci_dev_to_eeh_dev(pdev);
+       if (!edev || !eeh_dev_passed(edev) ||
+           !edev->pe || !eeh_pe_passed(edev->pe))
+               return -ENODEV;
+
+       if (pedev)
+               *pedev = edev;
+       if (ppe)
+               *ppe = edev->pe;
+
+       return 0;
+}
+
+int eeh_vfio_set_pe_option(struct pci_dev *pdev, int option, int *retval)
+{
+       struct eeh_dev *edev;
+       struct eeh_pe *pe;
+       int ret = 0;
+
+       /* Device existing ? */
+       ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+       if (ret) {
+               pr_debug("%s: Cannot find device %s\n",
+                       __func__, pdev ? pci_name(pdev) : "NULL");
+               *retval = -7;
+               goto out;
+       }
+
+       /* Invalid option ? */
+       if (option < EEH_OPT_DISABLE ||
+           option > EEH_OPT_THAW_DMA) {
+               pr_debug("%s: Option %d out of range (%d, %d)\n",
+                       __func__, option, EEH_OPT_DISABLE, EEH_OPT_THAW_DMA);
+               *retval = -3;
+               ret = -EINVAL;
+               goto out;
+       }
+
+       if (option == EEH_OPT_DISABLE ||
+           option == EEH_OPT_ENABLE) {
+               *retval = 0;
+       } else {
+               if (!eeh_ops || !eeh_ops->set_option) {
+                       *retval = -7;
+                       ret = -ENOENT;
+                       goto out;
+               }
+
+               ret = eeh_ops->set_option(pe, option);
+               if (ret) {
+                       pr_debug("%s: Failure %d from backend\n",
+                               __func__, ret);
+                       *retval = -1;
+                       goto out;
+               }
+
+               *retval = 0;
+       }
+out:
+       return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_set_pe_option);
+
+int eeh_vfio_get_pe_addr(struct pci_dev *pdev, int option,
+                        int *retval, int *info)
+{
+       struct pci_bus *bus;
+       struct eeh_dev *edev;
+       struct eeh_pe *pe;
+       int ret = 0;
+
+       /* Device existing ? */
+       ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+       if (ret) {
+               *retval = -3;
+               goto out;
+       }
+
+       /* Invalid option ? */
+       if (option != 0 && option != 1) {
+               pr_debug("%s: option %d out of range (0, 1)\n",
+                       __func__, option);
+               *retval = -3;
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Fill result according to option. We don't differentiate
+        * PCI bus and device dependent PE here. So all PEs are
+        * built in "shared" mode. Also, the PE address has the format
+        * of "00BBSS00".
+        */
+       if (option == 0) {
+               bus = eeh_pe_bus_get(pe);
+               if (!bus) {
+                       *retval = -3;
+                       ret = -ENODEV;
+                       goto out;
+               }
+
+               *retval = 0;
+               *info = bus->number << 16;
+       } else {
+               *retval = 0;
+               *info = 1;
+       }
+out:
+       return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_addr);
+
+int eeh_vfio_get_pe_state(struct pci_dev *pdev, int *retval, int *state)
+{
+       struct eeh_dev *edev;
+       struct eeh_pe *pe;
+       int result, ret = 0;
+
+       /* Device existing ? */
+       ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+       if (ret) {
+               *retval = -3;
+               goto out;
+       }
+
+       if (!eeh_ops || !eeh_ops->get_state) {
+               pr_debug("%s: Unsupported request\n",
+                       __func__);
+               ret = -ENOENT;
+               *retval = -3;
+               goto out;
+       }
+
+       result = eeh_ops->get_state(pe, NULL);
+       if (!(result & EEH_STATE_RESET_ACTIVE) &&
+            (result & EEH_STATE_DMA_ENABLED) &&
+            (result & EEH_STATE_MMIO_ENABLED))
+               *state = 0;
+       else if (result & EEH_STATE_RESET_ACTIVE)
+               *state = 1;
+       else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+                !(result & EEH_STATE_DMA_ENABLED) &&
+                !(result & EEH_STATE_MMIO_ENABLED))
+               *state = 2;
+       else if (!(result & EEH_STATE_RESET_ACTIVE) &&
+                (result & EEH_STATE_DMA_ENABLED) &&
+                !(result & EEH_STATE_MMIO_ENABLED))
+               *state = 4;
+       else
+               *state = 5;
+
+       *retval = 0;
+out:
+       return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_get_pe_state);
+
+int eeh_vfio_reset_pe(struct pci_dev *pdev, int option, int *retval)
+{
+       struct eeh_dev *edev;
+       struct eeh_pe *pe;
+       int ret = 0;
+
+       /* Device existing ? */
+       ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+       if (ret) {
+               *retval = -3;
+               goto out;
+       }
+
+       /* Invalid option ? */
+       if (option != EEH_RESET_DEACTIVATE &&
+           option != EEH_RESET_HOT &&
+           option != EEH_RESET_FUNDAMENTAL) {
+               pr_debug("%s: Unsupported option %d\n",
+                       __func__, option);
+               ret = -EINVAL;
+               *retval = -3;
+               goto out;
+       }
+
+       if (!eeh_ops || !eeh_ops->set_option || !eeh_ops->reset) {
+               pr_debug("%s: Unsupported request\n",
+                       __func__);
+               ret = -ENOENT;
+               *retval = -7;
+               goto out;
+       }
+
+       ret = eeh_ops->reset(pe, option);
+       if (ret) {
+               pr_debug("%s: Failure %d from backend\n",
+                        __func__, ret);
+               *retval = -1;
+               goto out;
+       }
+
+       /*
+        * The PE is still in frozen state and we need clear that.
+        * It's good to clear frozen state after deassert to avoid
+        * messy IO access during reset, which might cause recrusive
+        * frozen PE.
+        */
+       if (option == EEH_RESET_DEACTIVATE) {
+               ret = eeh_ops->set_option(pe, EEH_OPT_THAW_MMIO);
+               if (ret) {
+                       pr_debug("%s: Cannot enable IO for PHB#%d-PE#%d (%d)\n",
+                               __func__, pe->phb->global_number, pe->addr, 
ret);
+                       *retval = -1;
+                       goto out;
+               }
+
+               ret = eeh_ops->set_option(pe, EEH_OPT_THAW_DMA);
+               if (ret) {
+                       pr_debug("%s: Cannot enable DMA for PHB#%d-PE#%d 
(%d)\n",
+                               __func__, pe->phb->global_number, pe->addr, 
ret);
+                       *retval = -1;
+                       goto out;
+               }
+
+               eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
+       }
+
+       *retval = 0;
+out:
+       return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_reset_pe);
+
+int eeh_vfio_configure_pe(struct pci_dev *pdev, int *retval)
+{
+       struct eeh_dev *edev;
+       struct eeh_pe *pe;
+       int ret = 0;
+
+       /* Device existing ? */
+       ret = eeh_vfio_check_dev(pdev, &edev, &pe);
+       if (ret) {
+               *retval = -3;
+               goto out;
+       }
+
+       /*
+        * The access to PCI config space on VFIO device has some
+        * limitations. Part of PCI config space, including BAR
+        * registers are not readable and writable. So the guest
+        * should have stale values for those registers and we have
+        * to restore them in host side.
+        */
+       eeh_pe_restore_bars(pe);
+       *retval = 0;
+
+out:
+       return ret;
+}
+EXPORT_SYMBOL_GPL(eeh_vfio_configure_pe);
+
+#endif /* CONFIG_VFIO_PCI_EEH */
+
 static int proc_eeh_show(struct seq_file *m, void *v)
 {
        if (!eeh_enabled()) {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 7ba0424..05c3dde 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -25,6 +25,9 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <linux/vfio.h>
+#ifdef CONFIG_VFIO_PCI_EEH
+#include <asm/eeh.h>
+#endif
 
 #include "vfio_pci_private.h"
 
@@ -152,32 +155,57 @@ static void vfio_pci_disable(struct vfio_pci_device *vdev)
        pci_restore_state(pdev);
 }
 
+static void vfio_eeh_pci_release(struct pci_dev *pdev)
+{
+#ifdef CONFIG_VFIO_PCI_EEH
+       eeh_vfio_release(pdev);
+#endif
+}
+
 static void vfio_pci_release(void *device_data)
 {
        struct vfio_pci_device *vdev = device_data;
 
-       if (atomic_dec_and_test(&vdev->refcnt))
+       if (atomic_dec_and_test(&vdev->refcnt)) {
+               vfio_eeh_pci_release(vdev->pdev);
                vfio_pci_disable(vdev);
+       }
 
        module_put(THIS_MODULE);
 }
 
+static int vfio_eeh_pci_open(struct pci_dev *pdev)
+{
+       int ret = 0;
+
+#ifdef CONFIG_VFIO_PCI_EEH
+       ret = eeh_vfio_open(pdev);
+#endif
+       return ret;
+}
+
 static int vfio_pci_open(void *device_data)
 {
        struct vfio_pci_device *vdev = device_data;
+       int ret;
 
        if (!try_module_get(THIS_MODULE))
                return -ENODEV;
 
        if (atomic_inc_return(&vdev->refcnt) == 1) {
-               int ret = vfio_pci_enable(vdev);
-               if (ret) {
-                       module_put(THIS_MODULE);
-                       return ret;
-               }
+               ret = vfio_pci_enable(vdev);
+               if (ret)
+                       goto error;
+
+               ret = vfio_eeh_pci_open(vdev->pdev);
+               if (ret)
+                       goto error;
        }
 
        return 0;
+error:
+       module_put(THIS_MODULE);
+       return ret;
 }
 
 static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
@@ -321,6 +349,51 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev 
*pdev,
        return walk.ret;
 }
 
+static int vfio_eeh_pci_ioctl(struct pci_dev *pdev, struct vfio_eeh_op *info)
+{
+       int ret = 0;
+
+#ifdef CONFIG_VFIO_PCI_EEH
+       switch (info->op) {
+       case VFIO_EEH_OP_SET_OPTION:
+               ret = eeh_vfio_set_pe_option(pdev,
+                                            info->option.option,
+                                            &info->option.ret);
+               break;
+       case VFIO_EEH_OP_GET_ADDR:
+               ret = eeh_vfio_get_pe_addr(pdev,
+                                          info->addr.option,
+                                          &info->addr.ret,
+                                          &info->addr.info);
+               break;
+       case VFIO_EEH_OP_GET_STATE:
+               ret = eeh_vfio_get_pe_state(pdev,
+                                           &info->state.ret,
+                                           &info->state.reset_state);
+               info->state.cfg_cap = 1;
+               info->state.pe_unavail_info = 1000;
+               info->state.pe_recovery_info = 0;
+               break;
+       case VFIO_EEH_OP_PE_RESET:
+               ret = eeh_vfio_reset_pe(pdev,
+                                       info->reset.option,
+                                       &info->reset.ret);
+               break;
+       case VFIO_EEH_OP_PE_CONFIG:
+               ret = eeh_vfio_configure_pe(pdev,
+                                           &info->config.ret);
+       default:
+               ret = -EINVAL;
+               pr_debug("%s: Cannot handle op#%d\n",
+                       __func__, info->op);
+       }
+#else
+       ret = -ENOENT;
+#endif
+
+       return ret;
+}
+
 static long vfio_pci_ioctl(void *device_data,
                           unsigned int cmd, unsigned long arg)
 {
@@ -682,6 +755,20 @@ hot_reset_release:
 
                kfree(groups);
                return ret;
+       } else if (cmd == VFIO_EEH_OP) {
+               struct vfio_eeh_op info;
+               int ret = 0;
+
+               minsz = sizeof(info);
+               if (copy_from_user(&info, (void __user *)arg, minsz))
+                       return -EFAULT;
+               if (info.argsz < minsz)
+                       return -EINVAL;
+
+               ret = vfio_eeh_pci_ioctl(vdev->pdev, &info);
+               if (copy_to_user((void __user *)arg, &info, minsz))
+                       ret = -EFAULT;
+               return ret;
        }
 
        return -ENOTTY;
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index cb9023d..518961d 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -455,6 +455,49 @@ struct vfio_iommu_spapr_tce_info {
 
 #define VFIO_IOMMU_SPAPR_TCE_GET_INFO  _IO(VFIO_TYPE, VFIO_BASE + 12)
 
+/*
+ * The VFIO operation struct provides way to support EEH functionality
+ * for PCI device that is passed from host to guest via VFIO.
+ */
+#define VFIO_EEH_OP_SET_OPTION 0
+#define VFIO_EEH_OP_GET_ADDR   1
+#define VFIO_EEH_OP_GET_STATE  2
+#define VFIO_EEH_OP_PE_RESET   3
+#define VFIO_EEH_OP_PE_CONFIG  4
+
+struct vfio_eeh_op {
+       __u32 argsz;
+       __u32 op;
+
+       union {
+               struct vfio_eeh_set_option {
+                       __u32 option;
+                       __s32 ret;
+               } option;
+               struct vfio_eeh_pe_addr {
+                       __u32 option;
+                       __s32 ret;
+                       __u32 info;
+               } addr;
+               struct vfio_eeh_pe_state {
+                       __s32 ret;
+                       __u32 reset_state;
+                       __u32 cfg_cap;
+                       __u32 pe_unavail_info;
+                       __u32 pe_recovery_info;
+               } state;
+               struct vfio_eeh_reset {
+                       __u32 option;
+                       __s32 ret;
+               } reset;
+               struct vfio_eeh_config {
+                       __s32 ret;
+               } config;
+       };
+};
+
+#define VFIO_EEH_OP    _IO(VFIO_TYPE, VFIO_BASE + 21)
+
 /* ***************************************************************** */
 
 #endif /* _UAPIVFIO_H */
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to