From: Chen Fan <chen.fan.f...@cn.fujitsu.com> for supporting aer recovery, host and guest would run the same aer recovery code, that would do the secondary bus reset if the error is fatal, the aer recovery process: 1. error_detected 2. reset_link (if fatal) 3. slot_reset/mmio_enabled 4. resume
it indicates that host will do secondary bus reset to reset the physical devices under bus in step 2, that would cause devices in D3 status in a short time. but in qemu, we register an error detected handler, that would be invoked as host broadcasts the error-detected event in step 1, in order to avoid guest do reset_link when host do reset_link simultaneously. it may cause fatal error. we introduce a resmue notifier to assure host reset completely. then do guest aer injection. Signed-off-by: Chen Fan <chen.fan.f...@cn.fujitsu.com> --- hw/vfio/pci.c | 157 +++++++++++++++++++++++++++++++++++---------- hw/vfio/pci.h | 2 + linux-headers/linux/vfio.h | 1 + 3 files changed, 126 insertions(+), 34 deletions(-) diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 691ff5e..d79fb3d 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -2610,12 +2610,7 @@ static void vfio_put_device(VFIOPCIDevice *vdev) static void vfio_err_notifier_handler(void *opaque) { VFIOPCIDevice *vdev = opaque; - PCIDevice *dev = &vdev->pdev; Error *local_err = NULL; - PCIEAERMsg msg = { - .severity = 0, - .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn, - }; if (!event_notifier_test_and_clear(&vdev->err_notifier)) { return; @@ -2636,35 +2631,7 @@ static void vfio_err_notifier_handler(void *opaque) goto stop; } - /* - * we should read the error details from the real hardware - * configuration spaces, here we only need to do is signaling - * to guest an uncorrectable error has occurred. - */ - if (dev->exp.aer_cap) { - uint8_t *aer_cap = dev->config + dev->exp.aer_cap; - uint32_t uncor_status; - bool isfatal; - - uncor_status = vfio_pci_read_config(dev, - dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4); - - /* - * if the error is not emitted by this device, we can - * just ignore it. - */ - if (!(uncor_status & ~0UL)) { - return; - } - - isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER); - - msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : - PCI_ERR_ROOT_CMD_NONFATAL_EN; - - pcie_aer_msg(dev, &msg); - return; - } + return; stop: /* @@ -2757,6 +2724,126 @@ static void vfio_unregister_err_notifier(VFIOPCIDevice *vdev) event_notifier_cleanup(&vdev->err_notifier); } +static void vfio_resume_notifier_handler(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + PCIDevice *dev = &vdev->pdev; + PCIEAERMsg msg = { + .severity = 0, + .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn, + }; + + if (!event_notifier_test_and_clear(&vdev->resume_notifier)) { + return; + } + + /* + * we should read the error details from the real hardware + * configuration spaces, here we only need to do is signaling + * to guest an uncorrectable error has occurred. + */ + if (dev->exp.aer_cap) { + uint8_t *aer_cap = dev->config + dev->exp.aer_cap; + uint32_t uncor_status; + bool isfatal; + + uncor_status = vfio_pci_read_config(dev, + dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4); + + /* + * if the error is not emitted by this device, we can + * just ignore it. + */ + if (!(uncor_status & ~0UL)) { + return; + } + + isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER); + + msg.severity = isfatal ? PCI_ERR_ROOT_CMD_FATAL_EN : + PCI_ERR_ROOT_CMD_NONFATAL_EN; + + pcie_aer_msg(dev, &msg); + } +} + +static void vfio_register_aer_resume_notifier(VFIOPCIDevice *vdev) +{ + int ret; + int argsz; + struct vfio_irq_set *irq_set; + int32_t *pfd; + + if (!(vdev->features & VFIO_FEATURE_ENABLE_AER)) { + return; + } + + if (event_notifier_init(&vdev->resume_notifier, 0)) { + error_report("vfio: Unable to init event notifier for" + " resume notification"); + return; + } + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + + *pfd = event_notifier_get_fd(&vdev->resume_notifier); + qemu_set_fd_handler(*pfd, vfio_resume_notifier_handler, NULL, vdev); + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + if (ret) { + error_report("vfio: Failed to set up resume notification"); + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); + event_notifier_cleanup(&vdev->resume_notifier); + } else { + vdev->resume_enabled = true; + } + g_free(irq_set); +} + +static void vfio_unregister_aer_resume_notifier(VFIOPCIDevice *vdev) +{ + int argsz; + struct vfio_irq_set *irq_set; + int32_t *pfd; + int ret; + + if (!vdev->resume_enabled) { + return; + } + + argsz = sizeof(*irq_set) + sizeof(*pfd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_RESUME_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = 1; + pfd = (int32_t *)&irq_set->data; + *pfd = -1; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + if (ret) { + error_report("vfio: Failed to de-assign error fd: %m"); + } + g_free(irq_set); + qemu_set_fd_handler(event_notifier_get_fd(&vdev->resume_notifier), + NULL, NULL, vdev); + event_notifier_cleanup(&vdev->resume_notifier); + + vdev->resume_enabled = false; +} + static void vfio_req_notifier_handler(void *opaque) { VFIOPCIDevice *vdev = opaque; @@ -3062,6 +3149,7 @@ static int vfio_initfn(PCIDevice *pdev) } vfio_register_err_notifier(vdev); + vfio_register_aer_resume_notifier(vdev); vfio_register_req_notifier(vdev); vfio_setup_resetfn_quirk(vdev); @@ -3092,6 +3180,7 @@ static void vfio_exitfn(PCIDevice *pdev) VFIOPCIDevice *vdev = DO_UPCAST(VFIOPCIDevice, pdev, pdev); vfio_unregister_req_notifier(vdev); + vfio_unregister_aer_resume_notifier(vdev); vfio_unregister_err_notifier(vdev); pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); vfio_disable_interrupts(vdev); diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h index 9fb0206..3ebc58f 100644 --- a/hw/vfio/pci.h +++ b/hw/vfio/pci.h @@ -119,6 +119,7 @@ typedef struct VFIOPCIDevice { VFIOVGA *vga; /* 0xa0000, 0x3b0, 0x3c0 */ PCIHostDeviceAddress host; EventNotifier err_notifier; + EventNotifier resume_notifier; EventNotifier req_notifier; int (*resetfn)(struct VFIOPCIDevice *); uint32_t vendor_id; @@ -144,6 +145,7 @@ typedef struct VFIOPCIDevice { bool no_kvm_msi; bool no_kvm_msix; bool single_depend_dev; + bool resume_enabled; } VFIOPCIDevice; uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len); diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index 15e096c..6d1826d 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -345,6 +345,7 @@ enum { VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, VFIO_PCI_REQ_IRQ_INDEX, + VFIO_PCI_RESUME_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; -- 1.9.3