On Wed, Mar 22, 2017 at 06:36:52PM +0800, Cao jin wrote: > Make use of the non fatal error eventfd that the kernel module provide > to process the AER non fatal error. Fatal error still goes into the > legacy way which results in VM stop. > > Register the handler, wait for notification. Construct aer message and > pass it to root port on notification. Root port will trigger an interrupt > to signal guest, then guest driver will do the recovery. > > Signed-off-by: Dou Liyang <douly.f...@cn.fujitsu.com> > Signed-off-by: Cao jin <caoj.f...@cn.fujitsu.com> > --- > hw/vfio/pci.c | 247 > +++++++++++++++++++++++++++++++++++++++++++++ > hw/vfio/pci.h | 4 + > linux-headers/linux/vfio.h | 2 + > 3 files changed, 253 insertions(+) > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > index 3d0d005..4912bc6 100644 > --- a/hw/vfio/pci.c > +++ b/hw/vfio/pci.c > @@ -2422,6 +2422,34 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, > Error **errp) > "Could not enable error recovery for the device", > vbasedev->name); > } > + > + irq_info.index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX; > + irq_info.count = 0; /* clear */ > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > + if (ret) { > + /* This can fail for an old kernel or legacy PCI dev */ > + trace_vfio_populate_device_get_irq_info_failure(); > + } else if (irq_info.count == 1) { > + vdev->pci_aer_non_fatal = true; > + } else { > + error_report(WARN_PREFIX > + "Couldn't enable non fatal error recovery for the > device", > + vbasedev->name);
when does this trigger? > + } > + > + irq_info.index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX; > + irq_info.count = 0; /* clear */ > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); > + if (ret) { > + /* This can fail for an old kernel or legacy PCI dev */ > + trace_vfio_populate_device_get_irq_info_failure(); > + } else if (irq_info.count == 1) { > + vdev->passive_reset = true; > + } else { > + error_report(WARN_PREFIX > + "Don't support passive reset notification", > + vbasedev->name); when does this happen? what does this message mean? > + } > } > > static void vfio_put_device(VFIOPCIDevice *vdev) > @@ -2432,6 +2460,221 @@ static void vfio_put_device(VFIOPCIDevice *vdev) > vfio_put_base_device(&vdev->vbasedev); > } > > +static void vfio_non_fatal_err_notifier_handler(void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + PCIDevice *dev = &vdev->pdev; > + PCIEAERMsg msg = { > + .severity = PCI_ERR_ROOT_CMD_NONFATAL_EN, > + .source_id = (pci_bus_num(dev->bus) << 8) | dev->devfn, > + }; > + Should this just use pci_requester_id? At least Peter thought so. > + if (!event_notifier_test_and_clear(&vdev->non_fatal_err_notifier)) { > + return; > + } > + > + /* Populate the aer msg and send it to root port */ > + if (dev->exp.aer_cap) { > + uint8_t *aer_cap = dev->config + dev->exp.aer_cap; > + uint32_t uncor_status; > + bool isfatal; > + > + uncor_status = vfio_pci_read_config(dev, > + dev->exp.aer_cap + PCI_ERR_UNCOR_STATUS, 4); > + if (!uncor_status) { > + return; > + } > + > + isfatal = uncor_status & pci_get_long(aer_cap + PCI_ERR_UNCOR_SEVER); > + if (isfatal) { > + goto stop; > + } > + > + error_report("%s sending non fatal event to root port. uncor status > = " > + "0x%"PRIx32, vdev->vbasedev.name, uncor_status); > + pcie_aer_msg(dev, &msg); > + return; > + } > + > +stop: > + /* Terminate the guest in case of fatal error */ > + error_report("%s(%s) fatal error detected. Please collect any data" > + " possible and then kill the guest", __func__, > vdev->vbasedev.name); "Device detected a fatal error. VM stopped". would be better IMO. > + vm_stop(RUN_STATE_INTERNAL_ERROR); > +} > + > +/* > + * Register non fatal error notifier for devices supporting error recovery. > + * If we encounter a failure in this function, we report an error > + * and continue after disabling error recovery support for the device. > + */ > +static void vfio_register_non_fatal_err_notifier(VFIOPCIDevice *vdev) > +{ > + int ret; > + int argsz; > + struct vfio_irq_set *irq_set; > + int32_t *pfd; > + > + if (!vdev->pci_aer_non_fatal) { > + return; > + } > + > + if (event_notifier_init(&vdev->non_fatal_err_notifier, 0)) { > + error_report("vfio: Unable to init event notifier for non-fatal > error detection"); > + vdev->pci_aer_non_fatal = false; > + return; > + } > + > + argsz = sizeof(*irq_set) + sizeof(*pfd); > + > + irq_set = g_malloc0(argsz); > + irq_set->argsz = argsz; > + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | > + VFIO_IRQ_SET_ACTION_TRIGGER; > + irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX; > + irq_set->start = 0; > + irq_set->count = 1; > + pfd = (int32_t *)&irq_set->data; > + > + *pfd = event_notifier_get_fd(&vdev->non_fatal_err_notifier); > + qemu_set_fd_handler(*pfd, vfio_non_fatal_err_notifier_handler, NULL, > vdev); > + > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); > + if (ret) { > + error_report("vfio: Failed to set up non-fatal error notification"); > + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); > + event_notifier_cleanup(&vdev->non_fatal_err_notifier); > + vdev->pci_aer_non_fatal = false; > + } > + g_free(irq_set); > +} > + > +static void vfio_unregister_non_fatal_err_notifier(VFIOPCIDevice *vdev) > +{ > + int argsz; > + struct vfio_irq_set *irq_set; > + int32_t *pfd; > + int ret; > + > + if (!vdev->pci_aer_non_fatal) { > + return; > + } > + > + argsz = sizeof(*irq_set) + sizeof(*pfd); > + > + irq_set = g_malloc0(argsz); > + irq_set->argsz = argsz; > + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | > + VFIO_IRQ_SET_ACTION_TRIGGER; > + irq_set->index = VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX; > + irq_set->start = 0; > + irq_set->count = 1; > + pfd = (int32_t *)&irq_set->data; > + *pfd = -1; > + > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); > + if (ret) { > + error_report("vfio: Failed to de-assign error fd: %m"); > + } > + g_free(irq_set); > + qemu_set_fd_handler(event_notifier_get_fd(&vdev->non_fatal_err_notifier), > + NULL, NULL, vdev); > + event_notifier_cleanup(&vdev->non_fatal_err_notifier); > +} > + > +static void vfio_passive_reset_notifier_handler(void *opaque) > +{ > + VFIOPCIDevice *vdev = opaque; > + > + if (!event_notifier_test_and_clear(&vdev->passive_reset_notifier)) { > + return; > + } > + > + error_report("device %s is forced to be reset. Please collect any data" > + " possible and then kill the guest", vdev->vbasedev.name); I think we should just tell what happened in detail. Don't tell user what to do: "Device lost state due to host device reset. VM stopped". > + vm_stop(RUN_STATE_INTERNAL_ERROR); > +} > + > +/* > + * Register passive reset notifier, in case of certain function of a > + * multifunction device is passthroughed, while other functions are still > + * controlled by device driver. > + */ > +static void vfio_register_passive_reset_notifier(VFIOPCIDevice *vdev) > +{ > + int ret; > + int argsz; > + struct vfio_irq_set *irq_set; > + int32_t *pfd; > + > + error_report("Registering event notifier for passive reset"); > + if (!vdev->passive_reset) { > + return; > + } > + > + if (event_notifier_init(&vdev->passive_reset_notifier, 0)) { > + error_report("vfio: Unable to init event notifier for passive > reset"); > + vdev->passive_reset = false; > + return; > + } > + > + argsz = sizeof(*irq_set) + sizeof(*pfd); > + > + irq_set = g_malloc0(argsz); > + irq_set->argsz = argsz; > + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | > + VFIO_IRQ_SET_ACTION_TRIGGER; > + irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX; > + irq_set->start = 0; > + irq_set->count = 1; > + pfd = (int32_t *)&irq_set->data; > + > + *pfd = event_notifier_get_fd(&vdev->passive_reset_notifier); > + qemu_set_fd_handler(*pfd, vfio_passive_reset_notifier_handler, NULL, > vdev); > + > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); > + if (ret) { > + error_report("vfio: Failed to set up passive reset notification"); > + qemu_set_fd_handler(*pfd, NULL, NULL, vdev); > + event_notifier_cleanup(&vdev->passive_reset_notifier); > + vdev->passive_reset = false; > + } > + g_free(irq_set); > +} > + > +static void vfio_unregister_passive_reset_notifier(VFIOPCIDevice *vdev) > +{ > + int argsz; > + struct vfio_irq_set *irq_set; > + int32_t *pfd; > + int ret; > + > + if (!vdev->passive_reset) { > + return; > + } > + > + argsz = sizeof(*irq_set) + sizeof(*pfd); > + > + irq_set = g_malloc0(argsz); > + irq_set->argsz = argsz; > + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | > + VFIO_IRQ_SET_ACTION_TRIGGER; > + irq_set->index = VFIO_PCI_PASSIVE_RESET_IRQ_INDEX; > + irq_set->start = 0; > + irq_set->count = 1; > + pfd = (int32_t *)&irq_set->data; > + *pfd = -1; > + > + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); > + if (ret) { > + error_report("vfio: Failed to de-assign error fd: %m"); > + } > + g_free(irq_set); > + qemu_set_fd_handler(event_notifier_get_fd(&vdev->passive_reset_notifier), > + NULL, NULL, vdev); > + event_notifier_cleanup(&vdev->passive_reset_notifier); > +} > + > static void vfio_err_notifier_handler(void *opaque) > { > VFIOPCIDevice *vdev = opaque; > @@ -2860,6 +3103,8 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) > } > } > > + vfio_register_passive_reset_notifier(vdev); > + vfio_register_non_fatal_err_notifier(vdev); > vfio_register_err_notifier(vdev); > vfio_register_req_notifier(vdev); > vfio_setup_resetfn_quirk(vdev); > @@ -2900,6 +3145,8 @@ static void vfio_exitfn(PCIDevice *pdev) > > vfio_unregister_req_notifier(vdev); > vfio_unregister_err_notifier(vdev); > + vfio_unregister_non_fatal_err_notifier(vdev); > + vfio_unregister_passive_reset_notifier(vdev); > pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); > vfio_disable_interrupts(vdev); > if (vdev->intx.mmap_timer) { > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > index 34e8b04..dcb0e0a 100644 > --- a/hw/vfio/pci.h > +++ b/hw/vfio/pci.h > @@ -119,6 +119,8 @@ typedef struct VFIOPCIDevice { > void *igd_opregion; > PCIHostDeviceAddress host; > EventNotifier err_notifier; > + EventNotifier non_fatal_err_notifier; > + EventNotifier passive_reset_notifier; > EventNotifier req_notifier; > int (*resetfn)(struct VFIOPCIDevice *); > uint32_t vendor_id; > @@ -137,6 +139,8 @@ typedef struct VFIOPCIDevice { > uint32_t igd_gms; > uint8_t pm_cap; > bool pci_aer; > + bool pci_aer_non_fatal; > + bool passive_reset; > bool req_enabled; > bool has_flr; > bool has_pm_reset; Why do you need these flags? Why isn't the notifier sufficient? > diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h > index 759b850..726ddbe 100644 > --- a/linux-headers/linux/vfio.h > +++ b/linux-headers/linux/vfio.h > @@ -433,6 +433,8 @@ enum { > VFIO_PCI_MSIX_IRQ_INDEX, > VFIO_PCI_ERR_IRQ_INDEX, > VFIO_PCI_REQ_IRQ_INDEX, > + VFIO_PCI_NON_FATAL_ERR_IRQ_INDEX, > + VFIO_PCI_PASSIVE_RESET_IRQ_INDEX, > VFIO_PCI_NUM_IRQS > }; > > -- > 1.8.3.1 > >