Currently we bind a managed hostdev back to the host driver (or "unbind" from the perspective of the stub driver) immediately upon receiving a DEVICE_DELETED event from QEMU. In cases where we have more one device from the group attached to a guest, this runs the risk of putting the group in a "non-viable" state where both a guest and host are using devices from a group simultaneously.
This patch addresses this by deferring the unbind step until all hostdevs from a group have been detached from the guest. In the meantime, they are left on the drvManager's inactiveList, in a similar state as they would be if they were unmanaged devices that were bound to VFIO via nodedev-detach but not yet plugged into a guest. Signed-off-by: Michael Roth <mdr...@linux.vnet.ibm.com> --- src/libvirt_private.syms | 3 ++ src/qemu/qemu_hostdev.c | 16 +++++++++ src/qemu/qemu_hostdev.h | 4 +++ src/qemu/qemu_hotplug.c | 16 ++++++++- src/util/virhostdev.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++ src/util/virhostdev.h | 8 +++++ src/util/virpci.c | 27 +++++++++++++++ src/util/virpci.h | 1 + 8 files changed, 164 insertions(+), 1 deletion(-) diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 2bd3581..ba7fa39 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -1783,6 +1783,8 @@ virHostCPUStatsAssign; virHostdevFindUSBDevice; virHostdevIsSCSIDevice; virHostdevManagerGetDefault; +virHostdevPCIDeviceGroupUnbind; +virHostdevPCIDeviceGroupUnbindable; virHostdevPCINodeDeviceDetach; virHostdevPCINodeDeviceReAttach; virHostdevPCINodeDeviceReset; @@ -2342,6 +2344,7 @@ virPCIDeviceWaitForCleanup; virPCIEDeviceInfoFree; virPCIGetDeviceAddressFromSysfsLink; virPCIGetHeaderType; +virPCIGetIOMMUGroupList; virPCIGetNetName; virPCIGetPhysicalFunction; virPCIGetVirtualFunctionIndex; diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c index 73d26f4..fdc52fe 100644 --- a/src/qemu/qemu_hostdev.c +++ b/src/qemu/qemu_hostdev.c @@ -384,6 +384,22 @@ qemuHostdevPrepareDomainDevices(virQEMUDriverPtr driver, } void +qemuHostdevReleasePCIDevices(virQEMUDriverPtr driver, + const char *name, + virDomainHostdevDefPtr *hostdevs, + int nhostdevs) +{ + virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver); + const char *oldStateDir = cfg->stateDir; + virHostdevManagerPtr hostdev_mgr = driver->hostdevMgr; + + virHostdevReleasePCIDevices(hostdev_mgr, QEMU_DRIVER_NAME, name, + hostdevs, nhostdevs, oldStateDir); + + virObjectUnref(cfg); +} + +void qemuHostdevReAttachPCIDevices(virQEMUDriverPtr driver, const char *name, virDomainHostdevDefPtr *hostdevs, diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h index 9a7c7f1..b010085 100644 --- a/src/qemu/qemu_hostdev.h +++ b/src/qemu/qemu_hostdev.h @@ -74,6 +74,10 @@ void qemuHostdevReAttachPCIDevices(virQEMUDriverPtr driver, const char *name, virDomainHostdevDefPtr *hostdevs, int nhostdevs); +void qemuHostdevReleasePCIDevices(virQEMUDriverPtr driver, + const char *name, + virDomainHostdevDefPtr *hostdevs, + int nhostdevs); void qemuHostdevReAttachUSBDevices(virQEMUDriverPtr driver, const char *name, virDomainHostdevDefPtr *hostdevs, diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c index b557e82..af5ee6f 100644 --- a/src/qemu/qemu_hotplug.c +++ b/src/qemu/qemu_hotplug.c @@ -3896,7 +3896,10 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, switch ((virDomainHostdevSubsysType) hostdev->source.subsys.type) { case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI: - qemuHostdevReAttachPCIDevices(driver, vm->def->name, &hostdev, 1); + if (is_vfio) + qemuHostdevReleasePCIDevices(driver, vm->def->name, &hostdev, 1); + else + qemuHostdevReAttachPCIDevices(driver, vm->def->name, &hostdev, 1); qemuDomainReleaseDeviceAddress(vm, hostdev->info, NULL); /* QEMU might no longer need to lock as much memory, eg. we just * detached the last VFIO device, so adjust the limit here */ @@ -3925,6 +3928,17 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver, virDomainNetDefFree(net); } + if (is_vfio) { + int iommu_group = + virPCIDeviceAddressGetIOMMUGroupNum(&hostdev->source.subsys.u.pci.addr); + if (virHostdevPCIDeviceGroupUnbindable(driver->hostdevMgr, + iommu_group)) { + virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr, + iommu_group); + } + } + + ret = 0; cleanup: diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c index 2cd3f34..a7f04fe 100644 --- a/src/util/virhostdev.c +++ b/src/util/virhostdev.c @@ -905,6 +905,96 @@ virHostdevPreparePCIDevices(virHostdevManagerPtr mgr, return ret; } +static bool +virHostdevPCIDeviceUnbindableInternal(virHostdevManagerPtr mgr, + int iommu_group) +{ + struct virHostdevIsPCINodeDeviceUsedData data = { mgr, NULL, true }; + + if (virPCIIOMMUGroupIterate(iommu_group, + virHostdevIsPCINodeDeviceUsed, + &data) < 0) { + VIR_DEBUG("IOMMU group %d is not unbindable", iommu_group); + return false; + } + + VIR_DEBUG("IOMMU group %d is unbindable", iommu_group); + return true; +} + +/* + * Check if devices within IOMMU group are in use by any domains + */ +bool +virHostdevPCIDeviceGroupUnbindable(virHostdevManagerPtr mgr, + int iommu_group) +{ + bool result; + + virObjectLock(mgr->activePCIHostdevs); + result = virHostdevPCIDeviceUnbindableInternal(mgr, iommu_group); + virObjectUnlock(mgr->activePCIHostdevs); + + return result; +} + +/* + * Confirm all devices in IOMMU group are in inactiveList + * before attempting to reattach to host driver. Devices in IOMMU + * group that aren't in either activeList or inactiveList are considered + * outside our control, so we treat them as inactive as well. + * + * Callers can check virHostdevPCIDeviceGroupUnbindable() beforehand + * for some indication that the group is ready for reattach to the + * host, but since it's possible for a hostdev from the group to get + * re-attached to a guest prior to subsequently calling this function + * there is no guarantee of this, which should be fine since it would + * only be immediately rebound to the stub driver anyway. + */ +void +virHostdevPCIDeviceGroupUnbind(virHostdevManagerPtr mgr, + int iommu_group) +{ + virPCIDeviceListPtr pcidevs = NULL; + size_t i; + + virObjectLock(mgr->activePCIHostdevs); + virObjectLock(mgr->inactivePCIHostdevs); + + if (!virHostdevPCIDeviceUnbindableInternal(mgr, iommu_group)) { + VIR_DEBUG("IOMMU group %d still in use, deferring reattach " + "of PCI devices to host", iommu_group); + goto cleanup; + } + + pcidevs = virPCIGetIOMMUGroupList(iommu_group); + for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) { + virPCIDevicePtr actual, pci = virPCIDeviceListGet(pcidevs, i); + virPCIDeviceAddressPtr devAddr = virPCIDeviceGetAddress(pci); + + actual = virPCIDeviceListFindByIDs(mgr->inactivePCIHostdevs, + devAddr->domain, + devAddr->bus, + devAddr->slot, + devAddr->function); + if (actual) { + VIR_DEBUG("Reattaching PCI device %s", virPCIDeviceGetName(actual)); + if (virPCIDeviceGetManaged(actual)) + if (virPCIDeviceReattach(actual, mgr->activePCIHostdevs, + mgr->inactivePCIHostdevs) < 0) { + VIR_ERROR(_("Failed to re-attach PCI device: %s"), + virGetLastErrorMessage()); + virResetLastError(); + } + } + } + + cleanup: + virObjectUnref(pcidevs); + virObjectUnlock(mgr->activePCIHostdevs); + virObjectUnlock(mgr->inactivePCIHostdevs); +} + /* * Pre-condition: inactivePCIHostdevs & activePCIHostdevs * are locked diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h index fbc7fbd..2ab8101 100644 --- a/src/util/virhostdev.h +++ b/src/util/virhostdev.h @@ -122,6 +122,14 @@ virHostdevReleasePCIDevices(virHostdevManagerPtr mgr, const char *oldStateDir) ATTRIBUTE_NONNULL(1); void +virHostdevPCIDeviceGroupUnbind(virHostdevManagerPtr mgr, + int iommu_group) + ATTRIBUTE_NONNULL(1); +bool +virHostdevPCIDeviceGroupUnbindable(virHostdevManagerPtr mgr, + int iommu_group) + ATTRIBUTE_NONNULL(1); +void virHostdevReAttachUSBDevices(virHostdevManagerPtr hostdev_mgr, const char *drv_name, const char *dom_name, diff --git a/src/util/virpci.c b/src/util/virpci.c index b842f44..a8e5190 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -2298,6 +2298,33 @@ virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev) } +/* + * virPCIGetIOMMUGroupList - return a virPCIDeviceList containing + * all of the devices in @iommu_group. + * + * Return the new list, or NULL on failure + */ +virPCIDeviceListPtr +virPCIGetIOMMUGroupList(int iommu_group) +{ + virPCIDeviceListPtr groupList = virPCIDeviceListNew(); + + if (!groupList) + goto error; + + if (virPCIIOMMUGroupIterate(iommu_group, + virPCIDeviceGetIOMMUGroupAddOne, + groupList) < 0) + goto error; + + return groupList; + + error: + virObjectUnref(groupList); + return NULL; +} + + typedef struct { virPCIDeviceAddressPtr **iommuGroupDevices; size_t *nIommuGroupDevices; diff --git a/src/util/virpci.h b/src/util/virpci.h index 5ec1306..5bcacb2 100644 --- a/src/util/virpci.h +++ b/src/util/virpci.h @@ -180,6 +180,7 @@ int virPCIIOMMUGroupIterate(int iommu_group, virPCIDeviceAddressActor actor, void *opaque); virPCIDeviceListPtr virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev); +virPCIDeviceListPtr virPCIGetIOMMUGroupList(int iommu_group); int virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddressPtr devAddr, virPCIDeviceAddressPtr **iommuGroupDevices, size_t *nIommuGroupDevices); -- 2.7.4