Open VFIO FDs from libvirt backend without exposing these FDs to XML users, i.e. one per iommufd hostdev for /dev/vfio/devices/vfioX, and pass the FD to qemu command line.
Signed-off-by: Nathan Chen <[email protected]> --- src/conf/domain_conf.h | 2 + src/libvirt_private.syms | 1 + src/qemu/qemu_command.c | 26 ++++++++ src/qemu/qemu_domain.c | 39 ++++++++++++ src/qemu/qemu_domain.h | 17 +++++ src/qemu/qemu_process.c | 130 +++++++++++++++++++++++++++++++++++++++ src/util/virpci.c | 69 +++++++++++++++++++++ src/util/virpci.h | 2 + 8 files changed, 286 insertions(+) diff --git a/src/conf/domain_conf.h b/src/conf/domain_conf.h index 4fd8342950..da4ce9fc86 100644 --- a/src/conf/domain_conf.h +++ b/src/conf/domain_conf.h @@ -364,6 +364,8 @@ struct _virDomainHostdevDef { */ virDomainNetDef *parentnet; + virObject *privateData; + virDomainHostdevMode mode; virDomainStartupPolicy startupPolicy; bool managed; diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms index 4e57e4a8f6..ed2b0d381e 100644 --- a/src/libvirt_private.syms +++ b/src/libvirt_private.syms @@ -3159,6 +3159,7 @@ virPCIDeviceGetStubDriverName; virPCIDeviceGetStubDriverType; virPCIDeviceGetUnbindFromStub; virPCIDeviceGetUsedBy; +virPCIDeviceGetVfioPath; virPCIDeviceGetVPD; virPCIDeviceHasPCIExpressLink; virPCIDeviceIsAssignable; diff --git a/src/qemu/qemu_command.c b/src/qemu/qemu_command.c index 95d1c2ee98..9b08f66175 100644 --- a/src/qemu/qemu_command.c +++ b/src/qemu/qemu_command.c @@ -4756,6 +4756,12 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, const char *iommufdId = NULL; /* 'ramfb' property must be omitted unless it's to be enabled */ bool ramfb = pcisrc->ramfb == VIR_TRISTATE_SWITCH_ON; + bool useIommufd = false; + + if (pcisrc->driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + pcisrc->driver.iommufd == VIR_TRISTATE_BOOL_YES) { + useIommufd = true; + } /* caller has to assign proper passthrough driver name */ switch (pcisrc->driver.name) { @@ -4802,6 +4808,17 @@ qemuBuildPCIHostdevDevProps(const virDomainDef *def, NULL) < 0) return NULL; + if (useIommufd && dev->privateData) { + qemuDomainHostdevPrivate *hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(dev); + + if (hostdevPriv->vfioDeviceFd >= 0) { + if (virJSONValueObjectAdd(&props, + "S:fd", g_strdup_printf("%d", hostdevPriv->vfioDeviceFd), + NULL) < 0) + return NULL; + } + } + if (qemuBuildDeviceAddressProps(props, def, dev->info) < 0) return NULL; @@ -5260,6 +5277,15 @@ qemuBuildHostdevCommandLine(virCommand *cmd, if (qemuCommandAddExtDevice(cmd, hostdev->info, def, qemuCaps) < 0) return -1; + if (subsys->u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) { + qemuDomainHostdevPrivate *hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev); + + if (hostdevPriv && hostdevPriv->vfioDeviceFd >= 0) { + virCommandPassFD(cmd, hostdevPriv->vfioDeviceFd, + VIR_COMMAND_PASS_FD_CLOSE_PARENT); + } + } + if (!(devprops = qemuBuildPCIHostdevDevProps(def, hostdev))) return -1; diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c index ac56fc7cb4..7601bdbb2b 100644 --- a/src/qemu/qemu_domain.c +++ b/src/qemu/qemu_domain.c @@ -1238,6 +1238,45 @@ qemuDomainNetworkPrivateFormat(const virDomainNetDef *net, } +static virClass *qemuDomainHostdevPrivateClass; + +static void +qemuDomainHostdevPrivateDispose(void *obj) +{ + qemuDomainHostdevPrivate *priv = obj; + + VIR_FORCE_CLOSE(priv->vfioDeviceFd); +} + + +static int +qemuDomainHostdevPrivateOnceInit(void) +{ + if (!VIR_CLASS_NEW(qemuDomainHostdevPrivate, virClassForObject())) + return -1; + + return 0; +} + +VIR_ONCE_GLOBAL_INIT(qemuDomainHostdevPrivate); + +virObject * +qemuDomainHostdevPrivateNew(void) +{ + qemuDomainHostdevPrivate *priv; + + if (qemuDomainHostdevPrivateInitialize() < 0) + return NULL; + + if (!(priv = virObjectNew(qemuDomainHostdevPrivateClass))) + return NULL; + + priv->vfioDeviceFd = -1; + + return (virObject *) priv; +} + + /* qemuDomainSecretInfoSetup: * @priv: pointer to domain private object * @alias: alias of the secret diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h index 3396f929fd..4736f1ede5 100644 --- a/src/qemu/qemu_domain.h +++ b/src/qemu/qemu_domain.h @@ -461,6 +461,17 @@ struct _qemuDomainTPMPrivate { }; +#define QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev) \ + ((qemuDomainHostdevPrivate *) (hostdev)->privateData) + +typedef struct _qemuDomainHostdevPrivate qemuDomainHostdevPrivate; +struct _qemuDomainHostdevPrivate { + virObject parent; + + /* VFIO device file descriptor for iommufd passthrough */ + int vfioDeviceFd; +}; + void qemuDomainNetworkPrivateClearFDs(qemuDomainNetworkPrivate *priv); @@ -1174,3 +1185,9 @@ qemuDomainCheckCPU(virArch arch, bool qemuDomainMachineSupportsFloppy(const char *machine, virQEMUCaps *qemuCaps); + +virObject * +qemuDomainHostdevPrivateNew(void); + +int qemuProcessOpenVfioFds(virDomainObj *vm); +void qemuProcessCloseVfioFds(virDomainObj *vm); diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c index 45fc32a663..bf245ee8af 100644 --- a/src/qemu/qemu_process.c +++ b/src/qemu/qemu_process.c @@ -106,6 +106,7 @@ #include "logging/log_manager.h" #include "logging/log_protocol.h" +#include "util/virpci.h" #define VIR_FROM_THIS VIR_FROM_QEMU @@ -8091,6 +8092,9 @@ qemuProcessLaunch(virConnectPtr conn, if (qemuExtDevicesStart(driver, vm, incomingMigrationExtDevices) < 0) goto cleanup; + if (qemuProcessOpenVfioFds(vm) < 0) + goto cleanup; + if (!(cmd = qemuBuildCommandLine(vm, incoming ? "defer" : NULL, vmop, @@ -10267,3 +10271,129 @@ qemuProcessHandleNbdkitExit(qemuNbdkitProcess *nbdkit, qemuProcessEventSubmit(vm, QEMU_PROCESS_EVENT_NBDKIT_EXITED, 0, 0, nbdkit); virObjectUnlock(vm); } + +/** + * qemuProcessOpenVfioDeviceFd: + * @hostdev: host device definition + * @vfioFd: returned file descriptor + * + * Opens the VFIO device file descriptor for a hostdev. + * + * Returns: 0 on success, -1 on failure + */ +static int +qemuProcessOpenVfioDeviceFd(virDomainHostdevDef *hostdev, + int *vfioFd) +{ + g_autofree char *vfioPath = NULL; + int fd = -1; + + + if (hostdev->mode != VIR_DOMAIN_HOSTDEV_MODE_SUBSYS || + hostdev->source.subsys.type != VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + virReportError(VIR_ERR_INTERNAL_ERROR, "%s", + _("VFIO FD only supported for PCI hostdevs")); + return -1; + } + + if (virPCIDeviceGetVfioPath(&hostdev->source.subsys.u.pci.addr, &vfioPath) < 0) + return -1; + + VIR_DEBUG("Opening VFIO device %s", vfioPath); + + if ((fd = open(vfioPath, O_RDWR | O_CLOEXEC)) < 0) { + if (errno == ENOENT) { + virReportError(VIR_ERR_CONFIG_UNSUPPORTED, + _("VFIO device %1$s not found - ensure device is bound to vfio-pci driver"), + vfioPath); + } else { + virReportSystemError(errno, + _("cannot open VFIO device %1$s"), vfioPath); + } + return -1; + } + + *vfioFd = fd; + VIR_DEBUG("Opened VFIO device FD %d for %s", *vfioFd, vfioPath); + return 0; +} + +/** + * qemuProcessOpenVfioFds: + * @vm: domain object + * + * Opens all necessary VFIO file descriptors for the domain. + * + * Returns: 0 on success, -1 on failure + */ +int +qemuProcessOpenVfioFds(virDomainObj *vm) +{ + size_t i; + + /* Check if we have any hostdevs that need VFIO FDs */ + for (i = 0; i < vm->def->nhostdevs; i++) { + virDomainHostdevDef *hostdev = vm->def->hostdevs[i]; + qemuDomainHostdevPrivate *hostdevPriv = NULL; + + if (hostdev->mode == VIR_DOMAIN_HOSTDEV_MODE_SUBSYS && + hostdev->source.subsys.type == VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI) { + + if (hostdev->source.subsys.u.pci.driver.name == VIR_DEVICE_HOSTDEV_PCI_DRIVER_NAME_VFIO && + hostdev->source.subsys.u.pci.driver.iommufd == VIR_TRISTATE_BOOL_YES) { + + if (!hostdev->privateData) { + if (!(hostdev->privateData = qemuDomainHostdevPrivateNew())) + goto error; + } + + hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev); + + /* Open VFIO device FD */ + if (qemuProcessOpenVfioDeviceFd(hostdev, &hostdevPriv->vfioDeviceFd) < 0) + goto error; + + VIR_DEBUG("Stored VFIO FD %d in hostdev %04x:%02x:%02x.%d private data", + hostdevPriv->vfioDeviceFd, + hostdev->source.subsys.u.pci.addr.domain, + hostdev->source.subsys.u.pci.addr.bus, + hostdev->source.subsys.u.pci.addr.slot, + hostdev->source.subsys.u.pci.addr.function); + } + } + } + + return 0; + + error: + qemuProcessCloseVfioFds(vm); + return -1; +} + +/** + * qemuProcessCloseVfioFds: + * @vm: domain object + * + * Closes all VFIO file descriptors for the domain. + */ +void +qemuProcessCloseVfioFds(virDomainObj *vm) +{ + size_t i; + + /* Close all VFIO device FDs */ + for (i = 0; i < vm->def->nhostdevs; i++) { + virDomainHostdevDef *hostdev = vm->def->hostdevs[i]; + qemuDomainHostdevPrivate *hostdevPriv; + + if (!hostdev->privateData) + continue; + + hostdevPriv = QEMU_DOMAIN_HOSTDEV_PRIVATE(hostdev); + + if (hostdevPriv->vfioDeviceFd >= 0) { + VIR_DEBUG("Closing VFIO device FD %d", hostdevPriv->vfioDeviceFd); + VIR_FORCE_CLOSE(hostdevPriv->vfioDeviceFd); + } + } +} diff --git a/src/util/virpci.c b/src/util/virpci.c index 90617e69c6..da62ece0f6 100644 --- a/src/util/virpci.c +++ b/src/util/virpci.c @@ -3320,3 +3320,72 @@ virPCIDeviceAddressFree(virPCIDeviceAddress *address) { g_free(address); } + +/** + * virPCIDeviceGetVfioPath: + * @addr: host device PCI address + * @vfioPath: returned VFIO device path + * + * Constructs the VFIO device path for a PCI hostdev. + * + * Returns: 0 on success, -1 on failure + */ +int +virPCIDeviceGetVfioPath(virPCIDeviceAddress *addr, + char **vfioPath) +{ + g_autofree char *addrStr = NULL; + + *vfioPath = NULL; + addrStr = virPCIDeviceAddressAsString(addr); + + /* First try: Direct lookup in device's vfio-dev subdirectory */ + { + g_autofree char *sysfsPath = NULL; + g_autoptr(DIR) dir = NULL; + struct dirent *entry = NULL; + + sysfsPath = g_strdup_printf("/sys/bus/pci/devices/%s/vfio-dev/", addrStr); + + if (virDirOpen(&dir, sysfsPath) == 1) { + while (virDirRead(dir, &entry, sysfsPath) > 0) { + if (STRPREFIX(entry->d_name, "vfio")) { + *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name); + return 0; + } + } + } + } + + /* Second try: Scan /sys/class/vfio-dev */ + { + g_autofree char *sysfsPath = g_strdup("/sys/class/vfio-dev"); + g_autoptr(DIR) dir = NULL; + struct dirent *entry = NULL; + + if (virDirOpen(&dir, sysfsPath) == 1) { + while (virDirRead(dir, &entry, sysfsPath) > 0) { + g_autofree char *devLink = NULL; + g_autofree char *target = NULL; + + if (!STRPREFIX(entry->d_name, "vfio")) + continue; + + devLink = g_strdup_printf("/sys/class/vfio-dev/%s/device", entry->d_name); + + if (virFileResolveLink(devLink, &target) < 0) + continue; + + if (strstr(target, addrStr)) { + *vfioPath = g_strdup_printf("/dev/vfio/devices/%s", entry->d_name); + return 0; + } + } + } + } + + virReportError(VIR_ERR_INTERNAL_ERROR, + _("cannot find VFIO device for PCI device %1$s"), + addrStr); + return -1; +} diff --git a/src/util/virpci.h b/src/util/virpci.h index fc538566e1..24ede10755 100644 --- a/src/util/virpci.h +++ b/src/util/virpci.h @@ -296,6 +296,8 @@ void virPCIEDeviceInfoFree(virPCIEDeviceInfo *dev); void virPCIDeviceAddressFree(virPCIDeviceAddress *address); +int virPCIDeviceGetVfioPath(virPCIDeviceAddress *addr, char **vfioPath); + G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIDevice, virPCIDeviceFree); G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIDeviceAddress, virPCIDeviceAddressFree); G_DEFINE_AUTOPTR_CLEANUP_FUNC(virPCIEDeviceInfo, virPCIEDeviceInfoFree); -- 2.43.0
