On ARM, when a device is behind an IOMMU, its MSI doorbell address is subject to translation by the IOMMU. This behavior affects vfio-pci passthrough devices assigned to guests using an accelerated SMMUv3.
In this setup, we configure the host SMMUv3 in nested mode, where VFIO sets up the Stage-2 (S2) mappings for guest RAM, while the guest controls Stage-1 (S1). To allow VFIO to correctly configure S2 mappings, we currently return the system address space via the get_address_space() callback for vfio-pci devices. However, QEMU/KVM also uses this same callback path when resolving the address space for MSI doorbells: kvm_irqchip_add_msi_route() kvm_arch_fixup_msi_route() pci_device_iommu_address_space() This leads to problems when MSI doorbells need to be translated. To fix this, introduce an optional get_msi_address_space() callback. In the SMMUv3 accelerated case, this callback returns the IOMMU address space if the guest has set up S1 translations for the vfio-pci device. Otherwise, it returns the system address space. Suggested-by: Nicolin Chen <nicol...@nvidia.com> Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com> --- hw/arm/smmuv3-accel.c | 25 +++++++++++++++++++++++++ hw/pci/pci.c | 19 +++++++++++++++++++ include/hw/pci/pci.h | 16 ++++++++++++++++ target/arm/kvm.c | 2 +- 4 files changed, 61 insertions(+), 1 deletion(-) diff --git a/hw/arm/smmuv3-accel.c b/hw/arm/smmuv3-accel.c index f1584dd775..04c665ccf5 100644 --- a/hw/arm/smmuv3-accel.c +++ b/hw/arm/smmuv3-accel.c @@ -346,6 +346,30 @@ static void smmuv3_accel_unset_iommu_device(PCIBus *bus, void *opaque, } } +static AddressSpace *smmuv3_accel_find_msi_as(PCIBus *bus, void *opaque, + int devfn) +{ + SMMUState *bs = opaque; + SMMUPciBus *sbus; + SMMUv3AccelDevice *accel_dev; + SMMUDevice *sdev; + + sbus = smmu_get_sbus(bs, bus); + accel_dev = smmuv3_accel_get_dev(bs, sbus, bus, devfn); + sdev = &accel_dev->sdev; + + /* + * If the assigned vfio-pci dev has S1 translation enabled by + * Guest, return IOMMU address space for MSI translation. + * Otherwise, return system address space. + */ + if (accel_dev->s1_hwpt) { + return &sdev->as; + } else { + return &accel_dev->as_sysmem; + } +} + static bool smmuv3_accel_pdev_allowed(PCIDevice *pdev, bool *vfio_pci) { @@ -407,6 +431,7 @@ static const PCIIOMMUOps smmuv3_accel_ops = { .get_viommu_cap = smmuv3_accel_get_viommu_cap, .set_iommu_device = smmuv3_accel_set_iommu_device, .unset_iommu_device = smmuv3_accel_unset_iommu_device, + .get_msi_address_space = smmuv3_accel_find_msi_as, }; void smmuv3_accel_init(SMMUv3State *s) diff --git a/hw/pci/pci.c b/hw/pci/pci.c index 13de0e2809..404aeb643d 100644 --- a/hw/pci/pci.c +++ b/hw/pci/pci.c @@ -2957,6 +2957,25 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice *dev) return &address_space_memory; } +AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev) +{ + PCIBus *bus; + PCIBus *iommu_bus; + int devfn; + + pci_device_get_iommu_bus_devfn(dev, &iommu_bus, &bus, &devfn); + if (iommu_bus) { + if (iommu_bus->iommu_ops->get_msi_address_space) { + return iommu_bus->iommu_ops->get_msi_address_space(bus, + iommu_bus->iommu_opaque, devfn); + } else { + return iommu_bus->iommu_ops->get_address_space(bus, + iommu_bus->iommu_opaque, devfn); + } + } + return &address_space_memory; +} + int pci_iommu_init_iotlb_notifier(PCIDevice *dev, IOMMUNotifier *n, IOMMUNotify fn, void *opaque) { diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h index d1d43e9fb9..55138c406e 100644 --- a/include/hw/pci/pci.h +++ b/include/hw/pci/pci.h @@ -639,12 +639,28 @@ typedef struct PCIIOMMUOps { uint32_t pasid, bool priv_req, bool exec_req, hwaddr addr, bool lpig, uint16_t prgi, bool is_read, bool is_write); + /** + * @get_msi_address_space: get the address space for MSI doorbell address + * for devices + * + * Optional callback which returns a pointer to an #AddressSpace. This + * is required if MSI doorbell also gets translated through IOMMU(eg: ARM) + * + * @bus: the #PCIBus being accessed. + * + * @opaque: the data passed to pci_setup_iommu(). + * + * @devfn: device and function number + */ + AddressSpace * (*get_msi_address_space)(PCIBus *bus, void *opaque, + int devfn); } PCIIOMMUOps; AddressSpace *pci_device_iommu_address_space(PCIDevice *dev); bool pci_device_set_iommu_device(PCIDevice *dev, HostIOMMUDevice *hiod, Error **errp); void pci_device_unset_iommu_device(PCIDevice *dev); +AddressSpace *pci_device_iommu_msi_address_space(PCIDevice *dev); /** * pci_device_get_viommu_cap: get vIOMMU capabilities. diff --git a/target/arm/kvm.c b/target/arm/kvm.c index 6672344855..c78d0d59bb 100644 --- a/target/arm/kvm.c +++ b/target/arm/kvm.c @@ -1535,7 +1535,7 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level) int kvm_arch_fixup_msi_route(struct kvm_irq_routing_entry *route, uint64_t address, uint32_t data, PCIDevice *dev) { - AddressSpace *as = pci_device_iommu_address_space(dev); + AddressSpace *as = pci_device_iommu_msi_address_space(dev); hwaddr xlat, len, doorbell_gpa; MemoryRegionSection mrs; MemoryRegion *mr; -- 2.34.1