sPAPR IOMMU is managing two copies of an TCE table:
1) a guest view of the table - this is what emulated devices use and
this is where H_GET_TCE reads from;
2) a hardware TCE table - only present if there is at least one vfio-pci
device on a PHB; it is updated via a memory listener on a PHB address
space which forwards map/unmap requests to vfio-pci IOMMU host driver.
At the moment presence of vfio-pci devices on a bus affect the way
the guest view table is allocated. If there is no vfio-pci on a PHB
and the host kernel supports KVM acceleration of H_PUT_TCE, a table
is allocated in KVM. However, if there is vfio-pci and we do yet not
support KVM acceleration for these, the table has to be allocated
by the userspace.
When vfio-pci device is hotplugged and there were no vfio-pci devices
already, the guest view table could have been allocated by KVM which
means that H_PUT_TCE is handled by the host kernel and since we
do not support vfio-pci in KVM, the hardware table will not be updated.
This reallocates the guest view table in QEMU if the first vfio-pci
device has just been plugged. spapr_tce_realloc_userspace() handles this.
This replays all the mappings to make sure that the tables are in sync.
This will not have a visible effect though as for a new device
the guest kernel will allocate-and-map new addresses and therefore
existing mappings from emulated devices will not be used by vfio-pci
devices.
This adds calls to spapr_phb_dma_capabilities_update() in PCI hotplug
hooks.
Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
---
Changes:
v10:
* removed unnecessary memory_region_del_subregion() and
memory_region_add_subregion() as
"vfio: Unregister IOMMU notifiers when container is destroyed" removes
notifiers in a more correct way
v9:
* spapr_phb_hotplug_dma_sync() enumerates TCE tables explicitely rather than
via object_child_foreach()
* spapr_phb_hotplug_dma_sync() does memory_region_del_subregion() +
memory_region_add_subregion() as otherwise vfio_listener_region_del() is not
called and we end up with vfio_iommu_map_notify registered twice (comments
welcome!)
if we do hotplug+hotunplug+hotplug of the same device.
* moved spapr_phb_hotplug_dma_sync() on unplug event to rcu as before calling
spapr_phb_hotplug_dma_sync(), we need VFIO to release the container, otherwise
spapr_phb_dma_capabilities_update() will decide that the PHB still has VFIO
device.
Actual VFIO PCI device release happens from rcu and since we add ours later,
it gets executed later and we are good.
---
hw/ppc/spapr_iommu.c | 51 ++++++++++++++++++++++++++++++++++++++++++---
hw/ppc/spapr_pci.c | 47 +++++++++++++++++++++++++++++++++++++++++
include/hw/pci-host/spapr.h | 1 +
include/hw/ppc/spapr.h | 2 ++
trace-events | 2 ++
5 files changed, 100 insertions(+), 3 deletions(-)
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 45c00d8..2d99c3b 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -78,12 +78,13 @@ static uint64_t *spapr_tce_alloc_table(uint32_t liobn,
uint32_t nb_table,
uint32_t page_shift,
int *fd,
- bool vfio_accel)
+ bool vfio_accel,
+ bool force_userspace)
{
uint64_t *table = NULL;
uint64_t window_size = (uint64_t)nb_table << page_shift;
- if (kvm_enabled() && !(window_size >> 32)) {
+ if (kvm_enabled() && !force_userspace && !(window_size >> 32)) {
table = kvmppc_create_spapr_tce(liobn, window_size, fd, vfio_accel);
}
@@ -222,7 +223,8 @@ static void spapr_tce_table_do_enable(sPAPRTCETable *tcet,
bool vfio_accel)
tcet->nb_table,
tcet->page_shift,
&tcet->fd,
- vfio_accel);
+ vfio_accel,
+ false);
memory_region_set_size(&tcet->iommu,
(uint64_t)tcet->nb_table << tcet->page_shift);
@@ -495,6 +497,49 @@ int spapr_dma_dt(void *fdt, int node_off, const char
*propname,
return 0;
}
+static int spapr_tce_do_replay(sPAPRTCETable *tcet, uint64_t *table)
+{
+ target_ulong ioba = tcet->bus_offset, pgsz = (1ULL << tcet->page_shift);
+ long i, ret = 0;
+
+ for (i = 0; i < tcet->nb_table; ++i, ioba += pgsz) {
+ ret = put_tce_emu(tcet, ioba, table[i]);
+ if (ret) {
+ break;
+ }
+ }
+
+ return ret;
+}
+
+int spapr_tce_replay(sPAPRTCETable *tcet)
+{
+ return spapr_tce_do_replay(tcet, tcet->table);
+}
+
+int spapr_tce_realloc_userspace(sPAPRTCETable *tcet, bool replay)
+{
+ int ret = 0, oldfd;
+ uint64_t *oldtable;
+
+ oldtable = tcet->table;
+ oldfd = tcet->fd;
+ tcet->table = spapr_tce_alloc_table(tcet->liobn,
+ tcet->nb_table,
+ tcet->page_shift,
+ &tcet->fd,
+ false,
+ true); /* force_userspace */
+
+ if (replay) {
+ ret = spapr_tce_do_replay(tcet, oldtable);
+ }
+
+ spapr_tce_free_table(oldtable, oldfd, tcet->nb_table);
+
+ return ret;
+}
+
int spapr_tcet_dma_dt(void *fdt, int node_off, const char *propname,
sPAPRTCETable *tcet)
{
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 76c988f..d1fa157 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -827,6 +827,43 @@ int spapr_phb_dma_reset(sPAPRPHBState *sphb)
return 0;
}
+static int spapr_phb_hotplug_dma_sync(sPAPRPHBState *sphb)
+{
+ int ret = 0, i;
+ bool had_vfio = sphb->has_vfio;
+ sPAPRTCETable *tcet;
+
+ spapr_phb_dma_capabilities_update(sphb);