This adds support for Dynamic DMA Windows (DDW) option defined by
the SPAPR specification which allows to have additional DMA window(s)
This implements DDW for emulated and VFIO devices. As all TCE root regions
are mapped at 0 and 64bit long (and actual tables are child regions),
this replaces memory_region_add_subregion() with _overlap() to make
QEMU memory API happy.
This reserves RTAS token numbers for DDW calls.
This implements helpers to interact with VFIO kernel interface.
This changes the TCE table migration descriptor to support dynamic
tables as from now on, PHB will create as many stub TCE table objects
as PHB can possibly support but not all of them might be initialized at
the time of migration because DDW might or might not be requested by
the guest.
The "ddw" property is enabled by default on a PHB but for compatibility
the pseries-2.3 machine and older disable it.
This implements DDW for VFIO. The host kernel support is required.
This adds a "levels" property to PHB to control the number of levels
in the actual TCE table allocated by the host kernel, 0 is the default
value to tell QEMU to calculate the correct value. Current hardware
supports up to 5 levels.
The existing linux guests try creating one additional huge DMA window
with 64K or 16MB pages and map the entire guest RAM to. If succeeded,
the guest switches to dma_direct_ops and never calls TCE hypercalls
(H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM
and not waste time on map/unmap later.
This adds 4 RTAS handlers:
* ibm,query-pe-dma-window
* ibm,create-pe-dma-window
* ibm,remove-pe-dma-window
* ibm,reset-pe-dma-window
These are registered from type_init() callback.
These RTAS handlers are implemented in a separate file to avoid polluting
spapr_iommu.c with PCI.
Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
index c8ab06e..0b2ff6d 100644
--- a/hw/ppc/Makefile.objs
+++ b/hw/ppc/Makefile.objs
@@ -7,6 +7,9 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy)
obj-y += spapr_pci_vfio.o
endif
+ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES), yy)
+obj-y += spapr_rtas_ddw.o
+endif
# PowerPC 4xx boards
obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o
obj-y += ppc4xx_pci.o
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 5ca817c..d50d50b 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1860,6 +1860,11 @@ static const TypeInfo spapr_machine_info = {
.driver = "spapr-pci-host-bridge",\
.property = "dynamic-reconfiguration",\
.value = "off",\
+ },\
+ {\
+ .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\
+ .property = "ddw",\
+ .value = stringify(off),\
},
#define SPAPR_COMPAT_2_2 \
diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c
index 5e6bdb4..eaa1943 100644
--- a/hw/ppc/spapr_iommu.c
+++ b/hw/ppc/spapr_iommu.c
@@ -136,6 +136,15 @@ static IOMMUTLBEntry
spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr,
return ret;
}
+static void spapr_tce_table_pre_save(void *opaque)
+{
+ sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
+
+ tcet->migtable = tcet->table;
+}
+
+static void spapr_tce_table_do_enable(sPAPRTCETable *tcet, bool vfio_accel);
+
static int spapr_tce_table_post_load(void *opaque, int version_id)
{
sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque);
@@ -144,22 +153,43 @@ static int spapr_tce_table_post_load(void *opaque, int
version_id)
spapr_vio_set_bypass(tcet->vdev, tcet->bypass);
}
+ if (!tcet->migtable) {
+ return 0;
+ }
+
+ if (tcet->enabled) {
+ if (!tcet->table) {
+ tcet->enabled = false;
+ /* VFIO does not migrate so pass vfio_accel == false */
+ spapr_tce_table_do_enable(tcet, false);
+ }
+ memcpy(tcet->table, tcet->migtable,
+ tcet->nb_table * sizeof(tcet->table[0]));
+ free(tcet->migtable);
+ tcet->migtable = NULL;
+ }
+
return 0;
}
static const VMStateDescription vmstate_spapr_tce_table = {
.name = "spapr_iommu",
- .version_id = 2,
+ .version_id = 3,
.minimum_version_id = 2,
+ .pre_save = spapr_tce_table_pre_save,
.post_load = spapr_tce_table_post_load,
.fields = (VMStateField []) {
/* Sanity check */
VMSTATE_UINT32_EQUAL(liobn, sPAPRTCETable),
- VMSTATE_UINT32_EQUAL(nb_table, sPAPRTCETable),
/* IOMMU state */
+ VMSTATE_BOOL_V(enabled, sPAPRTCETable, 3),
+ VMSTATE_UINT64_V(bus_offset, sPAPRTCETable, 3),
+ VMSTATE_UINT32_V(page_shift, sPAPRTCETable, 3),
+ VMSTATE_UINT32(nb_table, sPAPRTCETable),
VMSTATE_BOOL(bypass, sPAPRTCETable),
- VMSTATE_VARRAY_UINT32(table, sPAPRTCETable, nb_table, 0,
vmstate_info_uint64, uint64_t),
+ VMSTATE_VARRAY_UINT32_ALLOC(migtable, sPAPRTCETable, nb_table, 0,
+ vmstate_info_uint64, uint64_t),
VMSTATE_END_OF_LIST()
},
diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c
index 1f980fa..ab2d650 100644
--- a/hw/ppc/spapr_pci.c
+++ b/hw/ppc/spapr_pci.c
@@ -719,6 +719,8 @@ static AddressSpace *spapr_pci_dma_iommu(PCIBus *bus, void
*opaque, int devfn)
static int spapr_phb_dma_update(Object *child, void *opaque)
{
int ret = 0;
+ uint64_t bus_offset = 0;
+ sPAPRPHBState *sphb = opaque;
sPAPRTCETable *tcet = (sPAPRTCETable *)
object_dynamic_cast(child, TYPE_SPAPR_TCE_TABLE);
@@ -726,6 +728,17 @@ static int spapr_phb_dma_update(Object *child, void
*opaque)
return 0;
}
+ ret = spapr_phb_vfio_dma_init_window(sphb,
+ tcet->page_shift,
+ tcet->nb_table << tcet->page_shift,
+ &bus_offset);
+ if (ret) {
+ return ret;
+ }
+ if (bus_offset != tcet->bus_offset) {
+ return -EFAULT;
+ }
+
if (tcet->fd >= 0) {
/*
* We got first vfio-pci device on accelerated table.
@@ -749,6 +762,9 @@ static int spapr_phb_dma_capabilities_update(sPAPRPHBState
*sphb)
sphb->dma32_window_start = 0;
sphb->dma32_window_size = SPAPR_PCI_DMA32_SIZE;
+ sphb->windows_supported = SPAPR_PCI_DMA_MAX_WINDOWS;
+ sphb->page_size_mask = (1ULL << 12) | (1ULL << 16) | (1ULL << 24);
+ sphb->dma64_window_size = pow2ceil(ram_size);