On Sat, Apr 25, 2015 at 10:24:43PM +1000, Alexey Kardashevskiy wrote: > This adds support for Dynamic DMA Windows (DDW) option defined by > the SPAPR specification which allows to have additional DMA window(s) > > This implements DDW for emulated and VFIO devices. As all TCE root regions > are mapped at 0 and 64bit long (and actual tables are child regions), > this replaces memory_region_add_subregion() with _overlap() to make > QEMU memory API happy. > > This reserves RTAS token numbers for DDW calls. > > This implements helpers to interact with VFIO kernel interface. > > This changes the TCE table migration descriptor to support dynamic > tables as from now on, PHB will create as many stub TCE table objects > as PHB can possibly support but not all of them might be initialized at > the time of migration because DDW might or might not be requested by > the guest. > > The "ddw" property is enabled by default on a PHB but for compatibility > the pseries-2.3 machine and older disable it. > > This implements DDW for VFIO. The host kernel support is required. > This adds a "levels" property to PHB to control the number of levels > in the actual TCE table allocated by the host kernel, 0 is the default > value to tell QEMU to calculate the correct value. Current hardware > supports up to 5 levels. > > The existing linux guests try creating one additional huge DMA window > with 64K or 16MB pages and map the entire guest RAM to. If succeeded, > the guest switches to dma_direct_ops and never calls TCE hypercalls > (H_PUT_TCE,...) again. This enables VFIO devices to use the entire RAM > and not waste time on map/unmap later. > > This adds 4 RTAS handlers: > * ibm,query-pe-dma-window > * ibm,create-pe-dma-window > * ibm,remove-pe-dma-window > * ibm,reset-pe-dma-window > These are registered from type_init() callback. > > These RTAS handlers are implemented in a separate file to avoid polluting > spapr_iommu.c with PCI. > > Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
Reviewed-by: David Gibson <da...@gibson.dropbear.id.au> > --- > Changes: > v6: > * rework as there is no more special device for VFIO PHB > > v5: > * total rework > * enabled for machines >2.3 > * fixed migration > * merged rtas handlers here > > v4: > * reset handler is back in generalized form > > v3: > * removed reset > * windows_num is now 1 or bigger rather than 0-based value and it is only > changed in PHB code, not in RTAS > * added page mask check in create() > * added SPAPR_PCI_DDW_MAX_WINDOWS to track how many windows are already > created > > v2: > * tested on hacked emulated E1000 > * implemented DDW reset on the PHB reset > * spapr_pci_ddw_remove/spapr_pci_ddw_reset are public for reuse by VFIO > --- > hw/ppc/Makefile.objs | 3 + > hw/ppc/spapr.c | 10 +- > hw/ppc/spapr_iommu.c | 35 +++++- > hw/ppc/spapr_pci.c | 66 ++++++++-- > hw/ppc/spapr_pci_vfio.c | 80 ++++++++++++ > hw/ppc/spapr_rtas_ddw.c | 300 > ++++++++++++++++++++++++++++++++++++++++++++ > include/hw/pci-host/spapr.h | 21 ++++ > include/hw/ppc/spapr.h | 17 ++- > trace-events | 4 + > 9 files changed, 521 insertions(+), 15 deletions(-) > create mode 100644 hw/ppc/spapr_rtas_ddw.c > > diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs > index 437955d..c6b344f 100644 > --- a/hw/ppc/Makefile.objs > +++ b/hw/ppc/Makefile.objs > @@ -7,6 +7,9 @@ obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o > ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES)$(CONFIG_LINUX), yyy) > obj-y += spapr_pci_vfio.o > endif > +ifeq ($(CONFIG_PCI)$(CONFIG_PSERIES), yy) > +obj-y += spapr_rtas_ddw.o > +endif > # PowerPC 4xx boards > obj-y += ppc405_boards.o ppc4xx_devs.o ppc405_uc.o ppc440_bamboo.o > obj-y += ppc4xx_pci.o > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c > index b28209f..fd7fdb3 100644 > --- a/hw/ppc/spapr.c > +++ b/hw/ppc/spapr.c > @@ -1801,7 +1801,15 @@ static const TypeInfo spapr_machine_info = { > }, > }; > > +#define SPAPR_COMPAT_2_3 \ > + {\ > + .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\ > + .property = "ddw",\ > + .value = stringify(off),\ > + } > + > #define SPAPR_COMPAT_2_2 \ > + SPAPR_COMPAT_2_3, \ > {\ > .driver = TYPE_SPAPR_PCI_HOST_BRIDGE,\ > .property = "mem_win_size",\ > @@ -1853,7 +1861,7 @@ static const TypeInfo spapr_machine_2_2_info = { > static void spapr_machine_2_3_class_init(ObjectClass *oc, void *data) > { > static GlobalProperty compat_props[] = { > - SPAPR_COMPAT_2_2, > + SPAPR_COMPAT_2_3, > { /* end of list */ } > }; > MachineClass *mc = MACHINE_CLASS(oc); > diff --git a/hw/ppc/spapr_iommu.c b/hw/ppc/spapr_iommu.c > index 245534f..df4c72d 100644 > --- a/hw/ppc/spapr_iommu.c > +++ b/hw/ppc/spapr_iommu.c > @@ -90,6 +90,15 @@ static IOMMUTLBEntry > spapr_tce_translate_iommu(MemoryRegion *iommu, hwaddr addr, > return ret; > } > > +static void spapr_tce_table_pre_save(void *opaque) > +{ > + sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); > + > + tcet->migtable = tcet->table; > +} > + > +static void spapr_tce_table_do_enable(sPAPRTCETable *tcet); > + > static int spapr_tce_table_post_load(void *opaque, int version_id) > { > sPAPRTCETable *tcet = SPAPR_TCE_TABLE(opaque); > @@ -98,22 +107,42 @@ static int spapr_tce_table_post_load(void *opaque, int > version_id) > spapr_vio_set_bypass(tcet->vdev, tcet->bypass); > } > > + if (!tcet->migtable) { What's the case where migtable will be NULL? IIUC an old->new migration will result in the data saved for "table" being loaded into "migtable". So "migtable" should only be NULL, when tce->enabled is also false? > + return 0; > + } > + > + if (tcet->enabled) { > + if (!tcet->table) { > + tcet->enabled = false; > + spapr_tce_table_do_enable(tcet); > + } > + memcpy(tcet->table, tcet->migtable, > + tcet->nb_table * sizeof(tcet->table[0])); > + free(tcet->migtable); > + tcet->migtable = NULL; > + } > + > return 0; > } > > static const VMStateDescription vmstate_spapr_tce_table = { > .name = "spapr_iommu", > - .version_id = 2, > + .version_id = 3, > .minimum_version_id = 2, > + .pre_save = spapr_tce_table_pre_save, > .post_load = spapr_tce_table_post_load, > .fields = (VMStateField []) { > /* Sanity check */ > VMSTATE_UINT32_EQUAL(liobn, sPAPRTCETable), > - VMSTATE_UINT32_EQUAL(nb_table, sPAPRTCETable), > > /* IOMMU state */ > + VMSTATE_BOOL_V(enabled, sPAPRTCETable, 3), > + VMSTATE_UINT64_V(bus_offset, sPAPRTCETable, 3), > + VMSTATE_UINT32_V(page_shift, sPAPRTCETable, 3), > + VMSTATE_UINT32(nb_table, sPAPRTCETable), > VMSTATE_BOOL(bypass, sPAPRTCETable), > - VMSTATE_VARRAY_UINT32(table, sPAPRTCETable, nb_table, 0, > vmstate_info_uint64, uint64_t), > + VMSTATE_VARRAY_UINT32_ALLOC(migtable, sPAPRTCETable, nb_table, 0, > + vmstate_info_uint64, uint64_t), > > VMSTATE_END_OF_LIST() > }, > diff --git a/hw/ppc/spapr_pci.c b/hw/ppc/spapr_pci.c > index d097cce..d3d8f12 100644 > --- a/hw/ppc/spapr_pci.c > +++ b/hw/ppc/spapr_pci.c > @@ -849,15 +849,17 @@ static void spapr_phb_realize(DeviceState *dev, Error > **errp) > sphb->lsi_table[i].irq = irq; > } > > - tcet = spapr_tce_new_table(DEVICE(sphb), sphb->dma_liobn); > - if (!tcet) { > - error_setg(errp, "failed to create TCE table"); > + for (i = 0; i < SPAPR_PCI_DMA_MAX_WINDOWS; ++i) { > + tcet = spapr_tce_new_table(DEVICE(sphb), > + SPAPR_PCI_LIOBN(sphb->index, i)); > + if (!tcet) { > + error_setg(errp, "spapr_tce_new_table failed"); > return; > + } > + memory_region_add_subregion_overlap(&sphb->iommu_root, 0, > + spapr_tce_get_iommu(tcet), 0); > } > > - memory_region_add_subregion(&sphb->iommu_root, 0, > - spapr_tce_get_iommu(tcet)); > - > sphb->msi = g_hash_table_new_full(g_int_hash, g_int_equal, g_free, > g_free); > } > > @@ -867,6 +869,9 @@ static int > spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb) > > sphb->dma32_window_start = 0; > sphb->dma32_window_size = SPAPR_PCI_DMA32_SIZE; > + sphb->windows_supported = SPAPR_PCI_DMA_MAX_WINDOWS; > + sphb->page_size_mask = (1 << 12) | (1 << 16) | (1 << 24); > + sphb->dma64_window_size = pow2ceil(ram_size); > > ret = spapr_phb_vfio_dma_capabilities_update(sphb); > sphb->has_vfio = (ret == 0); > @@ -874,12 +879,29 @@ static int > spapr_phb_dma_capabilities_update(sPAPRPHBState *sphb) > return 0; > } > > -static int spapr_phb_dma_init_window(sPAPRPHBState *sphb, > - uint32_t liobn, uint32_t page_shift, > - uint64_t window_size) > +int spapr_phb_dma_init_window(sPAPRPHBState *sphb, > + uint32_t liobn, uint32_t page_shift, > + uint64_t window_size) > { > uint64_t bus_offset = sphb->dma32_window_start; > sPAPRTCETable *tcet = spapr_tce_find_by_liobn(liobn); > + int ret; > + > + if (SPAPR_PCI_DMA_WINDOW_NUM(liobn) && !sphb->ddw_enabled) { > + return -1; > + } > + > + if (sphb->ddw_enabled) { > + if (sphb->has_vfio) { > + ret = spapr_phb_vfio_dma_init_window(sphb, > + page_shift, window_size, > + &bus_offset); > + } > + > + if (ret && SPAPR_PCI_DMA_WINDOW_NUM(liobn)) { > + bus_offset = SPAPR_PCI_DMA64_START; > + } > + } > > spapr_tce_table_enable(tcet, bus_offset, page_shift, > window_size >> page_shift, > @@ -891,9 +913,14 @@ static int spapr_phb_dma_init_window(sPAPRPHBState *sphb, > int spapr_phb_dma_remove_window(sPAPRPHBState *sphb, > sPAPRTCETable *tcet) > { > + int ret; > + > + if (sphb->has_vfio && sphb->ddw_enabled) { > + ret = spapr_phb_vfio_dma_remove_window(sphb, tcet); > + } > spapr_tce_table_disable(tcet); > > - return 0; > + return ret; > } > > static int spapr_phb_disable_dma_windows(Object *child, void *opaque) > @@ -950,6 +977,8 @@ static Property spapr_phb_properties[] = { > DEFINE_PROP_UINT64("io_win_addr", sPAPRPHBState, io_win_addr, -1), > DEFINE_PROP_UINT64("io_win_size", sPAPRPHBState, io_win_size, > SPAPR_PCI_IO_WIN_SIZE), > + DEFINE_PROP_BOOL("ddw", sPAPRPHBState, ddw_enabled, true), > + DEFINE_PROP_UINT8("levels", sPAPRPHBState, levels, 0), > DEFINE_PROP_END_OF_LIST(), > }; > > @@ -1140,6 +1169,15 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > uint32_t interrupt_map_mask[] = { > cpu_to_be32(b_ddddd(-1)|b_fff(0)), 0x0, 0x0, cpu_to_be32(-1)}; > uint32_t interrupt_map[PCI_SLOT_MAX * PCI_NUM_PINS][7]; > + uint32_t ddw_applicable[] = { > + cpu_to_be32(RTAS_IBM_QUERY_PE_DMA_WINDOW), > + cpu_to_be32(RTAS_IBM_CREATE_PE_DMA_WINDOW), > + cpu_to_be32(RTAS_IBM_REMOVE_PE_DMA_WINDOW) > + }; > + uint32_t ddw_extensions[] = { > + cpu_to_be32(1), > + cpu_to_be32(RTAS_IBM_RESET_PE_DMA_WINDOW) > + }; > sPAPRTCETable *tcet; > > /* Start populating the FDT */ > @@ -1170,6 +1208,14 @@ int spapr_populate_pci_dt(sPAPRPHBState *phb, > _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pci-config-space-type", 0x1)); > _FDT(fdt_setprop_cell(fdt, bus_off, "ibm,pe-total-#msi", XICS_IRQS)); > > + /* Dynamic DMA window */ > + if (phb->ddw_enabled) { > + _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-applicable", &ddw_applicable, > + sizeof(ddw_applicable))); > + _FDT(fdt_setprop(fdt, bus_off, "ibm,ddw-extensions", > + &ddw_extensions, sizeof(ddw_extensions))); > + } > + > /* Build the interrupt-map, this must matches what is done > * in pci_spapr_map_irq > */ > diff --git a/hw/ppc/spapr_pci_vfio.c b/hw/ppc/spapr_pci_vfio.c > index 6f91b39..7372d91 100644 > --- a/hw/ppc/spapr_pci_vfio.c > +++ b/hw/ppc/spapr_pci_vfio.c > @@ -41,6 +41,86 @@ int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState > *sphb) > sphb->dma32_window_start = info.dma32_window_start; > sphb->dma32_window_size = info.dma32_window_size; > > + if (sphb->ddw_enabled && (info.flags & VFIO_IOMMU_SPAPR_INFO_DDW)) { > + sphb->windows_supported = info.ddw.max_dynamic_windows_supported; > + sphb->page_size_mask = info.ddw.pgsizes; > + sphb->dma64_window_size = pow2ceil(ram_size); > + sphb->max_levels = info.ddw.levels; > + } else { > + /* If VFIO_IOMMU_INFO_DDW is not set, disable DDW */ > + sphb->ddw_enabled = false; > + } > + > + return ret; > +} > + > +static int spapr_phb_vfio_levels(uint32_t entries) > +{ > + unsigned pages = (entries * sizeof(uint64_t)) / getpagesize(); > + int levels; > + > + if (pages <= 64) { > + levels = 1; > + } else if (pages <= 64*64) { > + levels = 2; > + } else if (pages <= 64*64*64) { > + levels = 3; > + } else { > + levels = 4; > + } > + > + return levels; > +} > + > +int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb, > + uint32_t page_shift, > + uint64_t window_size, > + uint64_t *bus_offset) > +{ > + int ret; > + struct vfio_iommu_spapr_tce_create create = { > + .argsz = sizeof(create), > + .page_shift = page_shift, > + .window_size = window_size, > + .levels = sphb->levels, > + .start_addr = 0, > + }; > + > + /* > + * Dynamic windows are supported, that means that there is no > + * pre-created window and we have to create one. > + */ > + if (!create.levels) { > + create.levels = spapr_phb_vfio_levels(create.window_size >> > + page_shift); > + } > + > + if (create.levels > sphb->max_levels) { > + return -EINVAL; > + } > + > + ret = vfio_container_ioctl(&sphb->iommu_as, > + VFIO_IOMMU_SPAPR_TCE_CREATE, &create); > + if (ret) { > + return ret; > + } > + *bus_offset = create.start_addr; > + > + return 0; > +} > + > +int spapr_phb_vfio_dma_remove_window(sPAPRPHBState *sphb, > + sPAPRTCETable *tcet) > +{ > + struct vfio_iommu_spapr_tce_remove remove = { > + .argsz = sizeof(remove), > + .start_addr = tcet->bus_offset > + }; > + int ret; > + > + ret = vfio_container_ioctl(&sphb->iommu_as, > + VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove); > + > return ret; > } > > diff --git a/hw/ppc/spapr_rtas_ddw.c b/hw/ppc/spapr_rtas_ddw.c > new file mode 100644 > index 0000000..7ab7572 > --- /dev/null > +++ b/hw/ppc/spapr_rtas_ddw.c > @@ -0,0 +1,300 @@ > +/* > + * QEMU sPAPR Dynamic DMA windows support > + * > + * Copyright (c) 2014 Alexey Kardashevskiy, IBM Corporation. > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, > + * or (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, see <http://www.gnu.org/licenses/>. > + */ > + > +#include "qemu/error-report.h" > +#include "hw/ppc/spapr.h" > +#include "hw/pci-host/spapr.h" > +#include "trace.h" > + > +static int spapr_phb_get_active_win_num_cb(Object *child, void *opaque) > +{ > + sPAPRTCETable *tcet; > + > + tcet = (sPAPRTCETable *) object_dynamic_cast(child, > TYPE_SPAPR_TCE_TABLE); > + if (tcet && tcet->enabled) { > + ++*(unsigned *)opaque; > + } > + return 0; > +} > + > +static unsigned spapr_phb_get_active_win_num(sPAPRPHBState *sphb) > +{ > + unsigned ret = 0; > + > + object_child_foreach(OBJECT(sphb), spapr_phb_get_active_win_num_cb, > &ret); > + > + return ret; > +} > + > +static int spapr_phb_get_free_liobn_cb(Object *child, void *opaque) > +{ > + sPAPRTCETable *tcet; > + > + tcet = (sPAPRTCETable *) object_dynamic_cast(child, > TYPE_SPAPR_TCE_TABLE); > + if (tcet && !tcet->enabled) { > + *(uint32_t *)opaque = tcet->liobn; > + return 1; > + } > + return 0; > +} > + > +static unsigned spapr_phb_get_free_liobn(sPAPRPHBState *sphb) > +{ > + uint32_t liobn = 0; > + > + object_child_foreach(OBJECT(sphb), spapr_phb_get_free_liobn_cb, &liobn); > + > + return liobn; > +} > + > +static uint32_t spapr_query_mask(struct ppc_one_seg_page_size *sps, > + uint64_t page_mask) > +{ > + int i, j; > + uint32_t mask = 0; > + const struct { int shift; uint32_t mask; } masks[] = { > + { 12, RTAS_DDW_PGSIZE_4K }, > + { 16, RTAS_DDW_PGSIZE_64K }, > + { 24, RTAS_DDW_PGSIZE_16M }, > + { 25, RTAS_DDW_PGSIZE_32M }, > + { 26, RTAS_DDW_PGSIZE_64M }, > + { 27, RTAS_DDW_PGSIZE_128M }, > + { 28, RTAS_DDW_PGSIZE_256M }, > + { 34, RTAS_DDW_PGSIZE_16G }, > + }; > + > + for (i = 0; i < PPC_PAGE_SIZES_MAX_SZ; i++) { > + for (j = 0; j < ARRAY_SIZE(masks); ++j) { > + if ((sps[i].page_shift == masks[j].shift) && > + (page_mask & (1ULL << masks[j].shift))) { > + mask |= masks[j].mask; > + } > + } > + } > + > + return mask; > +} > + > +static void rtas_ibm_query_pe_dma_window(PowerPCCPU *cpu, > + sPAPREnvironment *spapr, > + uint32_t token, uint32_t nargs, > + target_ulong args, > + uint32_t nret, target_ulong rets) > +{ > + CPUPPCState *env = &cpu->env; > + sPAPRPHBState *sphb; > + uint64_t buid; > + uint32_t avail, addr, pgmask = 0; > + unsigned current; > + > + if ((nargs != 3) || (nret != 5)) { > + goto param_error_exit; > + } > + > + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); > + addr = rtas_ld(args, 0); > + sphb = spapr_pci_find_phb(spapr, buid); > + if (!sphb || !sphb->ddw_enabled) { > + goto param_error_exit; > + } > + > + current = spapr_phb_get_active_win_num(sphb); > + avail = (sphb->windows_supported > current) ? > + (sphb->windows_supported - current) : 0; > + > + /* Work out supported page masks */ > + pgmask = spapr_query_mask(env->sps.sps, sphb->page_size_mask); > + > + rtas_st(rets, 0, RTAS_OUT_SUCCESS); > + rtas_st(rets, 1, avail); > + > + /* > + * This is "Largest contiguous block of TCEs allocated specifically > + * for (that is, are reserved for) this PE". > + * Return the maximum number as all RAM was in 4K pages. > + */ > + rtas_st(rets, 2, sphb->dma64_window_size >> SPAPR_TCE_PAGE_SHIFT); > + rtas_st(rets, 3, pgmask); > + rtas_st(rets, 4, 0); /* DMA migration mask, not supported */ > + > + trace_spapr_iommu_ddw_query(buid, addr, avail, sphb->dma64_window_size, > + pgmask); > + return; > + > +param_error_exit: > + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); > +} > + > +static void rtas_ibm_create_pe_dma_window(PowerPCCPU *cpu, > + sPAPREnvironment *spapr, > + uint32_t token, uint32_t nargs, > + target_ulong args, > + uint32_t nret, target_ulong rets) > +{ > + sPAPRPHBState *sphb; > + sPAPRTCETable *tcet = NULL; > + uint32_t addr, page_shift, window_shift, liobn; > + uint64_t buid; > + long ret; > + > + if ((nargs != 5) || (nret != 4)) { > + goto param_error_exit; > + } > + > + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); > + addr = rtas_ld(args, 0); > + sphb = spapr_pci_find_phb(spapr, buid); > + if (!sphb || !sphb->ddw_enabled) { > + goto param_error_exit; > + } > + > + page_shift = rtas_ld(args, 3); > + window_shift = rtas_ld(args, 4); > + liobn = spapr_phb_get_free_liobn(sphb); > + > + if (!liobn || !(sphb->page_size_mask & (1ULL << page_shift))) { > + goto hw_error_exit; > + } > + > + ret = spapr_phb_dma_init_window(sphb, liobn, page_shift, > + 1ULL << window_shift); > + tcet = spapr_tce_find_by_liobn(liobn); > + trace_spapr_iommu_ddw_create(buid, addr, 1ULL << page_shift, > + 1ULL << window_shift, > + tcet ? tcet->bus_offset : 0xbaadf00d, > + liobn, ret); > + if (ret || !tcet) { > + goto hw_error_exit; > + } > + > + rtas_st(rets, 0, RTAS_OUT_SUCCESS); > + rtas_st(rets, 1, liobn); > + rtas_st(rets, 2, tcet->bus_offset >> 32); > + rtas_st(rets, 3, tcet->bus_offset & ((uint32_t) -1)); > + > + return; > + > +hw_error_exit: > + rtas_st(rets, 0, RTAS_OUT_HW_ERROR); > + return; > + > +param_error_exit: > + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); > +} > + > +static void rtas_ibm_remove_pe_dma_window(PowerPCCPU *cpu, > + sPAPREnvironment *spapr, > + uint32_t token, uint32_t nargs, > + target_ulong args, > + uint32_t nret, target_ulong rets) > +{ > + sPAPRPHBState *sphb; > + sPAPRTCETable *tcet; > + uint32_t liobn; > + long ret; > + > + if ((nargs != 1) || (nret != 1)) { > + goto param_error_exit; > + } > + > + liobn = rtas_ld(args, 0); > + tcet = spapr_tce_find_by_liobn(liobn); > + if (!tcet) { > + goto param_error_exit; > + } > + > + sphb = SPAPR_PCI_HOST_BRIDGE(OBJECT(tcet)->parent); > + if (!sphb || !sphb->ddw_enabled) { > + goto param_error_exit; > + } > + > + ret = spapr_phb_dma_remove_window(sphb, tcet); > + trace_spapr_iommu_ddw_remove(liobn, ret); > + if (ret) { > + goto hw_error_exit; > + } > + > + rtas_st(rets, 0, RTAS_OUT_SUCCESS); > + return; > + > +hw_error_exit: > + rtas_st(rets, 0, RTAS_OUT_HW_ERROR); > + return; > + > +param_error_exit: > + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); > +} > + > +static void rtas_ibm_reset_pe_dma_window(PowerPCCPU *cpu, > + sPAPREnvironment *spapr, > + uint32_t token, uint32_t nargs, > + target_ulong args, > + uint32_t nret, target_ulong rets) > +{ > + sPAPRPHBState *sphb; > + uint64_t buid; > + uint32_t addr; > + long ret; > + > + if ((nargs != 3) || (nret != 1)) { > + goto param_error_exit; > + } > + > + buid = ((uint64_t)rtas_ld(args, 1) << 32) | rtas_ld(args, 2); > + addr = rtas_ld(args, 0); > + sphb = spapr_pci_find_phb(spapr, buid); > + if (!sphb || !sphb->ddw_enabled) { > + goto param_error_exit; > + } > + > + ret = spapr_phb_dma_reset(sphb); > + trace_spapr_iommu_ddw_reset(buid, addr, ret); > + if (ret) { > + goto hw_error_exit; > + } > + > + rtas_st(rets, 0, RTAS_OUT_SUCCESS); > + > + return; > + > +hw_error_exit: > + rtas_st(rets, 0, RTAS_OUT_HW_ERROR); > + return; > + > +param_error_exit: > + rtas_st(rets, 0, RTAS_OUT_PARAM_ERROR); > +} > + > +static void spapr_rtas_ddw_init(void) > +{ > + spapr_rtas_register(RTAS_IBM_QUERY_PE_DMA_WINDOW, > + "ibm,query-pe-dma-window", > + rtas_ibm_query_pe_dma_window); > + spapr_rtas_register(RTAS_IBM_CREATE_PE_DMA_WINDOW, > + "ibm,create-pe-dma-window", > + rtas_ibm_create_pe_dma_window); > + spapr_rtas_register(RTAS_IBM_REMOVE_PE_DMA_WINDOW, > + "ibm,remove-pe-dma-window", > + rtas_ibm_remove_pe_dma_window); > + spapr_rtas_register(RTAS_IBM_RESET_PE_DMA_WINDOW, > + "ibm,reset-pe-dma-window", > + rtas_ibm_reset_pe_dma_window); > +} > + > +type_init(spapr_rtas_ddw_init) > diff --git a/include/hw/pci-host/spapr.h b/include/hw/pci-host/spapr.h > index 484291c..1d2ea8d 100644 > --- a/include/hw/pci-host/spapr.h > +++ b/include/hw/pci-host/spapr.h > @@ -87,6 +87,12 @@ struct sPAPRPHBState { > uint32_t dma32_window_size; > bool has_vfio; > int32_t iommugroupid; /* obsolete */ > + bool ddw_enabled; > + uint32_t windows_supported; > + uint64_t page_size_mask; > + uint64_t dma64_window_size; > + uint8_t max_levels; > + uint8_t levels; > > QLIST_ENTRY(sPAPRPHBState) list; > }; > @@ -109,6 +115,12 @@ struct sPAPRPHBState { > > #define SPAPR_PCI_DMA32_SIZE 0x40000000 > > +/* Default 64bit dynamic window offset */ > +#define SPAPR_PCI_DMA64_START 0x8000000000000000ULL > + > +/* Maximum allowed number of DMA windows for emulated PHB */ > +#define SPAPR_PCI_DMA_MAX_WINDOWS 2 > + > static inline qemu_irq spapr_phb_lsi_qirq(struct sPAPRPHBState *phb, int pin) > { > return xics_get_qirq(spapr->icp, phb->lsi_table[pin].irq); > @@ -127,11 +139,20 @@ void spapr_pci_rtas_init(void); > sPAPRPHBState *spapr_pci_find_phb(sPAPREnvironment *spapr, uint64_t buid); > PCIDevice *spapr_pci_find_dev(sPAPREnvironment *spapr, uint64_t buid, > uint32_t config_addr); > +int spapr_phb_dma_init_window(sPAPRPHBState *sphb, > + uint32_t liobn, uint32_t page_shift, > + uint64_t window_size); > int spapr_phb_dma_remove_window(sPAPRPHBState *sphb, > sPAPRTCETable *tcet); > int spapr_phb_dma_reset(sPAPRPHBState *sphb); > > int spapr_phb_vfio_dma_capabilities_update(sPAPRPHBState *sphb); > +int spapr_phb_vfio_dma_init_window(sPAPRPHBState *sphb, > + uint32_t page_shift, > + uint64_t window_size, > + uint64_t *bus_offset); > +int spapr_phb_vfio_dma_remove_window(sPAPRPHBState *sphb, > + sPAPRTCETable *tcet); > int spapr_phb_vfio_eeh_set_option(sPAPRPHBState *sphb, > unsigned int addr, int option); > int spapr_phb_vfio_eeh_get_state(sPAPRPHBState *sphb, int *state); > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h > index c8ac03f..873c661 100644 > --- a/include/hw/ppc/spapr.h > +++ b/include/hw/ppc/spapr.h > @@ -381,6 +381,16 @@ int spapr_allocate_irq_block(int num, bool lsi, bool > msi); > #define RTAS_OUT_NOT_SUPPORTED -3 > #define RTAS_OUT_NOT_AUTHORIZED -9002 > > +/* DDW pagesize mask values from ibm,query-pe-dma-window */ > +#define RTAS_DDW_PGSIZE_4K 0x01 > +#define RTAS_DDW_PGSIZE_64K 0x02 > +#define RTAS_DDW_PGSIZE_16M 0x04 > +#define RTAS_DDW_PGSIZE_32M 0x08 > +#define RTAS_DDW_PGSIZE_64M 0x10 > +#define RTAS_DDW_PGSIZE_128M 0x20 > +#define RTAS_DDW_PGSIZE_256M 0x40 > +#define RTAS_DDW_PGSIZE_16G 0x80 > + > /* RTAS tokens */ > #define RTAS_TOKEN_BASE 0x2000 > > @@ -422,8 +432,12 @@ int spapr_allocate_irq_block(int num, bool lsi, bool > msi); > #define RTAS_IBM_SET_SLOT_RESET (RTAS_TOKEN_BASE + 0x23) > #define RTAS_IBM_CONFIGURE_PE (RTAS_TOKEN_BASE + 0x24) > #define RTAS_IBM_SLOT_ERROR_DETAIL (RTAS_TOKEN_BASE + 0x25) > +#define RTAS_IBM_QUERY_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x26) > +#define RTAS_IBM_CREATE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x27) > +#define RTAS_IBM_REMOVE_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x28) > +#define RTAS_IBM_RESET_PE_DMA_WINDOW (RTAS_TOKEN_BASE + 0x29) > > -#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x26) > +#define RTAS_TOKEN_MAX (RTAS_TOKEN_BASE + 0x2A) > > /* RTAS ibm,get-system-parameter token values */ > #define RTAS_SYSPARM_SPLPAR_CHARACTERISTICS 20 > @@ -504,6 +518,7 @@ struct sPAPRTCETable { > uint64_t bus_offset; > uint32_t page_shift; > uint64_t *table; > + uint64_t *migtable; > bool bypass; > bool vfio_accel; > int fd; > diff --git a/trace-events b/trace-events > index 2739140..fd8ea7a 100644 > --- a/trace-events > +++ b/trace-events > @@ -1344,6 +1344,10 @@ spapr_iommu_pci_indirect(uint64_t liobn, uint64_t > ioba, uint64_t tce, uint64_t i > spapr_iommu_pci_stuff(uint64_t liobn, uint64_t ioba, uint64_t tce_value, > uint64_t npages, uint64_t ret) "liobn=%"PRIx64" ioba=0x%"PRIx64" > tcevalue=0x%"PRIx64" npages=%"PRId64" ret=%"PRId64 > spapr_iommu_xlate(uint64_t liobn, uint64_t ioba, uint64_t tce, unsigned > perm, unsigned pgsize) "liobn=%"PRIx64" 0x%"PRIx64" -> 0x%"PRIx64" perm=%u > mask=%x" > spapr_iommu_new_table(uint64_t liobn, void *tcet, void *table, int fd) > "liobn=%"PRIx64" tcet=%p table=%p fd=%d" > +spapr_iommu_ddw_query(uint64_t buid, uint32_t cfgaddr, unsigned wa, uint64_t > win_size, uint32_t pgmask) "buid=%"PRIx64" addr=%"PRIx32", %u windows > available, max window size=%"PRIx64", mask=%"PRIx32 > +spapr_iommu_ddw_create(uint64_t buid, uint32_t cfgaddr, unsigned long long > pg_size, unsigned long long req_size, uint64_t start, uint32_t liobn, long > ret) "buid=%"PRIx64" addr=%"PRIx32", page size=0x%llx, requested=0x%llx, > start addr=%"PRIx64", liobn=%"PRIx32", ret = %ld" > +spapr_iommu_ddw_remove(uint32_t liobn, long ret) "liobn=%"PRIx32", ret = %ld" > +spapr_iommu_ddw_reset(uint64_t buid, uint32_t cfgaddr, long ret) > "buid=%"PRIx64" addr=%"PRIx32", ret = %ld" > > # hw/ppc/ppc.c > ppc_tb_adjust(uint64_t offs1, uint64_t offs2, int64_t diff, int64_t seconds) > "adjusted from 0x%"PRIx64" to 0x%"PRIx64", diff %"PRId64" (%"PRId64"s)" -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
pgpNE0azswQaD.pgp
Description: PGP signature