On Wed, 2020-05-13 at 11:55 -0600, Alex Williamson wrote: > On Wed, 13 May 2020 00:35:47 +0000 > "Derrick, Jonathan" <jonathan.derr...@intel.com> wrote: > > > Hi Alex, > > > > [snip]
> > Thanks for the confirmation. The approach seems ok, but a subtle > side-effect of MemoryRegion quirks is that we're going to trap the > entire PAGE_SIZE range overlapping the quirk out to QEMU on guest > access. The two registers at 0x2000 might be reserved for this > purpose, but is there anything else that could be performance sensitive > anywhere in that page? If so, it might be less transparent, but more > efficient to invent a new quirk making use of config space (ex. adding > an emulated vendor capability somewhere known to be unused on the > device). Thanks, > > Alex Seems like that could be a problem if we're running with huge pages and overlapping msix tables. Gerd also recommended adding a vendor specific capability. I don't see that there's any vendor-specific capabilities we can accidentally shadow, so I'll go ahead and do that. Thanks, Jon > > > > > In order to support existing VMDs, this quirk emulates the VMLOCK and > > > > HPA shadow registers for all VMD device ids which don't natively assist > > > > with passthrough. The Linux VMD driver is updated to allow existing VMD > > > > devices to query VMLOCK for passthrough support. > > > > > > > > Signed-off-by: Jon Derrick <jonathan.derr...@intel.com> > > > > --- > > > > hw/vfio/pci-quirks.c | 103 +++++++++++++++++++++++++++++++++++++++++++ > > > > hw/vfio/pci.c | 7 +++ > > > > hw/vfio/pci.h | 2 + > > > > hw/vfio/trace-events | 3 ++ > > > > 4 files changed, 115 insertions(+) > > > > > > > > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c > > > > index 2d348f8237..4060a6a95d 100644 > > > > --- a/hw/vfio/pci-quirks.c > > > > +++ b/hw/vfio/pci-quirks.c > > > > @@ -1709,3 +1709,106 @@ free_exit: > > > > > > > > return ret; > > > > } > > > > + > > > > +/* > > > > + * The VMD endpoint provides a real PCIe domain to the guest and the > > > > guest > > > > + * kernel performs enumeration of the VMD sub-device domain. Guest > > > > transactions > > > > + * to VMD sub-devices go through IOMMU translation from guest > > > > addresses to > > > > + * physical addresses. When MMIO goes to an endpoint after being > > > > translated to > > > > + * physical addresses, the bridge rejects the transaction because the > > > > window > > > > + * has been programmed with guest addresses. > > > > + * > > > > + * VMD can use the Host Physical Address in order to correctly program > > > > the > > > > + * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow > > > > registers > > > > + * located at offset 0x2000 in MEMBAR2 (BAR 4). The shadow registers > > > > are valid > > > > + * if bit 1 is set in the VMD VMLOCK config register 0x70. VMD devices > > > > without > > > > + * this native assistance can have these registers safely emulated as > > > > these > > > > + * registers are reserved. > > > > + */ > > > > +typedef struct VFIOVMDQuirk { > > > > + VFIOPCIDevice *vdev; > > > > + uint64_t membar_phys[2]; > > > > +} VFIOVMDQuirk; > > > > + > > > > +static uint64_t vfio_vmd_quirk_read(void *opaque, hwaddr addr, > > > > unsigned size) > > > > +{ > > > > + VFIOVMDQuirk *data = opaque; > > > > + uint64_t val = 0; > > > > + > > > > + memcpy(&val, (void *)data->membar_phys + addr, size); > > > > + return val; > > > > +} > > > > + > > > > +static const MemoryRegionOps vfio_vmd_quirk = { > > > > + .read = vfio_vmd_quirk_read, > > > > + .endianness = DEVICE_LITTLE_ENDIAN, > > > > +}; > > > > + > > > > +#define VMD_VMLOCK 0x70 > > > > +#define VMD_SHADOW 0x2000 > > > > +#define VMD_MEMBAR2 4 > > > > + > > > > +static int vfio_vmd_emulate_shadow_registers(VFIOPCIDevice *vdev) > > > > +{ > > > > + VFIOQuirk *quirk; > > > > + VFIOVMDQuirk *data; > > > > + PCIDevice *pdev = &vdev->pdev; > > > > + int ret; > > > > + > > > > + data = g_malloc0(sizeof(*data)); > > > > + ret = pread(vdev->vbasedev.fd, data->membar_phys, 16, > > > > + vdev->config_offset + PCI_BASE_ADDRESS_2); > > > > + if (ret != 16) { > > > > + error_report("VMD %s cannot read MEMBARs (%d)", > > > > + vdev->vbasedev.name, ret); > > > > + g_free(data); > > > > + return -EFAULT; > > > > + } > > > > + > > > > + quirk = vfio_quirk_alloc(1); > > > > + quirk->data = data; > > > > + data->vdev = vdev; > > > > + > > > > + /* Emulate Shadow Registers */ > > > > + memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_vmd_quirk, > > > > data, > > > > + "vfio-vmd-quirk", sizeof(data->membar_phys)); > > > > + > > > > memory_region_add_subregion_overlap(vdev->bars[VMD_MEMBAR2].region.mem, > > > > + VMD_SHADOW, quirk->mem, 1); > > > > + memory_region_set_readonly(quirk->mem, true); > > > > + memory_region_set_enabled(quirk->mem, true); > > > > + > > > > + QLIST_INSERT_HEAD(&vdev->bars[VMD_MEMBAR2].quirks, quirk, next); > > > > + > > > > + trace_vfio_pci_vmd_quirk_shadow_regs(vdev->vbasedev.name, > > > > + data->membar_phys[0], > > > > + data->membar_phys[1]); > > > > + > > > > + /* Advertise Shadow Register support */ > > > > + pci_byte_test_and_set_mask(pdev->config + VMD_VMLOCK, 0x2); > > > > + pci_set_byte(pdev->wmask + VMD_VMLOCK, 0); > > > > + pci_set_byte(vdev->emulated_config_bits + VMD_VMLOCK, 0x2); > > > > + > > > > + trace_vfio_pci_vmd_quirk_vmlock(vdev->vbasedev.name, > > > > + pci_get_byte(pdev->config + > > > > VMD_VMLOCK)); > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +int vfio_pci_vmd_init(VFIOPCIDevice *vdev) > > > > +{ > > > > + int ret = 0; > > > > + > > > > + switch (vdev->device_id) { > > > > + case 0x28C0: /* Native passthrough support */ > > > > + break; > > > > + /* Emulates Native passthrough support */ > > > > + case 0x201D: > > > > + case 0x467F: > > > > + case 0x4C3D: > > > > + case 0x9A0B: > > > > + ret = vfio_vmd_emulate_shadow_registers(vdev); > > > > + break; > > > > + } > > > > + > > > > + return ret; > > > > +} > > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c > > > > index 5e75a95129..85425a1a6f 100644 > > > > --- a/hw/vfio/pci.c > > > > +++ b/hw/vfio/pci.c > > > > @@ -3024,6 +3024,13 @@ static void vfio_realize(PCIDevice *pdev, Error > > > > **errp) > > > > } > > > > } > > > > > > > > + if (vdev->vendor_id == PCI_VENDOR_ID_INTEL) { > > > > + ret = vfio_pci_vmd_init(vdev); > > > > + if (ret) { > > > > + error_report("Failed to setup VMD"); > > > > + } > > > > + } > > > > + > > > > vfio_register_err_notifier(vdev); > > > > vfio_register_req_notifier(vdev); > > > > vfio_setup_resetfn_quirk(vdev); > > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h > > > > index 0da7a20a7e..e8632d806b 100644 > > > > --- a/hw/vfio/pci.h > > > > +++ b/hw/vfio/pci.h > > > > @@ -217,6 +217,8 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev, > > > > int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp); > > > > int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp); > > > > > > > > +int vfio_pci_vmd_init(VFIOPCIDevice *vdev); > > > > + > > > > void vfio_display_reset(VFIOPCIDevice *vdev); > > > > int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp); > > > > void vfio_display_finalize(VFIOPCIDevice *vdev); > > > > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events > > > > index b1ef55a33f..ed64e477db 100644 > > > > --- a/hw/vfio/trace-events > > > > +++ b/hw/vfio/trace-events > > > > @@ -90,6 +90,9 @@ vfio_pci_nvidia_gpu_setup_quirk(const char *name, > > > > uint64_t tgt, uint64_t size) " > > > > vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, > > > > uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64 > > > > vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t > > > > link_speed) "%s link_speed=0x%x" > > > > > > > > +vfio_pci_vmd_quirk_shadow_regs(const char *name, uint64_t mb1, > > > > uint64_t mb2) "%s membar1_phys=0x%"PRIx64" membar2_phys=0x%"PRIx64 > > > > +vfio_pci_vmd_quirk_vmlock(const char *name, uint8_t vmlock) "%s > > > > vmlock=0x%x" > > > > + > > > > # common.c > > > > vfio_region_write(const char *name, int index, uint64_t addr, uint64_t > > > > data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)" > > > > vfio_region_read(char *name, int index, uint64_t addr, unsigned size, > > > > uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64