On Wed, 2020-05-13 at 11:55 -0600, Alex Williamson wrote:
> On Wed, 13 May 2020 00:35:47 +0000
> "Derrick, Jonathan" <jonathan.derr...@intel.com> wrote:
> 
> > Hi Alex,
> > 
> > 
[snip]

> 
> Thanks for the confirmation.  The approach seems ok, but a subtle
> side-effect of MemoryRegion quirks is that we're going to trap the
> entire PAGE_SIZE range overlapping the quirk out to QEMU on guest
> access.  The two registers at 0x2000 might be reserved for this
> purpose, but is there anything else that could be performance sensitive
> anywhere in that page?  If so, it might be less transparent, but more
> efficient to invent a new quirk making use of config space (ex. adding
> an emulated vendor capability somewhere known to be unused on the
> device).  Thanks,
> 
> Alex

Seems like that could be a problem if we're running with huge pages and
overlapping msix tables.
 
Gerd also recommended adding a vendor specific capability. I don't see
that there's any vendor-specific capabilities we can accidentally
shadow, so I'll go ahead and do that.

Thanks,
Jon

> 
> > > > In order to support existing VMDs, this quirk emulates the VMLOCK and
> > > > HPA shadow registers for all VMD device ids which don't natively assist
> > > > with passthrough. The Linux VMD driver is updated to allow existing VMD
> > > > devices to query VMLOCK for passthrough support.
> > > > 
> > > > Signed-off-by: Jon Derrick <jonathan.derr...@intel.com>
> > > > ---
> > > >  hw/vfio/pci-quirks.c | 103 +++++++++++++++++++++++++++++++++++++++++++
> > > >  hw/vfio/pci.c        |   7 +++
> > > >  hw/vfio/pci.h        |   2 +
> > > >  hw/vfio/trace-events |   3 ++
> > > >  4 files changed, 115 insertions(+)
> > > > 
> > > > diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
> > > > index 2d348f8237..4060a6a95d 100644
> > > > --- a/hw/vfio/pci-quirks.c
> > > > +++ b/hw/vfio/pci-quirks.c
> > > > @@ -1709,3 +1709,106 @@ free_exit:
> > > >  
> > > >      return ret;
> > > >  }
> > > > +
> > > > +/*
> > > > + * The VMD endpoint provides a real PCIe domain to the guest and the 
> > > > guest
> > > > + * kernel performs enumeration of the VMD sub-device domain. Guest 
> > > > transactions
> > > > + * to VMD sub-devices go through IOMMU translation from guest 
> > > > addresses to
> > > > + * physical addresses. When MMIO goes to an endpoint after being 
> > > > translated to
> > > > + * physical addresses, the bridge rejects the transaction because the 
> > > > window
> > > > + * has been programmed with guest addresses.
> > > > + *
> > > > + * VMD can use the Host Physical Address in order to correctly program 
> > > > the
> > > > + * bridge windows in its PCIe domain. VMD device 28C0 has HPA shadow 
> > > > registers
> > > > + * located at offset 0x2000 in MEMBAR2 (BAR 4). The shadow registers 
> > > > are valid
> > > > + * if bit 1 is set in the VMD VMLOCK config register 0x70. VMD devices 
> > > > without
> > > > + * this native assistance can have these registers safely emulated as 
> > > > these
> > > > + * registers are reserved.
> > > > + */
> > > > +typedef struct VFIOVMDQuirk {
> > > > +    VFIOPCIDevice *vdev;
> > > > +    uint64_t membar_phys[2];
> > > > +} VFIOVMDQuirk;
> > > > +
> > > > +static uint64_t vfio_vmd_quirk_read(void *opaque, hwaddr addr, 
> > > > unsigned size)
> > > > +{
> > > > +    VFIOVMDQuirk *data = opaque;
> > > > +    uint64_t val = 0;
> > > > +
> > > > +    memcpy(&val, (void *)data->membar_phys + addr, size);
> > > > +    return val;
> > > > +}
> > > > +
> > > > +static const MemoryRegionOps vfio_vmd_quirk = {
> > > > +    .read = vfio_vmd_quirk_read,
> > > > +    .endianness = DEVICE_LITTLE_ENDIAN,
> > > > +};
> > > > +
> > > > +#define VMD_VMLOCK  0x70
> > > > +#define VMD_SHADOW  0x2000
> > > > +#define VMD_MEMBAR2 4
> > > > +
> > > > +static int vfio_vmd_emulate_shadow_registers(VFIOPCIDevice *vdev)
> > > > +{
> > > > +    VFIOQuirk *quirk;
> > > > +    VFIOVMDQuirk *data;
> > > > +    PCIDevice *pdev = &vdev->pdev;
> > > > +    int ret;
> > > > +
> > > > +    data = g_malloc0(sizeof(*data));
> > > > +    ret = pread(vdev->vbasedev.fd, data->membar_phys, 16,
> > > > +                vdev->config_offset + PCI_BASE_ADDRESS_2);
> > > > +    if (ret != 16) {
> > > > +        error_report("VMD %s cannot read MEMBARs (%d)",
> > > > +                     vdev->vbasedev.name, ret);
> > > > +        g_free(data);
> > > > +        return -EFAULT;
> > > > +    }
> > > > +
> > > > +    quirk = vfio_quirk_alloc(1);
> > > > +    quirk->data = data;
> > > > +    data->vdev = vdev;
> > > > +
> > > > +    /* Emulate Shadow Registers */
> > > > +    memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_vmd_quirk, 
> > > > data,
> > > > +                          "vfio-vmd-quirk", sizeof(data->membar_phys));
> > > > +    
> > > > memory_region_add_subregion_overlap(vdev->bars[VMD_MEMBAR2].region.mem,
> > > > +                                        VMD_SHADOW, quirk->mem, 1);
> > > > +    memory_region_set_readonly(quirk->mem, true);
> > > > +    memory_region_set_enabled(quirk->mem, true);
> > > > +
> > > > +    QLIST_INSERT_HEAD(&vdev->bars[VMD_MEMBAR2].quirks, quirk, next);
> > > > +
> > > > +    trace_vfio_pci_vmd_quirk_shadow_regs(vdev->vbasedev.name,
> > > > +                                         data->membar_phys[0],
> > > > +                                         data->membar_phys[1]);
> > > > +
> > > > +    /* Advertise Shadow Register support */
> > > > +    pci_byte_test_and_set_mask(pdev->config + VMD_VMLOCK, 0x2);
> > > > +    pci_set_byte(pdev->wmask + VMD_VMLOCK, 0);
> > > > +    pci_set_byte(vdev->emulated_config_bits + VMD_VMLOCK, 0x2);
> > > > +
> > > > +    trace_vfio_pci_vmd_quirk_vmlock(vdev->vbasedev.name,
> > > > +                                    pci_get_byte(pdev->config + 
> > > > VMD_VMLOCK));
> > > > +
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +int vfio_pci_vmd_init(VFIOPCIDevice *vdev)
> > > > +{
> > > > +    int ret = 0;
> > > > +
> > > > +    switch (vdev->device_id) {
> > > > +    case 0x28C0: /* Native passthrough support */
> > > > +        break;
> > > > +    /* Emulates Native passthrough support */
> > > > +    case 0x201D:
> > > > +    case 0x467F:
> > > > +    case 0x4C3D:
> > > > +    case 0x9A0B:
> > > > +        ret = vfio_vmd_emulate_shadow_registers(vdev);
> > > > +        break;
> > > > +    }
> > > > +
> > > > +    return ret;
> > > > +}
> > > > diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
> > > > index 5e75a95129..85425a1a6f 100644
> > > > --- a/hw/vfio/pci.c
> > > > +++ b/hw/vfio/pci.c
> > > > @@ -3024,6 +3024,13 @@ static void vfio_realize(PCIDevice *pdev, Error 
> > > > **errp)
> > > >          }
> > > >      }
> > > >  
> > > > +    if (vdev->vendor_id == PCI_VENDOR_ID_INTEL) {
> > > > +        ret = vfio_pci_vmd_init(vdev);
> > > > +        if (ret) {
> > > > +            error_report("Failed to setup VMD");
> > > > +        }
> > > > +    }
> > > > +
> > > >      vfio_register_err_notifier(vdev);
> > > >      vfio_register_req_notifier(vdev);
> > > >      vfio_setup_resetfn_quirk(vdev);
> > > > diff --git a/hw/vfio/pci.h b/hw/vfio/pci.h
> > > > index 0da7a20a7e..e8632d806b 100644
> > > > --- a/hw/vfio/pci.h
> > > > +++ b/hw/vfio/pci.h
> > > > @@ -217,6 +217,8 @@ int vfio_pci_igd_opregion_init(VFIOPCIDevice *vdev,
> > > >  int vfio_pci_nvidia_v100_ram_init(VFIOPCIDevice *vdev, Error **errp);
> > > >  int vfio_pci_nvlink2_init(VFIOPCIDevice *vdev, Error **errp);
> > > >  
> > > > +int vfio_pci_vmd_init(VFIOPCIDevice *vdev);
> > > > +
> > > >  void vfio_display_reset(VFIOPCIDevice *vdev);
> > > >  int vfio_display_probe(VFIOPCIDevice *vdev, Error **errp);
> > > >  void vfio_display_finalize(VFIOPCIDevice *vdev);
> > > > diff --git a/hw/vfio/trace-events b/hw/vfio/trace-events
> > > > index b1ef55a33f..ed64e477db 100644
> > > > --- a/hw/vfio/trace-events
> > > > +++ b/hw/vfio/trace-events
> > > > @@ -90,6 +90,9 @@ vfio_pci_nvidia_gpu_setup_quirk(const char *name, 
> > > > uint64_t tgt, uint64_t size) "
> > > >  vfio_pci_nvlink2_setup_quirk_ssatgt(const char *name, uint64_t tgt, 
> > > > uint64_t size) "%s tgt=0x%"PRIx64" size=0x%"PRIx64
> > > >  vfio_pci_nvlink2_setup_quirk_lnkspd(const char *name, uint32_t 
> > > > link_speed) "%s link_speed=0x%x"
> > > >  
> > > > +vfio_pci_vmd_quirk_shadow_regs(const char *name, uint64_t mb1, 
> > > > uint64_t mb2) "%s membar1_phys=0x%"PRIx64" membar2_phys=0x%"PRIx64
> > > > +vfio_pci_vmd_quirk_vmlock(const char *name, uint8_t vmlock) "%s 
> > > > vmlock=0x%x"
> > > > +
> > > >  # common.c
> > > >  vfio_region_write(const char *name, int index, uint64_t addr, uint64_t 
> > > > data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
> > > >  vfio_region_read(char *name, int index, uint64_t addr, unsigned size, 
> > > > uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64  

Reply via email to