Re: [PATCH v4 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Knut Omang
On Wed, 2022-01-26 at 18:11 +0100, Lukasz Maniak wrote:
> From: Knut Omang 
> 
> This patch provides the building blocks for creating an SR/IOV
> PCIe Extended Capability header and register/unregister
> SR/IOV Virtual Functions.
> 
> Signed-off-by: Knut Omang 
> ---
>  hw/pci/meson.build  |   1 +
>  hw/pci/pci.c    | 100 +---
>  hw/pci/pcie.c   |   5 +
>  hw/pci/pcie_sriov.c | 294 
>  hw/pci/trace-events |   5 +
>  include/hw/pci/pci.h    |  12 +-
>  include/hw/pci/pcie.h   |   6 +
>  include/hw/pci/pcie_sriov.h |  71 +
>  include/qemu/typedefs.h |   2 +
>  9 files changed, 470 insertions(+), 26 deletions(-)
>  create mode 100644 hw/pci/pcie_sriov.c
>  create mode 100644 include/hw/pci/pcie_sriov.h
> 
> diff --git a/hw/pci/meson.build b/hw/pci/meson.build
> index 5c4bbac817..bcc9c75919 100644
> --- a/hw/pci/meson.build
> +++ b/hw/pci/meson.build
> @@ -5,6 +5,7 @@ pci_ss.add(files(
>    'pci.c',
>    'pci_bridge.c',
>    'pci_host.c',
> +  'pcie_sriov.c',
>    'shpc.c',
>    'slotid_cap.c'
>  ))
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index 5d30f9ca60..ba8fb92efc 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg)
>  {
>  uint8_t type;
>  
> +    /* PCIe virtual functions do not have their own BARs */
> +    assert(!pci_is_vf(d));
> +
>  if (reg != PCI_ROM_SLOT)
>  return PCI_BASE_ADDRESS_0 + reg * 4;
>  
> @@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev)
>  }
>  }
>  
> -static void pci_do_device_reset(PCIDevice *dev)
> +static void pci_reset_regions(PCIDevice *dev)
>  {
>  int r;
> +    if (pci_is_vf(dev)) {
> +    return;
> +    }
> +
> +    for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> +    PCIIORegion *region = >io_regions[r];
> +    if (!region->size) {
> +    continue;
> +    }
>  
> +    if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> +    region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> +    pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> +    } else {
> +    pci_set_long(dev->config + pci_bar(dev, r), region->type);
> +    }
> +    }
> +}
> +
> +static void pci_do_device_reset(PCIDevice *dev)
> +{
>  pci_device_deassert_intx(dev);
>  assert(dev->irq_state == 0);
>  
> @@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev)
>    pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) |
>    pci_get_word(dev->w1cmask + 
> PCI_INTERRUPT_LINE));
>  dev->config[PCI_CACHE_LINE_SIZE] = 0x0;
> -    for (r = 0; r < PCI_NUM_REGIONS; ++r) {
> -    PCIIORegion *region = >io_regions[r];
> -    if (!region->size) {
> -    continue;
> -    }
> -
> -    if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) &&
> -    region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> -    pci_set_quad(dev->config + pci_bar(dev, r), region->type);
> -    } else {
> -    pci_set_long(dev->config + pci_bar(dev, r), region->type);
> -    }
> -    }
> +    pci_reset_regions(dev);
>  pci_update_mappings(dev);
>  
>  msi_reset(dev);
> @@ -884,6 +895,16 @@ static void pci_init_multifunction(PCIBus *bus, 
> PCIDevice *dev,
> Error **errp)
>  dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION;
>  }
>  
> +    /*
> + * With SR/IOV and ARI, a device at function 0 need not be a 
> multifunction
> + * device, as it may just be a VF that ended up with function 0 in
> + * the legacy PCI interpretation. Avoid failing in such cases:
> + */
> +    if (pci_is_vf(dev) &&
> +    dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
> +    return;
> +    }
> +
>  /*
>   * multifunction bit is interpreted in two ways as follows.
>   *   - all functions must set the bit to 1.
> @@ -1083,6 +1104,7 @@ static PCIDevice *do_pci_register_device(PCIDevice 
> *pci_dev,
>     bus->devices[devfn]->name);
>  return NULL;
>  } else if (dev->hotplugged &&
> +   !pci_is_vf(pci_dev) &&
>     pci_get_function_0(pci_dev)) {
>  error_setg(errp, "PCI: slot %d function 0 already occupied by %s,"
>     " new func %s cannot be exposed to guest.",
> @@ 

Re: [PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)

2022-01-26 Thread Knut Omang
On Wed, 2022-01-26 at 14:23 +0100, Łukasz Gieryk wrote:
> I'm sorry for the delayed response. We (I and the other Lukasz) somehow
> had hoped that Knut, the original author of this patch, would have
> responded.

Yes, sorry - this one flushed past me here for some reason,

> 
> Let me address your questions, up to my best knowledge.
>   
> > > -static pcibus_t pci_bar_address(PCIDevice *d,
> > > -    int reg, uint8_t type, pcibus_t size)
> > > +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg,
> > > +    uint8_t type, pcibus_t size)
> > > +{
> > > +    pcibus_t new_addr;
> > > +    if (!pci_is_vf(d)) {
> > > +    int bar = pci_bar(d, reg);
> > > +    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +    new_addr = pci_get_quad(d->config + bar);
> > > +    } else {
> > > +    new_addr = pci_get_long(d->config + bar);
> > > +    }
> > > +    } else {
> > > +    PCIDevice *pf = d->exp.sriov_vf.pf;
> > > +    uint16_t sriov_cap = pf->exp.sriov_cap;
> > > +    int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4;
> > > +    uint16_t vf_offset = pci_get_word(pf->config + sriov_cap +
> > > PCI_SRIOV_VF_OFFSET);
> > > +    uint16_t vf_stride = pci_get_word(pf->config + sriov_cap +
> > > PCI_SRIOV_VF_STRIDE);
> > > +    uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / 
> > > vf_stride;
> > > +
> > > +    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > +    new_addr = pci_get_quad(pf->config + bar);
> > > +    } else {
> > > +    new_addr = pci_get_long(pf->config + bar);
> > > +    }
> > > +    new_addr += vf_num * size;
> > > +    }
> > > +    if (reg != PCI_ROM_SLOT) {
> > > +    /* Preserve the rom enable bit */
> > > +    new_addr &= ~(size - 1);
> > 
> > This comment puzzles me. How does clearing low bits preserve
> > any bits? Looks like this will clear low bits if any.
> > 
> 
> I think the comment applies to (reg != PCI_ROM_SLOT), i.e., the bits are
> cleared for BARs, but not for expansion ROM. I agree the placement of this
> comment is slightly misleading. We will move it up and rephrase slightly.

I agree - it's maybe better to just put the comment above the if(...) 
other than that I believe it is correct.

Knut

>  
> > > +pcibus_t pci_bar_address(PCIDevice *d,
> > > + int reg, uint8_t type, pcibus_t size)
> > >   {
> > >   pcibus_t new_addr, last_addr;
> > > -    int bar = pci_bar(d, reg);
> > >   uint16_t cmd = pci_get_word(d->config + PCI_COMMAND);
> > >   Object *machine = qdev_get_machine();
> > >   ObjectClass *oc = object_get_class(machine);
> > > @@ -1309,7 +1363,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> > >   if (!(cmd & PCI_COMMAND_IO)) {
> > >   return PCI_BAR_UNMAPPED;
> > >   }
> > > -    new_addr = pci_get_long(d->config + bar) & ~(size - 1);
> > > +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
> > >   last_addr = new_addr + size - 1;
> > >   /* Check if 32 bit BAR wraps around explicitly.
> > >    * TODO: make priorities correct and remove this work around.
> > > @@ -1324,11 +1378,7 @@ static pcibus_t pci_bar_address(PCIDevice *d,
> > >   if (!(cmd & PCI_COMMAND_MEMORY)) {
> > >   return PCI_BAR_UNMAPPED;
> > >   }
> > > -    if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) {
> > > -    new_addr = pci_get_quad(d->config + bar);
> > > -    } else {
> > > -    new_addr = pci_get_long(d->config + bar);
> > > -    }
> > > +    new_addr = pci_config_get_bar_addr(d, reg, type, size);
> > >   /* the ROM slot has a specific enable bit */
> > >   if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) {
> > 
> > And in fact here we check the low bit and handle it specially.
> 
> The code seems correct for me. The bit is preserved for ROM case.
> 
> > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > > index d7d73a31e4..182a225054 100644
> > > --- a/hw/pci/pcie.c
> > > +++ b/hw/pci/pcie.c
> > > @@ -446,6 +446,11 @@ void pcie_cap_slot_plug_cb(HotplugHandler 
> > > *hotplug_dev,
> > > DeviceState *dev,
> > >   PCIDevice *pci_dev = PCI_DEVICE(dev);
> > >   uint32_t lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP);
> > >   
> > > +    if(pci_is_vf(pci_dev)) {
> > > +    /* We don't want to change any state in hotplug_dev for SR/IOV 
> > > virtual
> > > functions */
> > > +    return;
> > > +    }
> > > +
> > 
> > Coding style violation here.  And pls document the why not the what.
> > E.g. IIRC the reason is that VFs don't have an express capability,
> > right?
> 
> I think the reason is that virtual functions don’t exist physically, so
> they cannot be individually disconnected. Only PF should respond to
> hotplug events, implicitly disconnecting (thus: destroying) all child
> VFs.
> 
> Anyway, we will update this comment to state *why* and add the missing
> 

Re: [PATCH 01/15] pcie: Set default and supported MaxReadReq to 512

2021-10-26 Thread Knut Omang
On Tue, 2021-10-26 at 16:36 +0200, Lukasz Maniak wrote:
> On Thu, Oct 07, 2021 at 06:12:41PM -0400, Michael S. Tsirkin wrote:
> > On Thu, Oct 07, 2021 at 06:23:52PM +0200, Lukasz Maniak wrote:
> > > From: Knut Omang 
> > > 
> > > Make the default PCI Express Capability for PCIe devices set
> > > MaxReadReq to 512.
> > 
> > code says 256
> > 
> > > Tyipcal modern devices people would want to
> > 
> > 
> > typo
> > 
> > > emulate or simulate would want this. The previous value would
> > > cause warnings from the root port driver on some kernels.
> > 
> > 
> > which specifically?
> > 
> > > 
> > > Signed-off-by: Knut Omang 
> > 
> > we can't make changes like this unconditionally, this will
> > break migration across versions.
> > Pls tie this to a machine version.
> > 
> > Thanks!
> > > ---
> > >   hw/pci/pcie.c | 5 -
> > >   1 file changed, 4 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
> > > index 6e95d82903..c1a12f3744 100644
> > > --- a/hw/pci/pcie.c
> > > +++ b/hw/pci/pcie.c
> > > @@ -62,8 +62,9 @@ pcie_cap_v1_fill(PCIDevice *dev, uint8_t port, uint8_t 
> > > type, uint8_t
> > > version)
> > >    * Functions conforming to the ECN, PCI Express Base
> > >    * Specification, Revision 1.1., or subsequent PCI Express Base
> > >    * Specification revisions.
> > > + *  + set max payload size to 256, which seems to be a common value
> > >    */
> > > -    pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER);
> > > +    pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER | (0x1 &
> > > PCI_EXP_DEVCAP_PAYLOAD));
> > >   
> > >   pci_set_long(exp_cap + PCI_EXP_LNKCAP,
> > >    (port << PCI_EXP_LNKCAP_PN_SHIFT) |
> > > @@ -179,6 +180,8 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset,
> > >   pci_set_long(exp_cap + PCI_EXP_DEVCAP2,
> > >    PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP);
> > >   
> > > +    pci_set_word(exp_cap + PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_READRQ_256B);
> > > +
> > >   pci_set_word(dev->wmask + pos + PCI_EXP_DEVCTL2, 
> > > PCI_EXP_DEVCTL2_EETLPPB);
> > >   
> > >   if (dev->cap_present & QEMU_PCIE_EXTCAP_INIT) {
> > > -- 
> > > 2.25.1
> > 
> 
> Hi Michael,
> 
> The reason Knut keeps rebasing this fix along with SR-IOV patch is not
> clear for us.

Sorry for the slow response - I seem to have messed up my mail filters so this
thread slipped past my attention.

> Since we have tested the NVMe device without this fix and did not notice
> any issues mentioned by Knut on kernel 5.4.0, we decided to drop it for
> v2.

I agree, let's just drop it.

It was likely in the 3.x kernels I had to relate to back then, 
likely discovered in Oracle Linux given that I did not specifically point to a 
kernel
range already back then.

> However, I have posted your comments to this patch on Knut's github so
> they can be addressed in case Knut decides to resubmit it later though.

Thanks for that ping, Lukasz, and great to see the patch finally being used in a
functional device!

Knut

> Thanks,
> Lukasz





Re: [PATCH 13/15] pcie: Add helpers to the SR/IOV API

2021-10-26 Thread Knut Omang
On Thu, 2021-10-07 at 18:24 +0200, Lukasz Maniak wrote:
> From: Łukasz Gieryk 
> 
> Two convenience functions for retrieving:
>  - the total number of VFs,
>  - the PCIDevice object of the N-th VF.
> 
> Signed-off-by: Łukasz Gieryk 
> ---
>  hw/pci/pcie_sriov.c | 14 ++
>  include/hw/pci/pcie_sriov.h |  8 
>  2 files changed, 22 insertions(+)
> 
> diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c
> index cac2aee061..5a8e92d5ab 100644
> --- a/hw/pci/pcie_sriov.c
> +++ b/hw/pci/pcie_sriov.c
> @@ -292,8 +292,22 @@ uint16_t pcie_sriov_vf_number(PCIDevice *dev)
>  return dev->exp.sriov_vf.vf_number;
>  }
>  
> +uint16_t pcie_sriov_vf_number_total(PCIDevice *dev)
> +{
> +    assert(!pci_is_vf(dev));
> +    return dev->exp.sriov_pf.num_vfs;
> +}
>  
>  PCIDevice *pcie_sriov_get_pf(PCIDevice *dev)
>  {
>  return dev->exp.sriov_vf.pf;
>  }
> +
> +PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n)
> +{
> +    assert(!pci_is_vf(dev));
> +    if (n < dev->exp.sriov_pf.num_vfs) {
> +    return dev->exp.sriov_pf.vf[n];
> +    }
> +    return NULL;
> +}
> diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h
> index 9ab48b79c0..d1f39b7223 100644
> --- a/include/hw/pci/pcie_sriov.h
> +++ b/include/hw/pci/pcie_sriov.h
> @@ -65,9 +65,17 @@ void pcie_sriov_pf_disable_vfs(PCIDevice *dev);
>  /* Get logical VF number of a VF - only valid for VFs */
>  uint16_t pcie_sriov_vf_number(PCIDevice *dev);
>  
> +/* Get the total number of VFs - only valid for PF */
> +uint16_t pcie_sriov_vf_number_total(PCIDevice *dev);
> +
>  /* Get the physical function that owns this VF.
>   * Returns NULL if dev is not a virtual function
>   */
>  PCIDevice *pcie_sriov_get_pf(PCIDevice *dev);
>  
> +/* Get the n-th VF of this physical function - only valid for PF.
> + * Returns NULL if index is invalid
> + */
> +PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n);
> +
>  #endif /* QEMU_PCIE_SRIOV_H */


These look like natural improvements to me, thanks!

Reviewed-by: Knut Omang