Re: [PATCH v4 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)
On Wed, 2022-01-26 at 18:11 +0100, Lukasz Maniak wrote: > From: Knut Omang > > This patch provides the building blocks for creating an SR/IOV > PCIe Extended Capability header and register/unregister > SR/IOV Virtual Functions. > > Signed-off-by: Knut Omang > --- > hw/pci/meson.build | 1 + > hw/pci/pci.c | 100 +--- > hw/pci/pcie.c | 5 + > hw/pci/pcie_sriov.c | 294 > hw/pci/trace-events | 5 + > include/hw/pci/pci.h | 12 +- > include/hw/pci/pcie.h | 6 + > include/hw/pci/pcie_sriov.h | 71 + > include/qemu/typedefs.h | 2 + > 9 files changed, 470 insertions(+), 26 deletions(-) > create mode 100644 hw/pci/pcie_sriov.c > create mode 100644 include/hw/pci/pcie_sriov.h > > diff --git a/hw/pci/meson.build b/hw/pci/meson.build > index 5c4bbac817..bcc9c75919 100644 > --- a/hw/pci/meson.build > +++ b/hw/pci/meson.build > @@ -5,6 +5,7 @@ pci_ss.add(files( > 'pci.c', > 'pci_bridge.c', > 'pci_host.c', > + 'pcie_sriov.c', > 'shpc.c', > 'slotid_cap.c' > )) > diff --git a/hw/pci/pci.c b/hw/pci/pci.c > index 5d30f9ca60..ba8fb92efc 100644 > --- a/hw/pci/pci.c > +++ b/hw/pci/pci.c > @@ -239,6 +239,9 @@ int pci_bar(PCIDevice *d, int reg) > { > uint8_t type; > > + /* PCIe virtual functions do not have their own BARs */ > + assert(!pci_is_vf(d)); > + > if (reg != PCI_ROM_SLOT) > return PCI_BASE_ADDRESS_0 + reg * 4; > > @@ -304,10 +307,30 @@ void pci_device_deassert_intx(PCIDevice *dev) > } > } > > -static void pci_do_device_reset(PCIDevice *dev) > +static void pci_reset_regions(PCIDevice *dev) > { > int r; > + if (pci_is_vf(dev)) { > + return; > + } > + > + for (r = 0; r < PCI_NUM_REGIONS; ++r) { > + PCIIORegion *region = >io_regions[r]; > + if (!region->size) { > + continue; > + } > > + if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) && > + region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { > + pci_set_quad(dev->config + pci_bar(dev, r), region->type); > + } else { > + pci_set_long(dev->config + pci_bar(dev, r), region->type); > + } > + } > +} > + > +static void pci_do_device_reset(PCIDevice *dev) > +{ > pci_device_deassert_intx(dev); > assert(dev->irq_state == 0); > > @@ -323,19 +346,7 @@ static void pci_do_device_reset(PCIDevice *dev) > pci_get_word(dev->wmask + PCI_INTERRUPT_LINE) | > pci_get_word(dev->w1cmask + > PCI_INTERRUPT_LINE)); > dev->config[PCI_CACHE_LINE_SIZE] = 0x0; > - for (r = 0; r < PCI_NUM_REGIONS; ++r) { > - PCIIORegion *region = >io_regions[r]; > - if (!region->size) { > - continue; > - } > - > - if (!(region->type & PCI_BASE_ADDRESS_SPACE_IO) && > - region->type & PCI_BASE_ADDRESS_MEM_TYPE_64) { > - pci_set_quad(dev->config + pci_bar(dev, r), region->type); > - } else { > - pci_set_long(dev->config + pci_bar(dev, r), region->type); > - } > - } > + pci_reset_regions(dev); > pci_update_mappings(dev); > > msi_reset(dev); > @@ -884,6 +895,16 @@ static void pci_init_multifunction(PCIBus *bus, > PCIDevice *dev, > Error **errp) > dev->config[PCI_HEADER_TYPE] |= PCI_HEADER_TYPE_MULTI_FUNCTION; > } > > + /* > + * With SR/IOV and ARI, a device at function 0 need not be a > multifunction > + * device, as it may just be a VF that ended up with function 0 in > + * the legacy PCI interpretation. Avoid failing in such cases: > + */ > + if (pci_is_vf(dev) && > + dev->exp.sriov_vf.pf->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { > + return; > + } > + > /* > * multifunction bit is interpreted in two ways as follows. > * - all functions must set the bit to 1. > @@ -1083,6 +1104,7 @@ static PCIDevice *do_pci_register_device(PCIDevice > *pci_dev, > bus->devices[devfn]->name); > return NULL; > } else if (dev->hotplugged && > + !pci_is_vf(pci_dev) && > pci_get_function_0(pci_dev)) { > error_setg(errp, "PCI: slot %d function 0 already occupied by %s," > " new func %s cannot be exposed to guest.", > @@
Re: [PATCH v3 01/15] pcie: Add support for Single Root I/O Virtualization (SR/IOV)
On Wed, 2022-01-26 at 14:23 +0100, Łukasz Gieryk wrote: > I'm sorry for the delayed response. We (I and the other Lukasz) somehow > had hoped that Knut, the original author of this patch, would have > responded. Yes, sorry - this one flushed past me here for some reason, > > Let me address your questions, up to my best knowledge. > > > > -static pcibus_t pci_bar_address(PCIDevice *d, > > > - int reg, uint8_t type, pcibus_t size) > > > +static pcibus_t pci_config_get_bar_addr(PCIDevice *d, int reg, > > > + uint8_t type, pcibus_t size) > > > +{ > > > + pcibus_t new_addr; > > > + if (!pci_is_vf(d)) { > > > + int bar = pci_bar(d, reg); > > > + if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { > > > + new_addr = pci_get_quad(d->config + bar); > > > + } else { > > > + new_addr = pci_get_long(d->config + bar); > > > + } > > > + } else { > > > + PCIDevice *pf = d->exp.sriov_vf.pf; > > > + uint16_t sriov_cap = pf->exp.sriov_cap; > > > + int bar = sriov_cap + PCI_SRIOV_BAR + reg * 4; > > > + uint16_t vf_offset = pci_get_word(pf->config + sriov_cap + > > > PCI_SRIOV_VF_OFFSET); > > > + uint16_t vf_stride = pci_get_word(pf->config + sriov_cap + > > > PCI_SRIOV_VF_STRIDE); > > > + uint32_t vf_num = (d->devfn - (pf->devfn + vf_offset)) / > > > vf_stride; > > > + > > > + if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { > > > + new_addr = pci_get_quad(pf->config + bar); > > > + } else { > > > + new_addr = pci_get_long(pf->config + bar); > > > + } > > > + new_addr += vf_num * size; > > > + } > > > + if (reg != PCI_ROM_SLOT) { > > > + /* Preserve the rom enable bit */ > > > + new_addr &= ~(size - 1); > > > > This comment puzzles me. How does clearing low bits preserve > > any bits? Looks like this will clear low bits if any. > > > > I think the comment applies to (reg != PCI_ROM_SLOT), i.e., the bits are > cleared for BARs, but not for expansion ROM. I agree the placement of this > comment is slightly misleading. We will move it up and rephrase slightly. I agree - it's maybe better to just put the comment above the if(...) other than that I believe it is correct. Knut > > > > +pcibus_t pci_bar_address(PCIDevice *d, > > > + int reg, uint8_t type, pcibus_t size) > > > { > > > pcibus_t new_addr, last_addr; > > > - int bar = pci_bar(d, reg); > > > uint16_t cmd = pci_get_word(d->config + PCI_COMMAND); > > > Object *machine = qdev_get_machine(); > > > ObjectClass *oc = object_get_class(machine); > > > @@ -1309,7 +1363,7 @@ static pcibus_t pci_bar_address(PCIDevice *d, > > > if (!(cmd & PCI_COMMAND_IO)) { > > > return PCI_BAR_UNMAPPED; > > > } > > > - new_addr = pci_get_long(d->config + bar) & ~(size - 1); > > > + new_addr = pci_config_get_bar_addr(d, reg, type, size); > > > last_addr = new_addr + size - 1; > > > /* Check if 32 bit BAR wraps around explicitly. > > > * TODO: make priorities correct and remove this work around. > > > @@ -1324,11 +1378,7 @@ static pcibus_t pci_bar_address(PCIDevice *d, > > > if (!(cmd & PCI_COMMAND_MEMORY)) { > > > return PCI_BAR_UNMAPPED; > > > } > > > - if (type & PCI_BASE_ADDRESS_MEM_TYPE_64) { > > > - new_addr = pci_get_quad(d->config + bar); > > > - } else { > > > - new_addr = pci_get_long(d->config + bar); > > > - } > > > + new_addr = pci_config_get_bar_addr(d, reg, type, size); > > > /* the ROM slot has a specific enable bit */ > > > if (reg == PCI_ROM_SLOT && !(new_addr & PCI_ROM_ADDRESS_ENABLE)) { > > > > And in fact here we check the low bit and handle it specially. > > The code seems correct for me. The bit is preserved for ROM case. > > > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c > > > index d7d73a31e4..182a225054 100644 > > > --- a/hw/pci/pcie.c > > > +++ b/hw/pci/pcie.c > > > @@ -446,6 +446,11 @@ void pcie_cap_slot_plug_cb(HotplugHandler > > > *hotplug_dev, > > > DeviceState *dev, > > > PCIDevice *pci_dev = PCI_DEVICE(dev); > > > uint32_t lnkcap = pci_get_long(exp_cap + PCI_EXP_LNKCAP); > > > > > > + if(pci_is_vf(pci_dev)) { > > > + /* We don't want to change any state in hotplug_dev for SR/IOV > > > virtual > > > functions */ > > > + return; > > > + } > > > + > > > > Coding style violation here. And pls document the why not the what. > > E.g. IIRC the reason is that VFs don't have an express capability, > > right? > > I think the reason is that virtual functions don’t exist physically, so > they cannot be individually disconnected. Only PF should respond to > hotplug events, implicitly disconnecting (thus: destroying) all child > VFs. > > Anyway, we will update this comment to state *why* and add the missing >
Re: [PATCH 01/15] pcie: Set default and supported MaxReadReq to 512
On Tue, 2021-10-26 at 16:36 +0200, Lukasz Maniak wrote: > On Thu, Oct 07, 2021 at 06:12:41PM -0400, Michael S. Tsirkin wrote: > > On Thu, Oct 07, 2021 at 06:23:52PM +0200, Lukasz Maniak wrote: > > > From: Knut Omang > > > > > > Make the default PCI Express Capability for PCIe devices set > > > MaxReadReq to 512. > > > > code says 256 > > > > > Tyipcal modern devices people would want to > > > > > > typo > > > > > emulate or simulate would want this. The previous value would > > > cause warnings from the root port driver on some kernels. > > > > > > which specifically? > > > > > > > > Signed-off-by: Knut Omang > > > > we can't make changes like this unconditionally, this will > > break migration across versions. > > Pls tie this to a machine version. > > > > Thanks! > > > --- > > > hw/pci/pcie.c | 5 - > > > 1 file changed, 4 insertions(+), 1 deletion(-) > > > > > > diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c > > > index 6e95d82903..c1a12f3744 100644 > > > --- a/hw/pci/pcie.c > > > +++ b/hw/pci/pcie.c > > > @@ -62,8 +62,9 @@ pcie_cap_v1_fill(PCIDevice *dev, uint8_t port, uint8_t > > > type, uint8_t > > > version) > > > * Functions conforming to the ECN, PCI Express Base > > > * Specification, Revision 1.1., or subsequent PCI Express Base > > > * Specification revisions. > > > + * + set max payload size to 256, which seems to be a common value > > > */ > > > - pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER); > > > + pci_set_long(exp_cap + PCI_EXP_DEVCAP, PCI_EXP_DEVCAP_RBER | (0x1 & > > > PCI_EXP_DEVCAP_PAYLOAD)); > > > > > > pci_set_long(exp_cap + PCI_EXP_LNKCAP, > > > (port << PCI_EXP_LNKCAP_PN_SHIFT) | > > > @@ -179,6 +180,8 @@ int pcie_cap_init(PCIDevice *dev, uint8_t offset, > > > pci_set_long(exp_cap + PCI_EXP_DEVCAP2, > > > PCI_EXP_DEVCAP2_EFF | PCI_EXP_DEVCAP2_EETLPP); > > > > > > + pci_set_word(exp_cap + PCI_EXP_DEVCTL, PCI_EXP_DEVCTL_READRQ_256B); > > > + > > > pci_set_word(dev->wmask + pos + PCI_EXP_DEVCTL2, > > > PCI_EXP_DEVCTL2_EETLPPB); > > > > > > if (dev->cap_present & QEMU_PCIE_EXTCAP_INIT) { > > > -- > > > 2.25.1 > > > > Hi Michael, > > The reason Knut keeps rebasing this fix along with SR-IOV patch is not > clear for us. Sorry for the slow response - I seem to have messed up my mail filters so this thread slipped past my attention. > Since we have tested the NVMe device without this fix and did not notice > any issues mentioned by Knut on kernel 5.4.0, we decided to drop it for > v2. I agree, let's just drop it. It was likely in the 3.x kernels I had to relate to back then, likely discovered in Oracle Linux given that I did not specifically point to a kernel range already back then. > However, I have posted your comments to this patch on Knut's github so > they can be addressed in case Knut decides to resubmit it later though. Thanks for that ping, Lukasz, and great to see the patch finally being used in a functional device! Knut > Thanks, > Lukasz
Re: [PATCH 13/15] pcie: Add helpers to the SR/IOV API
On Thu, 2021-10-07 at 18:24 +0200, Lukasz Maniak wrote: > From: Łukasz Gieryk > > Two convenience functions for retrieving: > - the total number of VFs, > - the PCIDevice object of the N-th VF. > > Signed-off-by: Łukasz Gieryk > --- > hw/pci/pcie_sriov.c | 14 ++ > include/hw/pci/pcie_sriov.h | 8 > 2 files changed, 22 insertions(+) > > diff --git a/hw/pci/pcie_sriov.c b/hw/pci/pcie_sriov.c > index cac2aee061..5a8e92d5ab 100644 > --- a/hw/pci/pcie_sriov.c > +++ b/hw/pci/pcie_sriov.c > @@ -292,8 +292,22 @@ uint16_t pcie_sriov_vf_number(PCIDevice *dev) > return dev->exp.sriov_vf.vf_number; > } > > +uint16_t pcie_sriov_vf_number_total(PCIDevice *dev) > +{ > + assert(!pci_is_vf(dev)); > + return dev->exp.sriov_pf.num_vfs; > +} > > PCIDevice *pcie_sriov_get_pf(PCIDevice *dev) > { > return dev->exp.sriov_vf.pf; > } > + > +PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n) > +{ > + assert(!pci_is_vf(dev)); > + if (n < dev->exp.sriov_pf.num_vfs) { > + return dev->exp.sriov_pf.vf[n]; > + } > + return NULL; > +} > diff --git a/include/hw/pci/pcie_sriov.h b/include/hw/pci/pcie_sriov.h > index 9ab48b79c0..d1f39b7223 100644 > --- a/include/hw/pci/pcie_sriov.h > +++ b/include/hw/pci/pcie_sriov.h > @@ -65,9 +65,17 @@ void pcie_sriov_pf_disable_vfs(PCIDevice *dev); > /* Get logical VF number of a VF - only valid for VFs */ > uint16_t pcie_sriov_vf_number(PCIDevice *dev); > > +/* Get the total number of VFs - only valid for PF */ > +uint16_t pcie_sriov_vf_number_total(PCIDevice *dev); > + > /* Get the physical function that owns this VF. > * Returns NULL if dev is not a virtual function > */ > PCIDevice *pcie_sriov_get_pf(PCIDevice *dev); > > +/* Get the n-th VF of this physical function - only valid for PF. > + * Returns NULL if index is invalid > + */ > +PCIDevice *pcie_sriov_get_vf_at_index(PCIDevice *dev, int n); > + > #endif /* QEMU_PCIE_SRIOV_H */ These look like natural improvements to me, thanks! Reviewed-by: Knut Omang