On 20/11/2019 12:28, Oliver O'Halloran wrote:
> The current process for configuring the IODA PEs for normal PCI devices is
> abit stupid.
This phrase should go.
> After assigning MMIO resources for the devices on a bus the
> actual PE asignment occurs when pcibios_setup_bridge() is called for the
> parent bridge. In pcibios_setup_bridge() we:
>
> 1. Assign all 256 devfn's for the subordinate bus to the PE corresponding
> to the bridge window.
> 2. Initialise the IOMMU tables for that PE.
> 3. Traverse each device on the bus below the bridge setting the IOMMU table
> pointer to point at the PE's table.
> 4. Finally, set pci_dn->pe_number to indicate that we've done the
> per-device setup and allow EEH and the platform code to look up
> the PE number.
>
> Although mostly functional, there's a couple of issues with this approach.
> The most glaring is that it mixes the per-bus (per-PE really) setup with
> the per-device setup in a way that's completely asymmetric to what happens
> when tearing down a device.
>
> In step 4. the number of devices in the PE is counted and stored in the
> ioda_pe structure. When releasing a pci_dev the device count is dropped
> until it hits zero where the ioda_pe itself is torn down. However, the bus
> itself remains active and can be re-scanned to bring back the devices that
> were removed. On a rescan we do not re-run the bridge setup so the
> per-device setup is never re-done which results in the re-scanned being
> unusable.
>
> There are a few other minor issues too. Associating all 256 devfns with
> the PE means that config accesses to non-existant PCI devices results
> in a spurious PE freezes. We currently prevent this by only allowing config
> accesses to a BDFN when there is a corresponding pci_dn structure. We
> would like to eliminate that restriction in the future though.
>
> That all said the biggest issue is that the current behaviour is hard to
> follow at the best of times. On top of that the behaviour is slightly (or
> majorly) different across each PHB type (PCIe, OpenCAPI, NVLink) and the
> behaviour for physical devices (described above) and virtual functions is
> again different. To address this we want to merge the paths as much as
> possible so that the PowerNV specific PCI initialisation steps all occur
> at roughly the same point in the PCI device setup path.
>
> We can address most of these problems by moving the PE setup out of
> pcibios_setup_bridge() and into pcibios_bus_add_device(). The latter is
> called on a per-device basis so we have some symmetry between the setup and
> teardown paths. Moving the PE assignments to here should also allow us to
> converge how PE assignment works on all PHB types so it's always done in
> one place.
>
> Signed-off-by: Oliver O'Halloran <ooh...@gmail.com>
> ---
> arch/powerpc/platforms/powernv/pci-ioda.c | 112 +++++++++++-----------
> 1 file changed, 58 insertions(+), 54 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index c6ea7a504e04..c74521e5f3ab 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -51,6 +51,7 @@ static const char * const pnv_phb_names[] = { "IODA1",
> "IODA2", "NPU_NVLINK",
> "NPU_OCAPI" };
>
> static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
> +static void pnv_pci_configure_bus(struct pci_bus *bus);
Do not need this, at least, after 46/46.
>
> void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
> const char *fmt, ...)
> @@ -1104,34 +1105,6 @@ static struct pnv_ioda_pe
> *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
> return pe;
> }
>
> -static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe
> *pe)
> -{
> - struct pci_dev *dev;
> -
> - list_for_each_entry(dev, &bus->devices, bus_list) {
> - struct pci_dn *pdn = pci_get_pdn(dev);
> -
> - if (pdn == NULL) {
> - pr_warn("%s: No device node associated with device !\n",
> - pci_name(dev));
> - continue;
> - }
> -
> - /*
> - * In partial hotplug case, the PCI device might be still
> - * associated with the PE and needn't attach it to the PE
> - * again.
> - */
> - if (pdn->pe_number != IODA_INVALID_PE)
> - continue;
> -
> - pe->device_count++;
> - pdn->pe_number = pe->pe_number;
> - if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
> - pnv_ioda_setup_same_PE(dev->subordinate, pe);
> - }
> -}
> -
> /*
> * There're 2 types of PCI bus sensitive PEs: One that is compromised of
> * single PCI bus. Another one that contains the primary PCI bus and its
> @@ -1152,7 +1125,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct
> pci_bus *bus, bool all)
> pe_num = phb->ioda.pe_rmap[bus->number << 8];
> if (pe_num != IODA_INVALID_PE) {
> pe = &phb->ioda.pe_array[pe_num];
> - pnv_ioda_setup_same_PE(bus, pe);
> return NULL;
> }
>
> @@ -1196,9 +1168,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct
> pci_bus *bus, bool all)
> return NULL;
> }
>
> - /* Associate it with all child devices */
> - pnv_ioda_setup_same_PE(bus, pe);
> -
> /* Put PE to the list */
> list_add_tail(&pe->list, &phb->ioda.pe_list);
>
> @@ -1758,23 +1727,20 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb
> *phb, struct pci_dev *pdev
> struct pci_dn *pdn = pci_get_pdn(pdev);
> struct pnv_ioda_pe *pe;
>
> - /*
> - * The function can be called while the PE#
> - * hasn't been assigned. Do nothing for the
> - * case.
> - */
> - if (!pdn || pdn->pe_number == IODA_INVALID_PE)
> - return;
> -
> pe = &phb->ioda.pe_array[pdn->pe_number];
> WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
> pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
> set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
> +
> + pe->device_count++;
> +
> /*
> * Note: iommu_add_device() will fail here as
> * for physical PE: the device is already added by now;
> * for virtual PE: sysfs entries are not ready yet and
> * tce_iommu_bus_notifier will add the device to a group later.
> + *
> + * XXX: this is wrong since the re-ordering patch.
What is that re-ordering patch and why it did not remove this comment then?
> */
> }
>
> @@ -2288,9 +2254,6 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb
> *phb,
> pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
> iommu_init_table(tbl, phb->hose->node, 0, 0);
>
> - if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> - pnv_ioda_setup_bus_dma(pe, pe->pbus);
> -
> return;
> fail:
> /* XXX Failure: Try to fallback to 64-bit only ? */
> @@ -2626,9 +2589,9 @@ static void pnv_pci_ioda_setup_iommu_api(void)
> /*
> * There are 4 types of PEs:
> * - PNV_IODA_PE_BUS: a downstream port with an adapter,
> - * created from pnv_pci_setup_bridge();
> + * created from pnv_pci_configure_bus();
> * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
> - * created from pnv_pci_setup_bridge();
> + * created from pnv_pci_configure_bus();
> * - PNV_IODA_PE_VF: a SRIOV virtual function,
> * created from pnv_pcibios_sriov_enable();
> * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
> @@ -2748,8 +2711,10 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb
> *phb,
> if (rc)
> return;
>
> - if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
> - pnv_ioda_setup_bus_dma(pe, pe->pbus);
> + /*
> + * The IOMMU table for the PE is associated with the device in
> + * pnv_pcibios_bus_add_device()
> + */
Is it really? eeh_add_device_late or pnv_eeh_probe_pdev do not seem to
be doing this.
> }
>
> int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
> @@ -3324,16 +3289,13 @@ static void pnv_pci_fixup_bridge_resources(struct
> pci_bus *bus,
> }
> }
>
> -static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
> +static void pnv_pci_configure_bus(struct pci_bus *bus)
> {
> struct pci_controller *hose = pci_bus_to_host(bus);
> struct pnv_phb *phb = hose->private_data;
> struct pci_dev *bridge = bus->self;
> struct pnv_ioda_pe *pe;
> - bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
> -
> - /* Extend bridge's windows if necessary */
> - pnv_pci_fixup_bridge_resources(bus, type);
> + bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
>
> /* The PE for root bus should be realized before any one else */
> if (!phb->ioda.root_pe_populated) {
> @@ -3342,12 +3304,21 @@ static void pnv_pci_setup_bridge(struct pci_bus *bus,
> unsigned long type)
> phb->ioda.root_pe_idx = pe->pe_number;
> phb->ioda.root_pe_populated = true;
> }
> +
> + /* no need to re-configure the root bus */
> + if (bus == phb->hose->bus)
> + return;
> }
>
> /* Don't assign PE to PCI bus, which doesn't have subordinate devices */
> if (list_empty(&bus->devices))
> return;
>
> + /* PE should never be re-configured */
> + pe = __pnv_ioda_get_pe(phb, bus->number << 8);
> + if (WARN_ON(pe))
> + return;
> +
> /* Reserve PEs according to used M64 resources */
> pnv_ioda_reserve_m64_pe(bus, NULL, all);
>
> @@ -3654,6 +3625,39 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
> {
> struct pci_controller *hose = pci_bus_to_host(pdev->bus);
> struct pnv_phb *phb = hose->private_data;
> + struct pci_dn *pdn = pci_get_pdn(pdev);
> + struct pnv_ioda_pe *pe;
> +
> + /* Check if the BDFN for this device is associated with a PE yet */
> + pe = __pnv_ioda_get_pe(phb, pdev->devfn | (pdev->bus->number << 8));
> + if (!pe) {
> + /*
> + * We should only hit this path for "normal" PCI PHBs. The
We should only hit this path only for PCIe-PCIx bridges, right? If yes,
should not this be in pnv_pci_dma_bus_setup()? If no, I am lost :(
> + * special PHBs used for OpenCAPI and NVLink don't have to
> + * deal with eeh-on-mmio so they assign PEs at probe time
> + * rather than after resources are allocated.
> + */
> + WARN_ON(phb->type != PNV_PHB_IODA2 && phb->type !=
> PNV_PHB_IODA1);
> + /* PEs for VFs should have been assigned in sriov_enable() */
> + WARN_ON(pdev->is_virtfn);
> +
> + pnv_pci_configure_bus(pdev->bus);
> + pe = __pnv_ioda_get_pe(phb, pdev->devfn | (pdev->bus->number <<
> 8));
> + pci_err(pdev, "Configured new pe PE#%x\n", pe ? pe->pe_number :
> 0xfffff);
> +
> +
An extra empty line.
> + /*
> + * If we can't setup the IODA PE something has gone horribly
> + * wrong and we can't enable DMA for the device.
> + */
> + if (WARN_ON(!pe))
> + return;
> + } else {
> + pci_err(pdev, "Added to existing PE#%x\n", pe->pe_number);
> + }
> +
> + if (pdn)
> + pdn->pe_number = pe->pe_number;
>
> pnv_pci_ioda_dma_dev_setup(phb, pdev);
Open code pnv_pci_ioda_dma_dev_setup()?
Or at least do pe->device_count++ here?
> }
> @@ -3680,14 +3684,14 @@ void pnv_pci_dma_bus_setup(struct pci_bus *bus)
>
> static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
> .dma_dev_setup = pnv_pci_dma_dev_setup,
> - .dma_bus_setup = pnv_pci_dma_bus_setup,
> + .dma_bus_setup = pnv_pci_dma_bus_setup, /* NB: DMA setup
> actually happens in dma_dev_setup */
The comment does not help much imho.
> .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported,
> .setup_msi_irqs = pnv_setup_msi_irqs,
> .teardown_msi_irqs = pnv_teardown_msi_irqs,
> .enable_device_hook = pnv_pci_enable_device_hook,
> .release_device = pnv_pci_release_device,
> .window_alignment = pnv_pci_window_alignment,
> - .setup_bridge = pnv_pci_setup_bridge,
> + .setup_bridge = pnv_pci_fixup_bridge_resources,
Using "fixup" in the name of a "setup" hook is ... mmmm.... contemporary :)
> .reset_secondary_bus = pnv_pci_reset_secondary_bus,
> .shutdown = pnv_pci_ioda_shutdown,
> };
>
--
Alexey