On Thu, Apr 04, 2019 at 02:22:25PM -0600, Alex Williamson wrote:
> On Thu,  4 Apr 2019 16:23:24 +1100
> Alexey Kardashevskiy <a...@ozlabs.ru> wrote:
> 
> > The NVIDIA V100 SXM2 GPUs are connected to the CPU via PCIe links and
> > (on POWER9) NVLinks. In addition to that, GPUs themselves have direct
> > peer to peer NVLinks in groups of 2 to 4 GPUs. At the moment the POWERNV
> > platform puts all interconnected GPUs to the same IOMMU group.
> > 
> > However the user may want to pass individual GPUs to the userspace so
> > in order to do so we need to put them into separate IOMMU groups and
> > cut off the interconnects.
> > 
> > Thankfully V100 GPUs implement an interface to do by programming link
> > disabling mask to BAR0 of a GPU. Once a link is disabled in a GPU using
> > this interface, it cannot be re-enabled until the secondary bus reset is
> > issued to the GPU.
> > 
> > This adds an extra step to the secondary bus reset handler (the one used
> > for such GPUs) to block NVLinks to GPUs which do not belong to the same
> > group as the GPU being reset.
> > 
> > This adds a new "isolate_nvlink" kernel parameter to allow GPU isolation;
> > when enabled, every GPU gets its own IOMMU group. The new parameter is off
> > by default to preserve the existing behaviour.
> > 
> > Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
> > ---
> > Changes:
> > v2:
> > * this is rework of [PATCH kernel RFC 0/2] vfio, powerpc/powernv: Isolate 
> > GV100GL
> > but this time it is contained in the powernv platform
> > ---
> >  arch/powerpc/platforms/powernv/Makefile      |   2 +-
> >  arch/powerpc/platforms/powernv/pci.h         |   1 +
> >  arch/powerpc/platforms/powernv/eeh-powernv.c |   1 +
> >  arch/powerpc/platforms/powernv/npu-dma.c     |  24 +++-
> >  arch/powerpc/platforms/powernv/nvlinkgpu.c   | 131 +++++++++++++++++++
> >  5 files changed, 156 insertions(+), 3 deletions(-)
> >  create mode 100644 arch/powerpc/platforms/powernv/nvlinkgpu.c
> > 
> > diff --git a/arch/powerpc/platforms/powernv/Makefile 
> > b/arch/powerpc/platforms/powernv/Makefile
> > index da2e99efbd04..60a10d3b36eb 100644
> > --- a/arch/powerpc/platforms/powernv/Makefile
> > +++ b/arch/powerpc/platforms/powernv/Makefile
> > @@ -6,7 +6,7 @@ obj-y                       += opal-msglog.o opal-hmi.o 
> > opal-power.o opal-irqchip.o
> >  obj-y                      += opal-kmsg.o opal-powercap.o opal-psr.o 
> > opal-sensor-groups.o
> >  
> >  obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
> > -obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
> > +obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o nvlinkgpu.o
> >  obj-$(CONFIG_CXL_BASE)     += pci-cxl.o
> >  obj-$(CONFIG_EEH)  += eeh-powernv.o
> >  obj-$(CONFIG_PPC_SCOM)     += opal-xscom.o
> > diff --git a/arch/powerpc/platforms/powernv/pci.h 
> > b/arch/powerpc/platforms/powernv/pci.h
> > index 8e36da379252..9fd3f391482c 100644
> > --- a/arch/powerpc/platforms/powernv/pci.h
> > +++ b/arch/powerpc/platforms/powernv/pci.h
> > @@ -250,5 +250,6 @@ extern void pnv_pci_unlink_table_and_group(struct 
> > iommu_table *tbl,
> >  extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
> >             void *tce_mem, u64 tce_size,
> >             u64 dma_offset, unsigned int page_shift);
> > +extern void pnv_try_isolate_nvidia_v100(struct pci_dev *gpdev);
> >  
> >  #endif /* __POWERNV_PCI_H */
> > diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
> > b/arch/powerpc/platforms/powernv/eeh-powernv.c
> > index f38078976c5d..464b097d9635 100644
> > --- a/arch/powerpc/platforms/powernv/eeh-powernv.c
> > +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
> > @@ -937,6 +937,7 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
> >             pnv_eeh_bridge_reset(dev, EEH_RESET_HOT);
> >             pnv_eeh_bridge_reset(dev, EEH_RESET_DEACTIVATE);
> >     }
> > +   pnv_try_isolate_nvidia_v100(dev);
> >  }
> >  
> >  static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type,
> > diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
> > b/arch/powerpc/platforms/powernv/npu-dma.c
> > index dc23d9d2a7d9..017eae8197e7 100644
> > --- a/arch/powerpc/platforms/powernv/npu-dma.c
> > +++ b/arch/powerpc/platforms/powernv/npu-dma.c
> > @@ -529,6 +529,23 @@ static void pnv_comp_attach_table_group(struct 
> > npu_comp *npucomp,
> >     ++npucomp->pe_num;
> >  }
> >  
> > +static bool isolate_nvlink;
> > +
> > +static int __init parse_isolate_nvlink(char *p)
> > +{
> > +   bool val;
> > +
> > +   if (!p)
> > +           val = true;
> > +   else if (kstrtobool(p, &val))
> > +           return -EINVAL;
> > +
> > +   isolate_nvlink = val;
> > +
> > +   return 0;
> > +}
> > +early_param("isolate_nvlink", parse_isolate_nvlink);
> > +
> >  struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe 
> > *pe)
> >  {
> >     struct iommu_table_group *table_group;
> > @@ -549,7 +566,7 @@ struct iommu_table_group 
> > *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
> >  
> >     hose = pci_bus_to_host(npdev->bus);
> >  
> > -   if (hose->npu) {
> > +   if (hose->npu && !isolate_nvlink) {
> >             table_group = &hose->npu->npucomp.table_group;
> >  
> >             if (!table_group->group) {
> > @@ -559,7 +576,10 @@ struct iommu_table_group 
> > *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
> >                                     pe->pe_number);
> >             }
> >     } else {
> > -           /* Create a group for 1 GPU and attached NPUs for POWER8 */
> > +           /*
> > +            * Create a group for 1 GPU and attached NPUs for
> > +            * POWER8 (always) or POWER9 (when isolate_nvlink).
> > +            */
> >             pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL);
> >             table_group = &pe->npucomp->table_group;
> >             table_group->ops = &pnv_npu_peers_ops;
> > diff --git a/arch/powerpc/platforms/powernv/nvlinkgpu.c 
> > b/arch/powerpc/platforms/powernv/nvlinkgpu.c
> > new file mode 100644
> > index 000000000000..dbd8e9d47a05
> > --- /dev/null
> > +++ b/arch/powerpc/platforms/powernv/nvlinkgpu.c
> > @@ -0,0 +1,131 @@
> > +// SPDX-License-Identifier: GPL-2.0+
> > +/*
> > + * A helper to disable NVLinks between GPUs on IBM Withersponn platform.
> > + *
> > + * Copyright (C) 2019 IBM Corp.  All rights reserved.
> > + *     Author: Alexey Kardashevskiy <a...@ozlabs.ru>
> > + *
> > + * This program is free software; you can redistribute it and/or modify
> > + * it under the terms of the GNU General Public License version 2 as
> > + * published by the Free Software Foundation.
> > + */
> > +
> > +#include <linux/module.h>
> > +#include <linux/device.h>
> > +#include <linux/of.h>
> > +#include <linux/iommu.h>
> > +#include <linux/pci.h>
> > +
> > +static int nvlinkgpu_is_ph_in_group(struct device *dev, void *data)
> > +{
> > +   return dev->of_node->phandle == *(phandle *) data;
> > +}
> > +
> > +static u32 nvlinkgpu_get_disable_mask(struct device *dev)
> > +{
> > +   int npu, peer;
> > +   u32 mask;
> > +   struct device_node *dn;
> > +   struct iommu_group *group;
> > +
> > +   dn = dev->of_node;
> > +   if (!of_find_property(dn, "ibm,nvlink-peers", NULL))
> > +           return 0;
> > +
> > +   group = iommu_group_get(dev);
> > +   if (!group)
> > +           return 0;
> > +
> > +   /*
> > +    * Collect links to keep which includes links to NPU and links to
> > +    * other GPUs in the same IOMMU group.
> > +    */
> > +   for (npu = 0, mask = 0; ; ++npu) {
> > +           u32 npuph = 0;
> > +
> > +           if (of_property_read_u32_index(dn, "ibm,npu", npu, &npuph))
> > +                   break;
> > +
> > +           for (peer = 0; ; ++peer) {
> > +                   u32 peerph = 0;
> > +
> > +                   if (of_property_read_u32_index(dn, "ibm,nvlink-peers",
> > +                                   peer, &peerph))
> > +                           break;
> > +
> > +                   if (peerph != npuph &&
> > +                           !iommu_group_for_each_dev(group, &peerph,
> > +                                   nvlinkgpu_is_ph_in_group))
> > +                           continue;
> > +
> > +                   mask |= 1 << (peer + 16);
> > +           }
> > +   }
> > +   iommu_group_put(group);
> > +
> > +   /* Disabling mechanism takes links to disable so invert it here */
> > +   mask = ~mask & 0x3F0000;
> > +
> > +   return mask;
> > +}
> > +
> > +void pnv_try_isolate_nvidia_v100(struct pci_dev *bridge)
> > +{
> > +   u32 mask;
> > +   void __iomem *bar0_0, *bar0_120000, *bar0_a00000;
> > +   struct pci_dev *pdev;
> > +
> > +   if (!bridge->subordinate)
> > +           return;
> > +
> > +   pdev = list_first_entry_or_null(&bridge->subordinate->devices,
> > +                   struct pci_dev, bus_list);
> > +   if (!pdev)
> > +           return;
> > +
> > +   if (pdev->vendor != PCI_VENDOR_ID_NVIDIA)
> > +           return;
> > +
> > +   mask = nvlinkgpu_get_disable_mask(&pdev->dev);
> > +   if (!mask)
> > +           return;
> > +
> > +   bar0_0 = pci_iomap_range(pdev, 0, 0, 0x10000);
> > +   bar0_120000 = pci_iomap_range(pdev, 0, 0x120000, 0x10000);
> > +   bar0_a00000 = pci_iomap_range(pdev, 0, 0xA00000, 0x10000);
> > +
> > +   if (bar0_120000 && bar0_0 && bar0_a00000) {
> > +           u32 val;
> > +           u16 cmd = 0, cmdmask = PCI_COMMAND_MEMORY;
> > +
> > +           pci_restore_state(pdev);
> > +           pci_read_config_word(pdev, PCI_COMMAND, &cmd);
> > +           if ((cmd & cmdmask) != cmdmask)
> > +                   pci_write_config_word(pdev, PCI_COMMAND, cmd | cmdmask);
> > +
> > +           /*
> > +            * The sequence is from "Tesla P100 and V100 SXM2 NVLink
> > +            * Isolation on Multi-Tenant Systems".
> > +            * The register names are not provided there either,
> > +            * hence raw values.
> > +            */
> > +           iowrite32(0x4, bar0_120000 + 0x4C);
> > +           iowrite32(0x2, bar0_120000 + 0x2204);
> > +           val = ioread32(bar0_0 + 0x200);
> > +           val |= 0x02000000;
> > +           iowrite32(val, bar0_0 + 0x200);
> > +           val = ioread32(bar0_a00000 + 0x148);
> > +           val |= mask;
> > +           iowrite32(val, bar0_a00000 + 0x148);
> > +
> > +           if ((cmd | cmdmask) != cmd)
> > +                   pci_write_config_word(pdev, PCI_COMMAND, cmd);
> > +   }
> > +
> 
> 
> It's a little troubling that if something goes wrong with an iomap
> we'll silently fail to re-enable the expected isolation.  Seems worthy
> of at least a pci_warn/err.  Thanks,

Agreed, but apart from that LGTM.

Regardless of Alex and my disagreement on what the best way to handle
this long term is, this seems like a reasonable interim approach.

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature

Reply via email to