date:20181220

Re: [PATCH kernel v6 18/20] vfio_pci: Allow mapping extra regions

2018-12-20 Thread Christoph Hellwig

On Wed, Dec 19, 2018 at 09:43:58AM -0700, Alex Williamson wrote:
> [cc +kvm, +lkml]
> 
> Sorry, just noticed these are only visible on ppc lists or for those
> directly cc'd.  vfio's official development list is the kvm list.  I'll
> let spapr specific changes get away without copying this list, but
> changes like this really need to be visible to everyone.  Thanks,

Please resend the whole series to include everyone.

[PATCH kernel v7 03/20] powerpc/vfio/iommu/kvm: Do not pin device memory

2018-12-20 Thread Alexey Kardashevskiy

This new memory does not have page structs as it is not plugged to
the host so gup() will fail anyway.

This adds 2 helpers:
- mm_iommu_newdev() to preregister the "memory device" memory so
the rest of API can still be used;
- mm_iommu_is_devmem() to know if the physical address is one of thise
new regions which we must avoid unpinning of.

This adds @mm to tce_page_is_contained() and iommu_tce_xchg() to test
if the memory is device memory to avoid pfn_to_page().

This adds a check for device memory in mm_iommu_ua_mark_dirty_rm() which
does delayed pages dirtying.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: Paul Mackerras 
---
Changes:
v6:
* added dummy mm_iommu_is_devmem() for !CONFIG_SPAPR_TCE_IOMMU
* removed "extern" from c file

v5:
* mm_iommu_is_devmem() now returns the actual size which might me smaller
than the pageshift so  tce_page_is_contained() won't do pfn_to_page()
if @hpa..@hpa+64K is preregistered but page_shift is bigger than 16
* removed David's r-by because of the change in mm_iommu_is_devmem

v4:
* added device memory check in the real mode path
---
 arch/powerpc/include/asm/iommu.h   |  5 +-
 arch/powerpc/include/asm/mmu_context.h | 11 +++
 arch/powerpc/kernel/iommu.c| 11 ++-
 arch/powerpc/kvm/book3s_64_vio.c   | 18 ++---
 arch/powerpc/mm/mmu_context_iommu.c| 93 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c| 29 +---
 6 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 35db0cb..a8aeac0 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -218,8 +218,9 @@ extern void iommu_register_group(struct iommu_table_group 
*table_group,
 extern int iommu_add_device(struct device *dev);
 extern void iommu_del_device(struct device *dev);
 extern int __init tce_iommu_bus_notifier_init(void);
-extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-   unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+   unsigned long entry, unsigned long *hpa,
+   enum dma_data_direction *direction);
 #else
 static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 268e112..c50bd6a 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -24,6 +24,9 @@ extern bool mm_iommu_preregistered(struct mm_struct *mm);
 extern long mm_iommu_new(struct mm_struct *mm,
unsigned long ua, unsigned long entries,
struct mm_iommu_table_group_mem_t **pmem);
+extern long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua,
+   unsigned long entries, unsigned long dev_hpa,
+   struct mm_iommu_table_group_mem_t **pmem);
 extern long mm_iommu_put(struct mm_struct *mm,
struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_init(struct mm_struct *mm);
@@ -39,8 +42,16 @@ extern long mm_iommu_ua_to_hpa(struct 
mm_iommu_table_group_mem_t *mem,
 extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned int pageshift, unsigned long *hpa);
 extern void mm_iommu_ua_mark_dirty_rm(struct mm_struct *mm, unsigned long ua);
+extern bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+   unsigned int pageshift, unsigned long *size);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
+#else
+static inline bool mm_iommu_is_devmem(struct mm_struct *mm, unsigned long hpa,
+   unsigned int pageshift, unsigned long *size)
+{
+   return false;
+}
 #endif
 extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
 extern void set_context(unsigned long id, pgd_t *pgd);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f0dc680..cbcc615 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define DBG(...)
 
@@ -993,15 +994,19 @@ int iommu_tce_check_gpa(unsigned long page_shift, 
unsigned long gpa)
 }
 EXPORT_SYMBOL_GPL(iommu_tce_check_gpa);
 
-long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
-   unsigned long *hpa, enum dma_data_direction *direction)
+long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
+   unsigned long entry, unsigned long *hpa,
+   enum dma_data_direction *direction)
 {
long ret;
+   unsigned long size = 0;
 
ret = tbl->it_ops->exchange(tbl, entry, hpa, direction);
 
if (!ret && ((*direction == DMA_FROM_DEVICE) ||
-

[PATCH kernel v7 06/20] powerpc/pseries/iommu: Use memory@ nodes in max RAM address calculation

2018-12-20 Thread Alexey Kardashevskiy

We might have memory@ nodes with "linux,usable-memory" set to zero
(for example, to replicate powernv's behaviour for GPU coherent memory)
which means that the memory needs an extra initialization but since
it can be used afterwards, the pseries platform will try mapping it
for DMA so the DMA window needs to cover those memory regions too;
if the window cannot cover new memory regions, the memory onlining fails.

This walks through the memory nodes to find the highest RAM address to
let a huge DMA window cover that too in case this memory gets onlined
later.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v4:
* uses of_read_number directly instead of cut-n-pasted read_n_cells
---
 arch/powerpc/platforms/pseries/iommu.c | 33 +-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 06f0296..cbcc8ce 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -964,6 +964,37 @@ struct failed_ddw_pdn {
 
 static LIST_HEAD(failed_ddw_pdn_list);
 
+static phys_addr_t ddw_memory_hotplug_max(void)
+{
+   phys_addr_t max_addr = memory_hotplug_max();
+   struct device_node *memory;
+
+   for_each_node_by_type(memory, "memory") {
+   unsigned long start, size;
+   int ranges, n_mem_addr_cells, n_mem_size_cells, len;
+   const __be32 *memcell_buf;
+
+   memcell_buf = of_get_property(memory, "reg", &len);
+   if (!memcell_buf || len <= 0)
+   continue;
+
+   n_mem_addr_cells = of_n_addr_cells(memory);
+   n_mem_size_cells = of_n_size_cells(memory);
+
+   /* ranges in cell */
+   ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
+
+   start = of_read_number(memcell_buf, n_mem_addr_cells);
+   memcell_buf += n_mem_addr_cells;
+   size = of_read_number(memcell_buf, n_mem_size_cells);
+   memcell_buf += n_mem_size_cells;
+
+   max_addr = max_t(phys_addr_t, max_addr, start + size);
+   }
+
+   return max_addr;
+}
+
 /*
  * If the PE supports dynamic dma windows, and there is space for a table
  * that can map all pages in a linear offset, then setup such a table,
@@ -1053,7 +1084,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
}
/* verify the window * number of ptes will map the partition */
/* check largest block * page size > max memory hotplug addr */
-   max_addr = memory_hotplug_max();
+   max_addr = ddw_memory_hotplug_max();
if (query.largest_available_block < (max_addr >> page_shift)) {
dev_dbg(&dev->dev, "can't map partition max 0x%llx with %u "
  "%llu-sized pages\n", max_addr,  
query.largest_available_block,
-- 
2.17.1

Re: [PATCH] powerpc/8xx: Map a second 8M text page at startup when needed.

2018-12-20 Thread Christoph Hellwig

On Thu, Dec 20, 2018 at 05:48:25AM +, Christophe Leroy wrote:
> Some debug setup like CONFIG_KASAN generate huge
> kernels with text size over the 8M limit.
> 
> This patch maps a second 8M page when _einittext is over 8M.

Do we also need a check to generate a useful warning if we ever overflow
the 16MB?

[PATCH kernel v7 07/20] powerpc/pseries/npu: Enable platform support

2018-12-20 Thread Alexey Kardashevskiy

We already changed NPU API for GPUs to not to call OPAL and the remaining
bit is initializing NPU structures.

This searches for POWER9 NVLinks attached to any device on a PHB and
initializes an NPU structure if any found.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* added WARN_ON_ONCE

v4:
* dropped "IBM,npu-vphb" compatible type on PHB and use the type of NVLink
---
 arch/powerpc/platforms/pseries/pci.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/pci.c 
b/arch/powerpc/platforms/pseries/pci.c
index 41d8a4d..7725825 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "pseries.h"
 
 #if 0
@@ -237,6 +238,8 @@ static void __init pSeries_request_regions(void)
 
 void __init pSeries_final_fixup(void)
 {
+   struct pci_controller *hose;
+
pSeries_request_regions();
 
eeh_probe_devices();
@@ -246,6 +249,25 @@ void __init pSeries_final_fixup(void)
ppc_md.pcibios_sriov_enable = pseries_pcibios_sriov_enable;
ppc_md.pcibios_sriov_disable = pseries_pcibios_sriov_disable;
 #endif
+   list_for_each_entry(hose, &hose_list, list_node) {
+   struct device_node *dn = hose->dn, *nvdn;
+
+   while (1) {
+   dn = of_find_all_nodes(dn);
+   if (!dn)
+   break;
+   nvdn = of_parse_phandle(dn, "ibm,nvlink", 0);
+   if (!nvdn)
+   continue;
+   if (!of_device_is_compatible(nvdn, "ibm,npu-link"))
+   continue;
+   if (!of_device_is_compatible(nvdn->parent,
+   "ibm,power9-npu"))
+   continue;
+   WARN_ON_ONCE(pnv_npu2_init(hose));
+   break;
+   }
+   }
 }
 
 /*
-- 
2.17.1

[PATCH kernel v7 08/20] powerpc/pseries: Remove IOMMU API support for non-LPAR systems

2018-12-20 Thread Alexey Kardashevskiy

The pci_dma_bus_setup_pSeries and pci_dma_dev_setup_pSeries hooks are
registered for the pseries platform which does not have FW_FEATURE_LPAR;
these would be pre-powernv platforms which we never supported PCI pass
through for anyway so remove it.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/platforms/pseries/iommu.c | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index cbcc8ce..2783cb7 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -645,7 +645,6 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
iommu_table_setparms(pci->phb, dn, tbl);
tbl->it_ops = &iommu_table_pseries_ops;
iommu_init_table(tbl, pci->phb->node);
-   iommu_register_group(pci->table_group, pci_domain_nr(bus), 0);
 
/* Divide the rest (1.75GB) among the children */
pci->phb->dma_window_size = 0x8000ul;
@@ -756,10 +755,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
iommu_table_setparms(phb, dn, tbl);
tbl->it_ops = &iommu_table_pseries_ops;
iommu_init_table(tbl, phb->node);
-   iommu_register_group(PCI_DN(dn)->table_group,
-   pci_domain_nr(phb->bus), 0);
set_iommu_table_base(&dev->dev, tbl);
-   iommu_add_device(&dev->dev);
return;
}
 
@@ -770,11 +766,10 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
while (dn && PCI_DN(dn) && PCI_DN(dn)->table_group == NULL)
dn = dn->parent;
 
-   if (dn && PCI_DN(dn)) {
+   if (dn && PCI_DN(dn))
set_iommu_table_base(&dev->dev,
PCI_DN(dn)->table_group->tables[0]);
-   iommu_add_device(&dev->dev);
-   } else
+   else
printk(KERN_WARNING "iommu: Device %s has no iommu table\n",
   pci_name(dev));
 }
-- 
2.17.1

[PATCH kernel v7 00/20] powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough

2018-12-20 Thread Alexey Kardashevskiy

My bad, I was not cc-ing everyone but now with v7 I am, sorry about that.


This is for passing through NVIDIA V100 GPUs on POWER9 systems.
20/20 has the details of hardware setup.

This implements support for NVIDIA V100 GPU with coherent memory and
NPU/ATS support available in the POWER9 CPU. The aim is to support
unmodified vendor driver in the guest.

This is pushed to (both guest and host kernels):
https://github.com/aik/linux/tree/nv2

Matching qemu is pushed to github:
https://github.com/aik/qemu/tree/nv2

Skiboot bits are here:
https://github.com/aik/skiboot/tree/nv2

The individual patches have changelogs. v7 fixes compile warning
and updates a VFIO capability comment in 20/20.

Please comment. Thanks.



Alexey Kardashevskiy (20):
  powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2
  powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a
region
  powerpc/vfio/iommu/kvm: Do not pin device memory
  powerpc/powernv: Move npu struct from pnv_phb to pci_controller
  powerpc/powernv/npu: Move OPAL calls away from context manipulation
  powerpc/pseries/iommu: Use memory@ nodes in max RAM address
calculation
  powerpc/pseries/npu: Enable platform support
  powerpc/pseries: Remove IOMMU API support for non-LPAR systems
  powerpc/powernv/pseries: Rework device adding to IOMMU groups
  powerpc/iommu_api: Move IOMMU groups setup to a single place
  powerpc/powernv: Reference iommu_table while it is linked to a group
  powerpc/powernv/npu: Move single TVE handling to NPU PE
  powerpc/powernv/npu: Convert NPU IOMMU helpers to
iommu_table_group_ops
  powerpc/powernv/npu: Add compound IOMMU groups
  powerpc/powernv/npu: Add release_ownership hook
  powerpc/powernv/npu: Check mmio_atsd array bounds when populating
  powerpc/powernv/npu: Fault user page into the hypervisor's pagetable
  vfio_pci: Allow mapping extra regions
  vfio_pci: Allow regions to add own capabilities
  vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

 drivers/vfio/pci/Makefile |   1 +
 arch/powerpc/include/asm/iommu.h  |  17 +-
 arch/powerpc/include/asm/mmu_context.h|  15 +-
 arch/powerpc/include/asm/pci-bridge.h |   1 +
 arch/powerpc/include/asm/pci.h|   4 +
 arch/powerpc/platforms/powernv/pci.h  |  30 +-
 drivers/vfio/pci/trace.h  | 102 
 drivers/vfio/pci/vfio_pci_private.h   |  20 +
 include/uapi/linux/vfio.h |  37 ++
 arch/powerpc/kernel/iommu.c   |  69 +--
 arch/powerpc/kvm/book3s_64_vio.c  |  18 +-
 arch/powerpc/mm/mmu_context_iommu.c   | 110 +++-
 arch/powerpc/platforms/powernv/npu-dma.c  | 549 +++---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c |   3 +-
 arch/powerpc/platforms/powernv/pci-ioda.c | 237 
 arch/powerpc/platforms/powernv/pci.c  |  43 +-
 arch/powerpc/platforms/pseries/iommu.c|  88 ++-
 arch/powerpc/platforms/pseries/pci.c  |  22 +
 drivers/vfio/pci/vfio_pci.c   |  42 +-
 drivers/vfio/pci/vfio_pci_nvlink2.c   | 482 +++
 drivers/vfio/vfio_iommu_spapr_tce.c   |  64 +-
 drivers/vfio/pci/Kconfig  |   6 +
 22 files changed, 1569 insertions(+), 391 deletions(-)
 create mode 100644 drivers/vfio/pci/trace.h
 create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

-- 
2.17.1

[PATCH kernel v7 01/20] powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2

2018-12-20 Thread Alexey Kardashevskiy

The skiboot firmware has a hot reset handler which fences the NVIDIA V100
GPU RAM on Witherspoons and makes accesses no-op instead of throwing HMIs:
https://github.com/open-power/skiboot/commit/fca2b2b839a67

Now we are going to pass V100 via VFIO which most certainly involves
KVM guests which are often terminated without getting a chance to offline
GPU RAM so we end up with a running machine with misconfigured memory.
Accessing this memory produces hardware management interrupts (HMI)
which bring the host down.

To suppress HMIs, this wires up this hot reset hook to vfio_pci_disable()
via pci_disable_device() which switches NPU2 to a safe mode and prevents
HMIs.

Signed-off-by: Alexey Kardashevskiy 
Acked-by: Alistair Popple 
Reviewed-by: David Gibson 
---
Changes:
v2:
* updated the commit log
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 9ee7a30..29c6837 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -3676,6 +3676,15 @@ static void pnv_pci_release_device(struct pci_dev *pdev)
pnv_ioda_release_pe(pe);
 }
 
+static void pnv_npu_disable_device(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev = pci_dev_to_eeh_dev(pdev);
+   struct eeh_pe *eehpe = edev ? edev->pe : NULL;
+
+   if (eehpe && eeh_ops && eeh_ops->reset)
+   eeh_ops->reset(eehpe, EEH_RESET_HOT);
+}
+
 static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 {
struct pnv_phb *phb = hose->private_data;
@@ -3720,6 +3729,7 @@ static const struct pci_controller_ops 
pnv_npu_ioda_controller_ops = {
.reset_secondary_bus= pnv_pci_reset_secondary_bus,
.dma_set_mask   = pnv_npu_dma_set_mask,
.shutdown   = pnv_pci_ioda_shutdown,
+   .disable_device = pnv_npu_disable_device,
 };
 
 static const struct pci_controller_ops pnv_npu_ocapi_ioda_controller_ops = {
-- 
2.17.1

[PATCH kernel v7 10/20] powerpc/iommu_api: Move IOMMU groups setup to a single place

2018-12-20 Thread Alexey Kardashevskiy

Registering new IOMMU groups and adding devices to them are separated in
code and the latter is dug in the DMA setup code which it does not
really belong to.

This moved IOMMU groups setup to a separate helper which registers a group
and adds devices as before. This does not make a difference as IOMMU
groups are not used anyway; the only dependency here is that
iommu_add_device() requires a valid pointer to an iommu_table
(set by set_iommu_table_base()).

To keep the old behaviour, this does not add new IOMMU groups for PEs
with no DMA weight and also skips NVLink bridges which do not have
pci_controller_ops::setup_bridge (the normal way of adding PEs).

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
Changes:
v5:
* fixed compile with defined but not used pnv_ioda_setup_bus_iommu_group();
unfortunately defining a dummy version looks uglier than #ifdef
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 82 +++
 1 file changed, 68 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index b86a6e0..f6ab13d 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -1538,6 +1538,9 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 
 static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
   struct pnv_ioda_pe *pe);
+#ifdef CONFIG_IOMMU_API
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe);
+#endif
 static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 {
struct pci_bus*bus;
@@ -1591,6 +1594,9 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, 
u16 num_vfs)
mutex_unlock(&phb->ioda.pe_list_mutex);
 
pnv_pci_ioda2_setup_dma_pe(phb, pe);
+#ifdef CONFIG_IOMMU_API
+   pnv_ioda_setup_bus_iommu_group(pe);
+#endif
}
 }
 
@@ -1930,21 +1936,16 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct 
pci_dev *pdev)
return mask;
 }
 
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-  struct pci_bus *bus,
-  bool add_to_group)
+static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
 {
struct pci_dev *dev;
 
list_for_each_entry(dev, &bus->devices, bus_list) {
set_iommu_table_base(&dev->dev, pe->table_group.tables[0]);
set_dma_offset(&dev->dev, pe->tce_bypass_base);
-   if (add_to_group)
-   iommu_add_device(&pe->table_group, &dev->dev);
 
if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
-   pnv_ioda_setup_bus_dma(pe, dev->subordinate,
-   add_to_group);
+   pnv_ioda_setup_bus_dma(pe, dev->subordinate);
}
 }
 
@@ -2374,7 +2375,7 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb 
*phb,
iommu_init_table(tbl, phb->hose->node);
 
if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
-   pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
+   pnv_ioda_setup_bus_dma(pe, pe->pbus);
 
return;
  fail:
@@ -2607,7 +2608,7 @@ static void pnv_ioda2_take_ownership(struct 
iommu_table_group *table_group)
pnv_pci_ioda2_set_bypass(pe, false);
pnv_pci_ioda2_unset_window(&pe->table_group, 0);
if (pe->pbus)
-   pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+   pnv_ioda_setup_bus_dma(pe, pe->pbus);
iommu_tce_table_put(tbl);
 }
 
@@ -2618,7 +2619,7 @@ static void pnv_ioda2_release_ownership(struct 
iommu_table_group *table_group)
 
pnv_pci_ioda2_setup_default_config(pe);
if (pe->pbus)
-   pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
+   pnv_ioda_setup_bus_dma(pe, pe->pbus);
 }
 
 static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
@@ -2735,12 +2736,68 @@ static struct iommu_table_group_ops 
pnv_pci_ioda2_npu_ops = {
.release_ownership = pnv_ioda2_release_ownership,
 };
 
+static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
+   struct pci_bus *bus)
+{
+   struct pci_dev *dev;
+
+   list_for_each_entry(dev, &bus->devices, bus_list) {
+   iommu_add_device(&pe->table_group, &dev->dev);
+
+   if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
+   pnv_ioda_setup_bus_iommu_group_add_devices(pe,
+   dev->subordinate);
+   }
+}
+
+static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe)
+{
+   if (!pnv_pci_ioda_pe_dma_weight(pe))
+   return;
+
+   iommu_register_group(&pe->table_group, pe->phb->hose->global_number,
+   pe->pe_number);
+
+   /*
+* set_iommu_table_base(&pe->pdev->dev, tbl) should have been

[PATCH kernel v7 11/20] powerpc/powernv: Reference iommu_table while it is linked to a group

2018-12-20 Thread Alexey Kardashevskiy

The iommu_table pointer stored in iommu_table_group may get stale
by accident, this adds referencing and removes a redundant comment
about this.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 3 ++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 4 
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 7639b21..697449a 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -368,6 +368,7 @@ void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
found = false;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
if (table_group->tables[i] == tbl) {
+   iommu_tce_table_put(tbl);
table_group->tables[i] = NULL;
found = true;
break;
@@ -393,7 +394,7 @@ long pnv_pci_link_table_and_group(int node, int num,
tgl->table_group = table_group;
list_add_rcu(&tgl->next, &tbl->it_group_list);
 
-   table_group->tables[num] = tbl;
+   table_group->tables[num] = iommu_tce_table_get(tbl);
 
return 0;
 }
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index f6ab13d..a5879ab 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2719,10 +2719,6 @@ static long pnv_pci_ioda2_npu_unset_window(
 
 static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
 {
-   /*
-* Detach NPU first as pnv_ioda2_take_ownership() will destroy
-* the iommu_table if 32bit DMA is enabled.
-*/
pnv_npu_take_ownership(gpe_table_group_to_npe(table_group));
pnv_ioda2_take_ownership(table_group);
 }
-- 
2.17.1

[PATCH kernel v7 02/20] powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a region

2018-12-20 Thread Alexey Kardashevskiy

Normally mm_iommu_get() should add a reference and mm_iommu_put() should
remove it. However historically mm_iommu_find() does the referencing and
mm_iommu_get() is doing allocation and referencing.

We are going to add another helper to preregister device memory so
instead of having mm_iommu_new() (which pre-registers the normal memory
and references the region), we need separate helpers for pre-registering
and referencing.

This renames:
- mm_iommu_get to mm_iommu_new;
- mm_iommu_find to mm_iommu_get.

This changes mm_iommu_get() to reference the region so the name now
reflects what it does.

This removes the check for exact match from mm_iommu_new() as we want it
to fail on existing regions; mm_iommu_get() should be used instead.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
Changes:
v5:
* fixed a bug with uninitialized @found in tce_iommu_unregister_pages()
* reworded the commit log

v4:
* squashed "powerpc/mm/iommu: Make mm_iommu_new() fail on existing regions" 
into this

v2:
* merged 2 patches into one
---
 arch/powerpc/include/asm/mmu_context.h |  4 +--
 arch/powerpc/mm/mmu_context_iommu.c| 19 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c| 35 +-
 3 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index c05efd2..268e112 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -21,7 +21,7 @@ struct mm_iommu_table_group_mem_t;
 
 extern int isolate_lru_page(struct page *page);/* from internal.h */
 extern bool mm_iommu_preregistered(struct mm_struct *mm);
-extern long mm_iommu_get(struct mm_struct *mm,
+extern long mm_iommu_new(struct mm_struct *mm,
unsigned long ua, unsigned long entries,
struct mm_iommu_table_group_mem_t **pmem);
 extern long mm_iommu_put(struct mm_struct *mm,
@@ -32,7 +32,7 @@ extern struct mm_iommu_table_group_mem_t 
*mm_iommu_lookup(struct mm_struct *mm,
unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
struct mm_struct *mm, unsigned long ua, unsigned long size);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+extern struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned int pageshift, unsigned long *hpa);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index 0741d90..25a4b7f7 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -89,7 +89,7 @@ bool mm_iommu_preregistered(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
-long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
+long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
struct mm_iommu_table_group_mem_t **pmem)
 {
struct mm_iommu_table_group_mem_t *mem;
@@ -100,12 +100,6 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
 
list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list,
next) {
-   if ((mem->ua == ua) && (mem->entries == entries)) {
-   ++mem->used;
-   *pmem = mem;
-   goto unlock_exit;
-   }
-
/* Overlap? */
if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
(ua < (mem->ua +
@@ -192,7 +186,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, 
unsigned long entries,
 
return ret;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_get);
+EXPORT_SYMBOL_GPL(mm_iommu_new);
 
 static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
 {
@@ -308,21 +302,26 @@ struct mm_iommu_table_group_mem_t 
*mm_iommu_lookup_rm(struct mm_struct *mm,
return ret;
 }
 
-struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+struct mm_iommu_table_group_mem_t *mm_iommu_get(struct mm_struct *mm,
unsigned long ua, unsigned long entries)
 {
struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
 
+   mutex_lock(&mem_list_mutex);
+
list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) {
if ((mem->ua == ua) && (mem->entries == entries)) {
ret = mem;
+   ++mem->used;
break;
}
}
 
+   mutex_unlock(&mem_list_mutex);
+
return ret;
 }
-EXPORT_SYMBOL_GPL(mm_iommu_find);
+EXPORT_SYMBOL_GPL(mm_iommu_get);
 
 long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned int pageshift

[PATCH kernel v7 12/20] powerpc/powernv/npu: Move single TVE handling to NPU PE

2018-12-20 Thread Alexey Kardashevskiy

Normal PCI PEs have 2 TVEs, one per a DMA window; however NPU PE has only
one which points to one of two tables of the corresponding PCI PE.

So whenever a new DMA window is programmed to PEs, the NPU PE needs to
release old table in order to use the new one.

Commit d41ce7b1bcc3e ("powerpc/powernv/npu: Do not try invalidating 32bit
table when 64bit table is enabled") did just that but in pci-ioda.c
while it actually belongs to npu-dma.c.

This moves the single TVE handling to npu-dma.c. This does not implement
restoring though as it is highly unlikely that we can set the table to
PCI PE and cannot to NPU PE and if that fails, we could only set 32bit
table to NPU PE and this configuration is not really supported or wanted.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/npu-dma.c  |  8 +++
 arch/powerpc/platforms/powernv/pci-ioda.c | 27 +++
 2 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index ef1457f..26063fb 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -130,6 +130,11 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
tbl->it_level_size : tbl->it_size;
const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+   int num2 = (num == 0) ? 1 : 0;
+
+   /* NPU has just one TVE so if there is another table, remove it first */
+   if (npe->table_group.tables[num2])
+   pnv_npu_unset_window(npe, num2);
 
pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
start_addr, start_addr + win_size - 1,
@@ -160,6 +165,9 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
struct pnv_phb *phb = npe->phb;
int64_t rc;
 
+   if (!npe->table_group.tables[num])
+   return 0;
+
pe_info(npe, "Removing DMA window\n");
 
rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index a5879ab..1ee3c5d6 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2672,23 +2672,14 @@ static struct pnv_ioda_pe *gpe_table_group_to_npe(
 static long pnv_pci_ioda2_npu_set_window(struct iommu_table_group *table_group,
int num, struct iommu_table *tbl)
 {
-   struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-   int num2 = (num == 0) ? 1 : 0;
long ret = pnv_pci_ioda2_set_window(table_group, num, tbl);
 
if (ret)
return ret;
 
-   if (table_group->tables[num2])
-   pnv_npu_unset_window(npe, num2);
-
-   ret = pnv_npu_set_window(npe, num, tbl);
-   if (ret) {
+   ret = pnv_npu_set_window(gpe_table_group_to_npe(table_group), num, tbl);
+   if (ret)
pnv_pci_ioda2_unset_window(table_group, num);
-   if (table_group->tables[num2])
-   pnv_npu_set_window(npe, num2,
-   table_group->tables[num2]);
-   }
 
return ret;
 }
@@ -2697,24 +2688,12 @@ static long pnv_pci_ioda2_npu_unset_window(
struct iommu_table_group *table_group,
int num)
 {
-   struct pnv_ioda_pe *npe = gpe_table_group_to_npe(table_group);
-   int num2 = (num == 0) ? 1 : 0;
long ret = pnv_pci_ioda2_unset_window(table_group, num);
 
if (ret)
return ret;
 
-   if (!npe->table_group.tables[num])
-   return 0;
-
-   ret = pnv_npu_unset_window(npe, num);
-   if (ret)
-   return ret;
-
-   if (table_group->tables[num2])
-   ret = pnv_npu_set_window(npe, num2, table_group->tables[num2]);
-
-   return ret;
+   return pnv_npu_unset_window(gpe_table_group_to_npe(table_group), num);
 }
 
 static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group)
-- 
2.17.1

[PATCH kernel v7 04/20] powerpc/powernv: Move npu struct from pnv_phb to pci_controller

2018-12-20 Thread Alexey Kardashevskiy

The powernv PCI code stores NPU data in the pnv_phb struct. The latter
is referenced by pci_controller::private_data. We are going to have NPU2
support in the pseries platform as well but it does not store any
private_data in in the pci_controller struct; and even if it did,
it would be a different data structure.

This makes npu a pointer and stores it one level higher in
the pci_controller struct.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* removed !npu checks as this is out of scope of this patch
* added WARN_ON_ONCE in WARN_ON_ONCE(pnv_npu2_init(phb))

v4:
* changed subj from "powerpc/powernv: Detach npu struct from pnv_phb"
* got rid of global list of npus - store them now in pci_controller
* got rid of npdev_to_npu() helper
---
 arch/powerpc/include/asm/pci-bridge.h |  1 +
 arch/powerpc/platforms/powernv/pci.h  | 16 -
 arch/powerpc/platforms/powernv/npu-dma.c  | 74 +--
 arch/powerpc/platforms/powernv/pci-ioda.c |  2 +-
 4 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 94d4490..aee4fcc 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -129,6 +129,7 @@ struct pci_controller {
 #endif /* CONFIG_PPC64 */
 
void *private_data;
+   struct npu *npu;
 };
 
 /* These are used for config access before all the PCI probing
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index 2131373..f2d50974 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -8,9 +8,6 @@
 
 struct pci_dn;
 
-/* Maximum possible number of ATSD MMIO registers per NPU */
-#define NV_NMMU_ATSD_REGS 8
-
 enum pnv_phb_type {
PNV_PHB_IODA1   = 0,
PNV_PHB_IODA2   = 1,
@@ -176,19 +173,6 @@ struct pnv_phb {
unsigned intdiag_data_size;
u8  *diag_data;
 
-   /* Nvlink2 data */
-   struct npu {
-   int index;
-   __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
-   unsigned int mmio_atsd_count;
-
-   /* Bitmask for MMIO register usage */
-   unsigned long mmio_atsd_usage;
-
-   /* Do we need to explicitly flush the nest mmu? */
-   bool nmmu_flush;
-   } npu;
-
int p2p_target_count;
 };
 
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 91d488f..5e66439 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -327,6 +327,25 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct 
pnv_ioda_pe *npe)
return gpe;
 }
 
+/*
+ * NPU2 ATS
+ */
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+
+/* An NPU descriptor, valid for POWER9 only */
+struct npu {
+   int index;
+   __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+   unsigned int mmio_atsd_count;
+
+   /* Bitmask for MMIO register usage */
+   unsigned long mmio_atsd_usage;
+
+   /* Do we need to explicitly flush the nest mmu? */
+   bool nmmu_flush;
+};
+
 /* Maximum number of nvlinks per npu */
 #define NV_MAX_LINKS 6
 
@@ -478,7 +497,6 @@ static void acquire_atsd_reg(struct npu_context 
*npu_context,
int i, j;
struct npu *npu;
struct pci_dev *npdev;
-   struct pnv_phb *nphb;
 
for (i = 0; i <= max_npu2_index; i++) {
mmio_atsd_reg[i].reg = -1;
@@ -493,8 +511,7 @@ static void acquire_atsd_reg(struct npu_context 
*npu_context,
if (!npdev)
continue;
 
-   nphb = pci_bus_to_host(npdev->bus)->private_data;
-   npu = &nphb->npu;
+   npu = pci_bus_to_host(npdev->bus)->npu;
mmio_atsd_reg[i].npu = npu;
mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
while (mmio_atsd_reg[i].reg < 0) {
@@ -662,6 +679,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
struct pnv_phb *nphb;
struct npu *npu;
struct npu_context *npu_context;
+   struct pci_controller *hose;
 
/*
 * At present we don't support GPUs connected to multiple NPUs and I'm
@@ -689,8 +707,9 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
return ERR_PTR(-EINVAL);
}
 
-   nphb = pci_bus_to_host(npdev->bus)->private_data;
-   npu = &nphb->npu;
+   hose = pci_bus_to_host(npdev->bus);
+   nphb = hose->private_data;
+   npu = hose->npu;
 
/*
 * Setup the NPU context table for a particular GPU. These need to be
@@ -764,7 +783,7 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
 */
WRITE_ONCE(npu_context->npdev[npu->index][nvli

[PATCH kernel v7 13/20] powerpc/powernv/npu: Convert NPU IOMMU helpers to iommu_table_group_ops

2018-12-20 Thread Alexey Kardashevskiy

At the moment NPU IOMMU is manipulated directly from the IODA2 PCI
PE code; PCI PE acts as a master to NPU PE. Soon we will have compound
IOMMU groups with several PEs from several different PHB (such as
interconnected GPUs and NPUs) so there will be no single master but
a one big IOMMU group.

This makes a first step and converts an NPU PE with a set of extern
function to a table group.

This should cause no behavioral change. Note that
pnv_npu_release_ownership() has never been implemented.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/platforms/powernv/pci.h  |  5 
 arch/powerpc/platforms/powernv/npu-dma.c  | 34 ++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 +--
 3 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index ddb4f02..cf9f748 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -216,11 +216,6 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
-extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
-   struct iommu_table *tbl);
-extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
-extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
-extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 26063fb..dc629ee 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -121,9 +121,14 @@ static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct 
pnv_ioda_pe *npe,
return pe;
 }
 
-long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
+static long pnv_npu_unset_window(struct iommu_table_group *table_group,
+   int num);
+
+static long pnv_npu_set_window(struct iommu_table_group *table_group, int num,
struct iommu_table *tbl)
 {
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
const unsigned long size = tbl->it_indirect_levels ?
@@ -134,7 +139,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
 
/* NPU has just one TVE so if there is another table, remove it first */
if (npe->table_group.tables[num2])
-   pnv_npu_unset_window(npe, num2);
+   pnv_npu_unset_window(&npe->table_group, num2);
 
pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
start_addr, start_addr + win_size - 1,
@@ -160,8 +165,10 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
return 0;
 }
 
-long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
+static long pnv_npu_unset_window(struct iommu_table_group *table_group, int 
num)
 {
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
 
@@ -206,7 +213,8 @@ static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
if (!gpe)
return;
 
-   rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
+   rc = pnv_npu_set_window(&npe->table_group, 0,
+   gpe->table_group.tables[0]);
 
/*
 * NVLink devices use the same TCE table configuration as
@@ -231,7 +239,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
return -EINVAL;
 
-   rc = pnv_npu_unset_window(npe, 0);
+   rc = pnv_npu_unset_window(&npe->table_group, 0);
if (rc != OPAL_SUCCESS)
return rc;
 
@@ -284,9 +292,12 @@ void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, 
bool bypass)
}
 }
 
+#ifdef CONFIG_IOMMU_API
 /* Switch ownership from platform code to external user (e.g. VFIO) */
-void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
+static void pnv_npu_take_ownership(struct iommu_table_group *table_group)
 {
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
 
@@ -297,7 +308,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
 * if it was enabled at the moment of ownership change.
 */
if (npe->table_group.tables[0]) {
-   pnv_npu_unset_window(npe, 0);
+   pnv_npu_unset_window(&npe->table_group, 0);
return;

[PATCH kernel v7 05/20] powerpc/powernv/npu: Move OPAL calls away from context manipulation

2018-12-20 Thread Alexey Kardashevskiy

When introduced, the NPU context init/destroy helpers called OPAL which
enabled/disabled PID (a userspace memory context ID) filtering in an NPU
per a GPU; this was a requirement for P9 DD1.0. However newer chip
revision added a PID wildcard support so there is no more need to
call OPAL every time a new context is initialized. Also, since the PID
wildcard support was added, skiboot does not clear wildcard entries
in the NPU so these remain in the hardware till the system reboot.

This moves LPID and wildcard programming to the PE setup code which
executes once during the booting process so NPU2 context init/destroy
won't need to do additional configuration.

This replaces the check for FW_FEATURE_OPAL with a check for npu!=NULL as
this is the way to tell if the NPU support is present and configured.

This moves pnv_npu2_init() declaration as pseries should be able to use it.
This keeps pnv_npu2_map_lpar() in powernv as pseries is not allowed to
call that. This exports pnv_npu2_map_lpar_dev() as following patches
will use it from the VFIO driver.

While at it, replace redundant list_for_each_entry_safe() with
a simpler list_for_each_entry().

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* add few checks for npu!=NULL

v4:
* add flags check in pnv_npu2_init_context()
---
 arch/powerpc/include/asm/pci.h|   3 +
 arch/powerpc/platforms/powernv/pci.h  |   2 +-
 arch/powerpc/platforms/powernv/npu-dma.c  | 111 --
 arch/powerpc/platforms/powernv/pci-ioda.c |  15 ++-
 4 files changed, 77 insertions(+), 54 deletions(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index 2af9ded..baf2886 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -129,5 +129,8 @@ extern void pcibios_scan_phb(struct pci_controller *hose);
 
 extern struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev);
 extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index);
+extern int pnv_npu2_init(struct pci_controller *hose);
+extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
+   unsigned long msr);
 
 #endif /* __ASM_POWERPC_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index f2d50974..ddb4f02 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -190,6 +190,7 @@ extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
 extern void pnv_pci_init_npu_phb(struct device_node *np);
 extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np);
+extern void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned long msr);
 extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
@@ -220,7 +221,6 @@ extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int 
num,
 extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
 extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
 extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
-extern int pnv_npu2_init(struct pnv_phb *phb);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 5e66439..ef1457f 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -512,6 +512,9 @@ static void acquire_atsd_reg(struct npu_context 
*npu_context,
continue;
 
npu = pci_bus_to_host(npdev->bus)->npu;
+   if (!npu)
+   continue;
+
mmio_atsd_reg[i].npu = npu;
mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
while (mmio_atsd_reg[i].reg < 0) {
@@ -676,7 +679,6 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
u32 nvlink_index;
struct device_node *nvlink_dn;
struct mm_struct *mm = current->mm;
-   struct pnv_phb *nphb;
struct npu *npu;
struct npu_context *npu_context;
struct pci_controller *hose;
@@ -687,13 +689,14 @@ struct npu_context *pnv_npu2_init_context(struct pci_dev 
*gpdev,
 */
struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
 
-   if (!firmware_has_feature(FW_FEATURE_OPAL))
-   return ERR_PTR(-ENODEV);
-
if (!npdev)
/* No nvlink associated with this GPU device */
return ERR_PTR(-ENODEV);
 
+   /* We only support DR/PR/HV in pnv_npu2_map_lpar_dev() */
+   if (flags & ~(MSR_DR | MSR_PR | MSR_HV))
+   return ERR_PTR(-EINVAL);
+
nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",

[PATCH kernel v7 14/20] powerpc/powernv/npu: Add compound IOMMU groups

2018-12-20 Thread Alexey Kardashevskiy

At the moment the powernv platform registers an IOMMU group for each PE.
There is an exception though: an NVLink bridge which is attached to
the corresponding GPU's IOMMU group making it a master.

Now we have POWER9 systems with GPUs connected to each other directly
bypassing PCI. At the moment we do not control state of these links so
we have to put such interconnected GPUs to one IOMMU group which
means that the old scheme with one GPU as a master won't work - there will
be up to 3 GPUs in such group.

This introduces a npu_comp struct which represents a compound IOMMU
group made of multiple PEs - PCI PEs (for GPUs) and NPU PEs (for NVLink
bridges). This converts the existing NVLink1 code to use the new scheme.
>From now on, each PE must have a valid iommu_table_group_ops which will
either be called directly (for a single PE group) or indirectly from
a compound group handlers.

This moves IOMMU group registration for NVLink-connected GPUs to npu-dma.c.
For POWER8, this stores a new compound group pointer in the PE (so a GPU
is still a master); for POWER9 the new group pointer is stored in an NPU
(which is allocated per a PCI host controller).

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v7:
* fixed compiler warning in pnv_try_setup_npu_table_group()

v5:
* now read page sizes from PHB NVLink to narrow down what the compoind PE
can actually support (hint: 4K/64K only)
---
 arch/powerpc/include/asm/pci.h|   1 +
 arch/powerpc/platforms/powernv/pci.h  |   7 +
 arch/powerpc/platforms/powernv/npu-dma.c  | 291 --
 arch/powerpc/platforms/powernv/pci-ioda.c | 159 
 4 files changed, 322 insertions(+), 136 deletions(-)

diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index baf2886..0c72f18 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -132,5 +132,6 @@ extern struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev 
*gpdev, int index);
 extern int pnv_npu2_init(struct pci_controller *hose);
 extern int pnv_npu2_map_lpar_dev(struct pci_dev *gpdev, unsigned int lparid,
unsigned long msr);
+extern int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev);
 
 #endif /* __ASM_POWERPC_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
index cf9f748..aef4bb5 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -62,6 +62,7 @@ struct pnv_ioda_pe {
 
/* "Base" iommu table, ie, 4K TCEs, 32-bit DMA */
struct iommu_table_group table_group;
+   struct npu_comp *npucomp;
 
/* 64-bit TCE bypass region */
booltce_bypass_enabled;
@@ -201,6 +202,8 @@ extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
 extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
 extern void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
+   __u64 window_size, __u32 levels);
 extern int pnv_eeh_post_init(void);
 
 extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
@@ -216,6 +219,10 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
 extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
 extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
 extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
+extern struct iommu_table_group *pnv_try_setup_npu_table_group(
+   struct pnv_ioda_pe *pe);
+extern struct iommu_table_group *pnv_npu_compound_attach(
+   struct pnv_ioda_pe *pe);
 
 /* pci-ioda-tce.c */
 #define POWERNV_IOMMU_DEFAULT_LEVELS   1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index dc629ee..d93a2cd 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -328,31 +328,6 @@ static struct iommu_table_group_ops pnv_pci_npu_ops = {
.unset_window = pnv_npu_unset_window,
.take_ownership = pnv_npu_take_ownership,
 };
-
-struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
-{
-   struct pnv_phb *phb = npe->phb;
-   struct pci_bus *pbus = phb->hose->bus;
-   struct pci_dev *npdev, *gpdev = NULL, *gptmp;
-   struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
-
-   if (!gpe || !gpdev)
-   return NULL;
-
-   npe->table_group.ops = &pnv_pci_npu_ops;
-
-   list_for_each_entry(npdev, &pbus->devices, bus_list) {
-   gptmp = pnv_pci_get_gpu_dev(npdev);
-
-   if (gptmp != gpdev)
-   continue;
-
-   pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
-   iommu_group_add_device(gpe->table_group.group, &npdev->dev);

[PATCH kernel v7 15/20] powerpc/powernv/npu: Add release_ownership hook

2018-12-20 Thread Alexey Kardashevskiy

In order to make ATS work and translate addresses for arbitrary
LPID and PID, we need to program an NPU with LPID and allow PID wildcard
matching with a specific MSR mask.

This implements a helper to assign a GPU to LPAR and program the NPU
with a wildcard for PID and a helper to do clean-up. The helper takes
MSR (only DR/HV/PR/SF bits are allowed) to program them into NPU2 for
ATS checkout requests support.

This exports pnv_npu2_unmap_lpar_dev() as following patches will use it
from the VFIO driver.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v5:
* removed opal_purge_cache as it is a part of reset in skiboot now
---
 arch/powerpc/platforms/powernv/npu-dma.c | 51 
 1 file changed, 51 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index d93a2cd..e06043b 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -300,6 +300,7 @@ static void pnv_npu_take_ownership(struct iommu_table_group 
*table_group)
table_group);
struct pnv_phb *phb = npe->phb;
int64_t rc;
+   struct pci_dev *gpdev = NULL;
 
/*
 * Note: NPU has just a single TVE in the hardware which means that
@@ -321,12 +322,28 @@ static void pnv_npu_take_ownership(struct 
iommu_table_group *table_group)
return;
}
pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
+
+   get_gpu_pci_dev_and_pe(npe, &gpdev);
+   if (gpdev)
+   pnv_npu2_unmap_lpar_dev(gpdev);
+}
+
+static void pnv_npu_release_ownership(struct iommu_table_group *table_group)
+{
+   struct pnv_ioda_pe *npe = container_of(table_group, struct pnv_ioda_pe,
+   table_group);
+   struct pci_dev *gpdev = NULL;
+
+   get_gpu_pci_dev_and_pe(npe, &gpdev);
+   if (gpdev)
+   pnv_npu2_map_lpar_dev(gpdev, 0, MSR_DR | MSR_PR | MSR_HV);
 }
 
 static struct iommu_table_group_ops pnv_pci_npu_ops = {
.set_window = pnv_npu_set_window,
.unset_window = pnv_npu_unset_window,
.take_ownership = pnv_npu_take_ownership,
+   .release_ownership = pnv_npu_release_ownership,
 };
 #endif /* !CONFIG_IOMMU_API */
 
@@ -1237,3 +1254,37 @@ void pnv_npu2_map_lpar(struct pnv_ioda_pe *gpe, unsigned 
long msr)
list_for_each_entry(gpdev, &gpe->pbus->devices, bus_list)
pnv_npu2_map_lpar_dev(gpdev, 0, msr);
 }
+
+int pnv_npu2_unmap_lpar_dev(struct pci_dev *gpdev)
+{
+   int ret;
+   struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+   struct pci_controller *hose;
+   struct pnv_phb *nphb;
+
+   if (!npdev)
+   return -ENODEV;
+
+   hose = pci_bus_to_host(npdev->bus);
+   nphb = hose->private_data;
+
+   dev_dbg(&gpdev->dev, "destroy context opalid=%llu\n",
+   nphb->opal_id);
+   ret = opal_npu_destroy_context(nphb->opal_id, 0/*__unused*/,
+   PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+   if (ret < 0) {
+   dev_err(&gpdev->dev, "Failed to destroy context: %d\n", ret);
+   return ret;
+   }
+
+   /* Set LPID to 0 anyway, just to be safe */
+   dev_dbg(&gpdev->dev, "Map LPAR opalid=%llu lparid=0\n", nphb->opal_id);
+   ret = opal_npu_map_lpar(nphb->opal_id,
+   PCI_DEVID(gpdev->bus->number, gpdev->devfn), 0 /*LPID*/,
+   0 /* LPCR bits */);
+   if (ret)
+   dev_err(&gpdev->dev, "Error %d mapping device to LPAR\n", ret);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(pnv_npu2_unmap_lpar_dev);
-- 
2.17.1

[PATCH kernel v7 16/20] powerpc/powernv/npu: Check mmio_atsd array bounds when populating

2018-12-20 Thread Alexey Kardashevskiy

A broken device tree might contain more than 8 values and introduce hard
to debug memory corruption bug. This adds the boundary check.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index e06043b..c6163b9 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -1179,8 +1179,9 @@ int pnv_npu2_init(struct pci_controller *hose)
 
npu->nmmu_flush = of_property_read_bool(hose->dn, "ibm,nmmu-flush");
 
-   for (i = 0; !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
-   i, &mmio_atsd); i++)
+   for (i = 0; i < ARRAY_SIZE(npu->mmio_atsd_regs) &&
+   !of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
+   i, &mmio_atsd); i++)
npu->mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
 
pr_info("NPU%d: Found %d MMIO ATSD registers", hose->global_number, i);
-- 
2.17.1

[PATCH kernel v7 17/20] powerpc/powernv/npu: Fault user page into the hypervisor's pagetable

2018-12-20 Thread Alexey Kardashevskiy

When a page fault happens in a GPU, the GPU signals the OS and the GPU
driver calls the fault handler which populated a page table; this allows
the GPU to complete an ATS request.

On the bare metal get_user_pages() is enough as it adds a pte to
the kernel page table but under KVM the partition scope tree does not get
updated so ATS will still fail.

This reads a byte from an effective address which causes HV storage
interrupt and KVM updates the partition scope tree.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index c6163b9..12b8421 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -1133,6 +1133,8 @@ int pnv_npu2_handle_fault(struct npu_context *context, 
uintptr_t *ea,
u64 rc = 0, result = 0;
int i, is_write;
struct page *page[1];
+   const char __user *u;
+   char c;
 
/* mmap_sem should be held so the struct_mm must be present */
struct mm_struct *mm = context->mm;
@@ -1145,18 +1147,17 @@ int pnv_npu2_handle_fault(struct npu_context *context, 
uintptr_t *ea,
is_write ? FOLL_WRITE : 0,
page, NULL, NULL);
 
-   /*
-* To support virtualised environments we will have to do an
-* access to the page to ensure it gets faulted into the
-* hypervisor. For the moment virtualisation is not supported in
-* other areas so leave the access out.
-*/
if (rc != 1) {
status[i] = rc;
result = -EFAULT;
continue;
}
 
+   /* Make sure partition scoped tree gets a pte */
+   u = page_address(page[0]);
+   if (__get_user(c, u))
+   result = -EFAULT;
+
status[i] = 0;
put_page(page[0]);
}
-- 
2.17.1

[PATCH kernel v7 09/20] powerpc/powernv/pseries: Rework device adding to IOMMU groups

2018-12-20 Thread Alexey Kardashevskiy

The powernv platform registers IOMMU groups and adds devices to them
from the pci_controller_ops::setup_bridge() hook except one case when
virtual functions (SRIOV VFs) are added from a bus notifier.

The pseries platform registers IOMMU groups from
the pci_controller_ops::dma_bus_setup() hook and adds devices from
the pci_controller_ops::dma_dev_setup() hook. The very same bus notifier
used for powernv does not add devices for pseries though as
__of_scan_bus() adds devices first, then it does the bus/dev DMA setup.

Both platforms use iommu_add_device() which takes a device and expects
it to have a valid IOMMU table struct with an iommu_table_group pointer
which in turn points the iommu_group struct (which represents
an IOMMU group). Although the helper seems easy to use, it relies on
some pre-existing device configuration and associated data structures
which it does not really need.

This simplifies iommu_add_device() to take the table_group pointer
directly. Pseries already has a table_group pointer handy and the bus
notified is not used anyway. For powernv, this copies the existing bus
notifier, makes it work for powernv only which means an easy way of
getting to the table_group pointer. This was tested on VFs but should
also support physical PCI hotplug.

Since iommu_add_device() receives the table_group pointer directly,
pseries does not do TCE cache invalidation (the hypervisor does) nor
allow multiple groups per a VFIO container (in other words sharing
an IOMMU table between partitionable endpoints), this removes
iommu_table_group_link from pseries.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
---
 arch/powerpc/include/asm/iommu.h  | 12 ++---
 arch/powerpc/kernel/iommu.c   | 58 ++-
 arch/powerpc/platforms/powernv/pci-ioda.c | 10 +---
 arch/powerpc/platforms/powernv/pci.c  | 43 -
 arch/powerpc/platforms/pseries/iommu.c| 46 +-
 5 files changed, 74 insertions(+), 95 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index a8aeac0..e847ff6 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -215,9 +215,9 @@ struct iommu_table_group {
 
 extern void iommu_register_group(struct iommu_table_group *table_group,
 int pci_domain_number, unsigned long pe_num);
-extern int iommu_add_device(struct device *dev);
+extern int iommu_add_device(struct iommu_table_group *table_group,
+   struct device *dev);
 extern void iommu_del_device(struct device *dev);
-extern int __init tce_iommu_bus_notifier_init(void);
 extern long iommu_tce_xchg(struct mm_struct *mm, struct iommu_table *tbl,
unsigned long entry, unsigned long *hpa,
enum dma_data_direction *direction);
@@ -228,7 +228,8 @@ static inline void iommu_register_group(struct 
iommu_table_group *table_group,
 {
 }
 
-static inline int iommu_add_device(struct device *dev)
+static inline int iommu_add_device(struct iommu_table_group *table_group,
+   struct device *dev)
 {
return 0;
 }
@@ -236,11 +237,6 @@ static inline int iommu_add_device(struct device *dev)
 static inline void iommu_del_device(struct device *dev)
 {
 }
-
-static inline int __init tce_iommu_bus_notifier_init(void)
-{
-return 0;
-}
 #endif /* !CONFIG_IOMMU_API */
 
 int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr);
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index cbcc615..9d5d109 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -1078,11 +1078,8 @@ void iommu_release_ownership(struct iommu_table *tbl)
 }
 EXPORT_SYMBOL_GPL(iommu_release_ownership);
 
-int iommu_add_device(struct device *dev)
+int iommu_add_device(struct iommu_table_group *table_group, struct device *dev)
 {
-   struct iommu_table *tbl;
-   struct iommu_table_group_link *tgl;
-
/*
 * The sysfs entries should be populated before
 * binding IOMMU group. If sysfs entries isn't
@@ -1098,32 +1095,10 @@ int iommu_add_device(struct device *dev)
return -EBUSY;
}
 
-   tbl = get_iommu_table_base(dev);
-   if (!tbl) {
-   pr_debug("%s: Skipping device %s with no tbl\n",
-__func__, dev_name(dev));
-   return 0;
-   }
-
-   tgl = list_first_entry_or_null(&tbl->it_group_list,
-   struct iommu_table_group_link, next);
-   if (!tgl) {
-   pr_debug("%s: Skipping device %s with no group\n",
-__func__, dev_name(dev));
-   return 0;
-   }
pr_debug("%s: Adding %s to iommu group %d\n",
-__func__, dev_name(dev),
-iommu_group_id(tgl->table_group->group));
+__func__, dev_name(dev),  iommu_group_id(table_group->group));
 
-   if (PAGE_SIZE < IOMMU_PAGE_SI

[PATCH kernel v7 18/20] vfio_pci: Allow mapping extra regions

2018-12-20 Thread Alexey Kardashevskiy

So far we only allowed mapping of MMIO BARs to the userspace. However
there are GPUs with on-board coherent RAM accessible via side
channels which we also want to map to the userspace. The first client
for this is NVIDIA V100 GPU with NVLink2 direct links to a POWER9
NPU-enabled CPU; such GPUs have 16GB RAM which is coherently mapped
to the system address space, we are going to export these as an extra
PCI region.

We already support extra PCI regions and this adds support for mapping
them to the userspace.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: David Gibson 
Acked-by: Alex Williamson 
---
Changes:
v2:
* reverted one of mistakenly removed error checks
---
 drivers/vfio/pci/vfio_pci_private.h | 3 +++
 drivers/vfio/pci/vfio_pci.c | 9 +
 2 files changed, 12 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index cde3b5d..86aab05 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -59,6 +59,9 @@ struct vfio_pci_regops {
  size_t count, loff_t *ppos, bool iswrite);
void(*release)(struct vfio_pci_device *vdev,
   struct vfio_pci_region *region);
+   int (*mmap)(struct vfio_pci_device *vdev,
+   struct vfio_pci_region *region,
+   struct vm_area_struct *vma);
 };
 
 struct vfio_pci_region {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index fef5002..4a6f7c0 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1130,6 +1130,15 @@ static int vfio_pci_mmap(void *device_data, struct 
vm_area_struct *vma)
return -EINVAL;
if ((vma->vm_flags & VM_SHARED) == 0)
return -EINVAL;
+   if (index >= VFIO_PCI_NUM_REGIONS) {
+   int regnum = index - VFIO_PCI_NUM_REGIONS;
+   struct vfio_pci_region *region = vdev->region + regnum;
+
+   if (region && region->ops && region->ops->mmap &&
+   (region->flags & VFIO_REGION_INFO_FLAG_MMAP))
+   return region->ops->mmap(vdev, region, vma);
+   return -EINVAL;
+   }
if (index >= VFIO_PCI_ROM_REGION_INDEX)
return -EINVAL;
if (!vdev->bar_mmap_supported[index])
-- 
2.17.1

[PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Alexey Kardashevskiy

POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
pluggable PCIe devices but still have PCIe links which are used
for config space and MMIO. In addition to that the GPUs have 6 NVLinks
which are connected to other GPUs and the POWER9 CPU. POWER9 chips
have a special unit on a die called an NPU which is an NVLink2 host bus
adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
These systems also support ATS (address translation services) which is
a part of the NVLink2 protocol. Such GPUs also share on-board RAM
(16GB or 32GB) to the system via the same NVLink2 so a CPU has
cache-coherent access to a GPU RAM.

This exports GPU RAM to the userspace as a new VFIO device region. This
preregisters the new memory as device memory as it might be used for DMA.
This inserts pfns from the fault handler as the GPU memory is not onlined
until the vendor driver is loaded and trained the NVLinks so doing this
earlier causes low level errors which we fence in the firmware so
it does not hurt the host system but still better be avoided; for the same
reason this does not map GPU RAM into the host kernel (usual thing for
emulated access otherwise).

This exports an ATSD (Address Translation Shootdown) register of NPU which
allows TLB invalidations inside GPU for an operating system. The register
conveniently occupies a single 64k page. It is also presented to
the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
each of them can be used for TLB invalidation in a GPU linked to this NPU.
This allocates one ATSD register per an NVLink bridge allowing passing
up to 6 registers. Due to the host firmware bug (just recently fixed),
only 1 ATSD register per NPU was actually advertised to the host system
so this passes that alone register via the first NVLink bridge device in
the group which is still enough as QEMU collects them all back and
presents to the guest via vPHB to mimic the emulated NPU PHB on the host.

In order to provide the userspace with the information about GPU-to-NVLink
connections, this exports an additional capability called "tgt"
(which is an abbreviated host system bus address). The "tgt" property
tells the GPU its own system address and allows the guest driver to
conglomerate the routing information so each GPU knows how to get directly
to the other GPUs.

For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
know LPID (a logical partition ID or a KVM guest hardware ID in other
words) and PID (a memory context ID of a userspace process, not to be
confused with a linux pid). This assigns a GPU to LPID in the NPU and
this is why this adds a listener for KVM on an IOMMU group. A PID comes
via NVLink from a GPU and NPU uses a PID wildcard to pass it through.

This requires coherent memory and ATSD to be available on the host as
the GPU vendor only supports configurations with both features enabled
and other configurations are known not to work. Because of this and
because of the ways the features are advertised to the host system
(which is a device tree with very platform specific properties),
this requires enabled POWERNV platform.

The V100 GPUs do not advertise any of these capabilities via the config
space and there are more than just one device ID so this relies on
the platform to tell whether these GPUs have special abilities such as
NVLinks.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v6.1:
* fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD

v6:
* reworked capabilities - tgt for nvlink and gpu and link-speed
for nvlink only

v5:
* do not memremap GPU RAM for emulation, map it only when it is needed
* allocate 1 ATSD register per NVLink bridge, if none left, then expose
the region with a zero size
* separate caps per device type
* addressed AW review comments

v4:
* added nvlink-speed to the NPU bridge capability as this turned out to
be not a constant value
* instead of looking at the exact device ID (which also changes from system
to system), now this (indirectly) looks at the device tree to know
if GPU and NPU support NVLink

v3:
* reworded the commit log about tgt
* added tracepoints (do we want them enabled for entire vfio-pci?)
* added code comments
* added write|mmap flags to the new regions
* auto enabled VFIO_PCI_NVLINK2 config option
* added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu
references; there are required by the NVIDIA driver
* keep notifier registered only for short time
---
 drivers/vfio/pci/Makefile   |   1 +
 drivers/vfio/pci/trace.h| 102 ++
 drivers/vfio/pci/vfio_pci_private.h |  14 +
 include/uapi/linux/vfio.h   |  37 +++
 drivers/vfio/pci/vfio_pci.c |  27 +-
 drivers/vfio/pci/vfio_pci_nvlink2.c | 482 
 drivers/vfio/pci/Kconfig|   6 +
 7 files changed, 667 insertions(+), 2 deletions(-)
 create mode 100644 drivers/vfio/pci/trace.h
 create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c

diff --

[PATCH kernel v7 19/20] vfio_pci: Allow regions to add own capabilities

2018-12-20 Thread Alexey Kardashevskiy

VFIO regions already support region capabilities with a limited set of
fields. However the subdriver might have to report to the userspace
additional bits.

This adds an add_capability() hook to vfio_pci_regops.

Signed-off-by: Alexey Kardashevskiy 
Acked-by: Alex Williamson 
---
Changes:
v3:
* removed confusing rationale for the patch, the next patch makes
use of it anyway
---
 drivers/vfio/pci/vfio_pci_private.h | 3 +++
 drivers/vfio/pci/vfio_pci.c | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index 86aab05..93c1738 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -62,6 +62,9 @@ struct vfio_pci_regops {
int (*mmap)(struct vfio_pci_device *vdev,
struct vfio_pci_region *region,
struct vm_area_struct *vma);
+   int (*add_capability)(struct vfio_pci_device *vdev,
+ struct vfio_pci_region *region,
+ struct vfio_info_cap *caps);
 };
 
 struct vfio_pci_region {
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 4a6f7c0..6cb70cf 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -763,6 +763,12 @@ static long vfio_pci_ioctl(void *device_data,
if (ret)
return ret;
 
+   if (vdev->region[i].ops->add_capability) {
+   ret = vdev->region[i].ops->add_capability(vdev,
+   &vdev->region[i], &caps);
+   if (ret)
+   return ret;
+   }
}
}
 
-- 
2.17.1

Re: [PATCH] powerpc/8xx: Map a second 8M text page at startup when needed.

2018-12-20 Thread Christophe Leroy





Le 20/12/2018 à 09:24, Christoph Hellwig a écrit :

On Thu, Dec 20, 2018 at 05:48:25AM +, Christophe Leroy wrote:

Some debug setup like CONFIG_KASAN generate huge
kernels with text size over the 8M limit.

This patch maps a second 8M page when _einittext is over 8M.


Do we also need a check to generate a useful warning if we ever overflow
the 16MB?



I don't think any other platform does that (the 40x also maps 16Mb, 
book3s/601 maps 24Mb)


But why not, could do that in another patch.

Is there an easy way to get the link to fail when CONFIG_PIN_TLB_TEXT is 
set and _einittext is higher than 16Mb ?


Or should we just map up to 24Mb on the 8xx and consider we are on the 
safe side with that much ?


Christophe

Re: [PATCH kernel v7 00/20] powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough

2018-12-20 Thread Michael Ellerman

Alexey Kardashevskiy  writes:

> My bad, I was not cc-ing everyone but now with v7 I am, sorry about that.

I've already applied v6, I'll assume this is unchanged from that unless
you tell me otherwise.

cheers

> This is for passing through NVIDIA V100 GPUs on POWER9 systems.
> 20/20 has the details of hardware setup.
>
> This implements support for NVIDIA V100 GPU with coherent memory and
> NPU/ATS support available in the POWER9 CPU. The aim is to support
> unmodified vendor driver in the guest.
>
> This is pushed to (both guest and host kernels):
> https://github.com/aik/linux/tree/nv2
>
> Matching qemu is pushed to github:
> https://github.com/aik/qemu/tree/nv2
>
> Skiboot bits are here:
> https://github.com/aik/skiboot/tree/nv2
>
> The individual patches have changelogs. v7 fixes compile warning
> and updates a VFIO capability comment in 20/20.
>
> Please comment. Thanks.
>
>
>
> Alexey Kardashevskiy (20):
>   powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2
>   powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a
> region
>   powerpc/vfio/iommu/kvm: Do not pin device memory
>   powerpc/powernv: Move npu struct from pnv_phb to pci_controller
>   powerpc/powernv/npu: Move OPAL calls away from context manipulation
>   powerpc/pseries/iommu: Use memory@ nodes in max RAM address
> calculation
>   powerpc/pseries/npu: Enable platform support
>   powerpc/pseries: Remove IOMMU API support for non-LPAR systems
>   powerpc/powernv/pseries: Rework device adding to IOMMU groups
>   powerpc/iommu_api: Move IOMMU groups setup to a single place
>   powerpc/powernv: Reference iommu_table while it is linked to a group
>   powerpc/powernv/npu: Move single TVE handling to NPU PE
>   powerpc/powernv/npu: Convert NPU IOMMU helpers to
> iommu_table_group_ops
>   powerpc/powernv/npu: Add compound IOMMU groups
>   powerpc/powernv/npu: Add release_ownership hook
>   powerpc/powernv/npu: Check mmio_atsd array bounds when populating
>   powerpc/powernv/npu: Fault user page into the hypervisor's pagetable
>   vfio_pci: Allow mapping extra regions
>   vfio_pci: Allow regions to add own capabilities
>   vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver
>
>  drivers/vfio/pci/Makefile |   1 +
>  arch/powerpc/include/asm/iommu.h  |  17 +-
>  arch/powerpc/include/asm/mmu_context.h|  15 +-
>  arch/powerpc/include/asm/pci-bridge.h |   1 +
>  arch/powerpc/include/asm/pci.h|   4 +
>  arch/powerpc/platforms/powernv/pci.h  |  30 +-
>  drivers/vfio/pci/trace.h  | 102 
>  drivers/vfio/pci/vfio_pci_private.h   |  20 +
>  include/uapi/linux/vfio.h |  37 ++
>  arch/powerpc/kernel/iommu.c   |  69 +--
>  arch/powerpc/kvm/book3s_64_vio.c  |  18 +-
>  arch/powerpc/mm/mmu_context_iommu.c   | 110 +++-
>  arch/powerpc/platforms/powernv/npu-dma.c  | 549 +++---
>  arch/powerpc/platforms/powernv/pci-ioda-tce.c |   3 +-
>  arch/powerpc/platforms/powernv/pci-ioda.c | 237 
>  arch/powerpc/platforms/powernv/pci.c  |  43 +-
>  arch/powerpc/platforms/pseries/iommu.c|  88 ++-
>  arch/powerpc/platforms/pseries/pci.c  |  22 +
>  drivers/vfio/pci/vfio_pci.c   |  42 +-
>  drivers/vfio/pci/vfio_pci_nvlink2.c   | 482 +++
>  drivers/vfio/vfio_iommu_spapr_tce.c   |  64 +-
>  drivers/vfio/pci/Kconfig  |   6 +
>  22 files changed, 1569 insertions(+), 391 deletions(-)
>  create mode 100644 drivers/vfio/pci/trace.h
>  create mode 100644 drivers/vfio/pci/vfio_pci_nvlink2.c
>
> -- 
> 2.17.1

[PATCHv2 0/3] mm: bugfix for NULL reference in mm on all archs

2018-12-20 Thread Pingfan Liu

This bug is original reported at 
https://lore.kernel.org/patchwork/patch/1020838/
In a short word, this bug should affect all archs, where a machine with a
numa-node having no memory, if nr_cpus prevents the instance of nodeA, and the
device on nodeA tries to allocate memory with device->numa_node info.
And node_zonelist(preferred_nid, gfp_mask) will panic due to uninstanced nodeA.

And there are two alternative methods to fix it.
-1st. Fix it in mm system
-2nd. Fix it in all archs independently, by online all possible nodes.

Originaly, I tries to fix it by the 1st method, while Michal suggests the 2nd 
one.
This series [1-2/3] tries to resolve some defect in v1, pointed out by Michal.
For discussion purpose, I send [3/3] in this thread, which tries to show e.g of
the 2nd method on powerpc platform.
For x86, I still help Michal to verify his patch on my test machine, please see:
https://lore.kernel.org/patchwork/comment/1208479/
https://lore.kernel.org/patchwork/comment/1210452/

It has already cost a little long time to find a solution, cc x86 and ppc 
mailing list
and hope their maintainers to give some suggestion to speed up the final 
solution.

Pingfan Liu (3):
  mm/numa: change the topo of build_zonelist_xx()
  mm/numa: build zonelist when alloc for device on offline node
  powerpc/numa: make all possible node be instanced against NULL
reference in node_zonelist()

 arch/powerpc/mm/numa.c | 13 ++--
 include/linux/gfp.h| 10 +-
 mm/page_alloc.c| 85 --
 3 files changed, 81 insertions(+), 27 deletions(-)

Cc: linuxppc-dev@lists.ozlabs.org
Cc: x...@kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Vlastimil Babka 
Cc: Mike Rapoport 
Cc: Bjorn Helgaas 
Cc: Jonathan Cameron 
Cc: David Rientjes 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
-- 
2.7.4

[PATCHv2 1/3] mm/numa: change the topo of build_zonelist_xx()

2018-12-20 Thread Pingfan Liu

The current build_zonelist_xx func relies on pgdat instance to build
zonelist, if a numa node is offline, there will no pgdat instance for it.
But in some case, there is still requirement for zonelist of offline node,
especially with nr_cpus option.
This patch change these funcs topo to ease the building of zonelist for
offline nodes.

Signed-off-by: Pingfan Liu 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: x...@kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Vlastimil Babka 
Cc: Mike Rapoport 
Cc: Bjorn Helgaas 
Cc: Jonathan Cameron 
Cc: David Rientjes 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
---
 mm/page_alloc.c | 44 +---
 1 file changed, 21 insertions(+), 23 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2ec9cc4..17dbf6e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5049,7 +5049,7 @@ static void zoneref_set_zone(struct zone *zone, struct 
zoneref *zoneref)
  *
  * Add all populated zones of a node to the zonelist.
  */
-static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
+static int build_zonerefs_node(int nid, struct zoneref *zonerefs)
 {
struct zone *zone;
enum zone_type zone_type = MAX_NR_ZONES;
@@ -5057,7 +5057,7 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct 
zoneref *zonerefs)
 
do {
zone_type--;
-   zone = pgdat->node_zones + zone_type;
+   zone = NODE_DATA(nid)->node_zones + zone_type;
if (managed_zone(zone)) {
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
@@ -5186,20 +5186,20 @@ static int find_next_best_node(int node, nodemask_t 
*used_node_mask)
  * This results in maximum locality--normal zone overflows into local
  * DMA zone, if any--but risks exhausting DMA zone.
  */
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
-   unsigned nr_nodes)
+static void build_zonelists_in_node_order(struct zonelist *node_zonelists,
+   int *node_order, unsigned int nr_nodes)
 {
struct zoneref *zonerefs;
int i;
 
-   zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+   zonerefs = node_zonelists[ZONELIST_FALLBACK]._zonerefs;
 
for (i = 0; i < nr_nodes; i++) {
int nr_zones;
 
pg_data_t *node = NODE_DATA(node_order[i]);
 
-   nr_zones = build_zonerefs_node(node, zonerefs);
+   nr_zones = build_zonerefs_node(node->node_id, zonerefs);
zonerefs += nr_zones;
}
zonerefs->zone = NULL;
@@ -5209,13 +5209,14 @@ static void build_zonelists_in_node_order(pg_data_t 
*pgdat, int *node_order,
 /*
  * Build gfp_thisnode zonelists
  */
-static void build_thisnode_zonelists(pg_data_t *pgdat)
+static void build_thisnode_zonelists(struct zonelist *node_zonelists,
+   int nid)
 {
struct zoneref *zonerefs;
int nr_zones;
 
-   zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
-   nr_zones = build_zonerefs_node(pgdat, zonerefs);
+   zonerefs = node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+   nr_zones = build_zonerefs_node(nid, zonerefs);
zonerefs += nr_zones;
zonerefs->zone = NULL;
zonerefs->zone_idx = 0;
@@ -5228,15 +5229,14 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
  * may still exist in local DMA zone.
  */
 
-static void build_zonelists(pg_data_t *pgdat)
+static void build_zonelists(struct zonelist *node_zonelists, int local_node)
 {
static int node_order[MAX_NUMNODES];
int node, load, nr_nodes = 0;
nodemask_t used_mask;
-   int local_node, prev_node;
+   int prev_node;
 
/* NUMA-aware ordering of nodes */
-   local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
nodes_clear(used_mask);
@@ -5257,8 +5257,8 @@ static void build_zonelists(pg_data_t *pgdat)
load--;
}
 
-   build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
-   build_thisnode_zonelists(pgdat);
+   build_zonelists_in_node_order(node_zonelists, node_order, nr_nodes);
+   build_thisnode_zonelists(node_zonelists, local_node);
 }
 
 #ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -5283,16 +5283,14 @@ static void setup_min_unmapped_ratio(void);
 static void setup_min_slab_ratio(void);
 #else  /* CONFIG_NUMA */
 
-static void build_zonelists(pg_data_t *pgdat)
+static void build_zonelists(struct zonelist *node_zonelists, int local_node)
 {
int node, local_node;
struct zoneref *zonerefs;
int nr_zones;
 
-   local_node = pgdat->node_id;
-
-   zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
-   nr_zones = build_zonerefs_node(pgdat, zone

[PATCHv2 2/3] mm/numa: build zonelist when alloc for device on offline node

2018-12-20 Thread Pingfan Liu

I hit a bug on an AMD machine, with kexec -l nr_cpus=4 option. It is due to
some pgdat is not instanced when specifying nr_cpus, e.g, on x86, not
initialized by init_cpu_to_node()->init_memory_less_node(). But
device->numa_node info is used as preferred_nid param for
__alloc_pages_nodemask(), which causes NULL reference
  ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
Although this bug is detected on x86, it should affect all archs, where
a machine with a numa-node having no memory, if nr_cpus prevents the
instance of the node, and the device on the node tries to allocate memory
with device->numa_node info.
There are two alternative methods to fix the bug.
-1. Make all possible numa nodes be instanced. This should be done for all
archs
-2. Using zonelist instead of pgdat when encountering un-instanced node,
and only do this when needed.
This patch adopts the 2nd method, uses possible_zonelist[] to mirror
node_zonelists[], and tries to build zonelist for the offline node when needed.

Notes about the crashing info:
-1. kexec -l with nr_cpus=4
-2. system info
  NUMA node0 CPU(s): 0,8,16,24
  NUMA node1 CPU(s): 2,10,18,26
  NUMA node2 CPU(s): 4,12,20,28
  NUMA node3 CPU(s): 6,14,22,30
  NUMA node4 CPU(s): 1,9,17,25
  NUMA node5 CPU(s): 3,11,19,27
  NUMA node6 CPU(s): 5,13,21,29
  NUMA node7 CPU(s): 7,15,23,31
-3. panic stack
[...]
[5.721547] atomic64_test: passed for x86-64 platform with CX8 and with SSE
[5.729187] pcieport :00:01.1: Signaling PME with IRQ 34
[5.735187] pcieport :00:01.2: Signaling PME with IRQ 35
[5.741168] pcieport :00:01.3: Signaling PME with IRQ 36
[5.747189] pcieport :00:07.1: Signaling PME with IRQ 37
[5.754061] pcieport :00:08.1: Signaling PME with IRQ 39
[5.760727] pcieport :20:07.1: Signaling PME with IRQ 40
[5.766955] pcieport :20:08.1: Signaling PME with IRQ 42
[5.772742] BUG: unable to handle kernel paging request at 2088
[5.773618] PGD 0 P4D 0
[5.773618] Oops:  [#1] SMP NOPTI
[5.773618] CPU: 2 PID: 1 Comm: swapper/0 Not tainted 4.20.0-rc1+ #3
[5.773618] Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.4.3 
06/29/2018
[5.773618] RIP: 0010:__alloc_pages_nodemask+0xe2/0x2a0
[5.773618] Code: 00 00 44 89 ea 80 ca 80 41 83 f8 01 44 0f 44 ea 89 da c1 
ea 08 83 e2 01 88 54 24 20 48 8b 54 24 08 48 85 d2 0f 85 46 01 00 00 <3b> 77 08 
0f 82 3d 01 00 00 48 89 f8 44 89 ea 48 89
e1 44 89 e6 89
[5.773618] RSP: 0018:aa65fb20 EFLAGS: 00010246
[5.773618] RAX:  RBX: 006012c0 RCX: 
[5.773618] RDX:  RSI: 0002 RDI: 2080
[5.773618] RBP: 006012c0 R08:  R09: 0002
[5.773618] R10: 006080c0 R11: 0002 R12: 
[5.773618] R13: 0001 R14:  R15: 0002
[5.773618] FS:  () GS:8c69afe0() 
knlGS:
[5.773618] CS:  0010 DS:  ES:  CR0: 80050033
[5.773618] CR2: 2088 CR3: 00087e00a000 CR4: 003406e0
[5.773618] Call Trace:
[5.773618]  new_slab+0xa9/0x570
[5.773618]  ___slab_alloc+0x375/0x540
[5.773618]  ? pinctrl_bind_pins+0x2b/0x2a0
[5.773618]  __slab_alloc+0x1c/0x38
[5.773618]  __kmalloc_node_track_caller+0xc8/0x270
[5.773618]  ? pinctrl_bind_pins+0x2b/0x2a0
[5.773618]  devm_kmalloc+0x28/0x60
[5.773618]  pinctrl_bind_pins+0x2b/0x2a0
[5.773618]  really_probe+0x73/0x420
[5.773618]  driver_probe_device+0x115/0x130
[5.773618]  __driver_attach+0x103/0x110
[5.773618]  ? driver_probe_device+0x130/0x130
[5.773618]  bus_for_each_dev+0x67/0xc0
[5.773618]  ? klist_add_tail+0x3b/0x70
[5.773618]  bus_add_driver+0x41/0x260
[5.773618]  ? pcie_port_setup+0x4d/0x4d
[5.773618]  driver_register+0x5b/0xe0
[5.773618]  ? pcie_port_setup+0x4d/0x4d
[5.773618]  do_one_initcall+0x4e/0x1d4
[5.773618]  ? init_setup+0x25/0x28
[5.773618]  kernel_init_freeable+0x1c1/0x26e
[5.773618]  ? loglevel+0x5b/0x5b
[5.773618]  ? rest_init+0xb0/0xb0
[5.773618]  kernel_init+0xa/0x110
[5.773618]  ret_from_fork+0x22/0x40
[5.773618] Modules linked in:
[5.773618] CR2: 2088
[5.773618] ---[ end trace 1030c9120a03d081 ]---
[...]

Other notes about the reproduction of this bug:
After appling the following patch:
'commit 0d76bcc960e6 ("Revert "ACPI/PCI: Pay attention to device-specific
_PXM node values"")'
This bug is covered and not triggered on my test AMD machine.
But it should still exist since dev->numa_node info can be set by other
method on other archs when using nr_cpus param

Signed-off-by: Pingfan Liu 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: x...@kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Vlastimil Babka 
Cc: Mike Rapoport 
Cc: Bjorn Helg

[PATCHv2 3/3] powerpc/numa: make all possible node be instanced against NULL reference in node_zonelist()

2018-12-20 Thread Pingfan Liu

This patch tries to resolve a bug rooted at mm when using nr_cpus. It was
reported at [1]. The root cause is: device->numa_node info is used as
preferred_nid param for __alloc_pages_nodemask(), which causes NULL
reference when ac->zonelist = node_zonelist(preferred_nid, gfp_mask), due to
the preferred_nid is not online and not instanced. Hence the bug affects
all archs if a machine having a memory less numa-node, but a device on the
node is used and provide numa_node info to __alloc_pages_nodemask().
This patch makes all possible node online for ppc.

[1]: https://lore.kernel.org/patchwork/patch/1020838/

Signed-off-by: Pingfan Liu 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: x...@kernel.org
Cc: linux-ker...@vger.kernel.org
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Vlastimil Babka 
Cc: Mike Rapoport 
Cc: Bjorn Helgaas 
Cc: Jonathan Cameron 
Cc: David Rientjes 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
---
Note:
[1-2/3] implements one way to fix the bug, while this patch tries another way.
Hence using this patch when [1-2/3] is not acceptable.

 arch/powerpc/mm/numa.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index ce28ae5..31d81a4 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -864,10 +864,19 @@ void __init initmem_init(void)
 
memblock_dump_all();
 
-   for_each_online_node(nid) {
+   /* Instance all possible nodes to overcome potential NULL reference
+* issue on node_zonelist() when using nr_cpus
+*/
+   for_each_node(nid) {
unsigned long start_pfn, end_pfn;
 
-   get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+   if (node_online(nid))
+   get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+   else {
+   start_pfn = end_pfn = 0;
+   /* online it, so later zonelists[] will be built */
+   node_set_online(nid);
+   }
setup_node_data(nid, start_pfn, end_pfn);
sparse_memory_present_with_active_regions(nid);
}
-- 
2.7.4

Re: [PATCH v3 1/3] powerpc: Discard more sections in linker script

2018-12-20 Thread Michael Ellerman

Joel Stanley  writes:

> Building the ppc64 kernel with a modern binutils results in this
> warning:
>
>  powerpc64le-linux-gnu-ld: warning: orphan section `.gnu.hash' from
>  `linker stubs' being placed in section `.gnu.hash'
>
> Alan Modra  explains:
>
>  > .gnu.hash, like .hash, is used by glibc ld.so for dynamic symbol
>  > lookup.  I imagine you don't need either section in a kernel, so
>  > discarding both sounds reasonable.  Likely you could discard .interp
>  > and .dynstr too, and .dynsym when !CONFIG_PPC32.
>
> Reported-by: Stephen Rothwell 
> Signed-off-by: Joel Stanley 
> ---
> See 
> https://lore.kernel.org/lkml/CACPK8Xft3n5KkpTjN3=7_vucxhfck7mxvzm2rrqu7tppcbo...@mail.gmail.com/T/#m58532c86cf0c7b4fb01cc1fe724e48d4c7d8e4a7
>
> v3: Add dynstr hunk to this patch (it was incorrectly left in patch 2)
> ---
>  arch/powerpc/kernel/vmlinux.lds.S | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)

Building ppc64le_defconfig with gcc 8.1.0 / binutils 2.30, this is giving me:

  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux1: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux2: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux2: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux2: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
.tmp_vmlinux2: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-nm: 
vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: attempt to load strings from a non-string section (number 0)
  WARNING: 2 bad relocations
  c1490a50 R_PPC64_ADDR64(null)
  c1490a68 R_PPC64_ADDR64(null)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: attempt to load strings from a non-string section (number 0)
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: attempt to load strings from a non-string section (number 0)

Haven't had time to dig into why yet.

cheers

Re: [PATCH v3 2/3] powerpc: Discard dynsym section for !PPC32

2018-12-20 Thread Michael Ellerman

Joel Stanley  writes:

> Alan Modra  explains:
>
>  > Likely you could discard .interp > and .dynstr too, and .dynsym when
>  > !CONFIG_PPC32.
>
> Discarding of interp and dynstr happened in a previous patch. The dynsym
> cleanup was a bit less straightforward, so it gets it's own patch.
>
> Signed-off-by: Joel Stanley 

And with the same toolchain this gives me:

  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: not a dynamic object
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: Invalid operation
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: not a dynamic object
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 vmlinux: Invalid operation
  
/opt/cross/kisskb/korg/gcc-8.1.0-nolibc/powerpc64-linux/bin/powerpc64-linux-objdump:
 --stop-address: bad number: 0x


cheers

Re: [PATCH kernel v7 00/20] powerpc/powernv/npu, vfio: NVIDIA V100 + P9 passthrough

2018-12-20 Thread Alexey Kardashevskiy




On 20/12/2018 20:38, Michael Ellerman wrote:
> Alexey Kardashevskiy  writes:
> 
>> My bad, I was not cc-ing everyone but now with v7 I am, sorry about that.
> 
> I've already applied v6, I'll assume this is unchanged from that unless
> you tell me otherwise.

14/20 has fixed warning about uninitialized npdev, 20/20 has fixed
comment about one capability:

[fstn1-p1 kernel]$ git diff 7e04f09 9128bd1
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c
b/arch/powerpc/platforms/powernv/npu-dma.c
index ed81426..12b8421 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -540,7 +540,7 @@ struct iommu_table_group
*pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
struct npu_comp *npucomp;
struct pci_dev *gpdev = NULL;
struct pci_controller *hose;
-   struct pci_dev *npdev;
+   struct pci_dev *npdev = NULL;

list_for_each_entry(gpdev, &pe->pbus->devices, bus_list) {
npdev = pnv_pci_get_npu_dev(gpdev, 0);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 22b825c..5562587 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -390,8 +390,7 @@ struct vfio_region_info_cap_nvlink2_ssatgt {
 };

 /*
- * Capability with compressed real address (aka SSA - small system
address),
- * used to match the NVLink bridge with a GPU. Also contains a link speed.
+ * Capability with an NVLink link speed.
  */
 #define VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD5



> 
> cheers
> 
>> This is for passing through NVIDIA V100 GPUs on POWER9 systems.
>> 20/20 has the details of hardware setup.
>>
>> This implements support for NVIDIA V100 GPU with coherent memory and
>> NPU/ATS support available in the POWER9 CPU. The aim is to support
>> unmodified vendor driver in the guest.
>>
>> This is pushed to (both guest and host kernels):
>> https://github.com/aik/linux/tree/nv2
>>
>> Matching qemu is pushed to github:
>> https://github.com/aik/qemu/tree/nv2
>>
>> Skiboot bits are here:
>> https://github.com/aik/skiboot/tree/nv2
>>
>> The individual patches have changelogs. v7 fixes compile warning
>> and updates a VFIO capability comment in 20/20.
>>
>> Please comment. Thanks.
>>
>>
>>
>> Alexey Kardashevskiy (20):
>>   powerpc/ioda/npu: Call skiboot's hot reset hook when disabling NPU2
>>   powerpc/mm/iommu/vfio_spapr_tce: Change mm_iommu_get to reference a
>> region
>>   powerpc/vfio/iommu/kvm: Do not pin device memory
>>   powerpc/powernv: Move npu struct from pnv_phb to pci_controller
>>   powerpc/powernv/npu: Move OPAL calls away from context manipulation
>>   powerpc/pseries/iommu: Use memory@ nodes in max RAM address
>> calculation
>>   powerpc/pseries/npu: Enable platform support
>>   powerpc/pseries: Remove IOMMU API support for non-LPAR systems
>>   powerpc/powernv/pseries: Rework device adding to IOMMU groups
>>   powerpc/iommu_api: Move IOMMU groups setup to a single place
>>   powerpc/powernv: Reference iommu_table while it is linked to a group
>>   powerpc/powernv/npu: Move single TVE handling to NPU PE
>>   powerpc/powernv/npu: Convert NPU IOMMU helpers to
>> iommu_table_group_ops
>>   powerpc/powernv/npu: Add compound IOMMU groups
>>   powerpc/powernv/npu: Add release_ownership hook
>>   powerpc/powernv/npu: Check mmio_atsd array bounds when populating
>>   powerpc/powernv/npu: Fault user page into the hypervisor's pagetable
>>   vfio_pci: Allow mapping extra regions
>>   vfio_pci: Allow regions to add own capabilities
>>   vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver
>>
>>  drivers/vfio/pci/Makefile |   1 +
>>  arch/powerpc/include/asm/iommu.h  |  17 +-
>>  arch/powerpc/include/asm/mmu_context.h|  15 +-
>>  arch/powerpc/include/asm/pci-bridge.h |   1 +
>>  arch/powerpc/include/asm/pci.h|   4 +
>>  arch/powerpc/platforms/powernv/pci.h  |  30 +-
>>  drivers/vfio/pci/trace.h  | 102 
>>  drivers/vfio/pci/vfio_pci_private.h   |  20 +
>>  include/uapi/linux/vfio.h |  37 ++
>>  arch/powerpc/kernel/iommu.c   |  69 +--
>>  arch/powerpc/kvm/book3s_64_vio.c  |  18 +-
>>  arch/powerpc/mm/mmu_context_iommu.c   | 110 +++-
>>  arch/powerpc/platforms/powernv/npu-dma.c  | 549 +++---
>>  arch/powerpc/platforms/powernv/pci-ioda-tce.c |   3 +-
>>  arch/powerpc/platforms/powernv/pci-ioda.c | 237 
>>  arch/powerpc/platforms/powernv/pci.c  |  43 +-
>>  arch/powerpc/platforms/pseries/iommu.c|  88 ++-
>>  arch/powerpc/platforms/pseries/pci.c  |  22 +
>>  drivers/vfio/pci/vfio_pci.c   |  42 +-
>>  drivers/vfio/pci/vfio_pci_nvlink2.c   | 482 +++
>>  drivers/vfio/vfio_iommu_spapr_tce.c   |  64 +-
>>  drivers/vfio/pci/Kconfig  |   6 +
>>  22 files changed, 1569 insertions(+), 391 deletions(-)
>>  create mode 1

Re: [PATCHv2 2/3] mm/numa: build zonelist when alloc for device on offline node

2018-12-20 Thread Michal Hocko

On Thu 20-12-18 17:50:38, Pingfan Liu wrote:
[...]
> @@ -453,7 +456,12 @@ static inline int gfp_zonelist(gfp_t flags)
>   */
>  static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
>  {
> - return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> + if (unlikely(!possible_zonelists[nid])) {
> + WARN_ONCE(1, "alloc from offline node: %d\n", nid);
> + if (unlikely(build_fallback_zonelists(nid)))
> + nid = first_online_node;
> + }
> + return possible_zonelists[nid] + gfp_zonelist(flags);
>  }

No, please don't do this. We do not want to make things work magically
and we definitely do not want to put something like that into the hot
path. We definitely need zonelists to be build transparently for all
possible nodes during the init time.
-- 
Michal Hocko
SUSE Labs

Re: [PATCHv2 2/3] mm/numa: build zonelist when alloc for device on offline node

2018-12-20 Thread Pingfan Liu

On Thu, Dec 20, 2018 at 7:35 PM Michal Hocko  wrote:
>
> On Thu 20-12-18 17:50:38, Pingfan Liu wrote:
> [...]
> > @@ -453,7 +456,12 @@ static inline int gfp_zonelist(gfp_t flags)
> >   */
> >  static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> >  {
> > - return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> > + if (unlikely(!possible_zonelists[nid])) {
> > + WARN_ONCE(1, "alloc from offline node: %d\n", nid);
> > + if (unlikely(build_fallback_zonelists(nid)))
> > + nid = first_online_node;
> > + }
> > + return possible_zonelists[nid] + gfp_zonelist(flags);
> >  }
>
> No, please don't do this. We do not want to make things work magically

For magically, if you mean directly replies on zonelist instead of on
pgdat struct, then it is easy to change
> and we definitely do not want to put something like that into the hot

But  the cose of "unlikely" can be ignored, why can it not be placed
in the path?
> path. We definitely need zonelists to be build transparently for all
> possible nodes during the init time.

That is the point, whether the all nodes should be instanced at boot
time, or not be instanced until there is requirement.

To replace the possible_zonelists, I bring out the following draft
(locking issue is not considered)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 0705164..24e8ae6 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -453,6 +453,11 @@ static inline int gfp_zonelist(gfp_t flags)
  */
 static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
 {
+   if (unlikely(!node_data[nid])) {
+   WARN_ONCE(1, "alloc from offline node: %d\n", nid);
+   if (unlikely(build_offline_node(nid)))
+   nid = first_online_node;
+   }
return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
 }

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2ec9cc4..4ef15fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5261,6 +5261,21 @@ static void build_zonelists(pg_data_t *pgdat)
build_thisnode_zonelists(pgdat);
 }

+int build_offline_node(int nid)
+{
+   unsigned long zones_size[MAX_NR_ZONES] = {0};
+   unsigned long zholes_size[MAX_NR_ZONES] = {0};
+   pg_data_t *pgdat;
+
+   pgdat = kzalloc(sizeof(pg_data_t), GFP_ATOMIC);
+   if (!pgdat)
+   return -ENOMEM
+   node_data[nid] = pgdat;
+   free_area_init_node(nid, zones_size, 0, zholes_size);
+   build_zonelists(pgdat);
+   return 0;
+}
+

Thanks and regards,
Pingfan

Re: [PATCHv2 2/3] mm/numa: build zonelist when alloc for device on offline node

2018-12-20 Thread Michal Hocko

On Thu 20-12-18 20:26:28, Pingfan Liu wrote:
> On Thu, Dec 20, 2018 at 7:35 PM Michal Hocko  wrote:
> >
> > On Thu 20-12-18 17:50:38, Pingfan Liu wrote:
> > [...]
> > > @@ -453,7 +456,12 @@ static inline int gfp_zonelist(gfp_t flags)
> > >   */
> > >  static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> > >  {
> > > - return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> > > + if (unlikely(!possible_zonelists[nid])) {
> > > + WARN_ONCE(1, "alloc from offline node: %d\n", nid);
> > > + if (unlikely(build_fallback_zonelists(nid)))
> > > + nid = first_online_node;
> > > + }
> > > + return possible_zonelists[nid] + gfp_zonelist(flags);
> > >  }
> >
> > No, please don't do this. We do not want to make things work magically
> 
> For magically, if you mean directly replies on zonelist instead of on
> pgdat struct, then it is easy to change

No, I mean that we _know_ which nodes are possible. Platform is supposed
to tell us. We should just do the intialization properly. What we do now
instead is a pile of hacks that fit magically together. And that should
be changed.

> > and we definitely do not want to put something like that into the hot
> 
> But  the cose of "unlikely" can be ignored, why can it not be placed
> in the path?

unlikely will simply put the code outside of the hot path. The condition
is still there. There are people desperately fighting to get every
single cycle out of the page allocator. Now you want them to pay a
branch which is relevant only for few obscure HW setups.

> > path. We definitely need zonelists to be build transparently for all
> > possible nodes during the init time.
> 
> That is the point, whether the all nodes should be instanced at boot
> time, or not be instanced until there is requirement.

And that should be done at init time. We have all the information
necessary at that time.
-- 
Michal Hocko
SUSE Labs

Re: [PATCH] selftests/powerpc: New TM signal self test

2018-12-20 Thread Michael Ellerman

Breno Leitao  writes:

> A new self test that forces MSR[TS] to be set without calling any TM
> instruction. This test also tries to cause a page fault at a signal
> handler, exactly between MSR[TS] set and tm_recheckpoint(), forcing
> thread->texasr to be rewritten with TEXASR[FS] = 0, which will cause a BUG
> when tm_recheckpoint() is called.
>
> This test is not deterministic since it is hard to guarantee that the page
> access will cause a page fault. Tests have shown that the bug could be
> exposed with few interactions in a buggy kernel. This test is configured to
> loop 5000x, having a good chance to hit the kernel issue in just one run.
> This self test takes less than two seconds to run.
>
> This test uses set/getcontext because the kernel will recheckpoint
> zeroed structures, causing the test to segfault, which is undesired because
> the test needs to rerun, so, there is a signal handler for SIGSEGV which
> will restart the test.

Hi Breno,

Thanks for the test, some of these TM tests are getting pretty advanced! :)

Unfortunately it doesn't build in a few configurations.

On Ubuntu 18.10 built with powerpc-linux-gnu-gcc I get:

  tm-signal-force-msr.c: In function 'trap_signal_handler':
  tm-signal-force-msr.c:42:19: error: 'union uc_regs_ptr' has no member named 
'gp_regs'; did you mean 'uc_regs'?
ucp->uc_mcontext.gp_regs[PT_MSR] |= MSR_TS_S;
 ^~~
 uc_regs
  tm-signal-force-msr.c:17:29: error: left shift count >= width of type 
[-Werror=shift-count-overflow]
   #define __MASK(X)   (1UL<<(X))
   ^~
  tm-signal-force-msr.c:20:25: note: in expansion of macro '__MASK'
   #define MSR_TS_S__MASK(MSR_TS_S_LG) /* Transaction Suspended */
   ^~
  tm-signal-force-msr.c:42:38: note: in expansion of macro 'MSR_TS_S'
ucp->uc_mcontext.gp_regs[PT_MSR] |= MSR_TS_S;
^~~~

And using powerpc64le-linux-gnu-gcc I get:

  In file included from /usr/powerpc64le-linux-gnu/include/string.h:494,
   from tm-signal-force-msr.c:10:
  In function 'memcpy',
  inlined from 'trap_signal_handler' at tm-signal-force-msr.c:39:2:
  /usr/powerpc64le-linux-gnu/include/bits/string_fortified.h:34:10: error: 
'__builtin_memcpy' accessing 1272 bytes at offsets 8 and 168 overlaps 1112 
bytes at offset 168 [-Werror=restrict]
 return __builtin___memcpy_chk (__dest, __src, __len, __bos0 (__dest));
^~

cheers

Re: [PATCH] powerpc/pkeys: copy pkey-tracking-information at fork()

2018-12-20 Thread Michael Ellerman

Hi Ram,

Thanks for fixing this.

Ram Pai  writes:
> diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> index b271b28..5d65c47 100644
> --- a/arch/powerpc/mm/pkeys.c
> +++ b/arch/powerpc/mm/pkeys.c
> @@ -414,3 +414,10 @@ bool arch_vma_access_permitted(struct vm_area_struct 
> *vma, bool write,
>  
>   return pkey_access_permitted(vma_pkey(vma), write, execute);
>  }
> +
> +void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
> +{
> + /* Duplicate the oldmm pkey state in mm: */
> + mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
> + mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
> +}

Shouldn't this check if pkeys are actually in use?

eg:

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index cf87dddefbdc..587807763737 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -418,6 +418,9 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, 
bool write,
 
 void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
 {
+   if (static_branch_likely(&pkey_disabled))
+   return;
+
/* Duplicate the oldmm pkey state in mm: */
mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;


Ideally we'd actually do it in the inline so that the function call to
arch_dup_pkeys() can be avoided. But it looks like header dependencies
might make that hard.

cheers

Re: [PATCH RFCv2 0/4] mm/memory_hotplug: Introduce memory block types

2018-12-20 Thread David Hildenbrand

On 30.11.18 18:59, David Hildenbrand wrote:
> This is the second approach, introducing more meaningful memory block
> types and not changing online behavior in the kernel. It is based on
> latest linux-next.
> 
> As we found out during dicussion, user space should always handle onlining
> of memory, in any case. However in order to make smart decisions in user
> space about if and how to online memory, we have to export more information
> about memory blocks. This way, we can formulate rules in user space.
> 
> One such information is the type of memory block we are talking about.
> This helps to answer some questions like:
> - Does this memory block belong to a DIMM?
> - Can this DIMM theoretically ever be unplugged again?
> - Was this memory added by a balloon driver that will rely on balloon
>   inflation to remove chunks of that memory again? Which zone is advised?
> - Is this special standby memory on s390x that is usually not automatically
>   onlined?
> 
> And in short it helps to answer to some extend (excluding zone imbalances)
> - Should I online this memory block?
> - To which zone should I online this memory block?
> ... of course special use cases will result in different anwers. But that's
> why user space has control of onlining memory.
> 
> More details can be found in Patch 1 and Patch 3.
> Tested on x86 with hotplugged DIMMs. Cross-compiled for PPC and s390x.
> 
> 
> Example:
> $ udevadm info -q all -a /sys/devices/system/memory/memory0
>   KERNEL=="memory0"
>   SUBSYSTEM=="memory"
>   DRIVER==""
>   ATTR{online}=="1"
>   ATTR{phys_device}=="0"
>   ATTR{phys_index}==""
>   ATTR{removable}=="0"
>   ATTR{state}=="online"
>   ATTR{type}=="boot"
>   ATTR{valid_zones}=="none"
> $ udevadm info -q all -a /sys/devices/system/memory/memory90
>   KERNEL=="memory90"
>   SUBSYSTEM=="memory"
>   DRIVER==""
>   ATTR{online}=="1"
>   ATTR{phys_device}=="0"
>   ATTR{phys_index}=="005a"
>   ATTR{removable}=="1"
>   ATTR{state}=="online"
>   ATTR{type}=="dimm"
>   ATTR{valid_zones}=="Normal"
> 
> 
> RFC -> RFCv2:
> - Now also taking care of PPC (somehow missed it :/ )
> - Split the series up to some degree (some ideas on how to split up patch 3
>   would be very welcome)
> - Introduce more memory block types. Turns out abstracting too much was
>   rather confusing and not helpful. Properly document them.
> 
> Notes:
> - I wanted to convert the enum of types into a named enum but this
>   provoked all kinds of different errors. For now, I am doing it just like
>   the other types (e.g. online_type) we are using in that context.
> - The "removable" property should never have been named like that. It
>   should have been "offlinable". Can we still rename that? E.g. boot memory
>   is sometimes marked as removable ...
> 


Any feedback regarding the suggested block types would be very much
appreciated!


-- 

Thanks,

David / dhildenb

Re: [PATCH RFCv2 0/4] mm/memory_hotplug: Introduce memory block types

2018-12-20 Thread Michal Hocko

On Thu 20-12-18 13:58:16, David Hildenbrand wrote:
> On 30.11.18 18:59, David Hildenbrand wrote:
> > This is the second approach, introducing more meaningful memory block
> > types and not changing online behavior in the kernel. It is based on
> > latest linux-next.
> > 
> > As we found out during dicussion, user space should always handle onlining
> > of memory, in any case. However in order to make smart decisions in user
> > space about if and how to online memory, we have to export more information
> > about memory blocks. This way, we can formulate rules in user space.
> > 
> > One such information is the type of memory block we are talking about.
> > This helps to answer some questions like:
> > - Does this memory block belong to a DIMM?
> > - Can this DIMM theoretically ever be unplugged again?
> > - Was this memory added by a balloon driver that will rely on balloon
> >   inflation to remove chunks of that memory again? Which zone is advised?
> > - Is this special standby memory on s390x that is usually not automatically
> >   onlined?
> > 
> > And in short it helps to answer to some extend (excluding zone imbalances)
> > - Should I online this memory block?
> > - To which zone should I online this memory block?
> > ... of course special use cases will result in different anwers. But that's
> > why user space has control of onlining memory.
> > 
> > More details can be found in Patch 1 and Patch 3.
> > Tested on x86 with hotplugged DIMMs. Cross-compiled for PPC and s390x.
> > 
> > 
> > Example:
> > $ udevadm info -q all -a /sys/devices/system/memory/memory0
> > KERNEL=="memory0"
> > SUBSYSTEM=="memory"
> > DRIVER==""
> > ATTR{online}=="1"
> > ATTR{phys_device}=="0"
> > ATTR{phys_index}==""
> > ATTR{removable}=="0"
> > ATTR{state}=="online"
> > ATTR{type}=="boot"
> > ATTR{valid_zones}=="none"
> > $ udevadm info -q all -a /sys/devices/system/memory/memory90
> > KERNEL=="memory90"
> > SUBSYSTEM=="memory"
> > DRIVER==""
> > ATTR{online}=="1"
> > ATTR{phys_device}=="0"
> > ATTR{phys_index}=="005a"
> > ATTR{removable}=="1"
> > ATTR{state}=="online"
> > ATTR{type}=="dimm"
> > ATTR{valid_zones}=="Normal"
> > 
> > 
> > RFC -> RFCv2:
> > - Now also taking care of PPC (somehow missed it :/ )
> > - Split the series up to some degree (some ideas on how to split up patch 3
> >   would be very welcome)
> > - Introduce more memory block types. Turns out abstracting too much was
> >   rather confusing and not helpful. Properly document them.
> > 
> > Notes:
> > - I wanted to convert the enum of types into a named enum but this
> >   provoked all kinds of different errors. For now, I am doing it just like
> >   the other types (e.g. online_type) we are using in that context.
> > - The "removable" property should never have been named like that. It
> >   should have been "offlinable". Can we still rename that? E.g. boot memory
> >   is sometimes marked as removable ...
> > 
> 
> 
> Any feedback regarding the suggested block types would be very much
> appreciated!

I still do not like this much to be honest. I just didn't get to think
through this properly. My fear is that this is conflating an actual API
with the current implementation and as such will cause problems in
future. But I haven't really looked into your patches closely so I might
be wrong. Anyway I won't be able to look into it by the end of year.
-- 
Michal Hocko
SUSE Labs

Re: [PATCH RFCv2 0/4] mm/memory_hotplug: Introduce memory block types

2018-12-20 Thread David Hildenbrand

On 20.12.18 14:08, Michal Hocko wrote:
> On Thu 20-12-18 13:58:16, David Hildenbrand wrote:
>> On 30.11.18 18:59, David Hildenbrand wrote:
>>> This is the second approach, introducing more meaningful memory block
>>> types and not changing online behavior in the kernel. It is based on
>>> latest linux-next.
>>>
>>> As we found out during dicussion, user space should always handle onlining
>>> of memory, in any case. However in order to make smart decisions in user
>>> space about if and how to online memory, we have to export more information
>>> about memory blocks. This way, we can formulate rules in user space.
>>>
>>> One such information is the type of memory block we are talking about.
>>> This helps to answer some questions like:
>>> - Does this memory block belong to a DIMM?
>>> - Can this DIMM theoretically ever be unplugged again?
>>> - Was this memory added by a balloon driver that will rely on balloon
>>>   inflation to remove chunks of that memory again? Which zone is advised?
>>> - Is this special standby memory on s390x that is usually not automatically
>>>   onlined?
>>>
>>> And in short it helps to answer to some extend (excluding zone imbalances)
>>> - Should I online this memory block?
>>> - To which zone should I online this memory block?
>>> ... of course special use cases will result in different anwers. But that's
>>> why user space has control of onlining memory.
>>>
>>> More details can be found in Patch 1 and Patch 3.
>>> Tested on x86 with hotplugged DIMMs. Cross-compiled for PPC and s390x.
>>>
>>>
>>> Example:
>>> $ udevadm info -q all -a /sys/devices/system/memory/memory0
>>> KERNEL=="memory0"
>>> SUBSYSTEM=="memory"
>>> DRIVER==""
>>> ATTR{online}=="1"
>>> ATTR{phys_device}=="0"
>>> ATTR{phys_index}==""
>>> ATTR{removable}=="0"
>>> ATTR{state}=="online"
>>> ATTR{type}=="boot"
>>> ATTR{valid_zones}=="none"
>>> $ udevadm info -q all -a /sys/devices/system/memory/memory90
>>> KERNEL=="memory90"
>>> SUBSYSTEM=="memory"
>>> DRIVER==""
>>> ATTR{online}=="1"
>>> ATTR{phys_device}=="0"
>>> ATTR{phys_index}=="005a"
>>> ATTR{removable}=="1"
>>> ATTR{state}=="online"
>>> ATTR{type}=="dimm"
>>> ATTR{valid_zones}=="Normal"
>>>
>>>
>>> RFC -> RFCv2:
>>> - Now also taking care of PPC (somehow missed it :/ )
>>> - Split the series up to some degree (some ideas on how to split up patch 3
>>>   would be very welcome)
>>> - Introduce more memory block types. Turns out abstracting too much was
>>>   rather confusing and not helpful. Properly document them.
>>>
>>> Notes:
>>> - I wanted to convert the enum of types into a named enum but this
>>>   provoked all kinds of different errors. For now, I am doing it just like
>>>   the other types (e.g. online_type) we are using in that context.
>>> - The "removable" property should never have been named like that. It
>>>   should have been "offlinable". Can we still rename that? E.g. boot memory
>>>   is sometimes marked as removable ...
>>>
>>
>>
>> Any feedback regarding the suggested block types would be very much
>> appreciated!
> 
> I still do not like this much to be honest. I just didn't get to think
> through this properly. My fear is that this is conflating an actual API
> with the current implementation and as such will cause problems in
> future. But I haven't really looked into your patches closely so I might
> be wrong. Anyway I won't be able to look into it by the end of year.
> 

I guess as long as we have memory block devices and we expect user space
to make a decision we will have this API and the involved problems.

I am open for alternatives, and as I said, any feedback on how to sort
this out will be highly appreciated.

I'll be on vacation for the next two weeks, so this can wait. Just
wanted to note that I am still interested in feedback :)

-- 

Thanks,

David / dhildenb

Re: [PATCH] ocxl: Fix endiannes bug in ocxl_link_update_pe()

2018-12-20 Thread Greg Kurz

On Mon, 17 Dec 2018 11:38:51 +1100
"Alastair D'Silva"  wrote:

> On Sun, 2018-12-16 at 22:28 +0100, Greg Kurz wrote:
> > All fields in the PE are big-endian. Use cpu_to_be32() like
> > everywhere
> > else something is written to the PE. Otherwise a wrong TID will be
> > used
> > by the NPU. If this TID happens to point to an existing thread
> > sharing
> > the same mm, it could be woken up by error. This is highly improbable
> > though. The likely outcome of this is the NPU not finding the target
> > thread and forcing the AFU into sending an interrupt, which userspace
> > is supposed to handle anyway.
> > 
> > Fixes: e948e06fc63a ("ocxl: Expose the thread_id needed for wait on
> > POWER9")
> > Cc: sta...@vger.kernel.org  # v4.18
> > Signed-off-by: Greg Kurz 
> > ---
> > 
> > This bug remained unnoticed so far because the current OCXL test
> > suite
> > happens to call OCXL_IOCTL_ENABLE_P9_WAIT before attaching a context.
> > This causes ocxl_link_update_pe() to be called before
> > ocxl_link_add_pe()
> > which re-writes the TID in the PE with the appropriate endianness.
> > 
> > I have some patches that change the behavior of the OCXL test suite
> > so that
> > it can catch the issue:
> > 
> > https://github.com/gkurz/libocxl/commits/wake-host-thread-rework
> > ---
> >  drivers/misc/ocxl/link.c |2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
> > index 31695a078485..646d16450066 100644
> > --- a/drivers/misc/ocxl/link.c
> > +++ b/drivers/misc/ocxl/link.c
> > @@ -566,7 +566,7 @@ int ocxl_link_update_pe(void *link_handle, int
> > pasid, __u16 tid)
> >  
> > mutex_lock(&spa->spa_lock);
> >  
> > -   pe->tid = tid;
> > +   pe->tid = cpu_to_be32(tid);
> >  
> > /*
> >  * The barrier makes sure the PE is updated
> >   
> 
> Good catch, thanks.
> 
> Reviewed-by: Alastair D'Silva 
> 

Friendly ping before Xmas break :)

Re: [PATCH v2] ocxl: Fix endiannes bug in read_afu_name()

2018-12-20 Thread Greg Kurz

On Wed, 12 Dec 2018 13:26:10 +1100
Andrew Donnellan  wrote:

> On 12/12/18 4:58 am, Greg Kurz wrote:
> > The AFU Descriptor Template in the PCI config space has a Name Space
> > field which is a 24 Byte ASCII character string of descriptive name
> > space for the AFU. The OCXL driver read the string four characters at
> > a time with pci_read_config_dword().
> > 
> > This optimization is valid on a little-endian system since this is PCI,
> > but a big-endian system ends up with each subset of four characters in
> > reverse order.
> > 
> > This could be fixed by switching to read characters one by one. Another
> > option is to swap the bytes if we're big-endian.
> > 
> > Go for the latter with le32_to_cpu().
> > 
> > Cc: sta...@vger.kernel.org  # v4.16
> > Signed-off-by: Greg Kurz 
> > Acked-by: Frederic Barrat   
> 
> Acked-by: Andrew Donnellan 
> 

Friendly ping before Xmas break :)

> > ---
> > v2: - silence sparse with (__force __le32) cast
> >  - new changelog
> > ---
> >   drivers/misc/ocxl/config.c |2 +-
> >   1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
> > index 57a6bb1fd3c9..8f2c5d8bd2ee 100644
> > --- a/drivers/misc/ocxl/config.c
> > +++ b/drivers/misc/ocxl/config.c
> > @@ -318,7 +318,7 @@ static int read_afu_name(struct pci_dev *dev, struct 
> > ocxl_fn_config *fn,
> > if (rc)
> > return rc;
> > ptr = (u32 *) &afu->name[i];
> > -   *ptr = val;
> > +   *ptr = le32_to_cpu((__force __le32) val);
> > }
> > afu->name[OCXL_AFU_NAME_SZ - 1] = '\0'; /* play safe */
> > return 0;
> >   
>

Re: [PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Murilo Opsfelder Araujo

On Thu, Dec 20, 2018 at 07:23:50PM +1100, Alexey Kardashevskiy wrote:
> POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
> pluggable PCIe devices but still have PCIe links which are used
> for config space and MMIO. In addition to that the GPUs have 6 NVLinks
> which are connected to other GPUs and the POWER9 CPU. POWER9 chips
> have a special unit on a die called an NPU which is an NVLink2 host bus
> adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
> These systems also support ATS (address translation services) which is
> a part of the NVLink2 protocol. Such GPUs also share on-board RAM
> (16GB or 32GB) to the system via the same NVLink2 so a CPU has
> cache-coherent access to a GPU RAM.
> 
> This exports GPU RAM to the userspace as a new VFIO device region. This
> preregisters the new memory as device memory as it might be used for DMA.
> This inserts pfns from the fault handler as the GPU memory is not onlined
> until the vendor driver is loaded and trained the NVLinks so doing this
> earlier causes low level errors which we fence in the firmware so
> it does not hurt the host system but still better be avoided; for the same
> reason this does not map GPU RAM into the host kernel (usual thing for
> emulated access otherwise).
> 
> This exports an ATSD (Address Translation Shootdown) register of NPU which
> allows TLB invalidations inside GPU for an operating system. The register
> conveniently occupies a single 64k page. It is also presented to
> the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
> each of them can be used for TLB invalidation in a GPU linked to this NPU.
> This allocates one ATSD register per an NVLink bridge allowing passing
> up to 6 registers. Due to the host firmware bug (just recently fixed),
> only 1 ATSD register per NPU was actually advertised to the host system
> so this passes that alone register via the first NVLink bridge device in
> the group which is still enough as QEMU collects them all back and
> presents to the guest via vPHB to mimic the emulated NPU PHB on the host.
> 
> In order to provide the userspace with the information about GPU-to-NVLink
> connections, this exports an additional capability called "tgt"
> (which is an abbreviated host system bus address). The "tgt" property
> tells the GPU its own system address and allows the guest driver to
> conglomerate the routing information so each GPU knows how to get directly
> to the other GPUs.
> 
> For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
> know LPID (a logical partition ID or a KVM guest hardware ID in other
> words) and PID (a memory context ID of a userspace process, not to be
> confused with a linux pid). This assigns a GPU to LPID in the NPU and
> this is why this adds a listener for KVM on an IOMMU group. A PID comes
> via NVLink from a GPU and NPU uses a PID wildcard to pass it through.
> 
> This requires coherent memory and ATSD to be available on the host as
> the GPU vendor only supports configurations with both features enabled
> and other configurations are known not to work. Because of this and
> because of the ways the features are advertised to the host system
> (which is a device tree with very platform specific properties),
> this requires enabled POWERNV platform.
> 
> The V100 GPUs do not advertise any of these capabilities via the config
> space and there are more than just one device ID so this relies on
> the platform to tell whether these GPUs have special abilities such as
> NVLinks.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v6.1:
> * fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
> 
> v6:
> * reworked capabilities - tgt for nvlink and gpu and link-speed
> for nvlink only
> 
> v5:
> * do not memremap GPU RAM for emulation, map it only when it is needed
> * allocate 1 ATSD register per NVLink bridge, if none left, then expose
> the region with a zero size
> * separate caps per device type
> * addressed AW review comments
> 
> v4:
> * added nvlink-speed to the NPU bridge capability as this turned out to
> be not a constant value
> * instead of looking at the exact device ID (which also changes from system
> to system), now this (indirectly) looks at the device tree to know
> if GPU and NPU support NVLink
> 
> v3:
> * reworded the commit log about tgt
> * added tracepoints (do we want them enabled for entire vfio-pci?)
> * added code comments
> * added write|mmap flags to the new regions
> * auto enabled VFIO_PCI_NVLINK2 config option
> * added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu
> references; there are required by the NVIDIA driver
> * keep notifier registered only for short time
> ---
>  drivers/vfio/pci/Makefile   |   1 +
>  drivers/vfio/pci/trace.h| 102 ++
>  drivers/vfio/pci/vfio_pci_private.h |  14 +
>  include/uapi/linux/vfio.h   |  37 +++
>  drivers/vfio/pci/vfio_pci.c |  27 +-
>  drivers/vfi

Re: [RFC/WIP] powerpc: Fix 32-bit handling of MSR_EE on exceptions

2018-12-20 Thread Christophe Leroy





On 12/20/2018 05:40 AM, Benjamin Herrenschmidt wrote:

Hi folks !

Why trying to figure out why we had occasionally lockdep barf about
interrupt state on ppc32 (440 in my case but I could reproduce on e500
as well using qemu), I realized that we are still doing something
rather gothic and wrong on 32-bit which we stopped doing on 64-bit
a while ago.

We have that thing where some handlers "copy" the EE value from the
original stack frame into the new MSR before transferring to the
handler.

Thus for a number of exceptions, we enter the handlers with interrupts
enabled.

This is rather fishy, some of the stuff that handlers might do early
on such as irq_enter/exit or user_exit, context tracking, etc... should
be run with interrupts off afaik.

Generally our handlers know when to re-enable interrupts if needed
(though some of the FSL specific SPE ones don't).

The problem we were having is that we assumed these interrupts would
return with interrupts enabled. However that isn't the case.

Instead, this changes things so that we always enter exception handlers
with interrupts *off* with the notable exception of syscalls which are
special (and get a fast path).

Currently, the patch only changes BookE (440 and E5xx tested in qemu),
the same recipe needs to be applied to 6xx, 8xx and 40x.

Also I'm not sure whether we need to create a stack frame around some
of the calls to trace_hardirqs_* in asm. ppc64 does it, due to problems
with the irqsoff tracer, but I haven't managed to reproduce those
issues. We need to look into it a bit more.

I'll work more on this in the next few days, comments appreciated.

Not-signed-off-by: Benjamin Herrenschmidt 

---
  arch/powerpc/kernel/entry_32.S   | 113 ++-
  arch/powerpc/kernel/head_44x.S   |   9 +--
  arch/powerpc/kernel/head_booke.h |  34 ---
  arch/powerpc/kernel/head_fsl_booke.S |  28 -
  arch/powerpc/kernel/traps.c  |   8 +++
  5 files changed, 111 insertions(+), 81 deletions(-)

diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 3841d74..39b4cb5 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -34,6 +34,9 @@
  #include 
  #include 
  #include 
+#include 
+#include 
+#include 
  
  /*

   * MSR_KERNEL is > 0x1 on 4xx/Book-E since it include MSR_CE.
@@ -205,20 +208,46 @@ transfer_to_handler_cont:
mflrr9
lwz r11,0(r9)   /* virtual address of handler */
lwz r9,4(r9)/* where to go when done */
+#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
+   mtspr   SPRN_NRI, r0
+#endif


That's not part of your patch, it's already in the tree.


+
  #ifdef CONFIG_TRACE_IRQFLAGS
+   /*
+* When tracing IRQ state (lockdep) we enable the MMU before we call
+* the IRQ tracing functions as they might access vmalloc space or
+* perform IOs for console output.
+*
+* To speed up the syscall path where interrupts stay on, let's check
+* first if we are changing the MSR value at all.
+*/
+   lwz r12,_MSR(r1)
+   xor r0,r10,r12
+   andi.   r0,r0,MSR_EE
+   bne 1f
+
+   /* MSR isn't changing, just transition directly */
+   lwz r0,GPR0(r1)
+   mtspr   SPRN_SRR0,r11
+   mtspr   SPRN_SRR1,r10
+   mtlrr9
+   SYNC
+   RFI
+
+1: /* MSR is changing, re-enable MMU so we can notify lockdep. We need to
+* keep interrupts disabled at this point otherwise we might risk
+* taking an interrupt before we tell lockdep they are enabled.
+*/
lis r12,reenable_mmu@h
ori r12,r12,reenable_mmu@l
+   lis r0,MSR_KERNEL@h
+   ori r0,r0,MSR_KERNEL@l
mtspr   SPRN_SRR0,r12
-   mtspr   SPRN_SRR1,r10
+   mtspr   SPRN_SRR1,r0
SYNC
RFI
-reenable_mmu:  /* re-enable mmu so we can */
-   mfmsr   r10
-   lwz r12,_MSR(r1)
-   xor r10,r10,r12
-   andi.   r10,r10,MSR_EE  /* Did EE change? */
-   beq 1f
  
+reenable_mmu:

/*
 * The trace_hardirqs_off will use CALLER_ADDR0 and CALLER_ADDR1.
 * If from user mode there is only one stack frame on the stack, and
@@ -239,8 +268,29 @@ reenable_mmu:  /* re-enable 
mmu so we can */
stw r3,16(r1)
stw r4,20(r1)
stw r5,24(r1)
-   bl  trace_hardirqs_off
-   lwz r5,24(r1)
+
+   /* Are we enabling or disabling interrupts ? */
+   andi.   r0,r10,MSR_EE
+   beq 1f
+
+   /* If we are enabling interrupt, this is a syscall. They shouldn't
+* happen while interrupts are disabled, so let's do a warning here.
+*/
+0: trap
+   EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING
+   bl  trace_hardirqs_on
+
+   /* Now enable for real */
+   mfmsr

Re: [PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Alex Williamson

On Thu, 20 Dec 2018 19:23:50 +1100
Alexey Kardashevskiy  wrote:

> POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
> pluggable PCIe devices but still have PCIe links which are used
> for config space and MMIO. In addition to that the GPUs have 6 NVLinks
> which are connected to other GPUs and the POWER9 CPU. POWER9 chips
> have a special unit on a die called an NPU which is an NVLink2 host bus
> adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
> These systems also support ATS (address translation services) which is
> a part of the NVLink2 protocol. Such GPUs also share on-board RAM
> (16GB or 32GB) to the system via the same NVLink2 so a CPU has
> cache-coherent access to a GPU RAM.
> 
> This exports GPU RAM to the userspace as a new VFIO device region. This
> preregisters the new memory as device memory as it might be used for DMA.
> This inserts pfns from the fault handler as the GPU memory is not onlined
> until the vendor driver is loaded and trained the NVLinks so doing this
> earlier causes low level errors which we fence in the firmware so
> it does not hurt the host system but still better be avoided; for the same
> reason this does not map GPU RAM into the host kernel (usual thing for
> emulated access otherwise).
> 
> This exports an ATSD (Address Translation Shootdown) register of NPU which
> allows TLB invalidations inside GPU for an operating system. The register
> conveniently occupies a single 64k page. It is also presented to
> the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
> each of them can be used for TLB invalidation in a GPU linked to this NPU.
> This allocates one ATSD register per an NVLink bridge allowing passing
> up to 6 registers. Due to the host firmware bug (just recently fixed),
> only 1 ATSD register per NPU was actually advertised to the host system
> so this passes that alone register via the first NVLink bridge device in
> the group which is still enough as QEMU collects them all back and
> presents to the guest via vPHB to mimic the emulated NPU PHB on the host.
> 
> In order to provide the userspace with the information about GPU-to-NVLink
> connections, this exports an additional capability called "tgt"
> (which is an abbreviated host system bus address). The "tgt" property
> tells the GPU its own system address and allows the guest driver to
> conglomerate the routing information so each GPU knows how to get directly
> to the other GPUs.
> 
> For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
> know LPID (a logical partition ID or a KVM guest hardware ID in other
> words) and PID (a memory context ID of a userspace process, not to be
> confused with a linux pid). This assigns a GPU to LPID in the NPU and
> this is why this adds a listener for KVM on an IOMMU group. A PID comes
> via NVLink from a GPU and NPU uses a PID wildcard to pass it through.
> 
> This requires coherent memory and ATSD to be available on the host as
> the GPU vendor only supports configurations with both features enabled
> and other configurations are known not to work. Because of this and
> because of the ways the features are advertised to the host system
> (which is a device tree with very platform specific properties),
> this requires enabled POWERNV platform.
> 
> The V100 GPUs do not advertise any of these capabilities via the config
> space and there are more than just one device ID so this relies on
> the platform to tell whether these GPUs have special abilities such as
> NVLinks.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
> Changes:
> v6.1:
> * fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
> 
> v6:
> * reworked capabilities - tgt for nvlink and gpu and link-speed
> for nvlink only
> 
> v5:
> * do not memremap GPU RAM for emulation, map it only when it is needed
> * allocate 1 ATSD register per NVLink bridge, if none left, then expose
> the region with a zero size
> * separate caps per device type
> * addressed AW review comments
> 
> v4:
> * added nvlink-speed to the NPU bridge capability as this turned out to
> be not a constant value
> * instead of looking at the exact device ID (which also changes from system
> to system), now this (indirectly) looks at the device tree to know
> if GPU and NPU support NVLink
> 
> v3:
> * reworded the commit log about tgt
> * added tracepoints (do we want them enabled for entire vfio-pci?)
> * added code comments
> * added write|mmap flags to the new regions
> * auto enabled VFIO_PCI_NVLINK2 config option
> * added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu
> references; there are required by the NVIDIA driver
> * keep notifier registered only for short time
> ---
>  drivers/vfio/pci/Makefile   |   1 +
>  drivers/vfio/pci/trace.h| 102 ++
>  drivers/vfio/pci/vfio_pci_private.h |  14 +
>  include/uapi/linux/vfio.h   |  37 +++
>  drivers/vfio/pci/vfio_pci.c |  27 +-
>  drivers/vfio/pci

Re: [PATCH] ocxl/afu_irq: Don't include

2018-12-20 Thread Greg Kurz

On Tue, 11 Dec 2018 11:09:39 +1100
Andrew Donnellan  wrote:

> Acked-by: Andrew Donnellan 
> 

Friendly ping before Xmas break :)

> On 11/12/18 2:13 am, Greg Kurz wrote:
> > The AFU irq code doesn't need to reach out to the platform.
> > 
> > Signed-off-by: Greg Kurz 
> > ---
> >   drivers/misc/ocxl/afu_irq.c |1 -
> >   1 file changed, 1 deletion(-)
> > 
> > diff --git a/drivers/misc/ocxl/afu_irq.c b/drivers/misc/ocxl/afu_irq.c
> > index e70cfa24577f..11ab996657a2 100644
> > --- a/drivers/misc/ocxl/afu_irq.c
> > +++ b/drivers/misc/ocxl/afu_irq.c
> > @@ -2,7 +2,6 @@
> >   // Copyright 2017 IBM Corp.
> >   #include 
> >   #include 
> > -#include 
> >   #include "ocxl_internal.h"
> >   #include "trace.h"
> >   
> >   
>

Re: [PATCH] ocxl: Clarify error path in setup_xsl_irq()

2018-12-20 Thread Greg Kurz

On Tue, 11 Dec 2018 11:19:55 +1100
Andrew Donnellan  wrote:

> On 11/12/18 2:18 am, Greg Kurz wrote:
> > Implementing rollback with goto and labels is a common practice that
> > leads to prettier and more maintainable code. FWIW, this design pattern
> > is already being used in alloc_link() a few lines below in this file.
> > 
> > Do the same in setup_xsl_irq().
> > 
> > Signed-off-by: Greg Kurz   
> 
> This is good, thanks.
> 
> Acked-by: Andrew Donnellan 
> 

Friendly ping before Xmas break :)

> > ---
> >   drivers/misc/ocxl/link.c |   23 ++-
> >   1 file changed, 14 insertions(+), 9 deletions(-)
> > 
> > diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
> > index eed92055184d..659977a17405 100644
> > --- a/drivers/misc/ocxl/link.c
> > +++ b/drivers/misc/ocxl/link.c
> > @@ -273,9 +273,9 @@ static int setup_xsl_irq(struct pci_dev *dev, struct 
> > link *link)
> > spa->irq_name = kasprintf(GFP_KERNEL, "ocxl-xsl-%x-%x-%x",
> > link->domain, link->bus, link->dev);
> > if (!spa->irq_name) {
> > -   unmap_irq_registers(spa);
> > dev_err(&dev->dev, "Can't allocate name for xsl interrupt\n");
> > -   return -ENOMEM;
> > +   rc = -ENOMEM;
> > +   goto err_xsl;
> > }
> > /*
> >  * At some point, we'll need to look into allowing a higher
> > @@ -283,11 +283,10 @@ static int setup_xsl_irq(struct pci_dev *dev, struct 
> > link *link)
> >  */
> > spa->virq = irq_create_mapping(NULL, hwirq);
> > if (!spa->virq) {
> > -   kfree(spa->irq_name);
> > -   unmap_irq_registers(spa);
> > dev_err(&dev->dev,
> > "irq_create_mapping failed for translation 
> > interrupt\n");
> > -   return -EINVAL;
> > +   rc = -EINVAL;
> > +   goto err_name;
> > }
> >   
> > dev_dbg(&dev->dev, "hwirq %d mapped to virq %d\n", hwirq, spa->virq);
> > @@ -295,15 +294,21 @@ static int setup_xsl_irq(struct pci_dev *dev, struct 
> > link *link)
> > rc = request_irq(spa->virq, xsl_fault_handler, 0, spa->irq_name,
> > link);
> > if (rc) {
> > -   irq_dispose_mapping(spa->virq);
> > -   kfree(spa->irq_name);
> > -   unmap_irq_registers(spa);
> > dev_err(&dev->dev,
> > "request_irq failed for translation interrupt: %d\n",
> > rc);
> > -   return -EINVAL;
> > +   rc = -EINVAL;
> > +   goto err_mapping;
> > }
> > return 0;
> > +
> > +err_mapping:
> > +   irq_dispose_mapping(spa->virq);
> > +err_name:
> > +   kfree(spa->irq_name);
> > +err_xsl:
> > +   unmap_irq_registers(spa);
> > +   return rc;
> >   }
> >   
> >   static void release_xsl_irq(struct link *link)
> >   
>

[PATCH 0/9] Add FADump support on PowerNV platform

2018-12-20 Thread Hari Bathini

Firmware-Assisted Dump (FADump) is currently supported only on pseries
platform. This patch series adds support for powernv platform too.

The first and third patches refactor the FADump code to make use of common
code across multiple platforms. The fourth patch adds basic FADump support
to powernv platform. The next patch processes CPU state data provided by
F/W and adds core notes to core file. The sixth patch adds support to export
opalcore. This is to make debugging of failures in opal code easier. The
remaining patches update firmware-assisted dump documentation appropriately.

The patch series is tested with the latest firmware plus the below skiboot
changes for MPIPL support:

https://patchwork.ozlabs.org/project/skiboot/list/?series=78497
("MPIPL support")

The patches are based on top of the below fadump changes:

https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=61500
("powerpc/fadump: Improvements for firmware-assisted dump")

---

Hari Bathini (9):
  powerpc/fadump: move internal fadump code to a new file
  powerpc/fadump: Improve fadump documentation
  pseries/fadump: move out platform specific support from generic code
  powerpc/fadump: enable fadump support on OPAL based POWER platform
  powerpc/fadump: process architected register state data provided by 
firmware
  powerpc/powernv: export /proc/opalcore for analysing opal crashes
  powerpc/fadump: add support to preserve crash data on FADUMP disabled 
kernel
  powerpc/fadump: use FADump instead of fadump for how it is pronounced
  powerpc/fadump: Update documentation about OPAL platform support


 Documentation/powerpc/firmware-assisted-dump.txt |  168 ++--
 arch/powerpc/Kconfig |   14 
 arch/powerpc/include/asm/fadump.h|  191 
 arch/powerpc/include/asm/opal-api.h  |   35 +
 arch/powerpc/include/asm/opal.h  |1 
 arch/powerpc/kernel/Makefile |6 
 arch/powerpc/kernel/fadump.c |  994 ++
 arch/powerpc/kernel/fadump_internal.c|  334 +++
 arch/powerpc/kernel/fadump_internal.h|  228 +
 arch/powerpc/kernel/prom.c   |4 
 arch/powerpc/platforms/powernv/Makefile  |5 
 arch/powerpc/platforms/powernv/opal-core.c   |  385 +
 arch/powerpc/platforms/powernv/opal-core.h   |   35 +
 arch/powerpc/platforms/powernv/opal-fadump.c |  655 ++
 arch/powerpc/platforms/powernv/opal-fadump.h |   49 +
 arch/powerpc/platforms/powernv/opal-wrappers.S   |1 
 arch/powerpc/platforms/pseries/Makefile  |1 
 arch/powerpc/platforms/pseries/pseries_fadump.c  |  534 
 arch/powerpc/platforms/pseries/pseries_fadump.h  |   96 ++
 19 files changed, 2749 insertions(+), 987 deletions(-)
 create mode 100644 arch/powerpc/kernel/fadump_internal.c
 create mode 100644 arch/powerpc/kernel/fadump_internal.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.h
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h
 create mode 100644 arch/powerpc/platforms/pseries/pseries_fadump.c
 create mode 100644 arch/powerpc/platforms/pseries/pseries_fadump.h

[PATCH 1/9] powerpc/fadump: move internal fadump code to a new file

2018-12-20 Thread Hari Bathini

Refactoring fadump code means internal fadump code is referenced from
different places. For ease, move internal code to a new file.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h |  112 ---
 arch/powerpc/kernel/Makefile  |2 
 arch/powerpc/kernel/fadump.c  |  190 ++---
 arch/powerpc/kernel/fadump_internal.c |  184 
 arch/powerpc/kernel/fadump_internal.h |  126 ++
 5 files changed, 322 insertions(+), 292 deletions(-)
 create mode 100644 arch/powerpc/kernel/fadump_internal.c
 create mode 100644 arch/powerpc/kernel/fadump_internal.h

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 188776b..028a8ef 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -24,34 +24,6 @@
 
 #ifdef CONFIG_FA_DUMP
 
-/*
- * The RMA region will be saved for later dumping when kernel crashes.
- * RMA is Real Mode Area, the first block of logical memory address owned
- * by logical partition, containing the storage that may be accessed with
- * translate off.
- */
-#define RMA_START  0x0
-#define RMA_END(ppc64_rma_size)
-
-/*
- * On some Power systems where RMO is 128MB, it still requires minimum of
- * 256MB for kernel to boot successfully. When kdump infrastructure is
- * configured to save vmcore over network, we run into OOM issue while
- * loading modules related to network setup. Hence we need aditional 64M
- * of memory to avoid OOM issue.
- */
-#define MIN_BOOT_MEM   (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \
-   + (0x1UL << 26))
-
-/* The upper limit percentage for user specified boot memory size (25%) */
-#define MAX_BOOT_MEM_RATIO 4
-
-#define memblock_num_regions(memblock_type)(memblock.memblock_type.cnt)
-
-/* Alignement per CMA requirement. */
-#define FADUMP_CMA_ALIGNMENT   (PAGE_SIZE <<   \
-   max_t(unsigned long, MAX_ORDER - 1, pageblock_order))
-
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA  0x0001
 #define FADUMP_HPTE_REGION 0x0002
@@ -60,18 +32,9 @@
 /* Dump request flag */
 #define FADUMP_REQUEST_FLAG0x0001
 
-/* FAD commands */
-#define FADUMP_REGISTER1
-#define FADUMP_UNREGISTER  2
-#define FADUMP_INVALIDATE  3
-
 /* Dump status flag */
 #define FADUMP_ERROR_FLAG  0x2000
 
-#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1)
-
-#define CPU_UNKNOWN(~((u32)0))
-
 /* Utility macros */
 #define SKIP_TO_NEXT_CPU(reg_entry)\
 ({ \
@@ -125,59 +88,8 @@ struct fadump_mem_struct {
struct fadump_section   rmr_region;
 };
 
-/* Firmware-assisted dump configuration details. */
-struct fw_dump {
-   unsigned long   cpu_state_data_size;
-   unsigned long   hpte_region_size;
-   unsigned long   boot_memory_size;
-   unsigned long   reserve_dump_area_start;
-   unsigned long   reserve_dump_area_size;
-   /* cmd line option during boot */
-   unsigned long   reserve_bootvar;
-
-   unsigned long   fadumphdr_addr;
-   unsigned long   cpu_notes_buf;
-   unsigned long   cpu_notes_buf_size;
-
-   int ibm_configure_kernel_dump;
-
-   unsigned long   fadump_enabled:1;
-   unsigned long   fadump_supported:1;
-   unsigned long   dump_active:1;
-   unsigned long   dump_registered:1;
-   unsigned long   nocma:1;
-};
-
-/*
- * Copy the ascii values for first 8 characters from a string into u64
- * variable at their respective indexes.
- * e.g.
- *  The string "FADMPINF" will be converted into 0x4641444d50494e46
- */
-static inline u64 str_to_u64(const char *str)
-{
-   u64 val = 0;
-   int i;
-
-   for (i = 0; i < sizeof(val); i++)
-   val = (*str) ? (val << 8) | *str++ : val << 8;
-   return val;
-}
-#define STR_TO_HEX(x)  str_to_u64(x)
-#define REG_ID(x)  str_to_u64(x)
-
-#define FADUMP_CRASH_INFO_MAGICSTR_TO_HEX("FADMPINF")
 #define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE")
 
-/* The firmware-assisted dump format.
- *
- * The register save area is an area in the partition's memory used to preserve
- * the register contents (CPU state data) for the active CPUs during a firmware
- * assisted dump. The dump format contains register save area header followed
- * by register entries. Each list of registers for a CPU starts with
- * "CPUSTRT" and ends with "CPUEND".
- */
-
 /* Register save area header. */
 struct fadump_reg_save_area_header {
__be64  magic_number;
@@ -185,29 +97,9 @@ struct fadump_reg_save_area_header {
__be32  num_cpu_offset;
 };
 
-/* Register entry. */
-struct fadump_reg_entry {
-   __be64  reg_id;
-   __be64  r

[PATCH 2/9] powerpc/fadump: Improve fadump documentation

2018-12-20 Thread Hari Bathini

The figures depicting FADump's (Firmware-Assisted Dump) memory layout
are missing some finer details like different memory regions and what
they represent. Improve the documentation by updating those details.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   56 --
 1 file changed, 30 insertions(+), 26 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 18c5fee..4897665 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -125,42 +125,46 @@ space memory except the user pages that were present in 
CMA region.
 
   o Memory Reservation during first kernel
 
-  Low memory Top of memory
-  0  boot memory size   |
-  |   ||<--Reserved dump area -->|  |
-  V   V|   Permanent Reservation |  V
-  +---+--/ /---+---++---++--+
-  |   ||CPU|HPTE|  DUMP |ELF |  |
-  +---+--/ /---+---++---++--+
-|   ^
-|   |
-\   /
- ---
-  Boot memory content gets transferred to
-  reserved area by firmware at the time of
-  crash
+  Low memoryTop of memory
+  0  boot memory size  |<--Reserved dump area --->|  |
+  |   ||   Permanent Reservation  |  |
+  V   V|   (Preserve area)|  V
+  +---+--/ /---+---+++---++--+
+  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
+  +---+--/ /---+---+++---++--+
+|   ^  ^
+|   |  |
+\   /  |
+ --- FADump Header
+  Boot memory content gets transferred   (meta area)
+  to reserved area by firmware at the
+  time of crash
+
Fig. 1
 
+
   o Memory Reservation during second kernel after crash
 
-  Low memoryTop of memory
-  0  boot memory size   |
-  |   |<- Reserved dump area --- -->|
-  V   V V
-  +---+--/ /---+---++---++--+
-  |   ||CPU|HPTE|  DUMP |ELF |  |
-  +---+--/ /---+---++---++--+
+  Low memoryTop of memory
+  0  boot memory size|
+  |   |<- Reserved dump area --->|
+  V   V|< Preserve area ->|  V
+  +---+--/ /---+---+++---++--+
+  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
+  +---+--/ /---+---+++---++--+
 |  |
 V  V
Used by second/proc/vmcore
kernel to boot
Fig. 2
 
-Currently the dump will be copied from /proc/vmcore to a
-a new file upon user intervention. The dump data available through
-/proc/vmcore will be in ELF format. Hence the existing kdump
-infrastructure (kdump scripts) to save the dump works fine with
-minor modifications.
+Currently the dump will be copied from /proc/vmcore to a new file upon
+user intervention. The dump data available through /proc/vmcore will be
+in ELF format. Hence the existing kdump infrastructure (kdump scripts)
+to save the dump works fine with minor modifications. KDump scripts on
+major Distro releases have already been modified to work seemlessly (no
+user intervention in saving the dump) when FADump is used, instead of
+KDump, as dump mechanism.
 
 The tools to examine the dump will be same as the ones
 used for kdump.

[PATCH 3/9] pseries/fadump: move out platform specific support from generic code

2018-12-20 Thread Hari Bathini

Introduce callbacks for platform specific operations like register,
unregister, invalidate & such, and move pseries specific code into
platform code.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/include/asm/fadump.h   |   71 ---
 arch/powerpc/kernel/fadump.c|  502 +-
 arch/powerpc/kernel/fadump_internal.h   |   38 ++
 arch/powerpc/platforms/pseries/Makefile |1 
 arch/powerpc/platforms/pseries/pseries_fadump.c |  537 +++
 arch/powerpc/platforms/pseries/pseries_fadump.h |   96 
 6 files changed, 707 insertions(+), 538 deletions(-)
 create mode 100644 arch/powerpc/platforms/pseries/pseries_fadump.c
 create mode 100644 arch/powerpc/platforms/pseries/pseries_fadump.h

diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index 028a8ef..db9465f 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -24,79 +24,8 @@
 
 #ifdef CONFIG_FA_DUMP
 
-/* Firmware provided dump sections */
-#define FADUMP_CPU_STATE_DATA  0x0001
-#define FADUMP_HPTE_REGION 0x0002
-#define FADUMP_REAL_MODE_REGION0x0011
-
-/* Dump request flag */
-#define FADUMP_REQUEST_FLAG0x0001
-
-/* Dump status flag */
-#define FADUMP_ERROR_FLAG  0x2000
-
-/* Utility macros */
-#define SKIP_TO_NEXT_CPU(reg_entry)\
-({ \
-   while (be64_to_cpu(reg_entry->reg_id) != REG_ID("CPUEND"))  \
-   reg_entry++;\
-   reg_entry++;\
-})
-
 extern int crashing_cpu;
 
-/* Kernel Dump section info */
-struct fadump_section {
-   __be32  request_flag;
-   __be16  source_data_type;
-   __be16  error_flags;
-   __be64  source_address;
-   __be64  source_len;
-   __be64  bytes_dumped;
-   __be64  destination_address;
-};
-
-/* ibm,configure-kernel-dump header. */
-struct fadump_section_header {
-   __be32  dump_format_version;
-   __be16  dump_num_sections;
-   __be16  dump_status_flag;
-   __be32  offset_first_dump_section;
-
-   /* Fields for disk dump option. */
-   __be32  dd_block_size;
-   __be64  dd_block_offset;
-   __be64  dd_num_blocks;
-   __be32  dd_offset_disk_path;
-
-   /* Maximum time allowed to prevent an automatic dump-reboot. */
-   __be32  max_time_auto;
-};
-
-/*
- * Firmware Assisted dump memory structure. This structure is required for
- * registering future kernel dump with power firmware through rtas call.
- *
- * No disk dump option. Hence disk dump path string section is not included.
- */
-struct fadump_mem_struct {
-   struct fadump_section_headerheader;
-
-   /* Kernel dump sections */
-   struct fadump_section   cpu_state_data;
-   struct fadump_section   hpte_region;
-   struct fadump_section   rmr_region;
-};
-
-#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE")
-
-/* Register save area header. */
-struct fadump_reg_save_area_header {
-   __be64  magic_number;
-   __be32  version;
-   __be32  num_cpu_offset;
-};
-
 extern int is_fadump_memory_area(u64 addr, ulong size);
 extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data);
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index e3f989f..36d9d48 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -40,15 +40,12 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
 #include "fadump_internal.h"
 
 static struct fw_dump fw_dump;
-static struct fadump_mem_struct fdm;
-static const struct fadump_mem_struct *fdm_active;
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
 #endif
@@ -124,63 +121,13 @@ static int __init fadump_cma_init(void) { return 1; }
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
  int depth, void *data)
 {
-   const __be32 *sections;
-   int i, num_sections;
-   int size;
-   const __be32 *token;
-
-   if (depth != 1 || strcmp(uname, "rtas") != 0)
+   if (depth != 1)
return 0;
 
-   /*
-* Check if Firmware Assisted dump is supported. if yes, check
-* if dump has been initiated on last reboot.
-*/
-   token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL);
-   if (!token)
-   return 1;
-
-   fw_dump.fadump_supported = 1;
-   fw_dump.ibm_configure_kernel_dump = be32_to_cpu(*token);
-
-   /*
-* The 'ibm,kernel-dump' rtas node is present only if there is
-* dump data waiting for us.
-*/
-   fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL);
-

[PATCH 4/9] powerpc/fadump: enable fadump support on OPAL based POWER platform

2018-12-20 Thread Hari Bathini

From: Hari Bathini 

Firmware-assisted dump support is enabled for OPAL based POWER platforms
in P9 firmware. Make the corresponding updates in kernel to enable fadump
support for such platforms.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/Kconfig|5 
 arch/powerpc/include/asm/opal-api.h |   35 ++
 arch/powerpc/include/asm/opal.h |1 
 arch/powerpc/kernel/fadump.c|  243 +++
 arch/powerpc/kernel/fadump_internal.c   |   27 +-
 arch/powerpc/kernel/fadump_internal.h   |   44 ++-
 arch/powerpc/platforms/powernv/Makefile |1 
 arch/powerpc/platforms/powernv/opal-fadump.c|  373 +++
 arch/powerpc/platforms/powernv/opal-fadump.h|   40 ++
 arch/powerpc/platforms/powernv/opal-wrappers.S  |1 
 arch/powerpc/platforms/pseries/pseries_fadump.c |   18 -
 11 files changed, 705 insertions(+), 83 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-fadump.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8be3126..08add7a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -565,7 +565,7 @@ config CRASH_DUMP
 
 config FA_DUMP
bool "Firmware-assisted dump"
-   depends on PPC64 && PPC_RTAS
+   depends on PPC64 && (PPC_RTAS || PPC_POWERNV)
select CRASH_CORE
select CRASH_DUMP
help
@@ -576,7 +576,8 @@ config FA_DUMP
  is meant to be a kdump replacement offering robustness and
  speed not possible without system firmware assistance.
 
- If unsure, say "N"
+ If unsure, say "y". Only special kernels like petitboot may
+ need to say "N" here.
 
 config IRQ_ALL_CPUS
bool "Distribute interrupts on all CPUs by default"
diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index 870fb7b..6076e51 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -210,7 +210,8 @@
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
 #defineOPAL_NX_COPROC_INIT 167
-#define OPAL_LAST  167
+#define OPAL_CONFIGURE_FADUMP  170
+#define OPAL_LAST  170
 
 #define QUIESCE_HOLD   1 /* Spin all calls at entry */
 #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */
@@ -972,6 +973,37 @@ struct opal_sg_list {
 };
 
 /*
+ * Firmware-Assisted Dump (FADump)
+ */
+
+/* The maximum number of dump sections supported by OPAL */
+#define OPAL_FADUMP_NR_SECTIONS64
+
+/* Kernel Dump section info */
+struct opal_fadump_section {
+   u8  src_type;
+   u8  reserved[7];
+   __be64  src_addr;
+   __be64  src_size;
+   __be64  dest_addr;
+   __be64  dest_size;
+};
+
+/*
+ * FADump memory structure for registering dump support with
+ * POWER f/w through opal call.
+ */
+struct opal_fadump_mem_struct {
+
+   __be16  section_size;   /*sizeof(struct fadump_section) */
+   __be16  section_count;  /* number of sections */
+   __be32  crashing_cpu;   /* Thread on which OPAL crashed */
+   __be64  reserved;
+
+   struct opal_fadump_section  section[OPAL_FADUMP_NR_SECTIONS];
+};
+
+/*
  * Dump region ID range usable by the OS
  */
 #define OPAL_DUMP_REGION_HOST_START0x80
@@ -1051,6 +1083,7 @@ enum {
OPAL_REBOOT_NORMAL  = 0,
OPAL_REBOOT_PLATFORM_ERROR  = 1,
OPAL_REBOOT_FULL_IPL= 2,
+   OPAL_REBOOT_OS_ERROR= 3,
 };
 
 /* Argument to OPAL_PCI_TCE_KILL */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ff38664..08cc09f 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -43,6 +43,7 @@ int64_t opal_npu_spa_clear_cache(uint64_t phb_id, uint32_t 
bdfn,
uint64_t PE_handle);
 int64_t opal_npu_tl_set(uint64_t phb_id, uint32_t bdfn, long cap,
uint64_t rate_phys, uint32_t size);
+int64_t opal_configure_fadump(uint64_t command, void *data, uint64_t 
data_size);
 int64_t opal_console_write(int64_t term_number, __be64 *length,
   const uint8_t *buffer);
 int64_t opal_console_read(int64_t term_number, __be64 *length,
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 36d9d48..190f7ed 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -46,12 +46,13 @@
 #include "fadump_internal.h"
 
 static struct fw_dump fw_dump;
+
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
 #endif
 
 static DEFINE_MUTEX(fadump_mutex);
-struct fad_crash_memory_ranges *crash_memory_ranges;
+struct fadump_memory_range *crash_memory_ranges;
 i

[PATCH 5/9] powerpc/fadump: process architected register state data provided by firmware

2018-12-20 Thread Hari Bathini

From: Hari Bathini 

Firmware provides architected register state data at the time of crash.
This data contains PIR value. Need to store the logical CPUs PIR values
to match the data provided by f/w with the corresponding logical CPU.

Signed-off-by: Hari Bathini 
Signed-off-by: Vasant Hegde 
---
 arch/powerpc/kernel/fadump.c|   40 +---
 arch/powerpc/kernel/fadump_internal.c   |  129 ++
 arch/powerpc/kernel/fadump_internal.h   |   32 +++
 arch/powerpc/platforms/powernv/opal-fadump.c|  216 +--
 arch/powerpc/platforms/powernv/opal-fadump.h|9 +
 arch/powerpc/platforms/pseries/pseries_fadump.c |1 
 6 files changed, 384 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 190f7ed..d9cf809 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -276,10 +276,10 @@ static unsigned long get_fadump_area_size(void)
size += fw_dump.hpte_region_size;
size += fw_dump.boot_memory_size;
size += sizeof(struct fadump_crash_info_header);
-   size += sizeof(struct elfhdr); /* ELF core header.*/
-   size += sizeof(struct elf_phdr); /* place holder for cpu notes */
-   /* Program headers for crash memory regions. */
-   size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2);
+   /* To store the start address of backup area */
+   size += sizeof(unsigned long *);
+   size += get_fadump_elfcore_hdr_size();
+   size += fw_dump.backup_area_size;
 
size = PAGE_ALIGN(size);
return size;
@@ -892,26 +892,6 @@ static int fadump_create_elfcore_headers(char *bufp)
return 0;
 }
 
-static unsigned long init_fadump_header(unsigned long addr)
-{
-   struct fadump_crash_info_header *fdh;
-
-   if (!addr)
-   return 0;
-
-   fw_dump.fadumphdr_addr = addr;
-   fdh = __va(addr);
-   addr += sizeof(struct fadump_crash_info_header);
-
-   memset(fdh, 0, sizeof(struct fadump_crash_info_header));
-   fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
-   fdh->elfcorehdr_addr = addr;
-   /* We will set the crashing cpu id in crash_fadump() during crash. */
-   fdh->crashing_cpu = CPU_UNKNOWN;
-
-   return addr;
-}
-
 static int register_fadump(void)
 {
unsigned long addr;
@@ -929,15 +909,15 @@ static int register_fadump(void)
if (ret)
return ret;
 
-   addr = fw_dump.meta_area_start;
-
/* Initialize fadump crash info header. */
-   addr = init_fadump_header(addr);
+   addr = fw_dump.ops->init_fadump_header(&fw_dump);
vaddr = __va(addr);
 
pr_debug("Creating ELF core headers at %#016lx\n", addr);
fadump_create_elfcore_headers(vaddr);
 
+   fadump_populate_backup_area(&fw_dump);
+
/* register the future kernel dump with firmware. */
pr_debug("Registering for firmware-assisted kernel dump...\n");
return fw_dump.ops->register_fadump(&fw_dump);
@@ -1242,8 +1222,12 @@ int __init setup_fadump(void)
fadump_invalidate_release_mem();
}
/* Initialize the kernel dump memory structure for FAD registration. */
-   else if (fw_dump.reserve_dump_area_size)
+   else if (fw_dump.reserve_dump_area_size) {
fw_dump.ops->init_fadump_mem_struct(&fw_dump);
+   fw_dump.ops->init_fadump_header(&fw_dump);
+   init_fadump_backup_area(&fw_dump);
+   fadump_populate_backup_area(&fw_dump);
+   }
 
fadump_init_files();
 
diff --git a/arch/powerpc/kernel/fadump_internal.c 
b/arch/powerpc/kernel/fadump_internal.c
index b46c7da..ea6f8ba 100644
--- a/arch/powerpc/kernel/fadump_internal.c
+++ b/arch/powerpc/kernel/fadump_internal.c
@@ -20,6 +20,34 @@
 
 #include "fadump_internal.h"
 
+/*
+ * Initializes the legacy fadump header format.
+ * Platform specific code can reuse/overwrite this format.
+ * OPAL platform overrides this data to add backup area support.
+ *
+ * TODO: Extend backup area support to pseries to make it robust?
+ */
+unsigned long generic_init_fadump_header(struct fw_dump *fadump_conf)
+{
+   unsigned long addr = fadump_conf->meta_area_start;
+   struct fadump_crash_info_header *fdh;
+
+   if (!addr)
+   return 0;
+
+   fadump_conf->fadumphdr_addr = addr;
+   fdh = __va(addr);
+   addr += sizeof(struct fadump_crash_info_header);
+
+   memset(fdh, 0, sizeof(struct fadump_crash_info_header));
+   fdh->magic_number = FADUMP_CRASH_INFO_MAGIC;
+   fdh->elfcorehdr_addr = addr;
+   /* We will set the crashing cpu id in crash_fadump() during crash. */
+   fdh->crashing_cpu = CPU_UNKNOWN;
+
+   return addr;
+}
+
 void *fadump_cpu_notes_buf_alloc(unsigned long size)
 {
void *vaddr;
@@ -106,6 +134,43 @@ void fadump_set_regval(struct pt_regs *regs, u64 reg_id, 
u64 reg_val)
regs->dsi

[PATCH 6/9] powerpc/powernv: export /proc/opalcore for analysing opal crashes

2018-12-20 Thread Hari Bathini

From: Hari Bathini 

Export /proc/opalcore file to analyze opal crashes

Signed-off-by: Hari Bathini 
---
 arch/powerpc/platforms/powernv/Makefile  |2 
 arch/powerpc/platforms/powernv/opal-core.c   |  385 ++
 arch/powerpc/platforms/powernv/opal-core.h   |   35 ++
 arch/powerpc/platforms/powernv/opal-fadump.c |   73 +
 4 files changed, 488 insertions(+), 7 deletions(-)
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.c
 create mode 100644 arch/powerpc/platforms/powernv/opal-core.h

diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index adc0de6..9420631 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -6,7 +6,7 @@ obj-y   += opal-msglog.o opal-hmi.o 
opal-power.o opal-irqchip.o
 obj-y  += opal-kmsg.o opal-powercap.o opal-psr.o 
opal-sensor-groups.o
 
 obj-$(CONFIG_SMP)  += smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_FA_DUMP)  += opal-fadump.o
+obj-$(CONFIG_FA_DUMP)  += opal-fadump.o opal-core.o
 obj-$(CONFIG_PCI)  += pci.o pci-ioda.o npu-dma.o pci-ioda-tce.o
 obj-$(CONFIG_CXL_BASE) += pci-cxl.o
 obj-$(CONFIG_EEH)  += eeh-powernv.o
diff --git a/arch/powerpc/platforms/powernv/opal-core.c 
b/arch/powerpc/platforms/powernv/opal-core.c
new file mode 100644
index 000..1d75526
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/opal-core.c
@@ -0,0 +1,385 @@
+/*
+ * Interface for exporting the OPAL ELF core.
+ * Heavily inspired from fs/proc/vmcore.c
+ *
+ * Copyright 2018-2019, IBM Corp.
+ * Author: Hari Bathini 
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#include "opal-core.h"
+
+struct opalcore {
+   struct list_head list;
+   unsigned long long paddr;
+   unsigned long long size;
+   loff_t offset;
+};
+
+static LIST_HEAD(opalcore_list);
+
+/* Total size of opalcore file. */
+static size_t opalcore_size;
+
+/* This buffer includes all the ELF core headers and the PT_NOTE */
+static char *opalcorebuf;
+static size_t  opalcorebuf_sz;
+
+/* NT_AUXV buffer */
+static char auxv_buf[AUXV_DESC_SZ];
+
+/* Pointer to the first PT_LOAD in the ELF file */
+Elf64_Phdr *ptload_phdr;
+unsigned int ptload_cnt;
+
+static struct proc_dir_entry *proc_opalcore;
+
+static struct opalcore * __init get_new_element(void)
+{
+   return kzalloc(sizeof(struct opalcore), GFP_KERNEL);
+}
+
+static inline int is_opalcore_usable(void)
+{
+   return (opalcorebuf != NULL) ? 1 : 0;
+}
+
+static Elf64_Word *append_elf64_note(Elf64_Word *buf, char *name,
+unsigned int type, void *data,
+size_t data_len)
+{
+   Elf64_Nhdr *note = (Elf64_Nhdr *)buf;
+   Elf64_Word namesz = strlen(name) + 1;
+
+   note->n_namesz = cpu_to_be32(namesz);
+   note->n_descsz = cpu_to_be32(data_len);
+   note->n_type   = cpu_to_be32(type);
+   buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf64_Word));
+   memcpy(buf, name, namesz);
+   buf += DIV_ROUND_UP(namesz, sizeof(Elf64_Word));
+   memcpy(buf, data, data_len);
+   buf += DIV_ROUND_UP(data_len, sizeof(Elf64_Word));
+
+   return buf;
+}
+
+static void fill_prstatus(struct elf_prstatus *prstatus, int cpu,
+ struct opalcore_config *oc_conf)
+{
+   memset(prstatus, 0, sizeof(struct elf_prstatus));
+   elf_core_copy_kernel_regs(&(prstatus->pr_reg), &(oc_conf->regs[cpu]));
+
+   /*
+* Overload PID with PIR value.
+* As a PIR value could also be '0', add an offset of '100'
+* to every PIR to avoid misinterpretations in GDB.
+*/
+   prstatus->pr_pid  = cpu_to_be32(100 + oc_conf->thread_pir[cpu]);
+   prstatus->pr_ppid = cpu_to_be32(1);
+
+   /*
+* Indicate SIGTERM for crash initiated from OPAL.
+* SIGUSR1 otherwise.
+*/
+   if (cpu == oc_conf->crashing_cpu) {
+   short sig;
+
+   sig = oc_conf->is_opal_initiated ? SIGTERM : SIGUSR1;
+   prstatus->pr_cursig = cpu_to_be16(sig);
+   }
+}
+
+static Elf64_Word *regs_to_elf64_notes(Elf64_Word *buf,
+  struct opalcore_config *oc_conf)
+{
+   int i;
+   struct elf_prstatus prstatus;
+
+   /*
+* First NT_PRSTATUS note should be crashing cpu info
+* for GDB to interpret it appropriately.
+*/
+   fill_prstatus(&prstatus, oc_conf->crashing_cpu, oc_conf);
+   buf = append_elf64_note(buf, CRASH_CORE_NOTE_NAME, NT_PRSTATUS,
+   &prstatus, sizeof(prstatus));
+
+   for_each_cpu(i, &(oc_con

[PATCH 7/9] powerpc/fadump: add support to preserve crash data on FADUMP disabled kernel

2018-12-20 Thread Hari Bathini

Add a new kernel config option, CONFIG_PRESERVE_FA_DUMP that ensures
that crash data, from previously crash'ed kernel, is preserved. This
helps in cases where FADUMP is not enabled but the subsequent memory
preserving kernel boot is likely to process this crash data. One
typical usecase for this config option is petitboot kernel.

Signed-off-by: Hari Bathini 
---
 arch/powerpc/Kconfig |9 ++
 arch/powerpc/include/asm/fadump.h|   12 
 arch/powerpc/kernel/Makefile |6 +++-
 arch/powerpc/kernel/fadump.c |   41 ++
 arch/powerpc/kernel/fadump_internal.h|6 
 arch/powerpc/kernel/prom.c   |4 +--
 arch/powerpc/platforms/powernv/Makefile  |6 +++-
 arch/powerpc/platforms/powernv/opal-fadump.c |   35 ++
 8 files changed, 104 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 08add7a..afa4e79 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -579,6 +579,15 @@ config FA_DUMP
  If unsure, say "y". Only special kernels like petitboot may
  need to say "N" here.
 
+config PRESERVE_FA_DUMP
+   bool "Preserve Firmware-assisted dump"
+   depends on PPC64 && PPC_POWERNV && !FA_DUMP
+   help
+ On a kernel with FA_DUMP disabled, this option helps to preserve
+ crash data from a previously crash'ed kernel. Useful when the next
+ memory preserving kernel boot would process this crash data.
+ Petitboot kernel is the typical usecase for this option.
+
 config IRQ_ALL_CPUS
bool "Distribute interrupts on all CPUs by default"
depends on SMP
diff --git a/arch/powerpc/include/asm/fadump.h 
b/arch/powerpc/include/asm/fadump.h
index db9465f..92a9ddf 100644
--- a/arch/powerpc/include/asm/fadump.h
+++ b/arch/powerpc/include/asm/fadump.h
@@ -22,14 +22,16 @@
 #ifndef __PPC64_FA_DUMP_H__
 #define __PPC64_FA_DUMP_H__
 
-#ifdef CONFIG_FA_DUMP
+#if defined(CONFIG_FA_DUMP) || defined(CONFIG_PRESERVE_FA_DUMP)
+extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
+ int depth, void *data);
+extern int fadump_reserve_mem(void);
+#endif
 
+#ifdef CONFIG_FA_DUMP
 extern int crashing_cpu;
 
 extern int is_fadump_memory_area(u64 addr, ulong size);
-extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
- int depth, void *data);
-extern int fadump_reserve_mem(void);
 extern int setup_fadump(void);
 extern int is_fadump_active(void);
 extern int should_fadump_crash(void);
@@ -41,4 +43,4 @@ static inline int is_fadump_active(void) { return 0; }
 static inline int should_fadump_crash(void) { return 0; }
 static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
 #endif
-#endif
+#endif /* __PPC64_FA_DUMP_H__ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 8e4bade..8ed84d2 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -65,7 +65,11 @@ obj-$(CONFIG_EEH)  += eeh.o eeh_pe.o eeh_dev.o 
eeh_cache.o \
  eeh_driver.o eeh_event.o eeh_sysfs.o
 obj-$(CONFIG_GENERIC_TBSYNC)   += smp-tbsync.o
 obj-$(CONFIG_CRASH_DUMP)   += crash_dump.o
-obj-$(CONFIG_FA_DUMP)  += fadump.o fadump_internal.o
+ifeq ($(CONFIG_FA_DUMP),y)
+obj-y  += fadump.o fadump_internal.o
+else
+obj-$(CONFIG_PRESERVE_FA_DUMP) += fadump.o
+endif
 ifdef CONFIG_PPC32
 obj-$(CONFIG_E500) += idle_e500.o
 endif
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index d9cf809..2c9457b 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -47,6 +47,7 @@
 
 static struct fw_dump fw_dump;
 
+#ifdef CONFIG_FA_DUMP
 #ifdef CONFIG_CMA
 static struct cma *fadump_cma;
 #endif
@@ -117,6 +118,7 @@ int __init fadump_cma_init(void)
 #else
 static int __init fadump_cma_init(void) { return 1; }
 #endif /* CONFIG_CMA */
+#endif /* CONFIG_FA_DUMP */
 
 /* Scan the Firmware Assisted dump configuration details. */
 int __init early_init_dt_scan_fw_dump(unsigned long node, const char *uname,
@@ -125,8 +127,10 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, 
const char *uname,
if (depth != 1)
return 0;
 
+#ifdef CONFIG_FA_DUMP
if (strcmp(uname, "rtas") == 0)
return pseries_dt_scan_fadump(&fw_dump, node);
+#endif
 
if (strcmp(uname, "ibm,opal") == 0)
return opal_dt_scan_fadump(&fw_dump, node);
@@ -134,6 +138,7 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, 
const char *uname,
return 0;
 }
 
+#ifdef CONFIG_FA_DUMP
 /*
  * If fadump is registered, check if the memory provided
  * falls within boot memory area and reserved memory area.
@@ -366,6 +371,7 @@ static int __init fadump_get_rmr_regions(v

[PATCH 8/9] powerpc/fadump: use FADump instead of fadump for how it is pronounced

2018-12-20 Thread Hari Bathini

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |   56 +++---
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 4897665..326f89c 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -8,18 +8,18 @@ a crashed system, and to do so from a fully-reset system, and
 to minimize the total elapsed time until the system is back
 in production use.
 
-- Firmware assisted dump (fadump) infrastructure is intended to replace
+- Firmware-Assisted Dump (FADump) infrastructure is intended to replace
   the existing phyp assisted dump.
 - Fadump uses the same firmware interfaces and memory reservation model
   as phyp assisted dump.
-- Unlike phyp dump, fadump exports the memory dump through /proc/vmcore
+- Unlike phyp dump, FADump exports the memory dump through /proc/vmcore
   in the ELF format in the same way as kdump. This helps us reuse the
   kdump infrastructure for dump capture and filtering.
 - Unlike phyp dump, userspace tool does not need to refer any sysfs
   interface while reading /proc/vmcore.
-- Unlike phyp dump, fadump allows user to release all the memory reserved
+- Unlike phyp dump, FADump allows user to release all the memory reserved
   for dump, with a single operation of echo 1 > /sys/kernel/fadump_release_mem.
-- Once enabled through kernel boot parameter, fadump can be
+- Once enabled through kernel boot parameter, FADump can be
   started/stopped through /sys/kernel/fadump_registered interface (see
   sysfs files section below) and can be easily integrated with kdump
   service start/stop init scripts.
@@ -33,7 +33,7 @@ dump offers several strong, practical advantages:
in a clean, consistent state.
 -- Once the dump is copied out, the memory that held the dump
is immediately available to the running kernel. And therefore,
-   unlike kdump, fadump doesn't need a 2nd reboot to get back
+   unlike kdump, FADump doesn't need a 2nd reboot to get back
the system to the production configuration.
 
 The above can only be accomplished by coordination with,
@@ -61,7 +61,7 @@ as follows:
  boot successfully. For syntax of crashkernel= parameter,
  refer to Documentation/kdump/kdump.txt. If any offset is
  provided in crashkernel= parameter, it will be ignored
- as fadump uses a predefined offset to reserve memory
+ as FADump uses a predefined offset to reserve memory
  for boot memory dump preservation in case of a crash.
 
 -- After the low memory (boot memory) area has been saved, the
@@ -119,7 +119,7 @@ blocking this significant chunk of memory from production 
kernel.
 Hence, the implementation uses the Linux kernel's Contiguous Memory
 Allocator (CMA) for memory reservation if CMA is configured for kernel.
 With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this fadump will
+use it, while kernel is prevented from using it. With this FADump will
 still be able to capture all of the kernel memory and most of the user
 space memory except the user pages that were present in CMA region.
 
@@ -169,14 +169,14 @@ KDump, as dump mechanism.
 The tools to examine the dump will be same as the ones
 used for kdump.
 
-How to enable firmware-assisted dump (fadump):
+How to enable firmware-assisted dump (FADump):
 -
 
 1. Set config option CONFIG_FA_DUMP=y and build kernel.
-2. Boot into linux kernel with 'fadump=on' kernel cmdline option.
-   By default, fadump reserved memory will be initialized as CMA area.
-   Alternatively, user can boot linux kernel with 'fadump=nocma' to
-   prevent fadump to use CMA.
+2. Boot into linux kernel with 'FADump=on' kernel cmdline option.
+   By default, FADump reserved memory will be initialized as CMA area.
+   Alternatively, user can boot linux kernel with 'FADump=nocma' to
+   prevent FADump to use CMA.
 3. Optionally, user can also set 'crashkernel=' kernel cmdline
to specify size of the memory to reserve for boot memory dump
preservation.
@@ -189,7 +189,7 @@ NOTE: 1. 'fadump_reserve_mem=' parameter has been 
deprecated. Instead
  option is set at kernel cmdline.
   3. if user wants to capture all of user space memory and ok with
  reserved memory not available to production system, then
- 'fadump=nocma' kernel parameter can be used to fallback to
+ 'FADump=nocma' kernel parameter can be used to fallback to
  old behaviour.
 
 Sysfs/debugfs files:
@@ -202,29 +202,29 @@ Here is the list of files under kernel sysfs:
 
  /sys/kernel/fadump_enabled
 
-This is used to display the fadump status.
-0 = fadump is disabled
-1 = fadump is enabled
+This is used to display the FADump status.
+0 = FADump is disabled

[PATCH 9/9] powerpc/fadump: Update documentation about OPAL platform support

2018-12-20 Thread Hari Bathini

With FADump support now available on both pseries and OPAL platforms,
update FADump documentation with these details. Also, update about
backup area and why it is used.

Signed-off-by: Hari Bathini 
---
 Documentation/powerpc/firmware-assisted-dump.txt |  102 ++
 1 file changed, 64 insertions(+), 38 deletions(-)

diff --git a/Documentation/powerpc/firmware-assisted-dump.txt 
b/Documentation/powerpc/firmware-assisted-dump.txt
index 326f89c..eff9f38 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -70,7 +70,8 @@ as follows:
normal.
 
 -- The freshly booted kernel will notice that there is a new
-   node (ibm,dump-kernel) in the device tree, indicating that
+   node (ibm,dump-kernel on PSeries or ibm,opal/dump/result-table
+   on OPAL platform) in the device tree, indicating that
there is crash data available from a previous boot. During
the early boot OS will reserve rest of the memory above
boot memory size effectively booting with restricted memory
@@ -92,7 +93,20 @@ as follows:
 
 Please note that the firmware-assisted dump feature
 is only available on Power6 and above systems with recent
-firmware versions.
+firmware versions on PSeries (PowerVM) platform and Power9
+and above systems with recent firmware versions on PowerNV
+(OPAL) platform.
+
+To process dump on OPAL platform, additional meta data (PIR to
+Logical CPU map) from the crashing kernel is required. This info
+has to be backed up by the crashing kernel for capture kernel to
+use it in making sense of the register state data provided by the
+F/W. The start address of the area where this info is backed up
+is stored at the tail end of FADump crash info header. To indicate
+the presence of this additional meta data (backup info), the magic
+number field in FADump crash info header is overloaded as version
+identifier.
+
 
 Implementation details:
 --
@@ -108,56 +122,65 @@ that are run. If there is dump data, then the
 memory is held.
 
 If there is no waiting dump data, then only the memory required
-to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is usually reserved at an offset greater than boot memory
-size (see Fig. 1). This area is *not* released: this region will
-be kept permanently reserved, so that it can act as a receptacle
-for a copy of the boot memory content in addition to CPU state
-and HPTE region, in the case a crash does occur. Since this reserved
-memory area is used only after the system crash, there is no point in
-blocking this significant chunk of memory from production kernel.
-Hence, the implementation uses the Linux kernel's Contiguous Memory
-Allocator (CMA) for memory reservation if CMA is configured for kernel.
-With CMA reservation this memory will be available for applications to
-use it, while kernel is prevented from using it. With this FADump will
-still be able to capture all of the kernel memory and most of the user
-space memory except the user pages that were present in CMA region.
+to hold CPU state, HPTE region, boot memory dump, FADump header,
+elfcore header and backup area, is usually reserved at an offset
+greater than boot memory size (see Fig. 1). This area is *not*
+released: this region will be kept permanently reserved, so that
+it can act as a receptacle for a copy of the boot memory content in
+addition to CPU state and HPTE region, in the case a crash does occur.
+Since this reserved memory area is used only after the system crash,
+there is no point in blocking this significant chunk of memory from
+production kernel. Hence, the implementation uses the Linux kernel's
+Contiguous Memory Allocator (CMA) for memory reservation if CMA is
+configured for kernel. With CMA reservation this memory will be
+available for applications to use it, while kernel is prevented from
+using it. With this FADump will still be able to capture all of the
+kernel memory and most of the user space memory except the user pages
+that were present in CMA region.
 
   o Memory Reservation during first kernel
 
-  Low memoryTop of memory
-  0  boot memory size  |<--Reserved dump area --->|  |
-  |   ||   Permanent Reservation  |  |
-  V   V|   (Preserve area)|  V
-  +---+--/ /---+---+++---++--+
-  |   ||CPU|HPTE|  DUMP  |HDR|ELF |  |
-  +---+--/ /---+---+++---++--+
-|   ^  ^
-|   |  |
-\   /  |
- --- FADump Header
-  Boot memory content gets transferred   (meta area)
-  to reserved area by firmware at the
-  time of crash
-
+  Low memory

Re: [PATCH] powerpc/pkeys: copy pkey-tracking-information at fork()

2018-12-20 Thread Ram Pai

On Fri, Dec 21, 2018 at 12:19:13AM +1100, Michael Ellerman wrote:
> Hi Ram,
> 
> Thanks for fixing this.
> 
> Ram Pai  writes:
> > diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> > index b271b28..5d65c47 100644
> > --- a/arch/powerpc/mm/pkeys.c
> > +++ b/arch/powerpc/mm/pkeys.c
> > @@ -414,3 +414,10 @@ bool arch_vma_access_permitted(struct vm_area_struct 
> > *vma, bool write,
> >  
> > return pkey_access_permitted(vma_pkey(vma), write, execute);
> >  }
> > +
> > +void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
> > +{
> > +   /* Duplicate the oldmm pkey state in mm: */
> > +   mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
> > +   mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
> > +}
> 
> Shouldn't this check if pkeys are actually in use?

Yes. That will make it better. However not checking it, will not hurt. Just that
it will do some unnecessary copying.

Will spin out a new patch with the fix, and send it your way.
RP


> 
> eg:
> 
> diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
> index cf87dddefbdc..587807763737 100644
> --- a/arch/powerpc/mm/pkeys.c
> +++ b/arch/powerpc/mm/pkeys.c
> @@ -418,6 +418,9 @@ bool arch_vma_access_permitted(struct vm_area_struct 
> *vma, bool write,
> 
>  void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
>  {
> + if (static_branch_likely(&pkey_disabled))
> + return;
> +
>   /* Duplicate the oldmm pkey state in mm: */
>   mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
>   mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
> 
> 
> Ideally we'd actually do it in the inline so that the function call to
> arch_dup_pkeys() can be avoided. But it looks like header dependencies
> might make that hard.
> 
> cheers

-- 
Ram Pai

[PATCH v2] powerpc/pkeys: copy pkey-tracking-information at fork()

2018-12-20 Thread Ram Pai

Pkey tracking information is not copied over to the mm_struct of the
child during fork(). This can cause the child to erroneously allocate
keys that were already allocated. Any allocated execute-only key is lost
aswell.

Add code; called by dup_mmap(), to copy the pkey state from parent to
child explicitly.

This problem was originally found by Dave Hansen on x86, which turns out
to be a problem on powerpc aswell.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 

v2: do not copy if pkeys is disabled.
-- comment by Michael Ellermen

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 0381394..cb16146 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -217,12 +217,6 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 }
 
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
-   struct mm_struct *mm)
-{
-   return 0;
-}
-
 #ifndef CONFIG_PPC_BOOK3S_64
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
@@ -247,6 +241,7 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 #ifdef CONFIG_PPC_MEM_KEYS
 bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
   bool execute, bool foreign);
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm);
 #else /* CONFIG_PPC_MEM_KEYS */
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
@@ -259,6 +254,7 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 #define thread_pkey_regs_save(thread)
 #define thread_pkey_regs_restore(new_thread, old_thread)
 #define thread_pkey_regs_init(thread)
+#define arch_dup_pkeys(oldmm, mm)
 
 static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
 {
@@ -267,5 +263,12 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
 
 #endif /* CONFIG_PPC_MEM_KEYS */
 
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+   struct mm_struct *mm)
+{
+   arch_dup_pkeys(oldmm, mm);
+   return 0;
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index b271b28..25a8dd9 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -414,3 +414,13 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, 
bool write,
 
return pkey_access_permitted(vma_pkey(vma), write, execute);
 }
+
+void arch_dup_pkeys(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+   if (static_branch_likely(&pkey_disabled))
+   return;
+
+   /* Duplicate the oldmm pkey state in mm: */
+   mm_pkey_allocation_map(mm) = mm_pkey_allocation_map(oldmm);
+   mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
+}

Re: [PATCH 1/2] PCI/IOV: provide flag to skip VF scanning

2018-12-20 Thread Bjorn Helgaas

Hi Sebastian,

On Tue, Dec 18, 2018 at 11:16:49AM +0100, Sebastian Ott wrote:
> Provide a flag to skip scanning for new VFs after SRIOV enablement.
> This can be set by implementations for which the VFs are already
> reported by other means.
> 
> Signed-off-by: Sebastian Ott 
> ---
>  drivers/pci/iov.c   | 48 
>  include/linux/pci.h |  1 +
>  2 files changed, 37 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
> index 9616eca3182f..3aa115ed3a65 100644
> --- a/drivers/pci/iov.c
> +++ b/drivers/pci/iov.c
> @@ -252,6 +252,27 @@ int __weak pcibios_sriov_disable(struct pci_dev *pdev)
>   return 0;
>  }
>  
> +static int sriov_add_vfs(struct pci_dev *dev, u16 num_vfs)
> +{
> + unsigned int i;
> + int rc;
> +
> + if (dev->no_vf_scan)
> + return 0;
> +
> + for (i = 0; i < num_vfs; i++) {
> + rc = pci_iov_add_virtfn(dev, i);
> + if (rc)
> + goto failed;
> + }
> + return 0;
> +failed:
> + while (i--)
> + pci_iov_remove_virtfn(dev, i);
> +
> + return rc;
> +}

I think the strategy is fine, but can you restructure the patches
like this:

  1) Factor out sriov_add_vfs() and sriov_dev_vfs().  This makes no
 functional change at all.

  2) Add dev->no_vf_scan, set it in the s390 pcibios_add_device(), and
 test it in sriov_add_vfs(), and sriov_del_vfs().

I think both pieces will be easier to review that way.

>  static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
>  {
>   int rc;
> @@ -337,21 +358,15 @@ static int sriov_enable(struct pci_dev *dev, int 
> nr_virtfn)
>   msleep(100);
>   pci_cfg_access_unlock(dev);
>  
> - for (i = 0; i < initial; i++) {
> - rc = pci_iov_add_virtfn(dev, i);
> - if (rc)
> - goto failed;
> - }
> + rc = sriov_add_vfs(dev, initial);
> + if (rc)
> + goto err_pcibios;
>  
>   kobject_uevent(&dev->dev.kobj, KOBJ_CHANGE);
>   iov->num_VFs = nr_virtfn;
>  
>   return 0;
>  
> -failed:
> - while (i--)
> - pci_iov_remove_virtfn(dev, i);
> -
>  err_pcibios:
>   iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
>   pci_cfg_access_lock(dev);
> @@ -368,17 +383,26 @@ static int sriov_enable(struct pci_dev *dev, int 
> nr_virtfn)
>   return rc;
>  }
>  
> -static void sriov_disable(struct pci_dev *dev)
> +static void sriov_del_vfs(struct pci_dev *dev)
>  {
> - int i;
>   struct pci_sriov *iov = dev->sriov;
> + int i;
>  
> - if (!iov->num_VFs)
> + if (dev->no_vf_scan)
>   return;
>  
>   for (i = 0; i < iov->num_VFs; i++)
>   pci_iov_remove_virtfn(dev, i);
> +}
> +
> +static void sriov_disable(struct pci_dev *dev)
> +{
> + struct pci_sriov *iov = dev->sriov;
> +
> + if (!iov->num_VFs)
> + return;
>  
> + sriov_del_vfs(dev);
>   iov->ctrl &= ~(PCI_SRIOV_CTRL_VFE | PCI_SRIOV_CTRL_MSE);
>   pci_cfg_access_lock(dev);
>   pci_write_config_word(dev, iov->pos + PCI_SRIOV_CTRL, iov->ctrl);
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 11c71c4ecf75..f70b9ccd3e86 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -405,6 +405,7 @@ struct pci_dev {
>   unsigned intnon_compliant_bars:1;   /* Broken BARs; ignore them */
>   unsigned intis_probed:1;/* Device probing in progress */
>   unsigned intlink_active_reporting:1;/* Device capable of reporting 
> link active */
> + unsigned intno_vf_scan:1;   /* Don't scan for VF's after VF 
> enablement */
>   pci_dev_flags_t dev_flags;
>   atomic_tenable_cnt; /* pci_enable_device has been called */
>  
> -- 
> 2.13.4
>

Re: [RFC/WIP] powerpc: Fix 32-bit handling of MSR_EE on exceptions

2018-12-20 Thread Benjamin Herrenschmidt



> >   /*
> >* MSR_KERNEL is > 0x1 on 4xx/Book-E since it include MSR_CE.
> > @@ -205,20 +208,46 @@ transfer_to_handler_cont:
> > mflrr9
> > lwz r11,0(r9)   /* virtual address of handler */
> > lwz r9,4(r9)/* where to go when done */
> > +#if defined(CONFIG_PPC_8xx) && defined(CONFIG_PERF_EVENTS)
> > +   mtspr   SPRN_NRI, r0
> > +#endif
> 
> That's not part of your patch, it's already in the tree.

Yup rebase glitch.

 .../...

> I tested it on the 8xx with the below changes in addition. No issue seen 
> so far.

Thanks !

I'll merge that in.

The main obscure area is that business with the irqsoff tracer and thus
the need to create stack frames around calls to trace_hardirqs_* ... we
do it in some places and not others, but I've not managed to make it
crash either. I need to get to the bottom of that, and possibly provide
proper macro helpers like ppc64 has to do it.

Cheers,
Ben.

Re: [PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Michael Ellerman

Murilo Opsfelder Araujo  writes:
> On Thu, Dec 20, 2018 at 07:23:50PM +1100, Alexey Kardashevskiy wrote:
...
>> diff --git a/drivers/vfio/pci/trace.h b/drivers/vfio/pci/trace.h
>> new file mode 100644
>> index 000..b80d2d3
>> --- /dev/null
>> +++ b/drivers/vfio/pci/trace.h
...
>> +TRACE_EVENT(vfio_pci_npu2_mmap,
>> +TP_PROTO(struct pci_dev *pdev, unsigned long hpa, unsigned long ua,
>> +unsigned long size, int ret),
>> +TP_ARGS(pdev, hpa, ua, size, ret),
>> +
>> +TP_STRUCT__entry(
>> +__field(const char *, name)
>> +__field(unsigned long, hpa)
>> +__field(unsigned long, ua)
>> +__field(unsigned long, size)
>> +__field(int, ret)
>> +),
>> +
>> +TP_fast_assign(
>> +__entry->name = dev_name(&pdev->dev),
>> +__entry->hpa = hpa;
>> +__entry->ua = ua;
>> +__entry->size = size;
>> +__entry->ret = ret;
>> +),
>> +
>> +TP_printk("%s: %lx -> %lx size=%lx ret=%d", __entry->name, __entry->hpa,
>> +__entry->ua, __entry->size, __entry->ret)
>> +);
>> +
>> +#endif /* _TRACE_SUBSYS_H */
>
> I think it's too late but this line I guess should read:
>
>   #endif /* _TRACE_VFIO_PCI_H */

Thanks I'll fix it up.

cheers

Re: [PATCH v2] powerpc/pkeys: copy pkey-tracking-information at fork()

2018-12-20 Thread Michael Ellerman

Ram Pai  writes:

> Pkey tracking information is not copied over to the mm_struct of the
> child during fork(). This can cause the child to erroneously allocate
> keys that were already allocated. Any allocated execute-only key is lost
> aswell.
> 
> Add code; called by dup_mmap(), to copy the pkey state from parent to
> child explicitly.
> 
> This problem was originally found by Dave Hansen on x86, which turns out
> to be a problem on powerpc aswell.
> 
> Reviewed-by: Thiago Jung Bauermann 
> Signed-off-by: Ram Pai 
> 
> v2: do not copy if pkeys is disabled.
>   -- comment by Michael Ellermen

Thanks.

I changed the subject to:

  powerpc/pkeys: Fix handling of pkey state across fork()

And added tags:

  Fixes: cf43d3b26452 ("powerpc: Enable pkey subsystem")
  Cc: sta...@vger.kernel.org # v4.16+

cheers

trace_hardirqs_on/off vs. extra stack frames

2018-12-20 Thread Benjamin Herrenschmidt

Hi Steven !

I'm trying to untangle something, and I need your help :-)

In commit 3cb5f1a3e58c0bd70d47d9907cc5c65192281dee, you added a summy
stack frame around the assembly calls to trace_hardirqs_on/off on the
ground that when using the latency tracer (irqsoff), you might poke at
CALLER_ADDR1 and that could blow up if there's only one frame at hand.

However, I can't see where it would be doing that. lockdep.c only uses
CALLER_ADDR0 and irqsoff uses the values passed by it. In fact, that
was already the case when the above commit was merged.

I tried on a 32-bit kernel to remove the dummy stack frame with no
issue so far  (though I do get stupid values reported with or
without a stack frame, but I think that's normal, looking into it).

The reason I'm asking is that we have other code path, on return
from interrupts for example, at least on 32-bits where we call the
tracing without the extra stack frame, and I yet to see it crash.

I wonder if the commit and bug fix above relates to some older code
that no longer existed even at the point where the commit was merged...

Any idea ?

Cheers,
Ben.

Re: [PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Alexey Kardashevskiy




On 21/12/2018 03:46, Alex Williamson wrote:
> On Thu, 20 Dec 2018 19:23:50 +1100
> Alexey Kardashevskiy  wrote:
> 
>> POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
>> pluggable PCIe devices but still have PCIe links which are used
>> for config space and MMIO. In addition to that the GPUs have 6 NVLinks
>> which are connected to other GPUs and the POWER9 CPU. POWER9 chips
>> have a special unit on a die called an NPU which is an NVLink2 host bus
>> adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
>> These systems also support ATS (address translation services) which is
>> a part of the NVLink2 protocol. Such GPUs also share on-board RAM
>> (16GB or 32GB) to the system via the same NVLink2 so a CPU has
>> cache-coherent access to a GPU RAM.
>>
>> This exports GPU RAM to the userspace as a new VFIO device region. This
>> preregisters the new memory as device memory as it might be used for DMA.
>> This inserts pfns from the fault handler as the GPU memory is not onlined
>> until the vendor driver is loaded and trained the NVLinks so doing this
>> earlier causes low level errors which we fence in the firmware so
>> it does not hurt the host system but still better be avoided; for the same
>> reason this does not map GPU RAM into the host kernel (usual thing for
>> emulated access otherwise).
>>
>> This exports an ATSD (Address Translation Shootdown) register of NPU which
>> allows TLB invalidations inside GPU for an operating system. The register
>> conveniently occupies a single 64k page. It is also presented to
>> the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
>> each of them can be used for TLB invalidation in a GPU linked to this NPU.
>> This allocates one ATSD register per an NVLink bridge allowing passing
>> up to 6 registers. Due to the host firmware bug (just recently fixed),
>> only 1 ATSD register per NPU was actually advertised to the host system
>> so this passes that alone register via the first NVLink bridge device in
>> the group which is still enough as QEMU collects them all back and
>> presents to the guest via vPHB to mimic the emulated NPU PHB on the host.
>>
>> In order to provide the userspace with the information about GPU-to-NVLink
>> connections, this exports an additional capability called "tgt"
>> (which is an abbreviated host system bus address). The "tgt" property
>> tells the GPU its own system address and allows the guest driver to
>> conglomerate the routing information so each GPU knows how to get directly
>> to the other GPUs.
>>
>> For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
>> know LPID (a logical partition ID or a KVM guest hardware ID in other
>> words) and PID (a memory context ID of a userspace process, not to be
>> confused with a linux pid). This assigns a GPU to LPID in the NPU and
>> this is why this adds a listener for KVM on an IOMMU group. A PID comes
>> via NVLink from a GPU and NPU uses a PID wildcard to pass it through.
>>
>> This requires coherent memory and ATSD to be available on the host as
>> the GPU vendor only supports configurations with both features enabled
>> and other configurations are known not to work. Because of this and
>> because of the ways the features are advertised to the host system
>> (which is a device tree with very platform specific properties),
>> this requires enabled POWERNV platform.
>>
>> The V100 GPUs do not advertise any of these capabilities via the config
>> space and there are more than just one device ID so this relies on
>> the platform to tell whether these GPUs have special abilities such as
>> NVLinks.
>>
>> Signed-off-by: Alexey Kardashevskiy 
>> ---
>> Changes:
>> v6.1:
>> * fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
>>
>> v6:
>> * reworked capabilities - tgt for nvlink and gpu and link-speed
>> for nvlink only
>>
>> v5:
>> * do not memremap GPU RAM for emulation, map it only when it is needed
>> * allocate 1 ATSD register per NVLink bridge, if none left, then expose
>> the region with a zero size
>> * separate caps per device type
>> * addressed AW review comments
>>
>> v4:
>> * added nvlink-speed to the NPU bridge capability as this turned out to
>> be not a constant value
>> * instead of looking at the exact device ID (which also changes from system
>> to system), now this (indirectly) looks at the device tree to know
>> if GPU and NPU support NVLink
>>
>> v3:
>> * reworded the commit log about tgt
>> * added tracepoints (do we want them enabled for entire vfio-pci?)
>> * added code comments
>> * added write|mmap flags to the new regions
>> * auto enabled VFIO_PCI_NVLINK2 config option
>> * added 'tgt' capability to a GPU so QEMU can recreate ibm,npu and ibm,gpu
>> references; there are required by the NVIDIA driver
>> * keep notifier registered only for short time
>> ---
>>  drivers/vfio/pci/Makefile   |   1 +
>>  drivers/vfio/pci/trace.h| 102 ++
>>  drivers/vfio/pci/vfio_p

Re: [PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Alex Williamson

On Fri, 21 Dec 2018 12:23:16 +1100
Alexey Kardashevskiy  wrote:

> On 21/12/2018 03:46, Alex Williamson wrote:
> > On Thu, 20 Dec 2018 19:23:50 +1100
> > Alexey Kardashevskiy  wrote:
> >   
> >> POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
> >> pluggable PCIe devices but still have PCIe links which are used
> >> for config space and MMIO. In addition to that the GPUs have 6 NVLinks
> >> which are connected to other GPUs and the POWER9 CPU. POWER9 chips
> >> have a special unit on a die called an NPU which is an NVLink2 host bus
> >> adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
> >> These systems also support ATS (address translation services) which is
> >> a part of the NVLink2 protocol. Such GPUs also share on-board RAM
> >> (16GB or 32GB) to the system via the same NVLink2 so a CPU has
> >> cache-coherent access to a GPU RAM.
> >>
> >> This exports GPU RAM to the userspace as a new VFIO device region. This
> >> preregisters the new memory as device memory as it might be used for DMA.
> >> This inserts pfns from the fault handler as the GPU memory is not onlined
> >> until the vendor driver is loaded and trained the NVLinks so doing this
> >> earlier causes low level errors which we fence in the firmware so
> >> it does not hurt the host system but still better be avoided; for the same
> >> reason this does not map GPU RAM into the host kernel (usual thing for
> >> emulated access otherwise).
> >>
> >> This exports an ATSD (Address Translation Shootdown) register of NPU which
> >> allows TLB invalidations inside GPU for an operating system. The register
> >> conveniently occupies a single 64k page. It is also presented to
> >> the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
> >> each of them can be used for TLB invalidation in a GPU linked to this NPU.
> >> This allocates one ATSD register per an NVLink bridge allowing passing
> >> up to 6 registers. Due to the host firmware bug (just recently fixed),
> >> only 1 ATSD register per NPU was actually advertised to the host system
> >> so this passes that alone register via the first NVLink bridge device in
> >> the group which is still enough as QEMU collects them all back and
> >> presents to the guest via vPHB to mimic the emulated NPU PHB on the host.
> >>
> >> In order to provide the userspace with the information about GPU-to-NVLink
> >> connections, this exports an additional capability called "tgt"
> >> (which is an abbreviated host system bus address). The "tgt" property
> >> tells the GPU its own system address and allows the guest driver to
> >> conglomerate the routing information so each GPU knows how to get directly
> >> to the other GPUs.
> >>
> >> For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
> >> know LPID (a logical partition ID or a KVM guest hardware ID in other
> >> words) and PID (a memory context ID of a userspace process, not to be
> >> confused with a linux pid). This assigns a GPU to LPID in the NPU and
> >> this is why this adds a listener for KVM on an IOMMU group. A PID comes
> >> via NVLink from a GPU and NPU uses a PID wildcard to pass it through.
> >>
> >> This requires coherent memory and ATSD to be available on the host as
> >> the GPU vendor only supports configurations with both features enabled
> >> and other configurations are known not to work. Because of this and
> >> because of the ways the features are advertised to the host system
> >> (which is a device tree with very platform specific properties),
> >> this requires enabled POWERNV platform.
> >>
> >> The V100 GPUs do not advertise any of these capabilities via the config
> >> space and there are more than just one device ID so this relies on
> >> the platform to tell whether these GPUs have special abilities such as
> >> NVLinks.
> >>
> >> Signed-off-by: Alexey Kardashevskiy 
> >> ---
> >> Changes:
> >> v6.1:
> >> * fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
> >>
> >> v6:
> >> * reworked capabilities - tgt for nvlink and gpu and link-speed
> >> for nvlink only
> >>
> >> v5:
> >> * do not memremap GPU RAM for emulation, map it only when it is needed
> >> * allocate 1 ATSD register per NVLink bridge, if none left, then expose
> >> the region with a zero size
> >> * separate caps per device type
> >> * addressed AW review comments
> >>
> >> v4:
> >> * added nvlink-speed to the NPU bridge capability as this turned out to
> >> be not a constant value
> >> * instead of looking at the exact device ID (which also changes from system
> >> to system), now this (indirectly) looks at the device tree to know
> >> if GPU and NPU support NVLink
> >>
> >> v3:
> >> * reworded the commit log about tgt
> >> * added tracepoints (do we want them enabled for entire vfio-pci?)
> >> * added code comments
> >> * added write|mmap flags to the new regions
> >> * auto enabled VFIO_PCI_NVLINK2 config option
> >> * added 'tgt' capability to a GPU so QEMU can recreate ibm,npu an

Re: [PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Alexey Kardashevskiy

On 21/12/2018 12:37, Alex Williamson wrote:
> On Fri, 21 Dec 2018 12:23:16 +1100
> Alexey Kardashevskiy  wrote:
> 
>> On 21/12/2018 03:46, Alex Williamson wrote:
>>> On Thu, 20 Dec 2018 19:23:50 +1100
>>> Alexey Kardashevskiy  wrote:
>>>   
 POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
 pluggable PCIe devices but still have PCIe links which are used
 for config space and MMIO. In addition to that the GPUs have 6 NVLinks
 which are connected to other GPUs and the POWER9 CPU. POWER9 chips
 have a special unit on a die called an NPU which is an NVLink2 host bus
 adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
 These systems also support ATS (address translation services) which is
 a part of the NVLink2 protocol. Such GPUs also share on-board RAM
 (16GB or 32GB) to the system via the same NVLink2 so a CPU has
 cache-coherent access to a GPU RAM.

 This exports GPU RAM to the userspace as a new VFIO device region. This
 preregisters the new memory as device memory as it might be used for DMA.
 This inserts pfns from the fault handler as the GPU memory is not onlined
 until the vendor driver is loaded and trained the NVLinks so doing this
 earlier causes low level errors which we fence in the firmware so
 it does not hurt the host system but still better be avoided; for the same
 reason this does not map GPU RAM into the host kernel (usual thing for
 emulated access otherwise).

 This exports an ATSD (Address Translation Shootdown) register of NPU which
 allows TLB invalidations inside GPU for an operating system. The register
 conveniently occupies a single 64k page. It is also presented to
 the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
 each of them can be used for TLB invalidation in a GPU linked to this NPU.
 This allocates one ATSD register per an NVLink bridge allowing passing
 up to 6 registers. Due to the host firmware bug (just recently fixed),
 only 1 ATSD register per NPU was actually advertised to the host system
 so this passes that alone register via the first NVLink bridge device in
 the group which is still enough as QEMU collects them all back and
 presents to the guest via vPHB to mimic the emulated NPU PHB on the host.

 In order to provide the userspace with the information about GPU-to-NVLink
 connections, this exports an additional capability called "tgt"
 (which is an abbreviated host system bus address). The "tgt" property
 tells the GPU its own system address and allows the guest driver to
 conglomerate the routing information so each GPU knows how to get directly
 to the other GPUs.

 For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
 know LPID (a logical partition ID or a KVM guest hardware ID in other
 words) and PID (a memory context ID of a userspace process, not to be
 confused with a linux pid). This assigns a GPU to LPID in the NPU and
 this is why this adds a listener for KVM on an IOMMU group. A PID comes
 via NVLink from a GPU and NPU uses a PID wildcard to pass it through.

 This requires coherent memory and ATSD to be available on the host as
 the GPU vendor only supports configurations with both features enabled
 and other configurations are known not to work. Because of this and
 because of the ways the features are advertised to the host system
 (which is a device tree with very platform specific properties),
 this requires enabled POWERNV platform.

 The V100 GPUs do not advertise any of these capabilities via the config
 space and there are more than just one device ID so this relies on
 the platform to tell whether these GPUs have special abilities such as
 NVLinks.

 Signed-off-by: Alexey Kardashevskiy 
 ---
 Changes:
 v6.1:
 * fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD

 v6:
 * reworked capabilities - tgt for nvlink and gpu and link-speed
 for nvlink only

 v5:
 * do not memremap GPU RAM for emulation, map it only when it is needed
 * allocate 1 ATSD register per NVLink bridge, if none left, then expose
 the region with a zero size
 * separate caps per device type
 * addressed AW review comments

 v4:
 * added nvlink-speed to the NPU bridge capability as this turned out to
 be not a constant value
 * instead of looking at the exact device ID (which also changes from system
 to system), now this (indirectly) looks at the device tree to know
 if GPU and NPU support NVLink

 v3:
 * reworded the commit log about tgt
 * added tracepoints (do we want them enabled for entire vfio-pci?)
 * added code comments
 * added write|mmap flags to the new regions
 * auto enabled VFIO_PCI_NVLINK2 config option
 * added 'tg

Re: trace_hardirqs_on/off vs. extra stack frames

2018-12-20 Thread Steven Rostedt

On Fri, 21 Dec 2018 12:11:35 +1100
Benjamin Herrenschmidt  wrote:

> Hi Steven !
> 
> I'm trying to untangle something, and I need your help :-)
> 
> In commit 3cb5f1a3e58c0bd70d47d9907cc5c65192281dee, you added a summy
> stack frame around the assembly calls to trace_hardirqs_on/off on the
> ground that when using the latency tracer (irqsoff), you might poke at
> CALLER_ADDR1 and that could blow up if there's only one frame at hand.
> 
> However, I can't see where it would be doing that. lockdep.c only uses
> CALLER_ADDR0 and irqsoff uses the values passed by it. In fact, that
> was already the case when the above commit was merged.
> 
> I tried on a 32-bit kernel to remove the dummy stack frame with no
> issue so far  (though I do get stupid values reported with or
> without a stack frame, but I think that's normal, looking into it).

BTW, I only had a 64 bit PPC working, so I would have been testing that.

> 
> The reason I'm asking is that we have other code path, on return
> from interrupts for example, at least on 32-bits where we call the
> tracing without the extra stack frame, and I yet to see it crash.

Have you tried enabling the irqsoff tracer and running it for a while?

 echo irqsoff > /sys/kernel/debug/tracing/current_tracer

The problem is that when we come from user space, and we disable
interrupts in the entry code, it calls into the irqsoff tracer:

[ in userspace ]

[ in kernel ]
bl .trace_hardirqs_off

  kernel/trace/trace_preemptirq.c:

   trace_hardirqs_off(CALLER_ADDR_0, CALLER_ADDR1)

IIRC, without the stack frame, that CALLER_ADDR1 can end up having the
kernel read garbage.

-- Steve


> 
> I wonder if the commit and bug fix above relates to some older code
> that no longer existed even at the point where the commit was
merged...
>

Re: [PATCH kernel v7 20/20] vfio_pci: Add NVIDIA GV100GL [Tesla V100 SXM2] subdriver

2018-12-20 Thread Alex Williamson

On Fri, 21 Dec 2018 12:50:00 +1100
Alexey Kardashevskiy  wrote:

> On 21/12/2018 12:37, Alex Williamson wrote:
> > On Fri, 21 Dec 2018 12:23:16 +1100
> > Alexey Kardashevskiy  wrote:
> >   
> >> On 21/12/2018 03:46, Alex Williamson wrote:  
> >>> On Thu, 20 Dec 2018 19:23:50 +1100
> >>> Alexey Kardashevskiy  wrote:
> >>> 
>  POWER9 Witherspoon machines come with 4 or 6 V100 GPUs which are not
>  pluggable PCIe devices but still have PCIe links which are used
>  for config space and MMIO. In addition to that the GPUs have 6 NVLinks
>  which are connected to other GPUs and the POWER9 CPU. POWER9 chips
>  have a special unit on a die called an NPU which is an NVLink2 host bus
>  adapter with p2p connections to 2 to 3 GPUs, 3 or 2 NVLinks to each.
>  These systems also support ATS (address translation services) which is
>  a part of the NVLink2 protocol. Such GPUs also share on-board RAM
>  (16GB or 32GB) to the system via the same NVLink2 so a CPU has
>  cache-coherent access to a GPU RAM.
> 
>  This exports GPU RAM to the userspace as a new VFIO device region. This
>  preregisters the new memory as device memory as it might be used for DMA.
>  This inserts pfns from the fault handler as the GPU memory is not onlined
>  until the vendor driver is loaded and trained the NVLinks so doing this
>  earlier causes low level errors which we fence in the firmware so
>  it does not hurt the host system but still better be avoided; for the 
>  same
>  reason this does not map GPU RAM into the host kernel (usual thing for
>  emulated access otherwise).
> 
>  This exports an ATSD (Address Translation Shootdown) register of NPU 
>  which
>  allows TLB invalidations inside GPU for an operating system. The register
>  conveniently occupies a single 64k page. It is also presented to
>  the userspace as a new VFIO device region. One NPU has 8 ATSD registers,
>  each of them can be used for TLB invalidation in a GPU linked to this 
>  NPU.
>  This allocates one ATSD register per an NVLink bridge allowing passing
>  up to 6 registers. Due to the host firmware bug (just recently fixed),
>  only 1 ATSD register per NPU was actually advertised to the host system
>  so this passes that alone register via the first NVLink bridge device in
>  the group which is still enough as QEMU collects them all back and
>  presents to the guest via vPHB to mimic the emulated NPU PHB on the host.
> 
>  In order to provide the userspace with the information about 
>  GPU-to-NVLink
>  connections, this exports an additional capability called "tgt"
>  (which is an abbreviated host system bus address). The "tgt" property
>  tells the GPU its own system address and allows the guest driver to
>  conglomerate the routing information so each GPU knows how to get 
>  directly
>  to the other GPUs.
> 
>  For ATS to work, the nest MMU (an NVIDIA block in a P9 CPU) needs to
>  know LPID (a logical partition ID or a KVM guest hardware ID in other
>  words) and PID (a memory context ID of a userspace process, not to be
>  confused with a linux pid). This assigns a GPU to LPID in the NPU and
>  this is why this adds a listener for KVM on an IOMMU group. A PID comes
>  via NVLink from a GPU and NPU uses a PID wildcard to pass it through.
> 
>  This requires coherent memory and ATSD to be available on the host as
>  the GPU vendor only supports configurations with both features enabled
>  and other configurations are known not to work. Because of this and
>  because of the ways the features are advertised to the host system
>  (which is a device tree with very platform specific properties),
>  this requires enabled POWERNV platform.
> 
>  The V100 GPUs do not advertise any of these capabilities via the config
>  space and there are more than just one device ID so this relies on
>  the platform to tell whether these GPUs have special abilities such as
>  NVLinks.
> 
>  Signed-off-by: Alexey Kardashevskiy 
>  ---
>  Changes:
>  v6.1:
>  * fixed outdated comment about VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
> 
>  v6:
>  * reworked capabilities - tgt for nvlink and gpu and link-speed
>  for nvlink only
> 
>  v5:
>  * do not memremap GPU RAM for emulation, map it only when it is needed
>  * allocate 1 ATSD register per NVLink bridge, if none left, then expose
>  the region with a zero size
>  * separate caps per device type
>  * addressed AW review comments
> 
>  v4:
>  * added nvlink-speed to the NPU bridge capability as this turned out to
>  be not a constant value
>  * instead of looking at the exact device ID (which also changes from 
>  system
>  to system), now this (indirectly) looks at the device tree to know
>  if GPU

[PATCH] Revert "serial: 8250: Default SERIAL_OF_PLATFORM to SERIAL_8250"

2018-12-20 Thread Florian Fainelli

This reverts commit 6d11023c345e369bcb9d5a68b271764e362c1f6e ("serial:
8250: Default SERIAL_OF_PLATFORM to SERIAL_8250") since that breaks at
least mpc8544ds (PowerPC) using arch/powerpc/kernel/legacy_serial.c.

See https://lkml.org/lkml/2018/12/5/1491 for discussion and analysis

Fixes: 6d11023c345e ("serial: 8250: Default SERIAL_OF_PLATFORM to SERIAL_8250")
Signed-off-by: Florian Fainelli 
---
 drivers/tty/serial/8250/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/tty/serial/8250/Kconfig b/drivers/tty/serial/8250/Kconfig
index d7737dca0e48..15c2c5463835 100644
--- a/drivers/tty/serial/8250/Kconfig
+++ b/drivers/tty/serial/8250/Kconfig
@@ -484,7 +484,6 @@ config SERIAL_8250_PXA
 config SERIAL_OF_PLATFORM
tristate "Devicetree based probing for 8250 ports"
depends on SERIAL_8250 && OF
-   default SERIAL_8250
help
  This option is used for all 8250 compatible serial ports that
  are probed through devicetree, including Open Firmware based
-- 
2.19.1

Re: [PATCH] serial: 8250: Default SERIAL_OF_PLATFORM to SERIAL_8250

2018-12-20 Thread Florian Fainelli

Le 12/20/18 à 9:38 AM, Guenter Roeck a écrit :
> On Thu, Dec 20, 2018 at 04:21:11PM +0100, Greg Kroah-Hartman wrote:
>> On Wed, Nov 14, 2018 at 05:11:25PM -0800, Guenter Roeck wrote:
>>> On Thu, Nov 01, 2018 at 11:26:06AM -0700, Florian Fainelli wrote:
>>>> It is way too easy to miss enabling SERIAL_OF_PLATFORM which would
>>>> result in the inability for the kernel to have a valid console device,
>>>> which can be seen with:
>>>>
>>>> Warning: unable to open an initial console.
>>>>
>>>> and then:
>>>>
>>>> Run /init as init process
>>>> Kernel panic - not syncing: Attempted to kill init! exitcode=0x0100
>>>>
>>>> Since SERIAL_OF_PLATFORM already depends on SERIAL_8250 && OF there
>>>> really is no drawback to defaulting this config to the value of
>>>> SERIAL_8250.
>>>>
>>>> Signed-off-by: Florian Fainelli 
>>>> Signed-off-by: Greg Kroah-Hartman 
>>>
>>> This patch results in situations where CONFIG_SERIAL_OF_PLATFORM is now
>>> defined where it was not previously. Example mpc85xx_defconfig. This in
>>> turn results in boot failures for those configurations, with an error
>>> message of
>>>
>>> of_serial: probe of e0004500.serial failed with error -22
>>>
>>> which wasn't seen before.
>>>
>>> Not sure if replacing a potential problem with a real one is really an
>>> improvement.`
>>
>> What ever was the result of this long thread?  Should I revert
>> something?  Or was a patch proposed?
>>
> The problem still exists in next-20181220.

I submitted a tentative patch to fix the problem, discussed it with
Michael and he had an alternative patch to 8250_core.c that should work
equally well though I was worried more breakage could be created that
way. Since clearly we have not been able to make much progress, maybe a
reversion of the original patch is appropriate, yes it's now sent a as a
reply to this mail! 

> 
> Unfortunately this is now just one failure of many in -next. I see more
> than 90 boot failures (out of ~330) there, not counting the build failures.
> And that is on a good day.
> 
> Guenter
> 


-- 
Florian

Re: [RFC PATCH v2 04/11] powerpc/mm: Add a framework for Kernel Userspace Access Protection

2018-12-20 Thread Michael Ellerman

Christophe Leroy  writes:

> This patch implements a framework for Kernel Userspace Access
> Protection.
>
> Then subarches will have to possibility to provide their own
> implementation by providing setup_kuap() and lock/unlock_user_access()
>
> Some platform will need to know the area accessed and whether it is
> accessed from read, write or both. Therefore source, destination and
> size and handed over to the two functions.
>
> Signed-off-by: Christophe Leroy 

I think some of this code came from Russell's original patch?

In which case we should have his signed-off-by here.

cheers

linux-next: manual merge of the kvm tree with the powerpc tree

2018-12-20 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the kvm tree got a conflict in:

  arch/powerpc/mm/fault.c

between commit:

  49a502ea23bf ("powerpc/mm: Make NULL pointer deferences explicit on bad page 
faults.")

from the powerpc tree and commit:

  d7b456152230 ("KVM: PPC: Book3S HV: Implement functions to access quadrants 1 
& 2")

from the kvm tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/powerpc/mm/fault.c
index c866d9a710fe,2e6fb1d758c3..
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@@ -650,9 -636,9 +650,10 @@@ void bad_page_fault(struct pt_regs *reg
switch (TRAP(regs)) {
case 0x300:
case 0x380:
+   case 0xe00:
 -  printk(KERN_ALERT "Unable to handle kernel paging request for "
 -  "data at address 0x%08lx\n", regs->dar);
 +  pr_alert("BUG: %s at 0x%08lx\n",
 +   regs->dar < PAGE_SIZE ? "Kernel NULL pointer 
dereference" :
 +   "Unable to handle kernel data access", regs->dar);
break;
case 0x400:
case 0x480:


pgpIg5PDVVgoj.pgp
Description: OpenPGP digital signature

Re: [PATCHv2 2/3] mm/numa: build zonelist when alloc for device on offline node

2018-12-20 Thread Pingfan Liu

On Thu, Dec 20, 2018 at 8:44 PM Michal Hocko  wrote:
>
> On Thu 20-12-18 20:26:28, Pingfan Liu wrote:
> > On Thu, Dec 20, 2018 at 7:35 PM Michal Hocko  wrote:
> > >
> > > On Thu 20-12-18 17:50:38, Pingfan Liu wrote:
> > > [...]
> > > > @@ -453,7 +456,12 @@ static inline int gfp_zonelist(gfp_t flags)
> > > >   */
> > > >  static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
> > > >  {
> > > > - return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
> > > > + if (unlikely(!possible_zonelists[nid])) {
> > > > + WARN_ONCE(1, "alloc from offline node: %d\n", nid);
> > > > + if (unlikely(build_fallback_zonelists(nid)))
> > > > + nid = first_online_node;
> > > > + }
> > > > + return possible_zonelists[nid] + gfp_zonelist(flags);
> > > >  }
> > >
> > > No, please don't do this. We do not want to make things work magically
> >
> > For magically, if you mean directly replies on zonelist instead of on
> > pgdat struct, then it is easy to change
>
> No, I mean that we _know_ which nodes are possible. Platform is supposed
> to tell us. We should just do the intialization properly. What we do now
> instead is a pile of hacks that fit magically together. And that should
> be changed.
>
Not agree. Here is the typical lazy to do, and at this point there is
also possible node info for us to check and build pgdat instance.

> > > and we definitely do not want to put something like that into the hot
> >
> > But  the cose of "unlikely" can be ignored, why can it not be placed
> > in the path?
>
> unlikely will simply put the code outside of the hot path. The condition
> is still there. There are people desperately fighting to get every
> single cycle out of the page allocator. Now you want them to pay a
> branch which is relevant only for few obscure HW setups.
>
Data is more convincing.
I test with the following program  built with -O2 on x86. No
observable performance difference between adding an extra unlikely
condition. And it is apparent that the frequency of checking on
unlikely is much higher than my patch.
#include 
#define unlikely_notrace(x) __builtin_expect(!!(x), 0)
#define unlikely(x) unlikely_notrace(x)
#define TEST_UNLIKELY 1
int main(int argc, char *argv[])
{
unsigned long i,j;
unsigned long end = (unsigned long)1 << 36;
unsigned long x = 9;
for (i = 1; i < end; i++) {
#ifdef TEST_UNLIKELY
if (unlikely(i == end - 1))
x *= 8;
#endif
x *= i;
x = x%10 + 1;
}
return 0;
}

> > > path. We definitely need zonelists to be build transparently for all
> > > possible nodes during the init time.
> >
> > That is the point, whether the all nodes should be instanced at boot
> > time, or not be instanced until there is requirement.
>
> And that should be done at init time. We have all the information
> necessary at that time.
> --

Will see other guys' comment.

Thanks and regards,
Pingfan

Re: [RFC PATCH v2 04/11] powerpc/mm: Add a framework for Kernel Userspace Access Protection

2018-12-20 Thread Christophe Leroy





Le 21/12/2018 à 06:07, Michael Ellerman a écrit :

Christophe Leroy  writes:


This patch implements a framework for Kernel Userspace Access
Protection.

Then subarches will have to possibility to provide their own
implementation by providing setup_kuap() and lock/unlock_user_access()

Some platform will need to know the area accessed and whether it is
accessed from read, write or both. Therefore source, destination and
size and handed over to the two functions.

Signed-off-by: Christophe Leroy 


I think some of this code came from Russell's original patch?

In which case we should have his signed-off-by here.



Yes that's right the ppc64 part is from Russel. As it's still an RFC and 
there is still some work to be done I didn't pay much attention to 
Signed-off and other tags yet.


Signed-off-by: Russell Currey 


Christophe

Re: [PATCHv2 1/3] mm/numa: change the topo of build_zonelist_xx()

2018-12-20 Thread kbuild test robot

Hi Pingfan,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v4.20-rc7 next-20181220]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Pingfan-Liu/mm-bugfix-for-NULL-reference-in-mm-on-all-archs/20181221-152625
config: riscv-tinyconfig (attached as .config)
compiler: riscv64-linux-gcc (GCC) 8.1.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.1.0 make.cross ARCH=riscv 

All errors (new ones prefixed by >>):

   mm/page_alloc.c: In function 'build_zonelists':
>> mm/page_alloc.c:5288:12: error: 'local_node' redeclared as different kind of 
>> symbol
 int node, local_node;
   ^~
   mm/page_alloc.c:5286:66: note: previous definition of 'local_node' was here
static void build_zonelists(struct zonelist *node_zonelists, int local_node)
 ^~

vim +/local_node +5288 mm/page_alloc.c

^1da177e4 Linus Torvalds2005-04-16  5285  
e6ee0d8bd Pingfan Liu   2018-12-20  5286  static void 
build_zonelists(struct zonelist *node_zonelists, int local_node)
^1da177e4 Linus Torvalds2005-04-16  5287  {
19655d348 Christoph Lameter 2006-09-25 @5288int node, local_node;
9d3be21bf Michal Hocko  2017-09-06  5289struct zoneref *zonerefs;
9d3be21bf Michal Hocko  2017-09-06  5290int nr_zones;
^1da177e4 Linus Torvalds2005-04-16  5291  
e6ee0d8bd Pingfan Liu   2018-12-20  5292zonerefs = 
node_zonelists[ZONELIST_FALLBACK]._zonerefs;
e6ee0d8bd Pingfan Liu   2018-12-20  5293nr_zones = 
build_zonerefs_node(local_node, zonerefs);
9d3be21bf Michal Hocko  2017-09-06  5294zonerefs += nr_zones;
^1da177e4 Linus Torvalds2005-04-16  5295  
^1da177e4 Linus Torvalds2005-04-16  5296/*
^1da177e4 Linus Torvalds2005-04-16  5297 * Now we build the zonelist so 
that it contains the zones
^1da177e4 Linus Torvalds2005-04-16  5298 * of all the other nodes.
^1da177e4 Linus Torvalds2005-04-16  5299 * We don't want to pressure a 
particular node, so when
^1da177e4 Linus Torvalds2005-04-16  5300 * building the zones for node 
N, we make sure that the
^1da177e4 Linus Torvalds2005-04-16  5301 * zones coming right after the 
local ones are those from
^1da177e4 Linus Torvalds2005-04-16  5302 * node N+1 (modulo N)
^1da177e4 Linus Torvalds2005-04-16  5303 */
^1da177e4 Linus Torvalds2005-04-16  5304for (node = local_node + 1; 
node < MAX_NUMNODES; node++) {
^1da177e4 Linus Torvalds2005-04-16  5305if (!node_online(node))
^1da177e4 Linus Torvalds2005-04-16  5306continue;
e6ee0d8bd Pingfan Liu   2018-12-20  5307nr_zones = 
build_zonerefs_node(node, zonerefs);
9d3be21bf Michal Hocko  2017-09-06  5308zonerefs += nr_zones;
^1da177e4 Linus Torvalds2005-04-16  5309}
^1da177e4 Linus Torvalds2005-04-16  5310for (node = 0; node < 
local_node; node++) {
^1da177e4 Linus Torvalds2005-04-16  5311if (!node_online(node))
^1da177e4 Linus Torvalds2005-04-16  5312continue;
e6ee0d8bd Pingfan Liu   2018-12-20  5313nr_zones = 
build_zonerefs_node(node, zonerefs);
9d3be21bf Michal Hocko  2017-09-06  5314zonerefs += nr_zones;
^1da177e4 Linus Torvalds2005-04-16  5315}
^1da177e4 Linus Torvalds2005-04-16  5316  
9d3be21bf Michal Hocko  2017-09-06  5317zonerefs->zone = NULL;
9d3be21bf Michal Hocko  2017-09-06  5318zonerefs->zone_idx = 0;
^1da177e4 Linus Torvalds2005-04-16  5319  }
^1da177e4 Linus Torvalds2005-04-16  5320  

:: The code at line 5288 was first introduced by commit
:: 19655d3487001d7df0e10e9cbfc27c758b77c2b5 [PATCH] linearly index 
zone->node_zonelists[]

:: TO: Christoph Lameter 
:: CC: Linus Torvalds 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip

76 matches

Mail list logo