Re: [RFC PATCH 8/8] powerpc/64s/radix: Only flush local TLB for spurious fault flushes

2017-09-07 Thread Benjamin Herrenschmidt
On Fri, 2017-09-08 at 14:44 +1000, Nicholas Piggin wrote:
> On Fri, 08 Sep 2017 08:05:38 +1000
> Benjamin Herrenschmidt  wrote:
> 
> > On Fri, 2017-09-08 at 00:51 +1000, Nicholas Piggin wrote:
> > > When permissiveness is relaxed, or found to have been relaxed by
> > > another thread, we flush that address out of the TLB to avoid a
> > > future fault or micro-fault due to a stale TLB entry.
> > > 
> > > Currently for processes with TLBs on other CPUs, this flush is always
> > > done with a global tlbie. Although that could reduce faults on remote
> > > CPUs, a broadcast operation seems to be wasteful for something that
> > > can be handled in-core by the remote CPU if it comes to it.
> > > 
> > > This is not benchmarked yet. It does seem cut some tlbie operations
> > > from the bus.  
> > 
> > What happens with the nest MMU here ?
> 
> Good question, I'm not sure. I can't tell from the UM or not if the
> agent and NMMU must discard cached translations if there is a
> translation cached but it has a permission fault. It's not clear 
> from that I've read that if it's relying on the host to send back a
> tlbie.

I think it's supposed to re-do a tablewalk.

> I'll keep digging.
> 
> Thanks,
> Nick


Re: [RFC PATCH 8/8] powerpc/64s/radix: Only flush local TLB for spurious fault flushes

2017-09-07 Thread Aneesh Kumar K.V
Nicholas Piggin  writes:

> When permissiveness is relaxed, or found to have been relaxed by
> another thread, we flush that address out of the TLB to avoid a
> future fault or micro-fault due to a stale TLB entry.
>
> Currently for processes with TLBs on other CPUs, this flush is always
> done with a global tlbie. Although that could reduce faults on remote
> CPUs, a broadcast operation seems to be wasteful for something that
> can be handled in-core by the remote CPU if it comes to it.
>
> This is not benchmarked yet. It does seem cut some tlbie operations
> from the bus.
>
> Signed-off-by: Nicholas Piggin 
> ---
>  .../powerpc/include/asm/book3s/64/tlbflush-radix.h |  5 
>  arch/powerpc/include/asm/book3s/64/tlbflush.h  | 11 +
>  arch/powerpc/mm/pgtable-book3s64.c |  5 +++-
>  arch/powerpc/mm/pgtable.c  |  2 +-
>  arch/powerpc/mm/tlb-radix.c| 27 
> ++
>  5 files changed, 48 insertions(+), 2 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h 
> b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index b12460b306a7..34cd864b8fc1 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -16,6 +16,8 @@ extern bool radix__flush_tlb_range_psize(struct mm_struct 
> *mm, unsigned long sta
>unsigned long end, int psize);
>  extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>  unsigned long start, unsigned long end);
> +extern void radix__local_flush_pmd_tlb_range(struct vm_area_struct *vma,
> + unsigned long start, unsigned long end);
>  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long 
> start,
>   unsigned long end);
>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long 
> end);
> @@ -24,6 +26,9 @@ extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
>  extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned 
> long vmaddr);
>  extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned 
> long vmaddr,
> int psize);
> +extern void radix__local_flush_tlb_range_psize(struct mm_struct *mm,
> + unsigned long start, unsigned long end,
> + int psize);
>  extern void radix__tlb_flush(struct mmu_gather *tlb);
>  #ifdef CONFIG_SMP
>  extern void radix__flush_tlb_mm(struct mm_struct *mm);
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
> b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 72b925f97bab..8a8b3e11a28e 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -83,6 +83,17 @@ static inline void flush_tlb_page(struct vm_area_struct 
> *vma,
>  #define flush_tlb_mm(mm) local_flush_tlb_mm(mm)
>  #define flush_tlb_page(vma, addr)local_flush_tlb_page(vma, addr)
>  #endif /* CONFIG_SMP */
> +
> +#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
> +static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> + unsigned long address)
> +{
> + if (radix_enabled())
> + radix__local_flush_tlb_page(vma, address);
> + else
> + flush_tlb_page(vma, address);
> +}
> +
>  /*
>   * flush the page walk cache for the address
>   */
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
> b/arch/powerpc/mm/pgtable-book3s64.c
> index 3b65917785a5..e46f346388d6 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -40,7 +40,10 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, 
> unsigned long address,
>   if (changed) {
>   __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp),
>   pmd_pte(entry), address);
> - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> + if (radix_enabled())
> + radix__local_flush_pmd_tlb_range(vma, address, address 
> + HPAGE_PMD_SIZE);
> + else
> + flush_pmd_tlb_range(vma, address, address + 
> HPAGE_PMD_SIZE);
  this is no-op for hash.


>   }
>   return changed;
>  }
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index a03ff3d99e0c..acd6ae8062ce 100644


-aneesh



Re: [PATCH] ASoC: fsl_ssi: Override bit clock rate based on slot number

2017-09-07 Thread Nicolin Chen
On Thu, Sep 07, 2017 at 10:23:43PM -0700, Nicolin Chen wrote:
> The set_sysclk() now is used to override the output bit clock rate.
> But this is not a common way to implement a set_dai_sysclk(). And
> this creates a problem when a general machine driver (simple-card
> for example) tries to do set_dai_sysclk() by passing an input clock
> rate for the baud clock instead of setting the bit clock rate as
> fsl_ssi driver expected.
> 
> So this patch solves this problem by firstly removing set_sysclk()
> since the hw_params() can calculate the bit clock rate. Secondly,
> in order not to break those TDM use cases which previously might
> have been using set_sysclk() to override the bit clock rate, this
> patch changes the driver to override it based on the slot number.
> 
> The patch also removes an obsolete comment of the dir parameter.
> 
> Signed-off-by: Nicolin Chen 

Forgot to mention, I think that it's better to wait for a couple of
Tested-by from those who use the TDM mode of SSI before applying it.

Thanks
Nicolin


Re: [PATCH v2 1/2] powerpc/npu: Use flush_all_mm() instead of flush_tlb_mm()

2017-09-07 Thread kbuild test robot
Hi Alistair,

[auto build test ERROR on powerpc/next]
[also build test ERROR on next-20170907]
[cannot apply to v4.13]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Alistair-Popple/powerpc-npu-Use-flush_all_mm-instead-of-flush_tlb_mm/20170908-080828
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-allmodconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/platforms/powernv/npu-dma.c: In function 'mmio_invalidate':
>> arch/powerpc/platforms/powernv/npu-dma.c:552:2: error: implicit declaration 
>> of function 'flush_all_mm' [-Werror=implicit-function-declaration]
 flush_all_mm(npu_context->mm);
 ^~~~
   cc1: some warnings being treated as errors

vim +/flush_all_mm +552 arch/powerpc/platforms/powernv/npu-dma.c

   533  
   534  /*
   535   * Invalidate either a single address or an entire PID depending on
   536   * the value of va.
   537   */
   538  static void mmio_invalidate(struct npu_context *npu_context, int va,
   539  unsigned long address, bool flush)
   540  {
   541  int i, j;
   542  struct npu *npu;
   543  struct pnv_phb *nphb;
   544  struct pci_dev *npdev;
   545  struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
   546  unsigned long pid = npu_context->mm->context.id;
   547  
   548  /*
   549   * Unfortunately the nest mmu does not support flushing specific
   550   * addresses so we have to flush the whole mm.
   551   */
 > 552  flush_all_mm(npu_context->mm);
   553  
   554  /*
   555   * Loop over all the NPUs this process is active on and launch
   556   * an invalidate.
   557   */
   558  for (i = 0; i <= max_npu2_index; i++) {
   559  mmio_atsd_reg[i].reg = -1;
   560  for (j = 0; j < NV_MAX_LINKS; j++) {
   561  npdev = npu_context->npdev[i][j];
   562  if (!npdev)
   563  continue;
   564  
   565  nphb = 
pci_bus_to_host(npdev->bus)->private_data;
   566  npu = >npu;
   567  mmio_atsd_reg[i].npu = npu;
   568  
   569  if (va)
   570  mmio_atsd_reg[i].reg =
   571  mmio_invalidate_va(npu, 
address, pid,
   572  flush);
   573  else
   574  mmio_atsd_reg[i].reg =
   575  mmio_invalidate_pid(npu, pid, 
flush);
   576  
   577  /*
   578   * The NPU hardware forwards the shootdown to 
all GPUs
   579   * so we only have to launch one shootdown per 
NPU.
   580   */
   581  break;
   582  }
   583  }
   584  
   585  mmio_invalidate_wait(mmio_atsd_reg, flush);
   586  if (flush)
   587  /* Wait for the flush to complete */
   588  mmio_invalidate_wait(mmio_atsd_reg, false);
   589  }
   590  

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH] ASoC: fsl-asoc-card: Don't error out if ENOTSUPP

2017-09-07 Thread Nicolin Chen
The snd_soc_component_set_sysclk() and snd_soc_dai_set_tdm_slot()
in the soc-core.c will return -ENOTSUPP if there is no function
implementation for them in the dai and component drivers.

So this patch tries to ignore this errno.

Signed-off-by: Nicolin Chen 
---
 sound/soc/fsl/fsl-asoc-card.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sound/soc/fsl/fsl-asoc-card.c b/sound/soc/fsl/fsl-asoc-card.c
index 2db4d0c..3772abb 100644
--- a/sound/soc/fsl/fsl-asoc-card.c
+++ b/sound/soc/fsl/fsl-asoc-card.c
@@ -166,7 +166,7 @@ static int fsl_asoc_card_hw_params(struct snd_pcm_substream 
*substream,
ret = snd_soc_dai_set_sysclk(rtd->cpu_dai, cpu_priv->sysclk_id[tx],
 cpu_priv->sysclk_freq[tx],
 cpu_priv->sysclk_dir[tx]);
-   if (ret) {
+   if (ret && ret != -ENOTSUPP) {
dev_err(dev, "failed to set sysclk for cpu dai\n");
return ret;
}
@@ -174,7 +174,7 @@ static int fsl_asoc_card_hw_params(struct snd_pcm_substream 
*substream,
if (cpu_priv->slot_width) {
ret = snd_soc_dai_set_tdm_slot(rtd->cpu_dai, 0x3, 0x3, 2,
   cpu_priv->slot_width);
-   if (ret) {
+   if (ret && ret != -ENOTSUPP) {
dev_err(dev, "failed to set TDM slot for cpu dai\n");
return ret;
}
@@ -270,7 +270,7 @@ static int fsl_asoc_card_set_bias_level(struct snd_soc_card 
*card,
 
ret = snd_soc_dai_set_sysclk(codec_dai, codec_priv->fll_id,
 pll_out, SND_SOC_CLOCK_IN);
-   if (ret) {
+   if (ret && ret != -ENOTSUPP) {
dev_err(dev, "failed to set SYSCLK: %d\n", ret);
return ret;
}
@@ -283,7 +283,7 @@ static int fsl_asoc_card_set_bias_level(struct snd_soc_card 
*card,
ret = snd_soc_dai_set_sysclk(codec_dai, codec_priv->mclk_id,
 codec_priv->mclk_freq,
 SND_SOC_CLOCK_IN);
-   if (ret) {
+   if (ret && ret != -ENOTSUPP) {
dev_err(dev, "failed to switch away from FLL: %d\n", 
ret);
return ret;
}
@@ -459,7 +459,7 @@ static int fsl_asoc_card_late_probe(struct snd_soc_card 
*card)
 
ret = snd_soc_dai_set_sysclk(codec_dai, codec_priv->mclk_id,
 codec_priv->mclk_freq, SND_SOC_CLOCK_IN);
-   if (ret) {
+   if (ret && ret != -ENOTSUPP) {
dev_err(dev, "failed to set sysclk in %s\n", __func__);
return ret;
}
-- 
2.7.4



[PATCH] ASoC: fsl_ssi: Override bit clock rate based on slot number

2017-09-07 Thread Nicolin Chen
The set_sysclk() now is used to override the output bit clock rate.
But this is not a common way to implement a set_dai_sysclk(). And
this creates a problem when a general machine driver (simple-card
for example) tries to do set_dai_sysclk() by passing an input clock
rate for the baud clock instead of setting the bit clock rate as
fsl_ssi driver expected.

So this patch solves this problem by firstly removing set_sysclk()
since the hw_params() can calculate the bit clock rate. Secondly,
in order not to break those TDM use cases which previously might
have been using set_sysclk() to override the bit clock rate, this
patch changes the driver to override it based on the slot number.

The patch also removes an obsolete comment of the dir parameter.

Signed-off-by: Nicolin Chen 
---
 sound/soc/fsl/fsl_ssi.c | 26 --
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
index 64598d1..3657c88 100644
--- a/sound/soc/fsl/fsl_ssi.c
+++ b/sound/soc/fsl/fsl_ssi.c
@@ -197,12 +197,12 @@ struct fsl_ssi_soc_data {
  * @use_dma: DMA is used or FIQ with stream filter
  * @use_dual_fifo: DMA with support for both FIFOs used
  * @fifo_deph: Depth of the SSI FIFOs
+ * @slots: number of slots
  * @rxtx_reg_val: Specific register settings for receive/transmit configuration
  *
  * @clk: SSI clock
  * @baudclk: SSI baud clock for master mode
  * @baudclk_streams: Active streams that are using baudclk
- * @bitclk_freq: bitclock frequency set by .set_dai_sysclk
  *
  * @dma_params_tx: DMA transmit parameters
  * @dma_params_rx: DMA receive parameters
@@ -233,12 +233,12 @@ struct fsl_ssi_private {
bool use_dual_fifo;
bool has_ipg_clk_name;
unsigned int fifo_depth;
+   unsigned int slots;
struct fsl_ssi_rxtx_reg_val rxtx_reg_val;
 
struct clk *clk;
struct clk *baudclk;
unsigned int baudclk_streams;
-   unsigned int bitclk_freq;
 
/* regcache for volatile regs */
u32 regcache_sfcsr;
@@ -700,8 +700,7 @@ static void fsl_ssi_shutdown(struct snd_pcm_substream 
*substream,
  * Note: This function can be only called when using SSI as DAI master
  *
  * Quick instruction for parameters:
- * freq: Output BCLK frequency = samplerate * 32 (fixed) * channels
- * dir: SND_SOC_CLOCK_OUT -> TxBCLK, SND_SOC_CLOCK_IN -> RxBCLK.
+ * freq: Output BCLK frequency = samplerate * 32 (fixed) * slots (or channels)
  */
 static int fsl_ssi_set_bclk(struct snd_pcm_substream *substream,
struct snd_soc_dai *cpu_dai,
@@ -716,9 +715,9 @@ static int fsl_ssi_set_bclk(struct snd_pcm_substream 
*substream,
unsigned int freq;
bool baudclk_is_used;
 
-   /* Prefer the explicitly set bitclock frequency */
-   if (ssi_private->bitclk_freq)
-   freq = ssi_private->bitclk_freq;
+   /* Generate bit clock based on the slot or channel number */
+   if (ssi_private->slots)
+   freq = ssi_private->slots * 32 * params_rate(hw_params);
else
freq = params_channels(hw_params) * 32 * params_rate(hw_params);
 
@@ -805,16 +804,6 @@ static int fsl_ssi_set_bclk(struct snd_pcm_substream 
*substream,
return 0;
 }
 
-static int fsl_ssi_set_dai_sysclk(struct snd_soc_dai *cpu_dai,
-   int clk_id, unsigned int freq, int dir)
-{
-   struct fsl_ssi_private *ssi_private = snd_soc_dai_get_drvdata(cpu_dai);
-
-   ssi_private->bitclk_freq = freq;
-
-   return 0;
-}
-
 /**
  * fsl_ssi_hw_params - program the sample size
  *
@@ -1121,6 +1110,8 @@ static int fsl_ssi_set_dai_tdm_slot(struct snd_soc_dai 
*cpu_dai, u32 tx_mask,
 
regmap_update_bits(regs, CCSR_SSI_SCR, CCSR_SSI_SCR_SSIEN, val);
 
+   ssi_private->slots = slots;
+
return 0;
 }
 
@@ -1191,7 +1182,6 @@ static const struct snd_soc_dai_ops fsl_ssi_dai_ops = {
.hw_params  = fsl_ssi_hw_params,
.hw_free= fsl_ssi_hw_free,
.set_fmt= fsl_ssi_set_dai_fmt,
-   .set_sysclk = fsl_ssi_set_dai_sysclk,
.set_tdm_slot   = fsl_ssi_set_dai_tdm_slot,
.trigger= fsl_ssi_trigger,
 };
-- 
2.7.4



Re: [RFC PATCH 8/8] powerpc/64s/radix: Only flush local TLB for spurious fault flushes

2017-09-07 Thread Nicholas Piggin
On Fri, 08 Sep 2017 08:05:38 +1000
Benjamin Herrenschmidt  wrote:

> On Fri, 2017-09-08 at 00:51 +1000, Nicholas Piggin wrote:
> > When permissiveness is relaxed, or found to have been relaxed by
> > another thread, we flush that address out of the TLB to avoid a
> > future fault or micro-fault due to a stale TLB entry.
> > 
> > Currently for processes with TLBs on other CPUs, this flush is always
> > done with a global tlbie. Although that could reduce faults on remote
> > CPUs, a broadcast operation seems to be wasteful for something that
> > can be handled in-core by the remote CPU if it comes to it.
> > 
> > This is not benchmarked yet. It does seem cut some tlbie operations
> > from the bus.  
> 
> What happens with the nest MMU here ?

Good question, I'm not sure. I can't tell from the UM or not if the
agent and NMMU must discard cached translations if there is a
translation cached but it has a permission fault. It's not clear 
from that I've read that if it's relying on the host to send back a
tlbie.

I'll keep digging.

Thanks,
Nick


Re: [PATCH 2/2] powerpc/powernv/npu: Don't explicitly flush nmmu tlb

2017-09-07 Thread kbuild test robot
Hi Alistair,

[auto build test ERROR on powerpc/next]
[also build test ERROR on next-20170907]
[cannot apply to v4.13]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Alistair-Popple/powerpc-npu-Use-flush_all_mm-instead-of-flush_tlb_mm/20170908-072908
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
make.cross ARCH=powerpc 

All errors (new ones prefixed by >>):

   arch/powerpc/platforms/powernv/npu-dma.c: In function 'mmio_invalidate':
   arch/powerpc/platforms/powernv/npu-dma.c:555:3: error: implicit declaration 
of function 'flush_all_mm' [-Werror=implicit-function-declaration]
  flush_all_mm(npu_context->mm);
  ^~~~
   arch/powerpc/platforms/powernv/npu-dma.c: In function 
'pnv_npu2_init_context':
>> arch/powerpc/platforms/powernv/npu-dma.c:744:3: error: implicit declaration 
>> of function 'inc_mm_active_cpus' [-Werror=implicit-function-declaration]
  inc_mm_active_cpus(mm);
  ^~
   arch/powerpc/platforms/powernv/npu-dma.c: In function 
'pnv_npu2_release_context':
>> arch/powerpc/platforms/powernv/npu-dma.c:758:3: error: implicit declaration 
>> of function 'dec_mm_active_cpus' [-Werror=implicit-function-declaration]
  dec_mm_active_cpus(npu_context->mm);
  ^~
   cc1: all warnings being treated as errors

vim +/inc_mm_active_cpus +744 arch/powerpc/platforms/powernv/npu-dma.c

   534  
   535  /*
   536   * Invalidate either a single address or an entire PID depending on
   537   * the value of va.
   538   */
   539  static void mmio_invalidate(struct npu_context *npu_context, int va,
   540  unsigned long address, bool flush)
   541  {
   542  int i, j;
   543  struct npu *npu;
   544  struct pnv_phb *nphb;
   545  struct pci_dev *npdev;
   546  struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
   547  unsigned long pid = npu_context->mm->context.id;
   548  
   549  if (npu_context->nmmu_flush)
   550  /*
   551   * Unfortunately the nest mmu does not support flushing 
specific
   552   * addresses so we have to flush the whole mm once 
before
   553   * shooting down the GPU translation.
   554   */
 > 555  flush_all_mm(npu_context->mm);
   556  
   557  /*
   558   * Loop over all the NPUs this process is active on and launch
   559   * an invalidate.
   560   */
   561  for (i = 0; i <= max_npu2_index; i++) {
   562  mmio_atsd_reg[i].reg = -1;
   563  for (j = 0; j < NV_MAX_LINKS; j++) {
   564  npdev = npu_context->npdev[i][j];
   565  if (!npdev)
   566  continue;
   567  
   568  nphb = 
pci_bus_to_host(npdev->bus)->private_data;
   569  npu = >npu;
   570  mmio_atsd_reg[i].npu = npu;
   571  
   572  if (va)
   573  mmio_atsd_reg[i].reg =
   574  mmio_invalidate_va(npu, 
address, pid,
   575  flush);
   576  else
   577  mmio_atsd_reg[i].reg =
   578  mmio_invalidate_pid(npu, pid, 
flush);
   579  
   580  /*
   581   * The NPU hardware forwards the shootdown to 
all GPUs
   582   * so we only have to launch one shootdown per 
NPU.
   583   */
   584  break;
   585  }
   586  }
   587  
   588  mmio_invalidate_wait(mmio_atsd_reg, flush);
   589  if (flush)
   590  /* Wait for the flush to complete */
   591  mmio_invalidate_wait(mmio_atsd_reg, false);
   592  }
   593  
   594  static void pnv_npu2_mn_release(struct mmu_notifier *mn,
   595  struct mm_struct *mm)
   596  {
   597  struct npu_context *npu_context = mn_to_npu_context(mn);
   598  
   599  /* Call into device driver to stop requests to the NMMU */
   600  if (npu_context->release_cb)
   601  npu_context->release_cb(npu_context, npu_

Re: [rfc 2/3] powerpc/mce: Extract physical_address for UE errors

2017-09-07 Thread Mahesh Jagannath Salgaonkar
On 09/05/2017 09:45 AM, Balbir Singh wrote:
> Walk the page table for NIP and extract the instruction. Then
> use the instruction to find the effective address via analyse_instr().
> 
> We might have page table walking races, but we expect them to
> be rare, the physical address extraction is best effort. The idea
> is to then hook up this infrastructure to memory failure eventually.
> 
> Signed-off-by: Balbir Singh 
> ---
>  arch/powerpc/include/asm/mce.h  |  2 +-
>  arch/powerpc/kernel/mce.c   |  6 -
>  arch/powerpc/kernel/mce_power.c | 60 
> +
>  3 files changed, 61 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
> index 75292c7..3a1226e 100644
> --- a/arch/powerpc/include/asm/mce.h
> +++ b/arch/powerpc/include/asm/mce.h
> @@ -204,7 +204,7 @@ struct mce_error_info {
> 
>  extern void save_mce_event(struct pt_regs *regs, long handled,
>  struct mce_error_info *mce_err, uint64_t nip,
> -uint64_t addr);
> +uint64_t addr, uint64_t phys_addr);
>  extern int get_mce_event(struct machine_check_event *mce, bool release);
>  extern void release_mce_event(void);
>  extern void machine_check_queue_event(void);
> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
> index e254399..f41a75d 100644
> --- a/arch/powerpc/kernel/mce.c
> +++ b/arch/powerpc/kernel/mce.c
> @@ -82,7 +82,7 @@ static void mce_set_error_info(struct machine_check_event 
> *mce,
>   */
>  void save_mce_event(struct pt_regs *regs, long handled,
>   struct mce_error_info *mce_err,
> - uint64_t nip, uint64_t addr)
> + uint64_t nip, uint64_t addr, uint64_t phys_addr)
>  {
>   int index = __this_cpu_inc_return(mce_nest_count) - 1;
>   struct machine_check_event *mce = this_cpu_ptr(_event[index]);
> @@ -140,6 +140,10 @@ void save_mce_event(struct pt_regs *regs, long handled,
>   } else if (mce->error_type == MCE_ERROR_TYPE_UE) {
>   mce->u.ue_error.effective_address_provided = true;
>   mce->u.ue_error.effective_address = addr;
> + if (phys_addr != ULONG_MAX) {
> + mce->u.ue_error.physical_address_provided = true;
> + mce->u.ue_error.physical_address = phys_addr;
> + }
>   }
>   return;
>  }
> diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
> index b76ca19..b77a698 100644
> --- a/arch/powerpc/kernel/mce_power.c
> +++ b/arch/powerpc/kernel/mce_power.c
> @@ -27,6 +27,25 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
> +
> +static unsigned long addr_to_pfn(struct mm_struct *mm, unsigned long addr)
> +{
> + pte_t *ptep;
> + unsigned long flags;
> +
> + local_irq_save(flags);
> + if (mm == current->mm)
> + ptep = find_current_mm_pte(mm->pgd, addr, NULL, NULL);
> + else
> + ptep = find_init_mm_pte(addr, NULL);
> + local_irq_restore(flags);
> + if (!ptep)
> + return ULONG_MAX;
> + return pte_pfn(*ptep);
> +}
> 
>  static void flush_tlb_206(unsigned int num_sets, unsigned int action)
>  {
> @@ -489,7 +508,8 @@ static int mce_handle_ierror(struct pt_regs *regs,
> 
>  static int mce_handle_derror(struct pt_regs *regs,
>   const struct mce_derror_table table[],
> - struct mce_error_info *mce_err, uint64_t *addr)
> + struct mce_error_info *mce_err, uint64_t *addr,
> + uint64_t *phys_addr)
>  {
>   uint64_t dsisr = regs->dsisr;
>   int handled = 0;
> @@ -555,7 +575,37 @@ static int mce_handle_derror(struct pt_regs *regs,
>   mce_err->initiator = table[i].initiator;
>   if (table[i].dar_valid)
>   *addr = regs->dar;
> -
> + else if (mce_err->severity == MCE_SEV_ERROR_SYNC &&
> + table[i].error_type == MCE_ERROR_TYPE_UE) {
> + /*
> +  * Carefully look at the NIP to determine
> +  * the instruction to analyse. Reading the NIP
> +  * in real-mode is tricky and can lead to recursive
> +  * faults
> +  */
> + int instr;
> + struct mm_struct *mm;
> + unsigned long nip = regs->nip;
> + unsigned long pfn = 0, instr_addr;
> + struct instruction_op op;
> + struct pt_regs tmp = *regs;
> +
> + if (user_mode(regs))
> + mm = current->mm;
> + else
> + mm = _mm;
> +
> + pfn = addr_to_pfn(mm, nip);
> + if (pfn != ULONG_MAX) {
> + instr_addr = (pfn 

Re: Machine Check in P2010(e500v2)

2017-09-07 Thread Scott Wood
On Wed, 2017-09-06 at 10:16 +, Joakim Tjernlund wrote:
> On Wed, 2017-09-06 at 10:05 +, Laurentiu Tudor wrote:
> > Hi Jocke,
> > 
> > On 09/01/2017 02:32 PM, Joakim Tjernlund wrote:
> > > I am trying to debug a Machine Check for a P2010 (e500v2) CPU:
> > > 
> > > [   28.111816] Caused by (from MCSR=10008): Bus - Read Data Bus Error
> > > [   28.117998] Oops: Machine check, sig: 7 [#1]
> > > [   28.122263] P1010 RDB
> > > [   28.124529] Modules linked in: linux_bcm_knet(PO) linux_user_bde(PO)
> > > linux_kernel_bde(PO)
> > > [   28.132718] CPU: 0 PID: 470 Comm: emxp2_hw_bl Tainted:
> > > P   O4.1.38+ #49
> > > [   28.140376] task: db16cd10 ti: df128000 task.ti: df128000
> > > [   28.145770] NIP:  LR: 10a4e404 CTR: 10046c38
> > > [   28.150730] REGS: df129f10 TRAP: 0204   Tainted:
> > > P   O (4.1.38+)
> > > [   28.157776] MSR: 0002d000   CR: 44002428  XER: 
> > > [   28.164140] DEAR: b7187000 ESR: 
> > > GPR00: 10a4e404 bf86ea30 b7ca94a0 132f9fa8 07006000 0700 
> > > 132f9fd8
> > > GPR08: b7149000 b7159000 0003e000 bf86ea20 24004424 11d6cf7c 
> > > 
> > > GPR16: 10f6e29c 10f6c872 10f6db01 b541 b541 11d92fcc 0011
> > > 0001
> > > GPR24: 01a4d12d 132ffbf0 11d6  07006000  132f9fa8
> > > 
> > > [   28.196375] NIP []   (null)
> > > [   28.199859] LR [10a4e404] 0x10a4e404
> > > [   28.203426] Call Trace:
> > > [   28.205866] ---[ end trace f456255ddf9bee83 ]---
> > > 
> > > I cannot figure out why NIP is NULL ? It LOOKs like NIP is set to
> > > MCSRR0 early on but maybe it is lost somehow?
> > > 
> > > Anyhow, looking at entry_32.S:
> > >   .globl  mcheck_transfer_to_handler
> > > mcheck_transfer_to_handler:
> > >   mfspr   r0,SPRN_DSRR0
> > >   stw r0,_DSRR0(r11)
> > >   mfspr   r0,SPRN_DSRR1
> > >   stw r0,_DSRR1(r11)
> > >   /* fall through */
> > > 
> > >   .globl  debug_transfer_to_handler
> > > debug_transfer_to_handler:
> > >   mfspr   r0,SPRN_CSRR0
> > >   stw r0,_CSRR0(r11)
> > >   mfspr   r0,SPRN_CSRR1
> > >   stw r0,_CSRR1(r11)
> > >   /* fall through */
> > > 
> > >   .globl  crit_transfer_to_handler
> > > crit_transfer_to_handler:
> > > 
> > > It looks odd that DSRRx is assigned in mcheck and CSRRx in debug and
> > > crit has none. Should not this assigment be shifted down one level?
> > > 
> > 
> > This does indeed looks weird. Have you tried moving the SPRN_CSRR* 
> > saving in the crit section? Any results?
> 
> After looking at this somwhat I think this is intentional and OK.
> I sorted NIP == NULL too:
> @@ -996,7 +998,7 @@ int fsl_pci_mcheck_exception(struct pt_regs *regs)
> if (is_in_pci_mem_space(addr)) {
> if (user_mode(regs)) {
> pagefault_disable();
> -   ret = get_user(regs->nip, );
> +   ret = get_user(inst, (__u32 __user *)regs->nip);
> pagefault_enable();
> } else {
> ret = probe_kernel_address(regs->nip, inst);

:-(

> 
> But after this, the CPU is still locked after an Machine Check. Is this
> to be expected? I figured the user space process would get a SIGBUS and
> kernel
> would resume normal operations.
> 
> Scott, maybe you have some idea?

The userspace process should exit with SIGBUS (not quite the same as receiving
a SIGBUS that can be handled).  Maybe whatever is causing the machine check
ends up causing more problems that lead to the hang.

-Scott



Re: [PATCH] powerpc/powernv: Increase memory block size to 1GB on radix

2017-09-07 Thread Anton Blanchard
Hi Reza,

> I may be misunderstanding this, but what if we did something like x86 
> does? When trying to unplug a region smaller than the mapping, they
> fill that part of the pagetable with 0xFD instead of freeing the
> whole thing. Once the whole thing is 0xFD, free it.
> 
> See arch/x86/mm/init_64.c:remove_{pte,pmd,pud}_table()
> 
> ---%<---
>   memset((void *)addr, PAGE_INUSE, next - addr);
> 
>   page_addr = page_address(pte_page(*pte));
>   if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
>   ...
>   pte_clear(_mm, addr, pte);
>   ...
>   }
> ---%<---

But you only have 1GB ptes at this point, you'd need to start
instantiating a new level in the tree, and populate 2MB ptes.

That is what Ben is suggesting. I'm happy to go any way (fix hotplug
to handle this, or increase the memblock size on PowerNV to 1GB), I just
need a solution.

Anton


Re: [PATCH] cxl: Dump PSL_FIR1/2 registers on PSL9 error irq

2017-09-07 Thread Andrew Donnellan

LGTM

Acked-by: Andrew Donnellan 

On 07/09/17 22:13, Vaibhav Jain wrote:

For PSL9 currently we aren't dumping the PSL FIR1/2 registers when a
PSL error interrupt is triggered. Contents of these registers are
useful in debugging AFU issues.

This patch fixes issue by updating the cxl_native_err_irq_dump_regs()
to dump these regs on PSL error interrupt thereby bringing the
behavior in line with PSL on POWER-8.

Signed-off-by: Vaibhav Jain 
---
  drivers/misc/cxl/native.c | 13 +++--
  drivers/misc/cxl/pci.c|  1 +
  2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 4a82c313cf71..60b91e95821d 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -1261,8 +1261,17 @@ void cxl_native_err_irq_dump_regs(struct cxl *adapter)
  {
u64 fir1, fir2;
  
-	fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);

-   fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
+   if (cxl_is_power8()) {
+   fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);
+   fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
+   } else if (cxl_is_power9()) {
+   fir1 = cxl_p1_read(adapter, CXL_PSL9_FIR1);
+   fir2 = cxl_p1_read(adapter, CXL_PSL9_FIR2);
+   } else {
+   /* Dont report garbage */
+   fir1 = fir2 = 0;
+   WARN_ON(1);
+   }
  
  	dev_crit(>dev, "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", fir1, fir2);

  }
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index d18b3d9292fd..597e145f38e3 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1762,6 +1762,7 @@ static const struct cxl_service_layer_ops psl9_ops = {
.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9,
.debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9,
.psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9,
+   .err_irq_dump_registers = cxl_native_err_irq_dump_regs,
.debugfs_stop_trace = cxl_stop_trace_psl9,
.write_timebase_ctrl = write_timebase_ctrl_psl9,
.timebase_read = timebase_read_psl9,



--
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited



Re: [PATCH] sound: soc: fsl: Do not set DAI sysclk when it is equal to system freq

2017-09-07 Thread Nicolin Chen
On Fri, Sep 08, 2017 at 01:10:12AM +0200, Łukasz Majewski wrote:

> >Just add a property to this cpu node like:
> > clock = < IMX6QDL_CLK_SSI2>;
> 
> This doesn't solve the issue:

I have a patch locally that should be able to solve your problem.
But I need to first verify on my board tonight and will send it
later (will put you in the TO/CC list).


Re: [PATCH] cxl: Dump PSL_FIR1/2 registers on PSL9 error irq

2017-09-07 Thread christophe lombard

Le 07/09/2017 à 14:13, Vaibhav Jain a écrit :

For PSL9 currently we aren't dumping the PSL FIR1/2 registers when a
PSL error interrupt is triggered. Contents of these registers are
useful in debugging AFU issues.

This patch fixes issue by updating the cxl_native_err_irq_dump_regs()
to dump these regs on PSL error interrupt thereby bringing the
behavior in line with PSL on POWER-8.

Signed-off-by: Vaibhav Jain 
---
  drivers/misc/cxl/native.c | 13 +++--
  drivers/misc/cxl/pci.c|  1 +
  2 files changed, 12 insertions(+), 2 deletions(-)


sounds good.

Acked-by:  Christophe Lombard 



Re: [PATCH] sound: soc: fsl: Do not set DAI sysclk when it is equal to system freq

2017-09-07 Thread Łukasz Majewski

Hi Nicolin,


On Wed, Sep 06, 2017 at 08:35:50PM +0200, Łukasz Majewski wrote:
  

clocks = < IMX6QDL_CLK_SSI2_IPG>,
 < IMX6QDL_CLK_SSI2>;
clock-names = "ipg", "baud";



dailink_master: cpu {
sound-dai = <>;
clock = <>;


If possible I do prefer a solution, which uses only DTS.
Side question - how to refer to baud clock from [1]?


Just add a property to this cpu node like:
clock = < IMX6QDL_CLK_SSI2>;


This doesn't solve the issue:

root@display5:~# speaker-test

speaker-test 1.1.3

Playback device is default
Stream parameters are 48000Hz, S16_LE, 1 channels
Using 16 octaves of pink noise
Rate set to 48000Hz (requested 48fsl-ssi-dai 202c000.ssi: bitclk > ipgclk/5
000Hz)
Buffer size range from 64fsl-ssi-dai 202c000.ssi: ASoC: can't set 
202c000.ssi hw params: -22

 to 65536
Period size range from 32 to 8191
Using max buffer size 65536
Periods = 4
Unable to set hw params for playback: Invalid argument
Setting of hwparams failed: Invalid argument





system-clock-frequency = ;


This would not be necessary unless you want to specify a clock rate
so as to override the clock rate configuration in hw_params().


This is the right solution based on current simple-card driver. For
SSI (having two clocks), you have to specify the baud clock in the
cpu node like that. I believe this is what the simple-card designer
expected users to do since the cpu node is the first place that the
driver tries to look at.


I will give a shoot the option with adding the ipg clock.


No, not ipg clock. You should use the second clock -- baud clock.




--
Best regards,

Lukasz Majewski

--

DENX Software Engineering GmbH,  Managing Director: Wolfgang Denk
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-10 Fax: (+49)-8142-66989-80 Email: w...@denx.de


Re: [PATCH] sound: soc: fsl: Do not set DAI sysclk when it is equal to system freq

2017-09-07 Thread Nicolin Chen
On Thu, Sep 07, 2017 at 02:44:11PM +0100, Mark Brown wrote:
 
> > On the other hand, the sys clock (baudclk in the driver) should be
> > configured whenever it's related to external clock outputs. When I
> > implemented this set_sysclk() for fsl_ssi.c, I used it to set this
> > sys clock (baudclk) by a machine driver, in order to set bit clock.
> > Then someone patched the driver by moving all the code to set_bclk()
> > to make machine drivers simpler. Now the set_sysclk() is remained
> > to give machine drivers a chance to override clock configurations
> > in the hw_params(). This could be used in TDM or some other special
> > cases (It could also have a purpose for backwards compatibility).
> 
> > So here, we should set baudclk (BCLK generator).
> 
> No, that's just going to cause confusion - if all the other drivers are
> using set_sysclk() to set an input clock rate to the IP rather than an
> output clock but your driver does something else then sooner or later
> someone will run into trouble with that.  

I admit I had that concern. Probably I should have deprecated this
set_sysclk(). I will try to patch it and hw_params() accordingly.


Re: UIO memmap of PCi devices not working?

2017-09-07 Thread Benjamin Herrenschmidt
On Thu, 2017-09-07 at 10:19 +, Joakim Tjernlund wrote:
> > Problem is that pci_mem_offset is gone, the closed I can find is mem_offset
> > but that is an array,maybe just mem_offset[0] ?
> > 
> > > I'm not sure exactly what's going
> > > on in your case, if you have a problem can you add printk to instrument
> > > ?
> > 
> > Seems to be something else going on in out board. Anyhow, the mem_offset 
> > should
> > be fixed to compile, nice to have it behind a CONFIG option. Then
> > one can start the process to remove the special casing easier.
> 
> After sorting the bugs in our app, it works with and without above patch.

Ok. I don't see a pressing need to change what we are doing in the
kernel then.

Cheers,
Ben.



Re: UIO memmap of PCi devices not working?

2017-09-07 Thread Benjamin Herrenschmidt
On Thu, 2017-09-07 at 08:59 +, Joakim Tjernlund wrote:
> 
> > Hrm it's tricky, you shouldn't just turn that ifdef back on without
> > also changing pci_resource_to_user().
> 
> There are two ifdef to change:
> __pci_mmap_make_offset():
> #if 0 /* See comment in pci_resource_to_user() for why this is disabled */
>   *offset += hose->pci_mem_offset;
> #endif
> 
> and
> 
> pci_resource_to_user()
>   /* We pass a fully fixed up address to userland for MMIO instead of
>* a BAR value because X is lame and expects to be able to use that
>* to pass to /dev/mem !
>*
>* That means that we'll have potentially 64 bits values where some
>* userland apps only expect 32 (like X itself since it thinks only
>* Sparc has 64 bits MMIO) but if we don't do that, we break it on
>* 32 bits CHRPs :-(
>*
>* Hopefully, the sysfs insterface is immune to that gunk. Once X
>* has been fixed (and the fix spread enough), we can re-enable the
>* 2 lines below and pass down a BAR value to userland. In that case
>* we'll also have to re-enable the matching code in
>* __pci_mmap_make_offset().
>*
>* BenH.
>*/
> #if 0
>   else if (rsrc->flags & IORESOURCE_MEM)
>   offset = hose->pci_mem_offset;
> #endif
> 
> Problem is that pci_mem_offset is gone, the closed I can find is mem_offset
> but that is an array,maybe just mem_offset[0] ?

No, you'd have to scan the array of resources to find which offset
applies.

> > I'm not sure exactly what's going
> > on in your case, if you have a problem can you add printk to instrument
> > ?
> 
> Seems to be something else going on in out board. Anyhow, the mem_offset 
> should
> be fixed to compile, nice to have it behind a CONFIG option. Then
> one can start the process to remove the special casing easier.

Again, why do you need to remove it ? Can you find anything with the
existing code (with its #if'0) that is broken ?

Cheers,
Ben.




Re: [RFC PATCH 8/8] powerpc/64s/radix: Only flush local TLB for spurious fault flushes

2017-09-07 Thread Benjamin Herrenschmidt
On Fri, 2017-09-08 at 00:51 +1000, Nicholas Piggin wrote:
> When permissiveness is relaxed, or found to have been relaxed by
> another thread, we flush that address out of the TLB to avoid a
> future fault or micro-fault due to a stale TLB entry.
> 
> Currently for processes with TLBs on other CPUs, this flush is always
> done with a global tlbie. Although that could reduce faults on remote
> CPUs, a broadcast operation seems to be wasteful for something that
> can be handled in-core by the remote CPU if it comes to it.
> 
> This is not benchmarked yet. It does seem cut some tlbie operations
> from the bus.

What happens with the nest MMU here ?

> Signed-off-by: Nicholas Piggin 
> ---
>  .../powerpc/include/asm/book3s/64/tlbflush-radix.h |  5 
>  arch/powerpc/include/asm/book3s/64/tlbflush.h  | 11 +
>  arch/powerpc/mm/pgtable-book3s64.c |  5 +++-
>  arch/powerpc/mm/pgtable.c  |  2 +-
>  arch/powerpc/mm/tlb-radix.c| 27 
> ++
>  5 files changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h 
> b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> index b12460b306a7..34cd864b8fc1 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
> @@ -16,6 +16,8 @@ extern bool radix__flush_tlb_range_psize(struct mm_struct 
> *mm, unsigned long sta
>unsigned long end, int psize);
>  extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
>  unsigned long start, unsigned long end);
> +extern void radix__local_flush_pmd_tlb_range(struct vm_area_struct *vma,
> + unsigned long start, unsigned long end);
>  extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long 
> start,
>   unsigned long end);
>  extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long 
> end);
> @@ -24,6 +26,9 @@ extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
>  extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned 
> long vmaddr);
>  extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned 
> long vmaddr,
> int psize);
> +extern void radix__local_flush_tlb_range_psize(struct mm_struct *mm,
> + unsigned long start, unsigned long end,
> + int psize);
>  extern void radix__tlb_flush(struct mmu_gather *tlb);
>  #ifdef CONFIG_SMP
>  extern void radix__flush_tlb_mm(struct mm_struct *mm);
> diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
> b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> index 72b925f97bab..8a8b3e11a28e 100644
> --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
> +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
> @@ -83,6 +83,17 @@ static inline void flush_tlb_page(struct vm_area_struct 
> *vma,
>  #define flush_tlb_mm(mm) local_flush_tlb_mm(mm)
>  #define flush_tlb_page(vma, addr)local_flush_tlb_page(vma, addr)
>  #endif /* CONFIG_SMP */
> +
> +#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
> +static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
> + unsigned long address)
> +{
> + if (radix_enabled())
> + radix__local_flush_tlb_page(vma, address);
> + else
> + flush_tlb_page(vma, address);
> +}
> +
>  /*
>   * flush the page walk cache for the address
>   */
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
> b/arch/powerpc/mm/pgtable-book3s64.c
> index 3b65917785a5..e46f346388d6 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -40,7 +40,10 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, 
> unsigned long address,
>   if (changed) {
>   __ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp),
>   pmd_pte(entry), address);
> - flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> + if (radix_enabled())
> + radix__local_flush_pmd_tlb_range(vma, address, address 
> + HPAGE_PMD_SIZE);
> + else
> + flush_pmd_tlb_range(vma, address, address + 
> HPAGE_PMD_SIZE);
>   }
>   return changed;
>  }
> diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
> index a03ff3d99e0c..acd6ae8062ce 100644
> --- a/arch/powerpc/mm/pgtable.c
> +++ b/arch/powerpc/mm/pgtable.c
> @@ -223,7 +223,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, 
> unsigned long address,
>   if (!is_vm_hugetlb_page(vma))
>   assert_pte_locked(vma->vm_mm, address);
>   __ptep_set_access_flags(vma->vm_mm, 

Re: [PATCH] powerpc/mm: Fix missing mmap_sem release

2017-09-07 Thread Davidlohr Bueso

On Thu, 07 Sep 2017, Laurent Dufour wrote:


The commit b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()") reviewed
the way the error path is managed in __do_page_fault() but it was a bit too
agressive when handling a case by returning without releasing the mmap_sem.

By the way, replacing current->mm->mmap_sem by mm->mmap_sem as mm is set to
current->mm.

Fixes: b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()")
Cc: Benjamin Herrenschmidt 
Signed-off-by: Laurent Dufour 
---
arch/powerpc/mm/fault.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4797d08581ce..f799ccf37d27 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c


But... here:

/*
 * If we need to retry the mmap_sem has already been released,
 * and if there is a fatal signal pending there is no guarantee
 * that we made any progress. Handle this case first.
 */


@@ -521,10 +521,11 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * User mode? Just return to handle the fatal exception 
otherwise
 * return to bad_page_fault
 */
+   up_read(>mmap_sem);
return is_user ? 0 : SIGBUS;
}


Per the above comment, for that case handle_mm_fault()
has already released mmap_sem. The same occurs in x86,
for example.

Thanks,
Davidlohr


Re: [PATCH V13 4/4] powerpc/vphn: Fix numa update end-loop bug

2017-09-07 Thread Michael Bringmann
Simplest change IMO:

for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
ud = [i++];
+   ud->next = [i];
ud->cpu = sibling;
ud->new_nid = new_nid;
ud->old_nid = numa_cpu_lookup_table[sibling];
cpumask_set_cpu(sibling, _cpus);
-   if (i < weight)
-   ud->next = [i];
}
cpu = cpu_last_thread_sibling(cpu);

}

if (i)
updates[i-1].next = NULL;

Link all of the updates together, and NULL the link pointer in the
last entry to be filled in.  No worries about invalid comparisons.
Reduced code.

Michael


On 09/07/2017 08:35 AM, Nathan Fontenot wrote:
> On 09/06/2017 05:03 PM, Michael Bringmann wrote:
>>
>>
>> On 09/06/2017 09:45 AM, Nathan Fontenot wrote:
>>> On 09/01/2017 10:48 AM, Michael Bringmann wrote:
 powerpc/vphn: On Power systems with shared configurations of CPUs
 and memory, there are some issues with the association of additional
 CPUs and memory to nodes when hot-adding resources.  This patch
 fixes an end-of-updates processing problem observed occasionally
 in numa_update_cpu_topology().

 Signed-off-by: Michael Bringmann 
 ---
  arch/powerpc/mm/numa.c |7 +++
  1 file changed, 7 insertions(+)

 diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
 index 3a5b334..fccf23f 100644
 --- a/arch/powerpc/mm/numa.c
 +++ b/arch/powerpc/mm/numa.c
 @@ -1410,6 +1410,13 @@ int numa_update_cpu_topology(bool cpus_locked)
cpu = cpu_last_thread_sibling(cpu);
}

 +  /*
 +   * Prevent processing of 'updates' from overflowing array
 +   * in cases where last entry filled in a 'next' pointer.
 +   */
 +  if (i)
 +  updates[i-1].next = NULL;
 +
>>>
>>> This really looks like the bug is in the code above this where we
>>> fill in the updates array for each of the sibling cpus. The code
>>> there assumes that if the current update entry is not the end that
>>> there will be more updates and blindly sets the next pointer.
>>>
>>> Perhaps correcting the logic in that code to next pointers. Set the
>>> ud pointer to NULL before the outer for_each_cpu() loop. Then in the
>>> inner for_each_cpu(sibling,...) loop update the ud-> next pointer as
>>> the first operation.
>>>
>>> for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
>>> if (ud)
>>> ud->next = [i];
>>> ...
>>> }
>>>
>>> Obviously untested, but I think this would prevent setting the next
>>> pointer in the last update entry that is filled out erroneously.
>>
>> The above fragment looks to skip initialization of the 'next' pointer
>> in the first element of the the 'updates'.  That would abort subsequent
>> evaluation of the array too soon, I believe.  I would like to take another 
>> look
>> to see whether the current check 'if (i < weight) ud->next = [i];'
>> is having problems due to i being 0-relative and weight being 1-relative.
> 
> Another thing to keep in mind is that cpus can be skipped by checks earlier
> in the loop. There is not guarantee that we will add 'weight' elements to
> the ud list.
> 
> -Nathan
> 
>>
>>>   
>>> -Nathan
>>
>> Michael
>>
>>>
pr_debug("Topology update for the following CPUs:\n");
if (cpumask_weight(_cpus)) {
for (ud = [0]; ud; ud = ud->next) {

>>>
>>
> 
> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com



RE: Machine Check in P2010(e500v2)

2017-09-07 Thread Leo Li


> -Original Message-
> From: Joakim Tjernlund [mailto:joakim.tjernl...@infinera.com]
> Sent: Thursday, September 07, 2017 3:41 AM
> To: linuxppc-dev@lists.ozlabs.org; Leo Li ; York Sun
> 
> Subject: Re: Machine Check in P2010(e500v2)
> 
> On Thu, 2017-09-07 at 00:50 +0200, Joakim Tjernlund wrote:
> > On Wed, 2017-09-06 at 21:13 +, Leo Li wrote:
> > > > -Original Message-
> > > > From: Joakim Tjernlund [mailto:joakim.tjernl...@infinera.com]
> > > > Sent: Wednesday, September 06, 2017 3:54 PM
> > > > To: linuxppc-dev@lists.ozlabs.org; Leo Li ;
> > > > York Sun 
> > > > Subject: Re: Machine Check in P2010(e500v2)
> > > >
> > > > On Wed, 2017-09-06 at 20:28 +, Leo Li wrote:
> > > > > > -Original Message-
> > > > > > From: Joakim Tjernlund [mailto:joakim.tjernl...@infinera.com]
> > > > > > Sent: Wednesday, September 06, 2017 3:17 PM
> > > > > > To: linuxppc-dev@lists.ozlabs.org; Leo Li
> > > > > > ; York Sun 
> > > > > > Subject: Re: Machine Check in P2010(e500v2)
> > > > > >
> > > > > > On Wed, 2017-09-06 at 19:31 +, Leo Li wrote:
> > > > > > > > -Original Message-
> > > > > > > > From: York Sun
> > > > > > > > Sent: Wednesday, September 06, 2017 10:38 AM
> > > > > > > > To: Joakim Tjernlund ;
> > > > > > > > linuxppc- d...@lists.ozlabs.org; Leo Li
> > > > > > > > 
> > > > > > > > Subject: Re: Machine Check in P2010(e500v2)
> > > > > > > >
> > > > > > > > Scott is no longer with Freescale/NXP. Adding Leo.
> > > > > > > >
> > > > > > > > On 09/05/2017 01:40 AM, Joakim Tjernlund wrote:
> > > > > > > > > So after some debugging I found this bug:
> > > > > > > > > @@ -996,7 +998,7 @@ int fsl_pci_mcheck_exception(struct
> > > > > > > > > pt_regs
> > > >
> > > > *regs)
> > > > > > > > >  if (is_in_pci_mem_space(addr)) {
> > > > > > > > >  if (user_mode(regs)) {
> > > > > > > > >  pagefault_disable();
> > > > > > > > > -   ret = get_user(regs->nip, );
> > > > > > > > > +   ret = get_user(inst, (__u32
> > > > > > > > > + __user *)regs->nip);
> > > > > > > > >  pagefault_enable();
> > > > > > > > >  } else {
> > > > > > > > >  ret =
> > > > > > > > > probe_kernel_address(regs->nip, inst);
> > > > > > > > >
> > > > > > > > > However, the kernel still locked up after fixing that.
> > > > > > > > > Now I wonder why this fixup is there in the first place?
> > > > > > > > > The routine will not really fixup the insn, just return
> > > > > > > > > 0x for the failing read and then advance the process 
> > > > > > > > > NIP.
> > > > > > >
> > > > > > > You are right.  The code here only gives 0x to the
> > > > > > > load instructions and
> > > > > >
> > > > > > continue with the next instruction when the load instruction
> > > > > > is causing the machine check.  This will prevent a system
> > > > > > lockup when reading from PCI/RapidIO device which is link down.
> > > > > > >
> > > > > > > I don't know what is actual problem in your case.  Maybe it
> > > > > > > is a write
> > > > > >
> > > > > > instruction instead of read?   Or the code is in a infinite loop 
> > > > > > waiting for
> a
> > > >
> > > > valid
> > > > > > read result?  Are you able to do some further debugging with
> > > > > > the NIP correctly printed?
> > > > > > >
> > > > > >
> > > > > > According to the MC it is a Read and the NIP also leads to a
> > > > > > read in the
> > > >
> > > > program.
> > > > > > ATM, I have disabled the fixup but I will enable that again.
> > > > > > Question, is it safe add a small printk when this MC
> > > > > > happens(after fixing up)? I need to see that it has happened
> > > > > > as the error is somewhat
> > > >
> > > > random.
> > > > >
> > > > > I think it is safe to add printk as the current machine check
> > > > > handlers are also
> > > >
> > > > using printk.
> > > >
> > > > I hope so, but if the fixup fires there is no printk at all so I was a 
> > > > bit unsure.
> > > > Don't like this fixup though, is there not a better way than
> > > > faking a read to user space(or kernel for that matter) ?
> > >
> > > I don't have a better idea.  Without the fixup, the offending load 
> > > instruction
> will never finish if there is anything wrong with the backing device and 
> freeze the
> whole system.  Do you have any suggestion in mind?
> > >
> >
> > But it never finishes the load, it just fakes a load of 0xf,
> > for user space I rather have it signal a SIGBUS but that does not seem
> > to work either, at least not for us but that could be a bug in general MC 
> > code
> maybe.
> > This fixup might be valid for kernel only as it has never worked for user 
> > space
> due to the bug I found.
> >
> > Where can I read about this errata ?
> 

[PATCH] powerpc: Expose TSCR via sysfs

2017-09-07 Thread Anton Blanchard
From: Anton Blanchard 

The thread switch control register (TSCR) is a per core register
that configures how the CPU shares resources between SMT threads.

Exposing it via sysfs allows us to tune it at run time.

Signed-off-by: Anton Blanchard 
---
 arch/powerpc/kernel/sysfs.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index 4437c70c7c2b..b60a441092b9 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -485,6 +485,7 @@ SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
 SYSFS_SPRSETUP(purr, SPRN_PURR);
 SYSFS_SPRSETUP(spurr, SPRN_SPURR);
 SYSFS_SPRSETUP(pir, SPRN_PIR);
+SYSFS_SPRSETUP(tscr, SPRN_TSCR);
 
 /*
   Lets only enable read for phyp resources and
@@ -495,6 +496,7 @@ static DEVICE_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
 static DEVICE_ATTR(spurr, 0400, show_spurr, NULL);
 static DEVICE_ATTR(purr, 0400, show_purr, store_purr);
 static DEVICE_ATTR(pir, 0400, show_pir, NULL);
+static DEVICE_ATTR(tscr, 0600, show_tscr, store_tscr);
 
 /*
  * This is the system wide DSCR register default value. Any
@@ -774,6 +776,9 @@ static int register_cpu_online(unsigned int cpu)
 
if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
device_create_file(s, _attr_pir);
+
+   if (cpu_has_feature(CPU_FTR_ARCH_206))
+   device_create_file(s, _attr_tscr);
 #endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
@@ -856,6 +861,9 @@ static int unregister_cpu_online(unsigned int cpu)
 
if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2))
device_remove_file(s, _attr_pir);
+
+   if (cpu_has_feature(CPU_FTR_ARCH_206))
+   device_remove_file(s, _attr_tscr);
 #endif /* CONFIG_PPC64 */
 
 #ifdef CONFIG_PPC_FSL_BOOK3E
-- 
2.11.0



[PATCH] powerpc/mm: Fix missing mmap_sem release

2017-09-07 Thread Laurent Dufour
The commit b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()") reviewed
the way the error path is managed in __do_page_fault() but it was a bit too
agressive when handling a case by returning without releasing the mmap_sem.

By the way, replacing current->mm->mmap_sem by mm->mmap_sem as mm is set to
current->mm.

Fixes: b5c8f0fd595d ("powerpc/mm: Rework mm_fault_error()")
Cc: Benjamin Herrenschmidt 
Signed-off-by: Laurent Dufour 
---
 arch/powerpc/mm/fault.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4797d08581ce..f799ccf37d27 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -521,10 +521,11 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * User mode? Just return to handle the fatal exception 
otherwise
 * return to bad_page_fault
 */
+   up_read(>mmap_sem);
return is_user ? 0 : SIGBUS;
}
 
-   up_read(>mm->mmap_sem);
+   up_read(>mmap_sem);
 
if (unlikely(fault & VM_FAULT_ERROR))
return mm_fault_error(regs, address, fault);
-- 
2.7.4



Re: [PATCH] powerpc/powernv: Increase memory block size to 1GB on radix

2017-09-07 Thread Reza Arbab

On Thu, Sep 07, 2017 at 05:17:41AM +, Anton Blanchard wrote:

But all of memory on PowerNV should be able to be hot unplugged, so
there are two options as I see it - either increase the memory block
size, or map everything with 2MB pages.


I may be misunderstanding this, but what if we did something like x86 
does? When trying to unplug a region smaller than the mapping, they fill 
that part of the pagetable with 0xFD instead of freeing the whole thing.  
Once the whole thing is 0xFD, free it.


See arch/x86/mm/init_64.c:remove_{pte,pmd,pud}_table()

---%<---
memset((void *)addr, PAGE_INUSE, next - addr);

page_addr = page_address(pte_page(*pte));
if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) {
...
pte_clear(_mm, addr, pte);
...
}
---%<---

--
Reza Arbab



[RFC PATCH 8/8] powerpc/64s/radix: Only flush local TLB for spurious fault flushes

2017-09-07 Thread Nicholas Piggin
When permissiveness is relaxed, or found to have been relaxed by
another thread, we flush that address out of the TLB to avoid a
future fault or micro-fault due to a stale TLB entry.

Currently for processes with TLBs on other CPUs, this flush is always
done with a global tlbie. Although that could reduce faults on remote
CPUs, a broadcast operation seems to be wasteful for something that
can be handled in-core by the remote CPU if it comes to it.

This is not benchmarked yet. It does seem cut some tlbie operations
from the bus.

Signed-off-by: Nicholas Piggin 
---
 .../powerpc/include/asm/book3s/64/tlbflush-radix.h |  5 
 arch/powerpc/include/asm/book3s/64/tlbflush.h  | 11 +
 arch/powerpc/mm/pgtable-book3s64.c |  5 +++-
 arch/powerpc/mm/pgtable.c  |  2 +-
 arch/powerpc/mm/tlb-radix.c| 27 ++
 5 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index b12460b306a7..34cd864b8fc1 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -16,6 +16,8 @@ extern bool radix__flush_tlb_range_psize(struct mm_struct 
*mm, unsigned long sta
 unsigned long end, int psize);
 extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
   unsigned long start, unsigned long end);
+extern void radix__local_flush_pmd_tlb_range(struct vm_area_struct *vma,
+   unsigned long start, unsigned long end);
 extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long 
start,
unsigned long end);
 extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long 
end);
@@ -24,6 +26,9 @@ extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
 extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned 
long vmaddr);
 extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned 
long vmaddr,
  int psize);
+extern void radix__local_flush_tlb_range_psize(struct mm_struct *mm,
+   unsigned long start, unsigned long end,
+   int psize);
 extern void radix__tlb_flush(struct mmu_gather *tlb);
 #ifdef CONFIG_SMP
 extern void radix__flush_tlb_mm(struct mm_struct *mm);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 72b925f97bab..8a8b3e11a28e 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -83,6 +83,17 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 #define flush_tlb_mm(mm)   local_flush_tlb_mm(mm)
 #define flush_tlb_page(vma, addr)  local_flush_tlb_page(vma, addr)
 #endif /* CONFIG_SMP */
+
+#define flush_tlb_fix_spurious_fault flush_tlb_fix_spurious_fault
+static inline void flush_tlb_fix_spurious_fault(struct vm_area_struct *vma,
+   unsigned long address)
+{
+   if (radix_enabled())
+   radix__local_flush_tlb_page(vma, address);
+   else
+   flush_tlb_page(vma, address);
+}
+
 /*
  * flush the page walk cache for the address
  */
diff --git a/arch/powerpc/mm/pgtable-book3s64.c 
b/arch/powerpc/mm/pgtable-book3s64.c
index 3b65917785a5..e46f346388d6 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -40,7 +40,10 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, 
unsigned long address,
if (changed) {
__ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp),
pmd_pte(entry), address);
-   flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+   if (radix_enabled())
+   radix__local_flush_pmd_tlb_range(vma, address, address 
+ HPAGE_PMD_SIZE);
+   else
+   flush_pmd_tlb_range(vma, address, address + 
HPAGE_PMD_SIZE);
}
return changed;
 }
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index a03ff3d99e0c..acd6ae8062ce 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -223,7 +223,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, 
unsigned long address,
if (!is_vm_hugetlb_page(vma))
assert_pte_locked(vma->vm_mm, address);
__ptep_set_access_flags(vma->vm_mm, ptep, entry, address);
-   flush_tlb_page(vma, address);
+   flush_tlb_fix_spurious_fault(vma, address);
}
return changed;
 }
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 

[RFC PATCH 7/8] powerpc/64s/radix: Improve TLB flushing for unmaps that free a page table

2017-09-07 Thread Nicholas Piggin
Unmaps that free page tables always flush the PID, which is sub
optimal. Allow those to do TLB range flushes with separate PWC flush.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 51 +++--
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 1b0cac656680..7452e1f4aa3c 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -351,23 +351,35 @@ static int radix_get_mmu_psize(int page_size)
return psize;
 }
 
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned 
long start,
+ unsigned long end, int psize);
+
 void radix__tlb_flush(struct mmu_gather *tlb)
 {
int psize = 0;
struct mm_struct *mm = tlb->mm;
int page_size = tlb->page_size;
 
-   psize = radix_get_mmu_psize(page_size);
/*
 * if page size is not something we understand, do a full mm flush
 */
-   if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
-   radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-   else if (tlb->need_flush_all) {
-   tlb->need_flush_all = 0;
+   if (tlb->fullmm) {
radix__flush_all_mm(mm);
-   } else
-   radix__flush_tlb_mm(mm);
+   } else if ( (psize = radix_get_mmu_psize(page_size)) == -1) {
+   if (!tlb->need_flush_all)
+   radix__flush_tlb_mm(mm);
+   else
+   radix__flush_all_mm(mm);
+   } else {
+   unsigned long start = tlb->start;
+   unsigned long end = tlb->end;
+
+   if (!tlb->need_flush_all)
+   radix__flush_tlb_range_psize(mm, start, end, psize);
+   else
+   radix__flush_tlb_pwc_range_psize(mm, start, end, psize);
+   }
+   tlb->need_flush_all = 0;
 }
 
 #define TLB_FLUSH_ALL -1UL
@@ -384,8 +396,9 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = 
POWER9_TLB_SETS_RADIX * 2;
 
-bool radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
- unsigned long end, int psize)
+static bool __radix__flush_tlb_range_psize(struct mm_struct *mm,
+   unsigned long start, unsigned long end,
+   int psize, bool also_pwc)
 {
unsigned long pid;
unsigned int page_shift = mmu_psize_defs[psize].shift;
@@ -401,17 +414,21 @@ bool radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
if (end == TLB_FLUSH_ALL || ((end - start) >> page_shift) >
tlb_local_single_page_flush_ceiling) {
full = true;
-   _tlbiel_pid(pid, RIC_FLUSH_TLB);
+   _tlbiel_pid(pid, also_pwc ? RIC_FLUSH_ALL : 
RIC_FLUSH_TLB);
} else {
_tlbiel_va_range(start, end, pid, page_size, psize);
+   if (also_pwc)
+   _tlbiel_pid(pid, RIC_FLUSH_PWC);
}
} else {
if (end == TLB_FLUSH_ALL || ((end - start) >> page_shift) >
tlb_single_page_flush_ceiling) {
full = true;
-   _tlbie_pid(pid, RIC_FLUSH_TLB);
+   _tlbie_pid(pid, also_pwc ? RIC_FLUSH_ALL : 
RIC_FLUSH_TLB);
} else {
_tlbie_va_range(start, end, pid, page_size, psize);
+   if (also_pwc)
+   _tlbie_pid(pid, RIC_FLUSH_PWC);
}
}
preempt_enable();
@@ -419,6 +436,18 @@ bool radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
return full;
 }
 
+bool radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+ unsigned long end, int psize)
+{
+   return __radix__flush_tlb_range_psize(mm, start, end, psize, false);
+}
+
+static void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned 
long start,
+ unsigned long end, int psize)
+{
+   __radix__flush_tlb_range_psize(mm, start, end, psize, true);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
-- 
2.13.3



[RFC PATCH 6/8] powerpc/64s/radix: Optimize flush_tlb_range

2017-09-07 Thread Nicholas Piggin
Currently for radix, flush_tlb_range flushes the entire PID, because
we don't know about THP vs regular pages. This is quite sub-optimal
for small mremap/mprotect/change_protection.

Instead, implement this with two range flush passes, one for each
page size. If the small page range flush ended up doing the full PID
invalidation, then avoid the second flush. If not, the second flush
is an order of magnitude or two fewer operations than the first, so
it's relatively insignificant.

There is still room for improvement here with some changes to generic
APIs, particularly if there are a lot of huge pages in place.

Signed-off-by: Nicholas Piggin 
---
 .../powerpc/include/asm/book3s/64/tlbflush-radix.h |  2 +-
 arch/powerpc/mm/tlb-radix.c| 52 +-
 2 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 9b433a624bf3..b12460b306a7 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -12,7 +12,7 @@ static inline int mmu_get_ap(int psize)
 
 extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
   unsigned long start, unsigned long 
end);
-extern void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long 
start,
+extern bool radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long 
start,
 unsigned long end, int psize);
 extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
   unsigned long start, unsigned long end);
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 8ec59b57d46c..1b0cac656680 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -299,17 +299,40 @@ void radix__flush_tlb_kernel_range(unsigned long start, 
unsigned long end)
 }
 EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 
-/*
- * Currently, for range flushing, we just do a full mm flush. Because
- * we use this in code path where we don' track the page size.
- */
 void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 unsigned long end)
 
 {
struct mm_struct *mm = vma->vm_mm;
+   bool full;
 
-   radix__flush_tlb_mm(mm);
+#ifdef CONFIG_HUGETLB_PAGE
+   if (is_vm_hugetlb_page(vma))
+   return radix__flush_hugetlb_tlb_range(vma, start, end);
+#endif
+   full = radix__flush_tlb_range_psize(mm, start, end, mmu_virtual_psize);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+   if (!full) {
+   /*
+* If the small page flush was not a full PID flush, we have
+* to do a second pass to flush transparent huge pages. This
+* will be a far smaller number of invalidates, so it's not
+* worth calculating.
+*
+* Range flushes are still sub-optimal for cases of all or
+* no hugepages (moreso the former), which should be improved
+* by changing the flush API.
+*/
+   unsigned long hstart, hend;
+   hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
+   hend = end >> HPAGE_PMD_SHIFT;
+   if (hstart != hend) {
+   hstart <<= HPAGE_PMD_SHIFT;
+   hend <<= HPAGE_PMD_SHIFT;
+   radix__flush_tlb_range_psize(mm, hstart, hend, 
MMU_PAGE_2M);
+   }
+   }
+#endif
 }
 EXPORT_SYMBOL(radix__flush_tlb_range);
 
@@ -361,32 +384,39 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = 
POWER9_TLB_SETS_RADIX * 2;
 
-void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
+bool radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
  unsigned long end, int psize)
 {
unsigned long pid;
unsigned int page_shift = mmu_psize_defs[psize].shift;
unsigned long page_size = 1UL << page_shift;
+   bool full = false;
 
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
-   return;
+   return full;
 
preempt_disable();
if (mm_is_thread_local(mm)) {
if (end == TLB_FLUSH_ALL || ((end - start) >> page_shift) >
-   tlb_local_single_page_flush_ceiling)
+   tlb_local_single_page_flush_ceiling) {
+   full = true;
_tlbiel_pid(pid, RIC_FLUSH_TLB);
-   else
+   } else {
_tlbiel_va_range(start, end, pid, 

[RFC PATCH 5/8] powerpc/64s/radix: Introduce local single page ceiling for TLB range flush

2017-09-07 Thread Nicholas Piggin
The single page flush ceiling is the cut-off point at which we switch
from invalidating individual pages, to invalidating the entire process
address space in response to a range flush.

Introduce a local variant of this heuristic because local and global
tlbie have significantly different properties:
- Local tlbiel requires 128 instructions to invalidate a PID, global
  tlbie only 1 instruction.
- Global tlbie instructions are expensive broadcast operations.

The local ceiling has been made much higher, 2x the number of
instructions required to invalidate the entire PID (this has not
yet been benchmarked in detail).
---
 arch/powerpc/mm/tlb-radix.c | 49 +++--
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 1d3cbc01596d..8ec59b57d46c 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -348,35 +348,41 @@ void radix__tlb_flush(struct mmu_gather *tlb)
 }
 
 #define TLB_FLUSH_ALL -1UL
+
 /*
- * Number of pages above which we will do a bcast tlbie. Just a
- * number at this point copied from x86
+ * Number of pages above which we invalidate the entire PID rather than
+ * flush individual pages, for local and global flushes respectively.
+ *
+ * tlbie goes out to the interconnect and individual ops are more costly.
+ * It also does not iterate over sets like the local tlbiel variant when
+ * invalidating a full PID, so it has a far lower threshold to change from
+ * individual page flushes to full-pid flushes.
  */
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = 
POWER9_TLB_SETS_RADIX * 2;
 
 void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
  unsigned long end, int psize)
 {
unsigned long pid;
-   bool local;
-   unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
+   unsigned int page_shift = mmu_psize_defs[psize].shift;
+   unsigned long page_size = 1UL << page_shift;
 
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
 
preempt_disable();
-   local = mm_is_thread_local(mm);
-   if (end == TLB_FLUSH_ALL ||
-   (end - start) > tlb_single_page_flush_ceiling * page_size) {
-   if (local)
+   if (mm_is_thread_local(mm)) {
+   if (end == TLB_FLUSH_ALL || ((end - start) >> page_shift) >
+   tlb_local_single_page_flush_ceiling)
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
-   _tlbie_pid(pid, RIC_FLUSH_TLB);
-
-   } else {
-   if (local)
_tlbiel_va_range(start, end, pid, page_size, psize);
+   } else {
+   if (end == TLB_FLUSH_ALL || ((end - start) >> page_shift) >
+   tlb_single_page_flush_ceiling)
+   _tlbie_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_va_range(start, end, pid, page_size, psize);
}
@@ -387,7 +393,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
unsigned long pid, end;
-   bool local;
 
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
@@ -399,21 +404,17 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
return;
}
 
-   preempt_disable();
-   local = mm_is_thread_local(mm);
-   /* Otherwise first do the PWC */
-   if (local)
-   _tlbiel_pid(pid, RIC_FLUSH_PWC);
-   else
-   _tlbie_pid(pid, RIC_FLUSH_PWC);
-
-   /* Then iterate the pages */
end = addr + HPAGE_PMD_SIZE;
 
-   if (local)
+   /* Otherwise first do the PWC, then iterate the pages. */
+   preempt_disable();
+   if (mm_is_thread_local(mm)) {
+   _tlbiel_pid(pid, RIC_FLUSH_PWC);
_tlbiel_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
-   else
+   } else {
+   _tlbie_pid(pid, RIC_FLUSH_PWC);
_tlbie_va_range(addr, end, pid, PAGE_SIZE, mmu_virtual_psize);
+   }
preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.13.3



[RFC PATCH 4/8] powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions

2017-09-07 Thread Nicholas Piggin
Move the barriers and range iteration down into the _tlbie* level,
which improves readability.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 70 ++---
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index c30f3faf5356..1d3cbc01596d 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -85,7 +85,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned 
long ric)
 }
 
 static inline void __tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+  unsigned long ap, unsigned long ric)
 {
unsigned long rb,rs,prs,r;
 
@@ -101,13 +101,28 @@ static inline void __tlbiel_va(unsigned long va, unsigned 
long pid,
 }
 
 static inline void _tlbiel_va(unsigned long va, unsigned long pid,
- unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
 {
+   unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbiel_va(va, pid, ap, ric);
asm volatile("ptesync": : :"memory");
 }
 
+static inline void _tlbiel_va_range(unsigned long start, unsigned long end,
+   unsigned long pid, unsigned long page_size,
+   unsigned long psize)
+{
+   unsigned long addr;
+   unsigned long ap = mmu_get_ap(psize);
+
+   asm volatile("ptesync": : :"memory");
+   for (addr = start; addr < end; addr += page_size)
+   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+   asm volatile("ptesync": : :"memory");
+}
+
 static inline void __tlbie_va(unsigned long va, unsigned long pid,
 unsigned long ap, unsigned long ric)
 {
@@ -125,13 +140,27 @@ static inline void __tlbie_va(unsigned long va, unsigned 
long pid,
 }
 
 static inline void _tlbie_va(unsigned long va, unsigned long pid,
-unsigned long ap, unsigned long ric)
+ unsigned long psize, unsigned long ric)
 {
+   unsigned long ap = mmu_get_ap(psize);
+
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, ap, ric);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
 }
 
+static inline void _tlbie_va_range(unsigned long start, unsigned long end,
+   unsigned long pid, unsigned long page_size,
+   unsigned long psize)
+{
+   unsigned long addr;
+   unsigned long ap = mmu_get_ap(psize);
+
+   asm volatile("ptesync": : :"memory");
+   for (addr = start; addr < end; addr += page_size)
+   __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
 
 /*
  * Base TLB flushing operations:
@@ -173,12 +202,11 @@ void radix__local_flush_tlb_page_psize(struct mm_struct 
*mm, unsigned long vmadd
   int psize)
 {
unsigned long pid;
-   unsigned long ap = mmu_get_ap(psize);
 
preempt_disable();
pid = mm ? mm->context.id : 0;
if (pid != MMU_NO_CONTEXT)
-   _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+   _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
 }
 
@@ -238,16 +266,15 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, 
unsigned long vmaddr,
 int psize)
 {
unsigned long pid;
-   unsigned long ap = mmu_get_ap(psize);
 
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
preempt_disable();
if (!mm_is_thread_local(mm))
-   _tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+   _tlbie_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
else
-   _tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
+   _tlbiel_va(vmaddr, pid, psize, RIC_FLUSH_TLB);
preempt_enable();
 }
 
@@ -331,9 +358,7 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
  unsigned long end, int psize)
 {
unsigned long pid;
-   unsigned long addr;
bool local;
-   unsigned long ap = mmu_get_ap(psize);
unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
pid = mm ? mm->context.id : 0;
@@ -350,18 +375,10 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
_tlbie_pid(pid, RIC_FLUSH_TLB);
 
} else {
-   asm volatile("ptesync": : :"memory");
-   for (addr = start; addr < end; addr += page_size) {
-
-   if (local)
-   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-   else
- 

[RFC PATCH 3/8] powerpc/64s/radix: optimize TLB range flush barriers

2017-09-07 Thread Nicholas Piggin
Short range flushes issue a sequences of tlbie(l) instructions for
individual effective addresses. These do not all require individual
barrier sequences, only one set around all instructions.

Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
made a similar optimization for tlbiel for PID flushing.

For tlbie, the ISA says:

The tlbsync instruction provides an ordering function for the
effects of all tlbie instructions executed by the thread executing
the tlbsync instruction, with respect to the memory barrier
created by a subsequent ptesync instruction executed by the same
thread.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 41 -
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 1ed61baf58da..c30f3faf5356 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned 
long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+static inline void __tlbiel_va(unsigned long va, unsigned long pid,
  unsigned long ap, unsigned long ric)
 {
unsigned long rb,rs,prs,r;
@@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned 
long pid,
prs = 1; /* process scoped */
r = 1;   /* raidx format */
 
-   asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : 
"memory");
-   asm volatile("ptesync": : :"memory");
trace_tlbie(0, 1, rb, rs, ric, prs, r);
 }
 
-static inline void _tlbie_va(unsigned long va, unsigned long pid,
+static inline void _tlbiel_va(unsigned long va, unsigned long pid,
+ unsigned long ap, unsigned long ric)
+{
+   asm volatile("ptesync": : :"memory");
+   __tlbiel_va(va, pid, ap, ric);
+   asm volatile("ptesync": : :"memory");
+}
+
+static inline void __tlbie_va(unsigned long va, unsigned long pid,
 unsigned long ap, unsigned long ric)
 {
unsigned long rb,rs,prs,r;
@@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned 
long pid,
prs = 1; /* process scoped */
r = 1;   /* raidx format */
 
-   asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
 : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : 
"memory");
-   asm volatile("eieio; tlbsync; ptesync": : :"memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
 }
 
+static inline void _tlbie_va(unsigned long va, unsigned long pid,
+unsigned long ap, unsigned long ric)
+{
+   asm volatile("ptesync": : :"memory");
+   __tlbie_va(va, pid, ap, ric);
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}
+
+
 /*
  * Base TLB flushing operations:
  *
@@ -335,14 +348,20 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
+
} else {
+   asm volatile("ptesync": : :"memory");
for (addr = start; addr < end; addr += page_size) {
 
if (local)
-   _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+   __tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
else
-   _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   __tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
+   if (local)
+   asm volatile("ptesync": : :"memory");
+   else
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
preempt_enable();
 }
@@ -373,6 +392,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
_tlbie_pid(pid, RIC_FLUSH_PWC);
 
/* Then iterate the pages */
+   asm volatile("ptesync": : :"memory");
end = addr + HPAGE_PMD_SIZE;
for (; addr < end; addr += PAGE_SIZE) {
if (local)
@@ -380,7 +400,10 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
-
+   if (local)
+   asm volatile("ptesync": : :"memory");
+   else
+   asm volatile("eieio; tlbsync; ptesync": : :"memory");
preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.13.3



[RFC PATCH 2/8] powerpc/64s/radix: tlbie improve preempt handling

2017-09-07 Thread Nicholas Piggin
Preempt should be consistently disabled for mm_is_thread_local tests,
so bring the rest of these under preempt_disable().

Preempt does not need to be disabled for the mm->context.id tests, which
allows simplification and removal of gotos.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 47 +
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index b3e849c4886e..1ed61baf58da 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -186,16 +186,15 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 {
unsigned long pid;
 
-   preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto no_context;
+   return;
 
+   preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_TLB);
else
_tlbiel_pid(pid, RIC_FLUSH_TLB);
-no_context:
preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -204,16 +203,15 @@ static void radix__flush_all_mm(struct mm_struct *mm)
 {
unsigned long pid;
 
-   preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto no_context;
+   return;
 
+   preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbiel_pid(pid, RIC_FLUSH_ALL);
-no_context:
preempt_enable();
 }
 
@@ -229,15 +227,14 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, 
unsigned long vmaddr,
unsigned long pid;
unsigned long ap = mmu_get_ap(psize);
 
-   preempt_disable();
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto bail;
+   return;
+   preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
else
_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-bail:
preempt_enable();
 }
 
@@ -322,46 +319,44 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
 {
unsigned long pid;
unsigned long addr;
-   int local = mm_is_thread_local(mm);
+   bool local;
unsigned long ap = mmu_get_ap(psize);
unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
-
-   preempt_disable();
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto err_out;
+   return;
 
+   preempt_disable();
+   local = mm_is_thread_local(mm);
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling * page_size) {
if (local)
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
-   goto err_out;
-   }
-   for (addr = start; addr < end; addr += page_size) {
+   } else {
+   for (addr = start; addr < end; addr += page_size) {
 
-   if (local)
-   _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-   else
-   _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   if (local)
+   _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+   else
+   _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   }
}
-err_out:
preempt_enable();
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
-   int local = mm_is_thread_local(mm);
unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;
-
+   bool local;
 
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto no_context;
+   return;
 
/* 4k page size, just blow the world */
if (PAGE_SIZE == 0x1000) {
@@ -369,6 +364,8 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
return;
}
 
+   preempt_disable();
+   local = mm_is_thread_local(mm);
/* Otherwise first do the PWC */
if (local)
_tlbiel_pid(pid, RIC_FLUSH_PWC);
@@ -383,7 +380,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
-no_context:
+
preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.13.3



[RFC PATCH 1/8] powerpc/64s/radix: Fix theoretical process table entry cache invalidation

2017-09-07 Thread Nicholas Piggin
According to the architecture, the process table entry cache must be
flushed with RIC=2 tlbies. This problem doesn't hit in existing
implementations that do not cache process table entries over mtpid. The
PID is only destroyed and re-used after all CPUs have switched away from
the mm, guaranteeing its entry is not cached anywhere. But this is not
generally safe according to the ISA.

Fix this by clearing the process table entry before the final flush
(which is always a RIC=2 flush that invalidates the process table entry
cache).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/mmu_context.h |  4 
 arch/powerpc/mm/mmu_context_book3s64.c | 23 ++-
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 309592589e30..0a70221adcf7 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -118,9 +118,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
 {
 }
 
+#ifndef CONFIG_PPC_BOOK3S_64
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 }
+#else
+extern void arch_exit_mmap(struct mm_struct *mm);
+#endif
 
 static inline void arch_unmap(struct mm_struct *mm,
  struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..feb3f43195c2 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -216,19 +216,32 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
WARN_ON_ONCE(!list_empty(>context.iommu_group_mem_list));
 #endif
+   if (radix_enabled())
+   WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+   else
+   subpage_prot_free(mm);
+   destroy_pagetable_page(mm);
+   __destroy_context(mm->context.id);
+   mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
if (radix_enabled()) {
/*
 * Radix doesn't have a valid bit in the process table
 * entries. However we know that at least P9 implementation
 * will avoid caching an entry with an invalid RTS field,
 * and 0 is invalid. So this will do.
+*
+* This runs before the "fullmm" tlb flush in exit_mmap,
+* which does a RIC_FLUSH_ALL to clear the process table
+* entry. No barrier required here after the store because
+* this process will do the invalidate, which starts with
+* ptesync.
 */
process_tb[mm->context.id].prtb0 = 0;
-   } else
-   subpage_prot_free(mm);
-   destroy_pagetable_page(mm);
-   __destroy_context(mm->context.id);
-   mm->context.id = MMU_NO_CONTEXT;
+   }
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
-- 
2.13.3



[RFC PATCH 0/8] Further radix TLB flush optimisations

2017-09-07 Thread Nicholas Piggin
Here is a bit more TLB flush work that mostly attempt to
improve range flushes by reducing barriers, and reducing
the cases we resort to flushing the entire PID.

I haven't done much benchmarking to get good numbers yet
for the exact heuristics settings, just interested in
comments for the overall idea.

Thanks,
Nick

Nicholas Piggin (8):
  powerpc/64s/radix: Fix theoretical process table entry cache
invalidation
  powerpc/64s/radix: tlbie improve preempt handling
  powerpc/64s/radix: optimize TLB range flush barriers
  powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
  powerpc/64s/radix: Introduce local single page ceiling for TLB range
flush
  powerpc/64s/radix: Optimize flush_tlb_range
  powerpc/64s/radix: Improve TLB flushing for unmaps that free a page
table
  powerpc/64s/radix: Only flush local TLB for spurious fault flushes

 .../powerpc/include/asm/book3s/64/tlbflush-radix.h |   7 +-
 arch/powerpc/include/asm/book3s/64/tlbflush.h  |  11 +
 arch/powerpc/include/asm/mmu_context.h |   4 +
 arch/powerpc/mm/mmu_context_book3s64.c |  23 +-
 arch/powerpc/mm/pgtable-book3s64.c |   5 +-
 arch/powerpc/mm/pgtable.c  |   2 +-
 arch/powerpc/mm/tlb-radix.c| 263 +++--
 7 files changed, 234 insertions(+), 81 deletions(-)

-- 
2.13.3



Re: [PATCH] sound: soc: fsl: Do not set DAI sysclk when it is equal to system freq

2017-09-07 Thread Mark Brown
On Tue, Sep 05, 2017 at 10:45:29AM -0700, Nicolin Chen wrote:

> The ipg clock is merely used to access registers, and has nothing
> (directly) to do with external clock outputs. The driver shall not
> change the ipg clock as the system ipg clock (its parent clock)
> might be messed and even system time would get weird -- happened
> once when the fsl_spdif driver used to call clk_set_rate() on its
> ipg clock. Although the clock controller should have some kind of
> protection in my opinion, we just avoid IP clock rate change in all
> audio drivers as well.

Yes, the clock API needs constraints code.

> On the other hand, the sys clock (baudclk in the driver) should be
> configured whenever it's related to external clock outputs. When I
> implemented this set_sysclk() for fsl_ssi.c, I used it to set this
> sys clock (baudclk) by a machine driver, in order to set bit clock.
> Then someone patched the driver by moving all the code to set_bclk()
> to make machine drivers simpler. Now the set_sysclk() is remained
> to give machine drivers a chance to override clock configurations
> in the hw_params(). This could be used in TDM or some other special
> cases (It could also have a purpose for backwards compatibility).

> So here, we should set baudclk (BCLK generator).

No, that's just going to cause confusion - if all the other drivers are
using set_sysclk() to set an input clock rate to the IP rather than an
output clock but your driver does something else then sooner or later
someone will run into trouble with that.  


signature.asc
Description: PGP signature


Re: [PATCH V13 4/4] powerpc/vphn: Fix numa update end-loop bug

2017-09-07 Thread Nathan Fontenot
On 09/06/2017 05:03 PM, Michael Bringmann wrote:
> 
> 
> On 09/06/2017 09:45 AM, Nathan Fontenot wrote:
>> On 09/01/2017 10:48 AM, Michael Bringmann wrote:
>>> powerpc/vphn: On Power systems with shared configurations of CPUs
>>> and memory, there are some issues with the association of additional
>>> CPUs and memory to nodes when hot-adding resources.  This patch
>>> fixes an end-of-updates processing problem observed occasionally
>>> in numa_update_cpu_topology().
>>>
>>> Signed-off-by: Michael Bringmann 
>>> ---
>>>  arch/powerpc/mm/numa.c |7 +++
>>>  1 file changed, 7 insertions(+)
>>>
>>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>>> index 3a5b334..fccf23f 100644
>>> --- a/arch/powerpc/mm/numa.c
>>> +++ b/arch/powerpc/mm/numa.c
>>> @@ -1410,6 +1410,13 @@ int numa_update_cpu_topology(bool cpus_locked)
>>> cpu = cpu_last_thread_sibling(cpu);
>>> }
>>>
>>> +   /*
>>> +* Prevent processing of 'updates' from overflowing array
>>> +* in cases where last entry filled in a 'next' pointer.
>>> +*/
>>> +   if (i)
>>> +   updates[i-1].next = NULL;
>>> +
>>
>> This really looks like the bug is in the code above this where we
>> fill in the updates array for each of the sibling cpus. The code
>> there assumes that if the current update entry is not the end that
>> there will be more updates and blindly sets the next pointer.
>>
>> Perhaps correcting the logic in that code to next pointers. Set the
>> ud pointer to NULL before the outer for_each_cpu() loop. Then in the
>> inner for_each_cpu(sibling,...) loop update the ud-> next pointer as
>> the first operation.
>>
>>  for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
>>  if (ud)
>>  ud->next = [i];
>>  ...
>>  }
>>
>> Obviously untested, but I think this would prevent setting the next
>> pointer in the last update entry that is filled out erroneously.
> 
> The above fragment looks to skip initialization of the 'next' pointer
> in the first element of the the 'updates'.  That would abort subsequent
> evaluation of the array too soon, I believe.  I would like to take another 
> look
> to see whether the current check 'if (i < weight) ud->next = [i];'
> is having problems due to i being 0-relative and weight being 1-relative.

Another thing to keep in mind is that cpus can be skipped by checks earlier
in the loop. There is not guarantee that we will add 'weight' elements to
the ud list.

-Nathan
 
> 
>>   
>> -Nathan
> 
> Michael
> 
>>
>>> pr_debug("Topology update for the following CPUs:\n");
>>> if (cpumask_weight(_cpus)) {
>>> for (ud = [0]; ud; ud = ud->next) {
>>>
>>
> 



[PATCH] cxl: Dump PSL_FIR1/2 registers on PSL9 error irq

2017-09-07 Thread Vaibhav Jain
For PSL9 currently we aren't dumping the PSL FIR1/2 registers when a
PSL error interrupt is triggered. Contents of these registers are
useful in debugging AFU issues.

This patch fixes issue by updating the cxl_native_err_irq_dump_regs()
to dump these regs on PSL error interrupt thereby bringing the
behavior in line with PSL on POWER-8.

Signed-off-by: Vaibhav Jain 
---
 drivers/misc/cxl/native.c | 13 +++--
 drivers/misc/cxl/pci.c|  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 4a82c313cf71..60b91e95821d 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -1261,8 +1261,17 @@ void cxl_native_err_irq_dump_regs(struct cxl *adapter)
 {
u64 fir1, fir2;
 
-   fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);
-   fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
+   if (cxl_is_power8()) {
+   fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);
+   fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
+   } else if (cxl_is_power9()) {
+   fir1 = cxl_p1_read(adapter, CXL_PSL9_FIR1);
+   fir2 = cxl_p1_read(adapter, CXL_PSL9_FIR2);
+   } else {
+   /* Dont report garbage */
+   fir1 = fir2 = 0;
+   WARN_ON(1);
+   }
 
dev_crit(>dev, "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", 
fir1, fir2);
 }
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index d18b3d9292fd..597e145f38e3 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -1762,6 +1762,7 @@ static const struct cxl_service_layer_ops psl9_ops = {
.debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9,
.debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9,
.psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9,
+   .err_irq_dump_registers = cxl_native_err_irq_dump_regs,
.debugfs_stop_trace = cxl_stop_trace_psl9,
.write_timebase_ctrl = write_timebase_ctrl_psl9,
.timebase_read = timebase_read_psl9,
-- 
2.13.5



Re: UIO memmap of PCi devices not working?

2017-09-07 Thread Joakim Tjernlund
On Thu, 2017-09-07 at 10:59 +0200, Joakim Tjernlund wrote:
> On Thu, 2017-09-07 at 18:33 +1000, Benjamin Herrenschmidt wrote:
> > On Thu, 2017-09-07 at 07:22 +, Joakim Tjernlund wrote:
> > > On Thu, 2017-09-07 at 17:16 +1000, Benjamin Herrenschmidt wrote:
> > > > On Wed, 2017-09-06 at 15:20 +, Joakim Tjernlund wrote:
> > > > > Having problems to mmap PCI UIO devices and stumbeled over this page:
> > > > >  
> > > > > http://billfarrow.blogspot.se/2010/09/userspace-access-to-pci-memory.html
> > > > > it claims some adjustments are needed for UIO mmap over PCI to work.
> > > > > These are #if 0 ATM and trying to enable them fails build.
> > > > > 
> > > > > Can this be fixed to at least build again ?
> > > > > The reason for having #if 0 in the first place appears to be old X 
> > > > > servers,
> > > > > is that still true? Can the special casing be removed now?
> > > > 
> > > > This article seems out of date... I *think* things should work without
> > > > change by just mmap'ing the appropriate sysfs files. I'm not sure why
> > > > the author thought that had to be ifdef'ed out...
> > > 
> > > Isn't that what the article is doing(mmaping sysfs files)?
> > > And the article author is #ifdefing it back, not out.
> > 
> > Yes sorry that's what I meant. It should work as-is.
> > 
> > > > 
> > > > Let me know if you have problems.
> > > 
> > > Sure, we still are looking 
> > > 
> > > > 
> > > > As far as I know, the generic code will call pci_resource_to_user()
> > > > which on powerpc will return a physical address that already includes
> > > > the offset, which is why we don't later add it.
> > > > 
> > > > Now we could probably tear all that out and use the new generic code
> > > > instead as I *think* X has (very) long been fixed but I'd have to spend
> > > > some time triple checking and testing on old HW which I don't have the
> > > > bandwidth for right now. 
> > > 
> > > Could you fixup the code which is now #if 0 ? I wan't to test the
> > > difference and I not sure how to fix the build problem after changing
> > > those two #if 0 to #if 1
> > > Even better if they could be a CONFIG option instead.
> > 
> > Hrm it's tricky, you shouldn't just turn that ifdef back on without
> > also changing pci_resource_to_user().
> 
> There are two ifdef to change:
> __pci_mmap_make_offset():
> #if 0 /* See comment in pci_resource_to_user() for why this is disabled */
>   *offset += hose->pci_mem_offset;
> #endif
> 
> and
> 
> pci_resource_to_user()
>   /* We pass a fully fixed up address to userland for MMIO instead of
>* a BAR value because X is lame and expects to be able to use that
>* to pass to /dev/mem !
>*
>* That means that we'll have potentially 64 bits values where some
>* userland apps only expect 32 (like X itself since it thinks only
>* Sparc has 64 bits MMIO) but if we don't do that, we break it on
>* 32 bits CHRPs :-(
>*
>* Hopefully, the sysfs insterface is immune to that gunk. Once X
>* has been fixed (and the fix spread enough), we can re-enable the
>* 2 lines below and pass down a BAR value to userland. In that case
>* we'll also have to re-enable the matching code in
>* __pci_mmap_make_offset().
>*
>* BenH.
>*/
> #if 0
>   else if (rsrc->flags & IORESOURCE_MEM)
>   offset = hose->pci_mem_offset;
> #endif

This seems to work, just a hack though:
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -314,8 +314,8 @@ static struct resource *__pci_mmap_make_offset(struct 
pci_dev *dev,
 
/* If memory, add on the PCI bridge address offset */
if (mmap_state == pci_mmap_mem) {
-#if 0 /* See comment in pci_resource_to_user() for why this is disabled */
-   *offset += hose->pci_mem_offset;
+#if 1  /* See comment in pci_resource_to_user() for why this is disabled */
+   *offset += hose->mem_offset[0];
 #endif
res_bit = IORESOURCE_MEM;
} else {
@@ -634,9 +634,9 @@ void pci_resource_to_user(const struct pci_dev *dev, int 
bar,
 *
 * BenH.
 */
-#if 0
+#if 1
else if (rsrc->flags & IORESOURCE_MEM)
-   offset = hose->pci_mem_offset;
+   offset = hose->mem_offset[0];
 #endif
 
*start = rsrc->start - offset;

> 
> Problem is that pci_mem_offset is gone, the closed I can find is mem_offset
> but that is an array,maybe just mem_offset[0] ?
> 
> > I'm not sure exactly what's going
> > on in your case, if you have a problem can you add printk to instrument
> > ?
> 
> Seems to be something else going on in out board. Anyhow, the mem_offset 
> should
> be fixed to compile, nice to have it behind a CONFIG option. Then
> one can start the process to remove the special casing easier.

After sorting the bugs in our app, it works with and without above patch.

 Jocke

Re: [PATCH resend] powerpc/vdso64: Add support for CLOCK_{REALTIME/MONOTONIC}_COARSE

2017-09-07 Thread Santosh Sivaraj
Hi all,

Any comments on the below patch?

Thanks,
Santosh
* Santosh Sivaraj  wrote (on 2017-08-28 13:14:40 +0530):

> Current vDSO64 implementation does not have support for coarse clocks
> (CLOCK_MONOTONIC_COARSE, CLOCK_REALTIME_COARSE), for which it falls back
> to system call, increasing the response time, vDSO implementation reduces
> the cycle time. Below is a benchmark of the difference in execution time
> with and without vDSO support.
> 
> (Non-coarse clocks are also included just for completion)
> 
> Without vDSO support:
> 
> clock-gettime-realtime: syscall: 172 nsec/call
> clock-gettime-realtime:libc: 26 nsec/call
> clock-gettime-realtime:vdso: 21 nsec/call
> clock-gettime-monotonic: syscall: 170 nsec/call
> clock-gettime-monotonic:libc: 30 nsec/call
> clock-gettime-monotonic:vdso: 24 nsec/call
> clock-gettime-realtime-coarse: syscall: 153 nsec/call
> clock-gettime-realtime-coarse:libc: 15 nsec/call
> clock-gettime-realtime-coarse:vdso: 9 nsec/call
> clock-gettime-monotonic-coarse: syscall: 167 nsec/call
> clock-gettime-monotonic-coarse:libc: 15 nsec/call
> clock-gettime-monotonic-coarse:vdso: 11 nsec/call
> 
> CC: Benjamin Herrenschmidt 
> Signed-off-by: Santosh Sivaraj 
> ---
>  arch/powerpc/kernel/asm-offsets.c |  2 +
>  arch/powerpc/kernel/vdso64/gettimeofday.S | 70 
> ---
>  2 files changed, 66 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/asm-offsets.c 
> b/arch/powerpc/kernel/asm-offsets.c
> index 6e95c2c19a7e..c6acaa5edd16 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -396,6 +396,8 @@ int main(void)
>   /* Other bits used by the vdso */
>   DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
>   DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
> + DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE);
> + DEFINE(CLOCK_MONOTONIC_COARSE, CLOCK_MONOTONIC_COARSE);
>   DEFINE(NSEC_PER_SEC, NSEC_PER_SEC);
>   DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC);
>  
> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S 
> b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index 382021324883..bae197a81add 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -60,18 +60,25 @@ V_FUNCTION_END(__kernel_gettimeofday)
>   */
>  V_FUNCTION_BEGIN(__kernel_clock_gettime)
>.cfi_startproc
> + mr  r11,r4  /* r11 saves tp */
> + mflrr12 /* r12 saves lr */
> + lis r7,NSEC_PER_SEC@h   /* want nanoseconds */
> + ori r7,r7,NSEC_PER_SEC@l
> +
>   /* Check for supported clock IDs */
>   cmpwi   cr0,r3,CLOCK_REALTIME
>   cmpwi   cr1,r3,CLOCK_MONOTONIC
>   crorcr0*4+eq,cr0*4+eq,cr1*4+eq
> - bne cr0,99f
> + beq cr0,49f
>  
> - mflrr12 /* r12 saves lr */
> + cmpwi   cr0,r3,CLOCK_REALTIME_COARSE
> + cmpwi   cr1,r3,CLOCK_MONOTONIC_COARSE
> + crorcr0*4+eq,cr0*4+eq,cr1*4+eq
> + beq cr0,65f
> +
> + b   99f /* Fallback to syscall */
>.cfi_register lr,r12
> - mr  r11,r4  /* r11 saves tp */
> - bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> - lis r7,NSEC_PER_SEC@h   /* want nanoseconds */
> - ori r7,r7,NSEC_PER_SEC@l
> +49:  bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
>  50:  bl  V_LOCAL_FUNC(__do_get_tspec)/* get time from tb & kernel */
>   bne cr1,80f /* if not monotonic, all done */
>  
> @@ -110,6 +117,57 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>  1:   bge cr1,80f
>   addir4,r4,-1
>   add r5,r5,r7
> + b   80f
> +
> + /*
> +  * For coarse clocks we get data directly from the vdso data page, so
> +  * we don't need to call __do_get_tspec, but we still need to do the
> +  * counter trick.
> +  */
> +65:  bl  V_LOCAL_FUNC(__get_datapage)/* get data page */
> +70:  ld  r8,CFG_TB_UPDATE_COUNT(r3)
> + andi.   r0,r8,1 /* pending update ? loop */
> + bne-70b
> + xor r0,r8,r8/* create dependency */
> + add r3,r3,r0
> +
> + /*
> +  * CLOCK_REALTIME_COARSE, below values are needed for MONOTONIC_COARSE
> +  * too
> +  */
> + ld  r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
> + ld  r5,STAMP_XTIME+TSPC64_TV_NSEC(r3)
> + bne cr1,78f
> +
> + /* CLOCK_MONOTONIC_COARSE */
> + lwa r6,WTOM_CLOCK_SEC(r3)
> + lwa r9,WTOM_CLOCK_NSEC(r3)
> +
> + /* check if counter has updated */
> +78:  or  r0,r6,r9
> + xor r0,r0,r0
> + add r3,r3,r0
> + ld  r0,CFG_TB_UPDATE_COUNT(r3)
> + cmpld   cr0,r0,r8   /* check if updated */
> + bne-70b
> +
> + /* 

Re: UIO memmap of PCi devices not working?

2017-09-07 Thread Joakim Tjernlund
On Thu, 2017-09-07 at 18:33 +1000, Benjamin Herrenschmidt wrote:
> On Thu, 2017-09-07 at 07:22 +, Joakim Tjernlund wrote:
> > On Thu, 2017-09-07 at 17:16 +1000, Benjamin Herrenschmidt wrote:
> > > On Wed, 2017-09-06 at 15:20 +, Joakim Tjernlund wrote:
> > > > Having problems to mmap PCI UIO devices and stumbeled over this page:
> > > >  
> > > > http://billfarrow.blogspot.se/2010/09/userspace-access-to-pci-memory.html
> > > > it claims some adjustments are needed for UIO mmap over PCI to work.
> > > > These are #if 0 ATM and trying to enable them fails build.
> > > > 
> > > > Can this be fixed to at least build again ?
> > > > The reason for having #if 0 in the first place appears to be old X 
> > > > servers,
> > > > is that still true? Can the special casing be removed now?
> > > 
> > > This article seems out of date... I *think* things should work without
> > > change by just mmap'ing the appropriate sysfs files. I'm not sure why
> > > the author thought that had to be ifdef'ed out...
> > 
> > Isn't that what the article is doing(mmaping sysfs files)?
> > And the article author is #ifdefing it back, not out.
> 
> Yes sorry that's what I meant. It should work as-is.
> 
> > > 
> > > Let me know if you have problems.
> > 
> > Sure, we still are looking 
> > 
> > > 
> > > As far as I know, the generic code will call pci_resource_to_user()
> > > which on powerpc will return a physical address that already includes
> > > the offset, which is why we don't later add it.
> > > 
> > > Now we could probably tear all that out and use the new generic code
> > > instead as I *think* X has (very) long been fixed but I'd have to spend
> > > some time triple checking and testing on old HW which I don't have the
> > > bandwidth for right now. 
> > 
> > Could you fixup the code which is now #if 0 ? I wan't to test the
> > difference and I not sure how to fix the build problem after changing
> > those two #if 0 to #if 1
> > Even better if they could be a CONFIG option instead.
> 
> Hrm it's tricky, you shouldn't just turn that ifdef back on without
> also changing pci_resource_to_user().

There are two ifdef to change:
__pci_mmap_make_offset():
#if 0 /* See comment in pci_resource_to_user() for why this is disabled */
*offset += hose->pci_mem_offset;
#endif

and

pci_resource_to_user()
/* We pass a fully fixed up address to userland for MMIO instead of
 * a BAR value because X is lame and expects to be able to use that
 * to pass to /dev/mem !
 *
 * That means that we'll have potentially 64 bits values where some
 * userland apps only expect 32 (like X itself since it thinks only
 * Sparc has 64 bits MMIO) but if we don't do that, we break it on
 * 32 bits CHRPs :-(
 *
 * Hopefully, the sysfs insterface is immune to that gunk. Once X
 * has been fixed (and the fix spread enough), we can re-enable the
 * 2 lines below and pass down a BAR value to userland. In that case
 * we'll also have to re-enable the matching code in
 * __pci_mmap_make_offset().
 *
 * BenH.
 */
#if 0
else if (rsrc->flags & IORESOURCE_MEM)
offset = hose->pci_mem_offset;
#endif

Problem is that pci_mem_offset is gone, the closed I can find is mem_offset
but that is an array,maybe just mem_offset[0] ?

> I'm not sure exactly what's going
> on in your case, if you have a problem can you add printk to instrument
> ?
Seems to be something else going on in out board. Anyhow, the mem_offset should
be fixed to compile, nice to have it behind a CONFIG option. Then
one can start the process to remove the special casing easier.

 Jocke

Re: Machine Check in P2010(e500v2)

2017-09-07 Thread Joakim Tjernlund
On Thu, 2017-09-07 at 00:50 +0200, Joakim Tjernlund wrote:
> On Wed, 2017-09-06 at 21:13 +, Leo Li wrote:
> > > -Original Message-
> > > From: Joakim Tjernlund [mailto:joakim.tjernl...@infinera.com]
> > > Sent: Wednesday, September 06, 2017 3:54 PM
> > > To: linuxppc-dev@lists.ozlabs.org; Leo Li ; York Sun
> > > 
> > > Subject: Re: Machine Check in P2010(e500v2)
> > > 
> > > On Wed, 2017-09-06 at 20:28 +, Leo Li wrote:
> > > > > -Original Message-
> > > > > From: Joakim Tjernlund [mailto:joakim.tjernl...@infinera.com]
> > > > > Sent: Wednesday, September 06, 2017 3:17 PM
> > > > > To: linuxppc-dev@lists.ozlabs.org; Leo Li ; York
> > > > > Sun 
> > > > > Subject: Re: Machine Check in P2010(e500v2)
> > > > > 
> > > > > On Wed, 2017-09-06 at 19:31 +, Leo Li wrote:
> > > > > > > -Original Message-
> > > > > > > From: York Sun
> > > > > > > Sent: Wednesday, September 06, 2017 10:38 AM
> > > > > > > To: Joakim Tjernlund ; linuxppc-
> > > > > > > d...@lists.ozlabs.org; Leo Li 
> > > > > > > Subject: Re: Machine Check in P2010(e500v2)
> > > > > > > 
> > > > > > > Scott is no longer with Freescale/NXP. Adding Leo.
> > > > > > > 
> > > > > > > On 09/05/2017 01:40 AM, Joakim Tjernlund wrote:
> > > > > > > > So after some debugging I found this bug:
> > > > > > > > @@ -996,7 +998,7 @@ int fsl_pci_mcheck_exception(struct pt_regs
> > > 
> > > *regs)
> > > > > > > >  if (is_in_pci_mem_space(addr)) {
> > > > > > > >  if (user_mode(regs)) {
> > > > > > > >  pagefault_disable();
> > > > > > > > -   ret = get_user(regs->nip, );
> > > > > > > > +   ret = get_user(inst, (__u32 __user
> > > > > > > > + *)regs->nip);
> > > > > > > >  pagefault_enable();
> > > > > > > >  } else {
> > > > > > > >  ret = probe_kernel_address(regs->nip,
> > > > > > > > inst);
> > > > > > > > 
> > > > > > > > However, the kernel still locked up after fixing that.
> > > > > > > > Now I wonder why this fixup is there in the first place? The
> > > > > > > > routine will not really fixup the insn, just return 0x
> > > > > > > > for the failing read and then advance the process NIP.
> > > > > > 
> > > > > > You are right.  The code here only gives 0x to the load
> > > > > > instructions and
> > > > > 
> > > > > continue with the next instruction when the load instruction is
> > > > > causing the machine check.  This will prevent a system lockup when
> > > > > reading from PCI/RapidIO device which is link down.
> > > > > > 
> > > > > > I don't know what is actual problem in your case.  Maybe it is a
> > > > > > write
> > > > > 
> > > > > instruction instead of read?   Or the code is in a infinite loop 
> > > > > waiting for a
> > > 
> > > valid
> > > > > read result?  Are you able to do some further debugging with the NIP
> > > > > correctly printed?
> > > > > > 
> > > > > 
> > > > > According to the MC it is a Read and the NIP also leads to a read in 
> > > > > the
> > > 
> > > program.
> > > > > ATM, I have disabled the fixup but I will enable that again.
> > > > > Question, is it safe add a small printk when this MC happens(after
> > > > > fixing up)? I need to see that it has happened as the error is 
> > > > > somewhat
> > > 
> > > random.
> > > > 
> > > > I think it is safe to add printk as the current machine check handlers 
> > > > are also
> > > 
> > > using printk.
> > > 
> > > I hope so, but if the fixup fires there is no printk at all so I was a 
> > > bit unsure.
> > > Don't like this fixup though, is there not a better way than faking a 
> > > read to user
> > > space(or kernel for that matter) ?
> > 
> > I don't have a better idea.  Without the fixup, the offending load 
> > instruction will never finish if there is anything wrong with the backing 
> > device and freeze the whole system.  Do you have any suggestion in mind?
> > 
> 
> But it never finishes the load, it just fakes a load of 0xf, for user 
> space I rather have it signal
> a SIGBUS but that does not seem to work either, at least not for us but that 
> could be a bug in general MC code
>  maybe.
> This fixup might be valid for kernel only as it has never worked for user 
> space due to the bug I found.
> 
> Where can I read about this errata ?

I have look high and low an cannot find an errata which maps to this fixup.
The closest I get is A-005125 which seems to have another workaround, I cannot 
find
any evidence that this workaround has been applied in Linux, can you?

 Jocke

Re: UIO memmap of PCi devices not working?

2017-09-07 Thread Benjamin Herrenschmidt
On Thu, 2017-09-07 at 07:22 +, Joakim Tjernlund wrote:
> On Thu, 2017-09-07 at 17:16 +1000, Benjamin Herrenschmidt wrote:
> > On Wed, 2017-09-06 at 15:20 +, Joakim Tjernlund wrote:
> > > Having problems to mmap PCI UIO devices and stumbeled over this page:
> > >  http://billfarrow.blogspot.se/2010/09/userspace-access-to-pci-memory.html
> > > it claims some adjustments are needed for UIO mmap over PCI to work.
> > > These are #if 0 ATM and trying to enable them fails build.
> > > 
> > > Can this be fixed to at least build again ?
> > > The reason for having #if 0 in the first place appears to be old X 
> > > servers,
> > > is that still true? Can the special casing be removed now?
> > 
> > This article seems out of date... I *think* things should work without
> > change by just mmap'ing the appropriate sysfs files. I'm not sure why
> > the author thought that had to be ifdef'ed out...
> 
> Isn't that what the article is doing(mmaping sysfs files)?
> And the article author is #ifdefing it back, not out.

Yes sorry that's what I meant. It should work as-is.

> > 
> > Let me know if you have problems.
> 
> Sure, we still are looking 
> 
> > 
> > As far as I know, the generic code will call pci_resource_to_user()
> > which on powerpc will return a physical address that already includes
> > the offset, which is why we don't later add it.
> > 
> > Now we could probably tear all that out and use the new generic code
> > instead as I *think* X has (very) long been fixed but I'd have to spend
> > some time triple checking and testing on old HW which I don't have the
> > bandwidth for right now. 
> 
> Could you fixup the code which is now #if 0 ? I wan't to test the
> difference and I not sure how to fix the build problem after changing
> those two #if 0 to #if 1
> Even better if they could be a CONFIG option instead.

Hrm it's tricky, you shouldn't just turn that ifdef back on without
also changing pci_resource_to_user(). I'm not sure exactly what's going
on in your case, if you have a problem can you add printk to instrument
?

Cheers,
Ben.





Re: UIO memmap of PCi devices not working?

2017-09-07 Thread Joakim Tjernlund
On Thu, 2017-09-07 at 17:16 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2017-09-06 at 15:20 +, Joakim Tjernlund wrote:
> > Having problems to mmap PCI UIO devices and stumbeled over this page:
> >  http://billfarrow.blogspot.se/2010/09/userspace-access-to-pci-memory.html
> > it claims some adjustments are needed for UIO mmap over PCI to work.
> > These are #if 0 ATM and trying to enable them fails build.
> > 
> > Can this be fixed to at least build again ?
> > The reason for having #if 0 in the first place appears to be old X servers,
> > is that still true? Can the special casing be removed now?
> 
> This article seems out of date... I *think* things should work without
> change by just mmap'ing the appropriate sysfs files. I'm not sure why
> the author thought that had to be ifdef'ed out...

Isn't that what the article is doing(mmaping sysfs files)?
And the article author is #ifdefing it back, not out.

> 
> Let me know if you have problems.

Sure, we still are looking 

> 
> As far as I know, the generic code will call pci_resource_to_user()
> which on powerpc will return a physical address that already includes
> the offset, which is why we don't later add it.
> 
> Now we could probably tear all that out and use the new generic code
> instead as I *think* X has (very) long been fixed but I'd have to spend
> some time triple checking and testing on old HW which I don't have the
> bandwidth for right now. 

Could you fixup the code which is now #if 0 ? I wan't to test the
difference and I not sure how to fix the build problem after changing
those two #if 0 to #if 1
Even better if they could be a CONFIG option instead.

 Jocke

Re: [PATCH] powerpc/powernv: Increase memory block size to 1GB on radix

2017-09-07 Thread Benjamin Herrenschmidt
On Thu, 2017-09-07 at 15:17 +1000, Anton Blanchard wrote:
> Hi,
> 
> > There is a similar issue being worked on w.r.t pseries.
> > 
> > https://lkml.kernel.org/r/1502357028-27465-1-git-send-email-bhar...@linux.vnet.ibm.com
> > 
> > The question is should we map these regions ? ie, we need to tell the 
> > kernel memory region that we would like to hot unplug later so that
> > we avoid doing kernel allocations from that. If we do that, then we
> > can possibly map them via 2M size ?
> 
> But all of memory on PowerNV should be able to be hot unplugged, so
> there are two options as I see it - either increase the memory block
> size, or map everything with 2MB pages. 

Or be smarter and map with 1G when blocks of 1G are available and break
down to 2M where necessary, it shouldn't be too hard.

Cheers,
Ben.



Re: UIO memmap of PCi devices not working?

2017-09-07 Thread Benjamin Herrenschmidt
On Wed, 2017-09-06 at 15:20 +, Joakim Tjernlund wrote:
> Having problems to mmap PCI UIO devices and stumbeled over this page:
>  http://billfarrow.blogspot.se/2010/09/userspace-access-to-pci-memory.html
> it claims some adjustments are needed for UIO mmap over PCI to work.
> These are #if 0 ATM and trying to enable them fails build.
> 
> Can this be fixed to at least build again ?
> The reason for having #if 0 in the first place appears to be old X servers,
> is that still true? Can the special casing be removed now?

This article seems out of date... I *think* things should work without
change by just mmap'ing the appropriate sysfs files. I'm not sure why
the author thought that had to be ifdef'ed out...

Let me know if you have problems.

As far as I know, the generic code will call pci_resource_to_user()
which on powerpc will return a physical address that already includes
the offset, which is why we don't later add it.

Now we could probably tear all that out and use the new generic code
instead as I *think* X has (very) long been fixed but I'd have to spend
some time triple checking and testing on old HW which I don't have the
bandwidth for right now. 

Cheers,
Ben.



Re: [PATCH v8 00/10] Enable VAS

2017-09-07 Thread Michael Neuling
So this is upstream now but it will cause a crash on boot with older skiboots
with: 

powernv-cpufreq: cpufreq pstate min 101 nominal 50 max 0
powernv-cpufreq: Workload Optimized Frequency is enabled in the platform
Disabling lock debugging due to kernel taint
Severe Machine check interrupt [Not recovered]
  NIP [c0098530]: reset_window_regs+0x20/0x220
  Initiator: CPU
  Error type: Unknown
opal: Machine check interrupt unrecoverable: MSR(RI=0)
opal: Hardware platform error: Unrecoverable Machine Check exception
CPU: 1 PID: 1 Comm: swapper/0 Tainted: G   M
4.13.0-rc7-00708-g8b680911e774-dirty #10
task: c00f2268 task.stack: c00f2270
NIP:  c0098530 LR: c0098758 CTR: 
REGS: c0003ffebd80 TRAP: 0200   Tainted: G   M 
(4.13.0-rc7-00708-g8b680911e774-dirty)
MSR:  98349031   CR: 24000224  XER: 
CFAR: c0098754 DAR: 100bef30 DSISR: 4000 SOFTE: 0 
GPR00: c0098f44 c00f22703a00 c0eff200 c00f1cf861e0 
GPR04: c00f22703a50 0001 0fff 0003 
GPR08: c00c842f 0001   
GPR12:  cfd40580 c0c03590 c0c1f428 
GPR16: c0c4a640 c0c31360 c0c92738 c0c92690 
GPR20: c0c926a0 c0c926f0 c00f1d672940  
GPR24: c0c92638  c0e888b0 c0dce428 
GPR28: 0002 c00f0e80 c00f22703a50 c00f1cf861e0 
NIP [c0098530] reset_window_regs+0x20/0x220
LR [c0098758] init_winctx_regs+0x28/0x6c0
Call Trace:
[c00f22703a00] [0002] 0x2 (unreliable)
[c00f22703a30] [c0098f44] vas_rx_win_open.part.11+0x154/0x210
[c00f22703ae0] [c0d668e8] nx842_powernv_init+0x6b4/0x824
[c00f22703c[   38.412765557,0] OPAL: Reboot requested due to Platform error.
[   38.412828287,3] OPAL: Reboot requested due to Platform error.40] 
[c000ca60] do_one_initcall+0x60/0x1c0

If you see this you need a new skiboot with at least these two patches:

b503dcf16d vas: Set mmio enable bits in DD2
a5c124072f vas: Set FIRs according to workbook

This is a community announcement brought to you by OzLabs. 
  OzLabs: Making Linux better since 1999

Mikey


On Mon, 2017-08-28 at 23:23 -0700, Sukadev Bhattiprolu wrote:
> Power9 introduces a hardware subsystem referred to as the Virtual
> Accelerator Switchboard (VAS). VAS allows kernel subsystems and user
> space processes to directly access the Nest Accelerator (NX) engines
> which implement compression and encryption algorithms in the hardware.
> 
> NX has been in Power processors since Power7+, but access to the NX
> engines was through the 'icswx' instruction which is only available
> to the kernel/hypervisor. Starting with Power9, access to the NX
> engines is provided to both kernel and user space processes through
> VAS.
> 
> The switchboard (i.e VAS) multiplexes accesses between "receivers" and
> "senders", where the "receivers" are typically the NX engines and
> "senders" are the kernel subsystems and user processors that wish to
> access the receivers (NX engines).  Once a sender is "connected" to
> a receiver through the switchboard, the senders can submit compression/
> encryption requests to the hardware using the new (PowerISA 3.0)
> "copy" and "paste" instructions.
> 
> In the initial OPAL and PowerNV kernel patchsets, the "senders" can
> only be kernel subsystems (eg NX-842 driver) and receivers can only
> be the NX-842 engine. Follow-on patch sets will allow senders/receivers
> to be user-space processes and receivers to be NX-GZIP engines.
> 
> Provides:
> 
>   This kernel patch set configures the VAS subsystems and provides
>   kernel interfaces to drivers like NX-842 to open receive and send
>   windows in VAS and to submit compression requests to the NX engine.
> 
> Requires:
> 
>   This patch set needs corresponding VAS/NX skiboot patches which
>   were merged into skiboot tree. i.e skiboot must include:
>   commit b503dcf ("vas: Set mmio enable bits in DD2")
> 
> Tests:
> In-kernel compression requests were tested on DD1 and DD2 POWER9
>   hardware using compression self-test module and the following
>   NX-842 patch set from Haren Myneni:
> 
> https://lists.ozlabs.org/pipermail/linuxppc-dev/2017-July/160620.html
> 
>   and by dropping the last parameters to both vas_copy_crb() and
>   vas_paste_crb() calls in drivers/crypto/nx/nx-842-powernv.c.
>   See also PATCH 10/10.
> 
> Git Tree:
> 
> https://github.com/sukadev/linux/ 
>   Branch: vas-kern-v8
> 
> Thanks to input from Ben Herrenschmidt, Michael Neuling, Michael Ellerman
> and Haren Myneni.
> 
> Changelog[v8]:
>   - [Michael Ellerman] Use kernel int types (u64, u32 etc); make VAS
>     a built-in rather than 

[PATCH 2/2] powerpc/powernv: Rework EEH initialization on powernv

2017-09-07 Thread Benjamin Herrenschmidt
Remove the post_init callback which is only used
by powernv, we can just call it explicitly from
the powernv code.

This partially kills the ability to "disable" eeh at
runtime via debugfs as this was calling that same
callback again, but this is both unused and broken
in several ways. If we want to revive it, we need
to create a dedicated enable/disable callback on the
backend that does the right thing.

Let the bulk of eeh initialize normally at
core_initcall() like it does on pseries by removing
the hack in eeh_init() that delays it.

Instead we make sure our eeh->probe cleanly bails
out of the PEs haven't been created yet and we force
a re-probe where we used to call eeh_init() again.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/include/asm/eeh.h   |  8 ++---
 arch/powerpc/kernel/eeh.c| 46 +--
 arch/powerpc/platforms/powernv/eeh-powernv.c | 47 +++-
 arch/powerpc/platforms/powernv/pci-ioda.c|  3 +-
 arch/powerpc/platforms/powernv/pci.h |  1 +
 5 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8e37b71674f4..f44271b3beaf 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -202,7 +202,6 @@ enum {
 struct eeh_ops {
char *name;
int (*init)(void);
-   int (*post_init)(void);
void* (*probe)(struct pci_dn *pdn, void *data);
int (*set_option)(struct eeh_pe *pe, int option);
int (*get_pe_addr)(struct eeh_pe *pe);
@@ -276,7 +275,7 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
 
 struct eeh_dev *eeh_dev_init(struct pci_dn *pdn);
 void eeh_dev_phb_init_dynamic(struct pci_controller *phb);
-int eeh_init(void);
+void eeh_probe_devices(void);
 int __init eeh_ops_register(struct eeh_ops *ops);
 int __exit eeh_ops_unregister(const char *name);
 int eeh_check_failure(const volatile void __iomem *token);
@@ -322,10 +321,7 @@ static inline bool eeh_enabled(void)
 return false;
 }
 
-static inline int eeh_init(void)
-{
-   return 0;
-}
+static inline void eeh_probe_devices(void) { }
 
 static inline void *eeh_dev_init(struct pci_dn *pdn, void *data)
 {
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index f27eecd5ec7f..5e8617a53a86 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -971,6 +971,18 @@ static struct notifier_block eeh_reboot_nb = {
.notifier_call = eeh_reboot_notifier,
 };
 
+void eeh_probe_devices(void)
+{
+   struct pci_controller *hose, *tmp;
+   struct pci_dn *pdn;
+
+   /* Enable EEH for all adapters */
+   list_for_each_entry_safe(hose, tmp, _list, list_node) {
+   pdn = hose->pci_data;
+   traverse_pci_dn(pdn, eeh_ops->probe, NULL);
+   }
+}
+
 /**
  * eeh_init - EEH initialization
  *
@@ -986,22 +998,11 @@ static struct notifier_block eeh_reboot_nb = {
  * Even if force-off is set, the EEH hardware is still enabled, so that
  * newer systems can boot.
  */
-int eeh_init(void)
+static int eeh_init(void)
 {
struct pci_controller *hose, *tmp;
-   struct pci_dn *pdn;
-   static int cnt = 0;
int ret = 0;
 
-   /*
-* We have to delay the initialization on PowerNV after
-* the PCI hierarchy tree has been built because the PEs
-* are figured out based on PCI devices instead of device
-* tree nodes
-*/
-   if (machine_is(powernv) && cnt++ <= 0)
-   return ret;
-
/* Register reboot notifier */
ret = register_reboot_notifier(_reboot_nb);
if (ret) {
@@ -1027,22 +1028,7 @@ int eeh_init(void)
if (ret)
return ret;
 
-   /* Enable EEH for all adapters */
-   list_for_each_entry_safe(hose, tmp, _list, list_node) {
-   pdn = hose->pci_data;
-   traverse_pci_dn(pdn, eeh_ops->probe, NULL);
-   }
-
-   /*
-* Call platform post-initialization. Actually, It's good chance
-* to inform platform that EEH is ready to supply service if the
-* I/O cache stuff has been built up.
-*/
-   if (eeh_ops->post_init) {
-   ret = eeh_ops->post_init();
-   if (ret)
-   return ret;
-   }
+   eeh_probe_devices();
 
if (eeh_enabled())
pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
@@ -1757,10 +1743,6 @@ static int eeh_enable_dbgfs_set(void *data, u64 val)
else
eeh_add_flag(EEH_FORCE_DISABLED);
 
-   /* Notify the backend */
-   if (eeh_ops->post_init)
-   eeh_ops->post_init();
-
return 0;
 }
 
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 3f48f6df1cf3..6bde8f0f78e3 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ 

[PATCH 1/2] powerpc/eeh: Create PHB PEs after EEH is initialized

2017-09-07 Thread Benjamin Herrenschmidt
Otherwise we end up not yet having computed the right
diag data size on powernv where EEH initialization
is delayed, thus causing memory corruption later on
when calling OPAL.

Signed-off-by: Benjamin Herrenschmidt 
---
 arch/powerpc/kernel/eeh.c |  4 
 arch/powerpc/kernel/eeh_dev.c | 18 --
 2 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 63992b2d8e15..f27eecd5ec7f 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -1018,6 +1018,10 @@ int eeh_init(void)
} else if ((ret = eeh_ops->init()))
return ret;
 
+   /* Initialize PHB PEs */
+   list_for_each_entry_safe(hose, tmp, _list, list_node)
+   eeh_dev_phb_init_dynamic(hose);
+
/* Initialize EEH event */
ret = eeh_event_init();
if (ret)
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index d6b2ca70d14d..0820b73288c0 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -83,21 +83,3 @@ void eeh_dev_phb_init_dynamic(struct pci_controller *phb)
/* EEH PE for PHB */
eeh_phb_pe_create(phb);
 }
-
-/**
- * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs
- *
- * Scan all the existing PHBs and create EEH devices for their OF
- * nodes and their children OF nodes
- */
-static int __init eeh_dev_phb_init(void)
-{
-   struct pci_controller *phb, *tmp;
-
-   list_for_each_entry_safe(phb, tmp, _list, list_node)
-   eeh_dev_phb_init_dynamic(phb);
-
-   return 0;
-}
-
-core_initcall(eeh_dev_phb_init);