date:20170829

[PATCH v3 18/17] powerpc: Emulate load/store floating point as integer word instructions

2017-08-29 Thread Paul Mackerras

This adds emulation for the lfiwax, lfiwzx and stfiwx instructions.
This necessitated adding a new flag to indicate whether a floating
point or an integer conversion was needed for LOAD_FP and STORE_FP,
so this moves the size field in op->type up 4 bits.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |  5 ++--
 arch/powerpc/lib/sstep.c | 60 ++--
 2 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 309d1c5..ab9d849 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -68,6 +68,7 @@ enum instruction_type {
 #define SIGNEXT0x20
 #define UPDATE 0x40/* matches bit in opcode 31 instructions */
 #define BYTEREV0x80
+#define FPCONV 0x100
 
 /* Barrier type field, ORed in with type */
 #define BARRIER_MASK   0xe0
@@ -93,8 +94,8 @@ enum instruction_type {
 #define VSX_CHECK_VEC  8   /* check MSR_VEC not MSR_VSX for reg >= 32 */
 
 /* Size field in type word */
-#define SIZE(n)((n) << 8)
-#define GETSIZE(w) ((w) >> 8)
+#define SIZE(n)((n) << 12)
+#define GETSIZE(w) ((w) >> 12)
 
 #define MKOP(t, f, s)  ((t) | (f) | SIZE(s))
 
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 24031ca..2f6897c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -457,19 +457,23 @@ NOKPROBE_SYMBOL(write_mem);
  * These access either the real FP register or the image in the
  * thread_struct, depending on regs->msr & MSR_FP.
  */
-static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs,
- bool cross_endian)
+static int do_fp_load(struct instruction_op *op, unsigned long ea,
+ struct pt_regs *regs, bool cross_endian)
 {
-   int err;
+   int err, rn, nb;
union {
+   int i;
+   unsigned int u;
float f;
double d[2];
unsigned long l[2];
u8 b[2 * sizeof(double)];
} u;
 
+   nb = GETSIZE(op->type);
if (!address_ok(regs, ea, nb))
return -EFAULT;
+   rn = op->reg;
err = copy_mem_in(u.b, ea, nb, regs);
if (err)
return err;
@@ -479,8 +483,14 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs,
do_byte_reverse(&u.b[8], 8);
}
preempt_disable();
-   if (nb == 4)
-   conv_sp_to_dp(&u.f, &u.d[0]);
+   if (nb == 4) {
+   if (op->type & FPCONV)
+   conv_sp_to_dp(&u.f, &u.d[0]);
+   else if (op->type & SIGNEXT)
+   u.l[0] = u.i;
+   else
+   u.l[0] = u.u;
+   }
if (regs->msr & MSR_FP)
put_fpr(rn, &u.d[0]);
else
@@ -498,25 +508,33 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs,
 }
 NOKPROBE_SYMBOL(do_fp_load);
 
-static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs,
-  bool cross_endian)
+static int do_fp_store(struct instruction_op *op, unsigned long ea,
+  struct pt_regs *regs, bool cross_endian)
 {
+   int rn, nb;
union {
+   unsigned int u;
float f;
double d[2];
unsigned long l[2];
u8 b[2 * sizeof(double)];
} u;
 
+   nb = GETSIZE(op->type);
if (!address_ok(regs, ea, nb))
return -EFAULT;
+   rn = op->reg;
preempt_disable();
if (regs->msr & MSR_FP)
get_fpr(rn, &u.d[0]);
else
u.l[0] = current->thread.TS_FPR(rn);
-   if (nb == 4)
-   conv_dp_to_sp(&u.d[0], &u.f);
+   if (nb == 4) {
+   if (op->type & FPCONV)
+   conv_dp_to_sp(&u.d[0], &u.f);
+   else
+   u.u = u.l[0];
+   }
if (nb == 16) {
rn |= 1;
if (regs->msr & MSR_FP)
@@ -2049,7 +2067,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
 #ifdef CONFIG_PPC_FPU
case 535:   /* lfsx */
case 567:   /* lfsux */
-   op->type = MKOP(LOAD_FP, u, 4);
+   op->type = MKOP(LOAD_FP, u | FPCONV, 4);
break;
 
case 599:   /* lfdx */
@@ -2059,7 +2077,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
 
case 663:   /* stfsx */
case 695:   /* stfsux */
-   op->type = MKOP(STORE_FP, u, 4);
+   op->type = MKOP(STORE_FP, u | FPCONV, 4);
break;
 
case 727:

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Peter Zijlstra

On Wed, Aug 30, 2017 at 07:19:30AM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2017-08-29 at 13:27 +0200, Peter Zijlstra wrote:
> > mpe helped me out and explained that is the PWC hint to TBLIE.
> > 
> > So, you set need_flush_all when you unhook pud/pmd/pte which you then
> > use to set PWC. So free_pgtables() will do the PWC when it unhooks
> > higher level pages.
> > 
> > But you're right that there's some issues, free_pgtables() itself
> > doesn't seem to use mm->page_table_lock,pmd->lock _AT_ALL_ to unhook the
> > pages.
> > 
> > If it were to do that, things should work fine since those locks would
> > then serialize against the speculative faults, we would never install a
> > page if the VMA would be under tear-down and it would thus not be
> > visible to your caches either.
> 
> That's one case. I don't remember of *all* the cases to be honest, but
> I do remember several times over the past few years thinking "ah we are
> fine because the mm sem taken for writing protects us from any
> concurrent tree structure change" :-)

Well, installing always seems to use the locks (it needs to, because its
always done with down_read()), that only leaves removal, and the only
place I know that removes stuff is free_pgtables().

But I think I found another fun place, copy_page_range(). While it
(pointlessly) takes all the PTLs on the dst mm it walks the src page
tables without any PTLs.

This means that if we have a multi-threaded process doing fork() a
thread of the src mm could instantiate page-tables that will not be
copied over.

Of course, this is highly dubious behaviour to begin with, and I don't
think there's anything fundamentally wrong with missing those pages but
we should document this stuff.

Re: Question: handling early hotplug interrupts

2017-08-29 Thread Michael Ellerman

Daniel Henrique Barboza  writes:

> Hi Ben,
>
> On 08/29/2017 06:55 PM, Benjamin Herrenschmidt wrote:
>> On Tue, 2017-08-29 at 17:43 -0300, Daniel Henrique Barboza wrote:
>>> Hi,
>>>
>>> This is a scenario I've been facing when working in early device
>>> hotplugs in QEMU. When a device is added, a IRQ pulse is fired to warn
>>> the guest of the event, then the kernel fetches it by calling
>>> 'check_exception' and handles it. If the hotplug is done too early
>>> (before SLOF, for example), the pulse is ignored and the hotplug event
>>> is left unchecked in the events queue.
>>>
>>> One solution would be to pulse the hotplug queue interrupt after CAS,
>>> when we are sure that the hotplug queue is negotiated. However, this
>>> panics the kernel with sig 11 kernel access of bad area, which suggests
>>> that the kernel wasn't quite ready to handle it.
>> That's not right. This is a bug that needs fixing. The interrupt should
>> be masked anyway but still.
>>
>> Tell us more about the crash (backtrace etc...)  this definitely needs
>> fixing.
>
> This is the backtrace using a 4.13.0-rc3 guest:
>
> -
> [0.008913] Unable to handle kernel paging request for data at address 
> 0x0100
> [0.008989] Faulting instruction address: 0xc012c318
> [0.009046] Oops: Kernel access of bad area, sig: 11 [#1]
> [0.009092] SMP NR_CPUS=1024
> [0.009092] NUMA
> [0.009128] pSeries
> [0.009173] Modules linked in:
> [0.009210] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc3+ #1
> [0.009268] task: c000feb02580 task.stack: c000fe108000
> [0.009325] NIP: c012c318 LR: c012c9c4 CTR: 
> 
> [0.009394] REGS: c000fffef910 TRAP: 0380   Not tainted (4.13.0-rc3+)
> [0.009450] MSR: 82009033 
> [0.009454]   CR: 28000822  XER: 2000
> [0.009554] CFAR: c012c9c0 SOFTE: 0
> [0.009554] GPR00: c012c9c4 c000fffefb90 c141f100 
> 0400
> [0.009554] GPR04:  c000fe1851c0  
> fee6
> [0.009554] GPR08: 000fffe1  0001 
> 02001001
> [0.009554] GPR12: 0040 cfd8 c000db58 
> 
> [0.009554] GPR16:    
> 
> [0.009554] GPR20:    
> 0001
> [0.009554] GPR24: 0002 0013 c000fe14bc00 
> 0400
> [0.009554] GPR28: 0400  c000fe1851c0 
> 0001
> [0.010121] NIP [c012c318] __queue_work+0x48/0x640
> [0.010168] LR [c012c9c4] queue_work_on+0xb4/0xf0
> [0.010213] Call Trace:
> [0.010239] [c000fffefb90] [c000db58] kernel_init+0x8/0x160 
> (unreliable)
> [0.010308] [c000fffefc70] [c012c9c4] queue_work_on+0xb4/0xf0
> [0.010368] [c000fffefcb0] [c00c4608] 
> queue_hotplug_event+0xd8/0x150
> [0.010435] [c000fffefd00] [c00c30d0] 
> ras_hotplug_interrupt+0x140/0x190
> [0.010505] [c000fffefd90] [c018c8b0] 
> __handle_irq_event_percpu+0x90/0x310
> [0.010573] [c000fffefe50] [c018cb6c] 
> handle_irq_event_percpu+0x3c/0x90
> [0.010642] [c000fffefe90] [c018cc24] 
> handle_irq_event+0x64/0xc0
> [0.010710] [c000fffefec0] [c01928b0] 
> handle_fasteoi_irq+0xc0/0x230
> [0.010779] [c000fffefef0] [c018ae14] 
> generic_handle_irq+0x54/0x80
> [0.010847] [c000fffeff20] [c00189f0] __do_irq+0x90/0x210
> [0.010904] [c000fffeff90] [c002e730] call_do_irq+0x14/0x24
> [0.010961] [c000fe10b640] [c0018c10] do_IRQ+0xa0/0x130
> [0.011021] [c000fe10b6a0] [c0008c58] 
> hardware_interrupt_common+0x158/0x160
> [0.011090] --- interrupt: 501 at __replay_interrupt+0x38/0x3c
> [0.011090] LR = arch_local_irq_restore+0x74/0x90
> [0.011179] [c000fe10b990] [c000fe10b9e0] 0xc000fe10b9e0 
> (unreliable)
> [0.011249] [c000fe10b9b0] [c0b967fc] 
> _raw_spin_unlock_irqrestore+0x4c/0xb0
> [0.011316] [c000fe10b9e0] [c018ff50] __setup_irq+0x630/0x9e0
> [0.011374] [c000fe10ba90] [c019054c] 
> request_threaded_irq+0x13c/0x250
> [0.011441] [c000fe10baf0] [c00c2cd0] 
> request_event_sources_irqs+0x100/0x180
> [0.011511] [c000fe10bc10] [c0eceda8] 
> __machine_initcall_pseries_init_ras_IRQ+0xc4/0x12c
> [0.011591] [c000fe10bc40] [c000d8c8] 
> do_one_initcall+0x68/0x1e0
> [0.011659] [c000fe10bd00] [c0eb4484] 
> kernel_init_freeable+0x284/0x370
> [0.011725] [c000fe10bdc0] [c000db7c] kernel_init+0x2c/0x160
> [0.011782] [c000fe10be30] [c000bc9c] 
> ret_from_kernel_thread+0x5c/0xc0
> [0.011848] Instructio

Re: Question: handling early hotplug interrupts

2017-08-29 Thread Michael Ellerman

Daniel Henrique Barboza  writes:

> Hi,
>
> This is a scenario I've been facing when working in early device 
> hotplugs in QEMU. When a device is added, a IRQ pulse is fired to warn 
> the guest of the event, then the kernel fetches it by calling 
> 'check_exception' and handles it. If the hotplug is done too early 
> (before SLOF, for example), the pulse is ignored and the hotplug event 
> is left unchecked in the events queue.
>
> One solution would be to pulse the hotplug queue interrupt after CAS, 
> when we are sure that the hotplug queue is negotiated. However, this 
> panics the kernel with sig 11 kernel access of bad area, which suggests 
> that the kernel wasn't quite ready to handle it.
>
> In my experiments using upstream 4.13 I saw that there is a 'safe time' 
> to pulse the queue, sometime after CAS and before mounting the root fs, 
> but I wasn't able to pinpoint it. From QEMU perspective, the last hcall 
> done (an h_set_mode) is still too early to pulse it and the kernel 
> panics. Looking at the kernel source I saw that the IRQ handling is 
> initiated quite early in the init process.
>
> So my question (ok, actually 2 questions):
>
> - Is my analysis correct? Is there an unsafe time to fire a IRQ pulse 
> before CAS that can break the kernel or am I overlooking/doing something 
> wrong?
> - is there a reliable way to know when can the kernel safely handle the 
> hotplug interrupt?

In addition to Ben's comments, you need to think about this differently.

The operating system you're booting may not be Linux.

Whatever Qemu does needs to make sense without reference to the exact
details or ordering of the Linux code. Qemu needs to provide a mechanism
that any operating system could use, and then we can make it work with
Linux.

cheers

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Peter Zijlstra

On Wed, Aug 30, 2017 at 10:33:50AM +0530, Anshuman Khandual wrote:
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..08f3042 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1181,6 +1181,18 @@ int __lock_page_killable(struct page *__page)
>  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
>  unsigned int flags)
>  {
> +   if (flags & FAULT_FLAG_SPECULATIVE) {
> +   if (flags & FAULT_FLAG_KILLABLE) {
> +   int ret;
> +
> +   ret = __lock_page_killable(page);
> +   if (ret)
> +   return 0;
> +   } else
> +   __lock_page(page);
> +   return 1;
> +   }
> +
> if (flags & FAULT_FLAG_ALLOW_RETRY) {
> /*
>  * CAUTION! In this case, mmap_sem is not released

Yeah, that looks right.

> @@ -4012,17 +4010,7 @@ int handle_speculative_fault(struct mm_struct *mm, 
> unsigned long address,
> goto unlock;
> }
> 
> +   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
> trace_spf_vma_notsup(_RET_IP_, vma, address);
> goto unlock;
> }

As riel pointed out on IRC slightly later, private file maps also need
->anon_vma and those actually have ->vm_ops IIRC so the condition needs
to be slightly more complicated.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Anshuman Khandual

On 08/27/2017 05:48 AM, Kirill A. Shutemov wrote:
>> +/* Transparent huge pages are not supported. */
>> +if (unlikely(pmd_trans_huge(*pmd)))
>> +goto out_walk;
> That's looks like a blocker to me.
> 
> Is there any problem with making it supported (besides plain coding)?

IIUC we would have to reattempt once for each PMD level fault because
of the lack of a page table entry there. Besides do we want to support
huge pages in general as part of speculative page fault path ? The
number of faults will be very less (256 times lower on POWER and 512
times lower on X86). So is it worth it ? BTW calling hugetlb_fault()
after figuring out the VMA, works correctly inside handle_speculative
_fault() last time I checked.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Anshuman Khandual

On 08/29/2017 07:15 PM, Peter Zijlstra wrote:
> On Tue, Aug 29, 2017 at 03:18:25PM +0200, Laurent Dufour wrote:
>> On 29/08/2017 14:04, Peter Zijlstra wrote:
>>> On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
 On 27/08/2017 02:18, Kirill A. Shutemov wrote:
>> +
>> +if (unlikely(!vma->anon_vma))
>> +goto unlock;
>
> It deserves a comment.

 You're right I'll add it in the next version.
 For the record, the root cause is that __anon_vma_prepare() requires the
 mmap_sem to be held because vm_next and vm_prev must be safe.
>>>
>>> But should that test not be:
>>>
>>> if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
>>> goto unlock;
>>>
>>> Because !anon vmas will never have ->anon_vma set and you don't want to
>>> exclude those.
>>
>> Yes in the case we later allow non anonymous vmas to be handled.
>> Currently only anonymous vmas are supported so the check is good enough,
>> isn't it ?
> 
> That wasn't at all clear from reading the code. This makes it clear
> ->anon_vma is only ever looked at for anonymous.
> 
> And like Kirill says, we _really_ should start allowing some (if not
> all) vm_ops. Large file based mappings aren't particularly rare.
> 
> I'm not sure we want to introduce a white-list or just bite the bullet
> and audit all ->fault() implementations. But either works and isn't
> terribly difficult, auditing all is more work though.

filemap_fault() is used as vma-vm_ops->fault() for most of the file
systems. Changing it can enable speculative fault support for all of
them. It will still exclude other driver based vma-vm_ops->fault()
implementation. AFAICS, __lock_page_or_retry() function can drop
mm->mmap_sem if the page could not be locked right away. As suggested
by Peterz, making it understand FAULT_FLAG_SPECULATIVE should be good
enough. The patch is lightly tested for file mappings on top of this
series.

diff --git a/mm/filemap.c b/mm/filemap.c
index a497024..08f3042 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,6 +1181,18 @@ int __lock_page_killable(struct page *__page)
 int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 unsigned int flags)
 {
+   if (flags & FAULT_FLAG_SPECULATIVE) {
+   if (flags & FAULT_FLAG_KILLABLE) {
+   int ret;
+
+   ret = __lock_page_killable(page);
+   if (ret)
+   return 0;
+   } else
+   __lock_page(page);
+   return 1;
+   }
+
if (flags & FAULT_FLAG_ALLOW_RETRY) {
/*
 * CAUTION! In this case, mmap_sem is not released
diff --git a/mm/memory.c b/mm/memory.c
index 549d235..02347f3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3836,8 +3836,6 @@ static int handle_pte_fault(struct vm_fault *vmf)
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
-   else if (vmf->flags & FAULT_FLAG_SPECULATIVE)
-   return VM_FAULT_RETRY;
else
return do_fault(vmf);
}
@@ -4012,17 +4010,7 @@ int handle_speculative_fault(struct mm_struct *mm, 
unsigned long address,
goto unlock;
}

-   /*
-* Can't call vm_ops service has we don't know what they would do
-* with the VMA.
-* This include huge page from hugetlbfs.
-*/
-   if (vma->vm_ops) {
-   trace_spf_vma_notsup(_RET_IP_, vma, address);
-   goto unlock;
-   }
-
-   if (unlikely(!vma->anon_vma)) {
+   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
trace_spf_vma_notsup(_RET_IP_, vma, address);
goto unlock;
}

[PATCH v3 17/17] powerpc: Use instruction emulation infrastructure to handle alignment faults

2017-08-29 Thread Paul Mackerras

This replaces almost all of the instruction emulation code in
fix_alignment() with calls to analyse_instr(), emulate_loadstore()
and emulate_dcbz().  The only emulation code left is the SPE
emulation code; analyse_instr() etc. do not handle SPE instructions
at present.

One result of this is that we can now handle alignment faults on
all the new VSX load and store instructions that were added in POWER9.
VSX loads/stores will take alignment faults for unaligned accesses
to cache-inhibited memory.

Another effect is that we no longer rely on the DAR and DSISR values
set by the processor.

With this, we now need to include the instruction emulation code
unconditionally.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/Kconfig|   4 -
 arch/powerpc/kernel/align.c | 774 ++--
 arch/powerpc/lib/Makefile   |   4 +-
 3 files changed, 34 insertions(+), 748 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index bf6abab..9fc3c0b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -367,10 +367,6 @@ config PPC_ADV_DEBUG_DAC_RANGE
depends on PPC_ADV_DEBUG_REGS && 44x
default y
 
-config PPC_EMULATE_SSTEP
-   bool
-   default y if KPROBES || UPROBES || XMON || HAVE_HW_BREAKPOINT
-
 config ZONE_DMA32
bool
default y if PPC64
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index ec7a8b0..26b9994 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct aligninfo {
unsigned char len;
@@ -40,364 +41,9 @@ struct aligninfo {
 #define LD 0   /* load */
 #define ST 1   /* store */
 #define SE 2   /* sign-extend value, or FP ld/st as word */
-#define F  4   /* to/from fp regs */
-#define U  8   /* update index register */
-#define M  0x10/* multiple load/store */
 #define SW 0x20/* byte swap */
-#define S  0x40/* single-precision fp or... */
-#define SX 0x40/* ... byte count in XER */
-#define HARD   0x80/* string, stwcx. */
 #define E4 0x40/* SPE endianness is word */
 #define E8 0x80/* SPE endianness is double word */
-#define SPLT   0x80/* VSX SPLAT load */
-
-/* DSISR bits reported for a DCBZ instruction: */
-#define DCBZ   0x5f/* 8xx/82xx dcbz faults when cache not enabled */
-
-/*
- * The PowerPC stores certain bits of the instruction that caused the
- * alignment exception in the DSISR register.  This array maps those
- * bits to information about the operand length and what the
- * instruction would do.
- */
-static struct aligninfo aligninfo[128] = {
-   { 4, LD },  /* 00 0 : lwz / lwarx */
-   INVALID,/* 00 0 0001 */
-   { 4, ST },  /* 00 0 0010: stw */
-   INVALID,/* 00 0 0011 */
-   { 2, LD },  /* 00 0 0100: lhz */
-   { 2, LD+SE },   /* 00 0 0101: lha */
-   { 2, ST },  /* 00 0 0110: sth */
-   { 4, LD+M },/* 00 0 0111: lmw */
-   { 4, LD+F+S },  /* 00 0 1000: lfs */
-   { 8, LD+F },/* 00 0 1001: lfd */
-   { 4, ST+F+S },  /* 00 0 1010: stfs */
-   { 8, ST+F },/* 00 0 1011: stfd */
-   { 16, LD }, /* 00 0 1100: lq */
-   { 8, LD },  /* 00 0 1101: ld/ldu/lwa */
-   INVALID,/* 00 0 1110 */
-   { 8, ST },  /* 00 0 : std/stdu */
-   { 4, LD+U },/* 00 1 : lwzu */
-   INVALID,/* 00 1 0001 */
-   { 4, ST+U },/* 00 1 0010: stwu */
-   INVALID,/* 00 1 0011 */
-   { 2, LD+U },/* 00 1 0100: lhzu */
-   { 2, LD+SE+U }, /* 00 1 0101: lhau */
-   { 2, ST+U },/* 00 1 0110: sthu */
-   { 4, ST+M },/* 00 1 0111: stmw */
-   { 4, LD+F+S+U },/* 00 1 1000: lfsu */
-   { 8, LD+F+U },  /* 00 1 1001: lfdu */
-   { 4, ST+F+S+U },/* 00 1 1010: stfsu */
-   { 8, ST+F+U },  /* 00 1 1011: stfdu */
-   { 16, LD+F },   /* 00 1 1100: lfdp */
-   INVALID,/* 00 1 1101 */
-   { 16, ST+F },   /* 00 1 1110: stfdp */
-   INVALID,/* 00 1  */
-   { 8, LD },  /* 01 0 : ldx */
-   INVALID,/* 01 0 0001 */
-   { 8, ST },  /* 01 0 0010: stdx */
-   INVALID,/* 01 0 0011 */
-   INVALID,/* 01 0 0100 */
-   { 4, LD+SE },   /* 01 0 0101: lwax */
-   INVALID,/* 01 0 0110 */
-   INVALID,/* 01 0 0111 */
-   { 4, LD+M+HARD+SX },/* 01 0 1000: lswx */
-   { 4, LD+M+HARD },   /* 01 0 1001: lswi */
-   { 4, ST+M+HARD+SX },/* 01 0 1010: stswx */
-   { 4, ST+M+HA

[PATCH v3 16/17] powerpc: Separate out load/store emulation into its own function

2017-08-29 Thread Paul Mackerras

This moves the parts of emulate_step() that deal with emulating
load and store instructions into a new function called
emulate_loadstore().  This is to make it possible to reuse this
code in the alignment handler.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |   9 ++
 arch/powerpc/lib/sstep.c | 258 ++-
 2 files changed, 154 insertions(+), 113 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 958c2c5..309d1c5 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -152,6 +152,15 @@ void emulate_update_regs(struct pt_regs *reg, struct 
instruction_op *op);
  */
 extern int emulate_step(struct pt_regs *regs, unsigned int instr);
 
+/*
+ * Emulate a load or store instruction by reading/writing the
+ * memory of the current process.  FP/VMX/VSX registers are assumed
+ * to hold live values if the appropriate enable bit in regs->msr is
+ * set; otherwise this will use the saved values in the thread struct
+ * for user-mode accesses.
+ */
+extern int emulate_loadstore(struct pt_regs *regs, struct instruction_op *op);
+
 extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
 const void *mem, bool cross_endian);
 extern void emulate_vsx_store(struct instruction_op *op,
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 810b5f2..24031ca 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -2667,76 +2667,35 @@ void emulate_update_regs(struct pt_regs *regs, struct 
instruction_op *op)
 }
 
 /*
- * Emulate instructions that cause a transfer of control,
- * loads and stores, and a few other instructions.
- * Returns 1 if the step was emulated, 0 if not,
- * or -1 if the instruction is one that should not be stepped,
- * such as an rfid, or a mtmsrd that would clear MSR_RI.
+ * Emulate a previously-analysed load or store instruction.
+ * Return values are:
+ * 0 = instruction emulated successfully
+ * -EFAULT = address out of range or access faulted (regs->dar
+ *  contains the faulting address)
+ * -EACCES = misaligned access, instruction requires alignment
+ * -EINVAL = unknown operation in *op
  */
-int emulate_step(struct pt_regs *regs, unsigned int instr)
+int emulate_loadstore(struct pt_regs *regs, struct instruction_op *op)
 {
-   struct instruction_op op;
-   int r, err, size, type;
-   unsigned long val;
-   unsigned int cr;
+   int err, size, type;
int i, rd, nb;
+   unsigned int cr;
+   unsigned long val;
unsigned long ea;
bool cross_endian;
 
-   r = analyse_instr(&op, regs, instr);
-   if (r < 0)
-   return r;
-   if (r > 0) {
-   emulate_update_regs(regs, &op);
-   return 0;
-   }
-
err = 0;
-   size = GETSIZE(op.type);
-   type = op.type & INSTR_TYPE_MASK;
+   size = GETSIZE(op->type);
+   type = op->type & INSTR_TYPE_MASK;
cross_endian = (regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE);
-
-   ea = op.ea;
-   if (OP_IS_LOAD_STORE(type) || type == CACHEOP)
-   ea = truncate_if_32bit(regs->msr, op.ea);
+   ea = truncate_if_32bit(regs->msr, op->ea);
 
switch (type) {
-   case CACHEOP:
-   if (!address_ok(regs, ea, 8))
-   return 0;
-   switch (op.type & CACHEOP_MASK) {
-   case DCBST:
-   __cacheop_user_asmx(ea, err, "dcbst");
-   break;
-   case DCBF:
-   __cacheop_user_asmx(ea, err, "dcbf");
-   break;
-   case DCBTST:
-   if (op.reg == 0)
-   prefetchw((void *) ea);
-   break;
-   case DCBT:
-   if (op.reg == 0)
-   prefetch((void *) ea);
-   break;
-   case ICBI:
-   __cacheop_user_asmx(ea, err, "icbi");
-   break;
-   case DCBZ:
-   err = emulate_dcbz(ea, regs);
-   break;
-   }
-   if (err) {
-   regs->dar = ea;
-   return 0;
-   }
-   goto instr_done;
-
case LARX:
if (ea & (size - 1))
-   break;  /* can't handle misaligned */
+   return -EACCES; /* can't handle misaligned */
if (!address_ok(regs, ea, size))
-   return 0;
+   return -EFAULT;
err = 0;
switch (size) {
 #ifdef __powerpc64__
@@ -2755,49 +2714,49 @@ int emulate_step(struct pt_regs *regs, unsigned int 
instr)
__get_user_asmx(val, ea, err, "ldarx");

[PATCH v3 15/17] powerpc: Handle opposite-endian processes in emulation code

2017-08-29 Thread Paul Mackerras

This adds code to the load and store emulation code to byte-swap
the data appropriately when the process being emulated is set to
the opposite endianness to that of the kernel.

This also enables the emulation for the multiple-register loads
and stores (lmw, stmw, lswi, stswi, lswx, stswx) to work for
little-endian.  In little-endian mode, the partial word at the
end of a transfer for lsw*/stsw* (when the byte count is not a
multiple of 4) is loaded/stored at the least-significant end of
the register.  Additionally, this fixes a bug in the previous
code in that it could call read_mem/write_mem with a byte count
that was not 1, 2, 4 or 8.

Note that this only works correctly on processors with "true"
little-endian mode, such as IBM POWER processors from POWER6 on, not
the so-called "PowerPC" little-endian mode that uses address swizzling
as implemented on the old 32-bit 603, 604, 740/750, 74xx CPUs.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |   7 +-
 arch/powerpc/lib/sstep.c | 184 +++
 2 files changed, 131 insertions(+), 60 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 793639a..958c2c5 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -153,7 +153,8 @@ void emulate_update_regs(struct pt_regs *reg, struct 
instruction_op *op);
 extern int emulate_step(struct pt_regs *regs, unsigned int instr);
 
 extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
-const void *mem);
-extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg 
*reg,
- void *mem);
+const void *mem, bool cross_endian);
+extern void emulate_vsx_store(struct instruction_op *op,
+ const union vsx_reg *reg, void *mem,
+ bool cross_endian);
 extern int emulate_dcbz(unsigned long ea, struct pt_regs *regs);
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 5c0f50b..810b5f2 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -217,6 +217,33 @@ static nokprobe_inline unsigned long byterev_8(unsigned 
long x)
 }
 #endif
 
+static nokprobe_inline void do_byte_reverse(void *ptr, int nb)
+{
+   switch (nb) {
+   case 2:
+   *(u16 *)ptr = byterev_2(*(u16 *)ptr);
+   break;
+   case 4:
+   *(u32 *)ptr = byterev_4(*(u32 *)ptr);
+   break;
+#ifdef __powerpc64__
+   case 8:
+   *(unsigned long *)ptr = byterev_8(*(unsigned long *)ptr);
+   break;
+   case 16: {
+   unsigned long *up = (unsigned long *)ptr;
+   unsigned long tmp;
+   tmp = byterev_8(up[0]);
+   up[0] = byterev_8(up[1]);
+   up[1] = tmp;
+   break;
+   }
+#endif
+   default:
+   WARN_ON_ONCE(1);
+   }
+}
+
 static nokprobe_inline int read_mem_aligned(unsigned long *dest,
unsigned long ea, int nb,
struct pt_regs *regs)
@@ -430,7 +457,8 @@ NOKPROBE_SYMBOL(write_mem);
  * These access either the real FP register or the image in the
  * thread_struct, depending on regs->msr & MSR_FP.
  */
-static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs)
+static int do_fp_load(int rn, unsigned long ea, int nb, struct pt_regs *regs,
+ bool cross_endian)
 {
int err;
union {
@@ -445,6 +473,11 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
err = copy_mem_in(u.b, ea, nb, regs);
if (err)
return err;
+   if (unlikely(cross_endian)) {
+   do_byte_reverse(u.b, min(nb, 8));
+   if (nb == 16)
+   do_byte_reverse(&u.b[8], 8);
+   }
preempt_disable();
if (nb == 4)
conv_sp_to_dp(&u.f, &u.d[0]);
@@ -465,7 +498,8 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
 }
 NOKPROBE_SYMBOL(do_fp_load);
 
-static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs)
+static int do_fp_store(int rn, unsigned long ea, int nb, struct pt_regs *regs,
+  bool cross_endian)
 {
union {
float f;
@@ -491,6 +525,11 @@ static int do_fp_store(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
u.l[1] = current->thread.TS_FPR(rn);
}
preempt_enable();
+   if (unlikely(cross_endian)) {
+   do_byte_reverse(u.b, min(nb, 8));
+   if (nb == 16)
+   do_byte_reverse(&u.b[8], 8);
+   }
return copy_mem_out(u.b, ea, nb, regs);
 }
 NOKPROBE_SYMBOL(do_fp_store);
@@ -499,7 +538,8 @@ NOKPROBE_SYMBOL(do_fp_store);
 #i

[PATCH v3 14/17] powerpc: Set regs->dar if memory access fails in emulate_step()

2017-08-29 Thread Paul Mackerras

This adds code to the instruction emulation code to set regs->dar
to the address of any memory access that fails.  This address is
not necessarily the same as the effective address of the instruction,
because if the memory access is unaligned, it might cross a page
boundary and fault on the second page.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 74 ++--
 1 file changed, 52 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index fa20f3a..5c0f50b 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -103,11 +103,19 @@ static nokprobe_inline int branch_taken(unsigned int 
instr,
return 1;
 }
 
-static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, 
int nb)
+static nokprobe_inline long address_ok(struct pt_regs *regs,
+  unsigned long ea, int nb)
 {
if (!user_mode(regs))
return 1;
-   return __access_ok(ea, nb, USER_DS);
+   if (__access_ok(ea, nb, USER_DS))
+   return 1;
+   if (__access_ok(ea, 1, USER_DS))
+   /* Access overlaps the end of the user region */
+   regs->dar = USER_DS.seg;
+   else
+   regs->dar = ea;
+   return 0;
 }
 
 /*
@@ -210,7 +218,8 @@ static nokprobe_inline unsigned long byterev_8(unsigned 
long x)
 #endif
 
 static nokprobe_inline int read_mem_aligned(unsigned long *dest,
-   unsigned long ea, int nb)
+   unsigned long ea, int nb,
+   struct pt_regs *regs)
 {
int err = 0;
unsigned long x = 0;
@@ -233,6 +242,8 @@ static nokprobe_inline int read_mem_aligned(unsigned long 
*dest,
}
if (!err)
*dest = x;
+   else
+   regs->dar = ea;
return err;
 }
 
@@ -240,7 +251,8 @@ static nokprobe_inline int read_mem_aligned(unsigned long 
*dest,
  * Copy from userspace to a buffer, using the largest possible
  * aligned accesses, up to sizeof(long).
  */
-static int nokprobe_inline copy_mem_in(u8 *dest, unsigned long ea, int nb)
+static int nokprobe_inline copy_mem_in(u8 *dest, unsigned long ea, int nb,
+  struct pt_regs *regs)
 {
int err = 0;
int c;
@@ -268,8 +280,10 @@ static int nokprobe_inline copy_mem_in(u8 *dest, unsigned 
long ea, int nb)
break;
 #endif
}
-   if (err)
+   if (err) {
+   regs->dar = ea;
return err;
+   }
dest += c;
ea += c;
}
@@ -289,7 +303,7 @@ static nokprobe_inline int read_mem_unaligned(unsigned long 
*dest,
 
u.ul = 0;
i = IS_BE ? sizeof(unsigned long) - nb : 0;
-   err = copy_mem_in(&u.b[i], ea, nb);
+   err = copy_mem_in(&u.b[i], ea, nb, regs);
if (!err)
*dest = u.ul;
return err;
@@ -306,13 +320,14 @@ static int read_mem(unsigned long *dest, unsigned long 
ea, int nb,
if (!address_ok(regs, ea, nb))
return -EFAULT;
if ((ea & (nb - 1)) == 0)
-   return read_mem_aligned(dest, ea, nb);
+   return read_mem_aligned(dest, ea, nb, regs);
return read_mem_unaligned(dest, ea, nb, regs);
 }
 NOKPROBE_SYMBOL(read_mem);
 
 static nokprobe_inline int write_mem_aligned(unsigned long val,
-   unsigned long ea, int nb)
+unsigned long ea, int nb,
+struct pt_regs *regs)
 {
int err = 0;
 
@@ -332,6 +347,8 @@ static nokprobe_inline int write_mem_aligned(unsigned long 
val,
break;
 #endif
}
+   if (err)
+   regs->dar = ea;
return err;
 }
 
@@ -339,7 +356,8 @@ static nokprobe_inline int write_mem_aligned(unsigned long 
val,
  * Copy from a buffer to userspace, using the largest possible
  * aligned accesses, up to sizeof(long).
  */
-static int nokprobe_inline copy_mem_out(u8 *dest, unsigned long ea, int nb)
+static int nokprobe_inline copy_mem_out(u8 *dest, unsigned long ea, int nb,
+   struct pt_regs *regs)
 {
int err = 0;
int c;
@@ -367,8 +385,10 @@ static int nokprobe_inline copy_mem_out(u8 *dest, unsigned 
long ea, int nb)
break;
 #endif
}
-   if (err)
+   if (err) {
+   regs->dar = ea;
return err;
+   }
dest += c;
ea += c;
}
@@ -387,7 +407,7 @@ static nokprobe_inline int write_mem_unaligned(unsigned 
long val,
 
u.ul = val;
i = IS_BE ? sizeof(unsigned long) - nb : 0;
-   return copy_mem_out

[PATCH v3 13/17] powerpc: Emulate the dcbz instruction

2017-08-29 Thread Paul Mackerras

This adds code to analyse_instr() and emulate_step() to understand the
dcbz (data cache block zero) instruction.  The emulate_dcbz() function
is made public so it can be used by the alignment handler in future.
(The apparently unnecessary cropping of the address to 32 bits is
there because it will be needed in that situation.)

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |  2 ++
 arch/powerpc/lib/sstep.c | 32 
 2 files changed, 34 insertions(+)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 474a992..793639a 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -84,6 +84,7 @@ enum instruction_type {
 #define DCBTST 0x200
 #define DCBT   0x300
 #define ICBI   0x400
+#define DCBZ   0x500
 
 /* VSX flags values */
 #define VSX_FPCONV 1   /* do floating point SP/DP conversion */
@@ -155,3 +156,4 @@ extern void emulate_vsx_load(struct instruction_op *op, 
union vsx_reg *reg,
 const void *mem);
 extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg 
*reg,
  void *mem);
+extern int emulate_dcbz(unsigned long ea, struct pt_regs *regs);
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 817cdc9..fa20f3a 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -780,6 +780,30 @@ static nokprobe_inline int do_vsx_store(struct 
instruction_op *op,
 }
 #endif /* CONFIG_VSX */
 
+int emulate_dcbz(unsigned long ea, struct pt_regs *regs)
+{
+   int err;
+   unsigned long i, size;
+
+#ifdef __powerpc64__
+   size = ppc64_caches.l1d.block_size;
+   if (!(regs->msr & MSR_64BIT))
+   ea &= 0xUL;
+#else
+   size = L1_CACHE_BYTES;
+#endif
+   ea &= ~(size - 1);
+   if (!address_ok(regs, ea, size))
+   return -EFAULT;
+   for (i = 0; i < size; i += sizeof(long)) {
+   err = __put_user(0, (unsigned long __user *) (ea + i));
+   if (err)
+   return err;
+   }
+   return 0;
+}
+NOKPROBE_SYMBOL(emulate_dcbz);
+
 #define __put_user_asmx(x, addr, err, op, cr)  \
__asm__ __volatile__(   \
"1: " op " %2,0,%3\n"   \
@@ -1748,6 +1772,11 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
op->type = MKOP(CACHEOP, ICBI, 0);
op->ea = xform_ea(instr, regs);
return 0;
+
+   case 1014:  /* dcbz */
+   op->type = MKOP(CACHEOP, DCBZ, 0);
+   op->ea = xform_ea(instr, regs);
+   return 0;
}
break;
}
@@ -2607,6 +2636,9 @@ int emulate_step(struct pt_regs *regs, unsigned int instr)
case ICBI:
__cacheop_user_asmx(ea, err, "icbi");
break;
+   case DCBZ:
+   err = emulate_dcbz(ea, regs);
+   break;
}
if (err)
return 0;
-- 
2.7.4

[PATCH v3 12/17] powerpc: Emulate load/store floating double pair instructions

2017-08-29 Thread Paul Mackerras

This adds lfdp[x] and stfdp[x] to the set of instructions that
analyse_instr() and emulate_step() understand.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 68 
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 167d40d..817cdc9 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -415,9 +415,9 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
int err;
union {
float f;
-   double d;
-   unsigned long l;
-   u8 b[sizeof(double)];
+   double d[2];
+   unsigned long l[2];
+   u8 b[2 * sizeof(double)];
} u;
 
if (!address_ok(regs, ea, nb))
@@ -427,11 +427,19 @@ static int do_fp_load(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
return err;
preempt_disable();
if (nb == 4)
-   conv_sp_to_dp(&u.f, &u.d);
+   conv_sp_to_dp(&u.f, &u.d[0]);
if (regs->msr & MSR_FP)
-   put_fpr(rn, &u.d);
+   put_fpr(rn, &u.d[0]);
else
-   current->thread.TS_FPR(rn) = u.l;
+   current->thread.TS_FPR(rn) = u.l[0];
+   if (nb == 16) {
+   /* lfdp */
+   rn |= 1;
+   if (regs->msr & MSR_FP)
+   put_fpr(rn, &u.d[1]);
+   else
+   current->thread.TS_FPR(rn) = u.l[1];
+   }
preempt_enable();
return 0;
 }
@@ -441,20 +449,27 @@ static int do_fp_store(int rn, unsigned long ea, int nb, 
struct pt_regs *regs)
 {
union {
float f;
-   double d;
-   unsigned long l;
-   u8 b[sizeof(double)];
+   double d[2];
+   unsigned long l[2];
+   u8 b[2 * sizeof(double)];
} u;
 
if (!address_ok(regs, ea, nb))
return -EFAULT;
preempt_disable();
if (regs->msr & MSR_FP)
-   get_fpr(rn, &u.d);
+   get_fpr(rn, &u.d[0]);
else
-   u.l = current->thread.TS_FPR(rn);
+   u.l[0] = current->thread.TS_FPR(rn);
if (nb == 4)
-   conv_dp_to_sp(&u.d, &u.f);
+   conv_dp_to_sp(&u.d[0], &u.f);
+   if (nb == 16) {
+   rn |= 1;
+   if (regs->msr & MSR_FP)
+   get_fpr(rn, &u.d[1]);
+   else
+   u.l[1] = current->thread.TS_FPR(rn);
+   }
preempt_enable();
return copy_mem_out(u.b, ea, nb);
 }
@@ -1938,7 +1953,17 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
case 759:   /* stfdux */
op->type = MKOP(STORE_FP, u, 8);
break;
-#endif
+
+#ifdef __powerpc64__
+   case 791:   /* lfdpx */
+   op->type = MKOP(LOAD_FP, 0, 16);
+   break;
+
+   case 919:   /* stfdpx */
+   op->type = MKOP(STORE_FP, 0, 16);
+   break;
+#endif /* __powerpc64 */
+#endif /* CONFIG_PPC_FPU */
 
 #ifdef __powerpc64__
case 660:   /* stdbrx */
@@ -1956,7 +1981,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
op->val = byterev_4(regs->gpr[rd]);
break;
 
-   case 725:
+   case 725:   /* stswi */
if (rb == 0)
rb = 32;/* # bytes to store */
op->type = MKOP(STORE_MULTI, 0, rb);
@@ -2246,9 +2271,14 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #endif
 
 #ifdef CONFIG_VSX
-   case 57:/* lxsd, lxssp */
+   case 57:/* lfdp, lxsd, lxssp */
op->ea = dsform_ea(instr, regs);
switch (instr & 3) {
+   case 0: /* lfdp */
+   if (rd & 1)
+   break;  /* reg must be even */
+   op->type = MKOP(LOAD_FP, 0, 16);
+   break;
case 2: /* lxsd */
op->reg = rd + 32;
op->type = MKOP(LOAD_VSX, 0, 8);
@@ -2283,8 +2313,14 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #endif
 
 #ifdef CONFIG_VSX
-   case 61:/* lxv, stxsd, stxssp, stxv */
+   case 61:/* stfdp, lxv, stxsd, stxssp, stxv */
switch (instr & 7) {
+   case 0: /* stfdp with LSB of DS field = 0 */
+   case 4: /* stfdp with LSB of DS field = 1 */
+   op->ea = dsform_ea(instr, regs);
+   op->t

[PATCH v3 11/17] powerpc: Emulate vector element load/store instructions

2017-08-29 Thread Paul Mackerras

This adds code to analyse_instr() and emulate_step() to handle the
vector element loads and stores:

lvebx, lvehx, lvewx, stvebx, stvehx, stvewx.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 38 --
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 91ae031..167d40d 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -476,7 +476,7 @@ static nokprobe_inline int do_vec_load(int rn, unsigned 
long ea,
return -EFAULT;
/* align to multiple of size */
ea &= ~(size - 1);
-   err = copy_mem_in(u.b, ea, size);
+   err = copy_mem_in(&u.b[ea & 0xf], ea, size);
if (err)
return err;
 
@@ -508,7 +508,7 @@ static nokprobe_inline int do_vec_store(int rn, unsigned 
long ea,
else
u.v = current->thread.vr_state.vr[rn];
preempt_enable();
-   return copy_mem_out(u.b, ea, size);
+   return copy_mem_out(&u.b[ea & 0xf], ea, size);
 }
 #endif /* CONFIG_ALTIVEC */
 
@@ -1807,12 +1807,46 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
break;
 
 #ifdef CONFIG_ALTIVEC
+   /*
+* Note: for the load/store vector element instructions,
+* bits of the EA say which field of the VMX register to use.
+*/
+   case 7: /* lvebx */
+   op->type = MKOP(LOAD_VMX, 0, 1);
+   op->element_size = 1;
+   break;
+
+   case 39:/* lvehx */
+   op->type = MKOP(LOAD_VMX, 0, 2);
+   op->element_size = 2;
+   break;
+
+   case 71:/* lvewx */
+   op->type = MKOP(LOAD_VMX, 0, 4);
+   op->element_size = 4;
+   break;
+
case 103:   /* lvx */
case 359:   /* lvxl */
op->type = MKOP(LOAD_VMX, 0, 16);
op->element_size = 16;
break;
 
+   case 135:   /* stvebx */
+   op->type = MKOP(STORE_VMX, 0, 1);
+   op->element_size = 1;
+   break;
+
+   case 167:   /* stvehx */
+   op->type = MKOP(STORE_VMX, 0, 2);
+   op->element_size = 2;
+   break;
+
+   case 199:   /* stvewx */
+   op->type = MKOP(STORE_VMX, 0, 4);
+   op->element_size = 4;
+   break;
+
case 231:   /* stvx */
case 487:   /* stvxl */
op->type = MKOP(STORE_VMX, 0, 16);
-- 
2.7.4

[PATCH v3 10/17] powerpc: Emulate FP/vector/VSX loads/stores correctly when regs not live

2017-08-29 Thread Paul Mackerras

At present, the analyse_instr/emulate_step code checks for the
relevant MSR_FP/VEC/VSX bit being set when a FP/VMX/VSX load
or store is decoded, but doesn't recheck the bit before reading or
writing the relevant FP/VMX/VSX register in emulate_step().

Since we don't have preemption disabled, it is possible that we get
preempted between checking the MSR bit and doing the register access.
If that happened, then the registers would have been saved to the
thread_struct for the current process.  Accesses to the CPU registers
would then potentially read stale values, or write values that would
never be seen by the user process.

Another way that the registers can become non-live is if a page
fault occurs when accessing user memory, and the page fault code
calls a copy routine that wants to use the VMX or VSX registers.

To fix this, the code for all the FP/VMX/VSX loads gets restructured
so that it forms an image in a local variable of the desired register
contents, then disables preemption, checks the MSR bit and either
sets the CPU register or writes the value to the thread struct.
Similarly, the code for stores checks the MSR bit, copies either the
CPU register or the thread struct to a local variable, then reenables
preemption and then copies the register image to memory.

If the instruction being emulated is in the kernel, then we must not
use the register values in the thread_struct.  In this case, if the
relevant MSR enable bit is not set, then emulate_step refuses to
emulate the instruction.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |   1 +
 arch/powerpc/lib/ldstfp.S| 241 +++
 arch/powerpc/lib/sstep.c | 228 +---
 3 files changed, 203 insertions(+), 267 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 4fcc2c9..474a992 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -119,6 +119,7 @@ union vsx_reg {
unsigned long d[2];
float   fp[4];
double  dp[2];
+   __vector128 v;
 };
 
 /*
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index 6840911..7b5cf5e 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -21,27 +21,19 @@
 
 #define STKFRM (PPC_MIN_STKFRM + 16)
 
-   .macro  inst32  op
-reg = 0
-   .rept   32
-20:\op reg,0,r4
-   b   3f
-   EX_TABLE(20b,99f)
-reg = reg + 1
-   .endr
-   .endm
-
-/* Get the contents of frN into fr0; N is in r3. */
+/* Get the contents of frN into *p; N is in r3 and p is in r4. */
 _GLOBAL(get_fpr)
mflrr0
+   mfmsr   r6
+   ori r7, r6, MSR_FP
+   MTMSRD(r7)
+   isync
rlwinm  r3,r3,3,0xf8
bcl 20,31,1f
-   blr /* fr0 is already in fr0 */
-   nop
-reg = 1
-   .rept   31
-   fmr fr0,reg
-   blr
+reg = 0
+   .rept   32
+   stfdreg, 0(r4)
+   b   2f
 reg = reg + 1
.endr
 1: mflrr5
@@ -49,18 +41,23 @@ reg = reg + 1
mtctr   r5
mtlrr0
bctr
+2: MTMSRD(r6)
+   isync
+   blr
 
-/* Put the contents of fr0 into frN; N is in r3. */
+/* Put the contents of *p into frN; N is in r3 and p is in r4. */
 _GLOBAL(put_fpr)
mflrr0
+   mfmsr   r6
+   ori r7, r6, MSR_FP
+   MTMSRD(r7)
+   isync
rlwinm  r3,r3,3,0xf8
bcl 20,31,1f
-   blr /* fr0 is already in fr0 */
-   nop
-reg = 1
-   .rept   31
-   fmr reg,fr0
-   blr
+reg = 0
+   .rept   32
+   lfd reg, 0(r4)
+   b   2f
 reg = reg + 1
.endr
 1: mflrr5
@@ -68,127 +65,24 @@ reg = reg + 1
mtctr   r5
mtlrr0
bctr
-
-/* Load FP reg N from float at *p.  N is in r3, p in r4. */
-_GLOBAL(do_lfs)
-   PPC_STLU r1,-STKFRM(r1)
-   mflrr0
-   PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1)
-   mfmsr   r6
-   ori r7,r6,MSR_FP
-   cmpwi   cr7,r3,0
-   MTMSRD(r7)
-   isync
-   beq cr7,1f
-   stfdfr0,STKFRM-16(r1)
-1: li  r9,-EFAULT
-2: lfs fr0,0(r4)
-   li  r9,0
-3: bl  put_fpr
-   beq cr7,4f
-   lfd fr0,STKFRM-16(r1)
-4: PPC_LL  r0,STKFRM+PPC_LR_STKOFF(r1)
-   mtlrr0
-   MTMSRD(r6)
-   isync
-   mr  r3,r9
-   addir1,r1,STKFRM
-   blr
-   EX_TABLE(2b,3b)
-
-/* Load FP reg N from double at *p.  N is in r3, p in r4. */
-_GLOBAL(do_lfd)
-   PPC_STLU r1,-STKFRM(r1)
-   mflrr0
-   PPC_STL r0,STKFRM+PPC_LR_STKOFF(r1)
-   mfmsr   r6
-   ori r7,r6,MSR_FP
-   cmpwi   cr7,r3,0
-   MTMSRD(r7)
-   isync
-   beq cr7,1f
-   stfdfr0,STKFRM-16(r1)
-1: li  r9,-EFAULT
-2: lfd fr0,0(r4)
-   li  r9,0
-3: beq cr7,4f
-   bl  put_fpr
-

[PATCH v3 09/17] powerpc: Make load/store emulation use larger memory accesses

2017-08-29 Thread Paul Mackerras

At the moment, emulation of loads and stores of up to 8 bytes to
unaligned addresses on a little-endian system uses a sequence of
single-byte loads or stores to memory.  This is rather inefficient,
and the code is hard to follow because it has many ifdefs.
In addition, the Power ISA has requirements on how unaligned accesses
are performed, which are not met by doing all accesses as
sequences of single-byte accesses.

Emulation of VSX loads and stores uses __copy_{to,from}_user,
which means the emulation code has no control on the size of
accesses.

To simplify this, we add new copy_mem_in() and copy_mem_out()
functions for accessing memory.  These use a sequence of the largest
possible aligned accesses, up to 8 bytes (or 4 on 32-bit systems),
to copy memory between a local buffer and user memory.  We then
rewrite {read,write}_mem_unaligned and the VSX load/store
emulation using these new functions.

These new functions also simplify the code in do_fp_load() and
do_fp_store() for the unaligned cases.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 235 +--
 1 file changed, 106 insertions(+), 129 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index ed2bc4c..6cc2911 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -193,7 +193,6 @@ static nokprobe_inline unsigned long max_align(unsigned 
long x)
return x & -x;  /* isolates rightmost bit */
 }
 
-
 static nokprobe_inline unsigned long byterev_2(unsigned long x)
 {
return ((x >> 8) & 0xff) | ((x & 0xff) << 8);
@@ -239,56 +238,69 @@ static nokprobe_inline int read_mem_aligned(unsigned long 
*dest,
return err;
 }
 
-static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
-   unsigned long ea, int nb, struct pt_regs *regs)
+/*
+ * Copy from userspace to a buffer, using the largest possible
+ * aligned accesses, up to sizeof(long).
+ */
+static int nokprobe_inline copy_mem_in(u8 *dest, unsigned long ea, int nb)
 {
-   int err;
-   unsigned long x, b, c;
-#ifdef __LITTLE_ENDIAN__
-   int len = nb; /* save a copy of the length for byte reversal */
-#endif
+   int err = 0;
+   int c;
 
-   /* unaligned, do this in pieces */
-   x = 0;
for (; nb > 0; nb -= c) {
-#ifdef __LITTLE_ENDIAN__
-   c = 1;
-#endif
-#ifdef __BIG_ENDIAN__
c = max_align(ea);
-#endif
if (c > nb)
c = max_align(nb);
-   err = read_mem_aligned(&b, ea, c);
+   switch (c) {
+   case 1:
+   err = __get_user(*dest, (unsigned char __user *) ea);
+   break;
+   case 2:
+   err = __get_user(*(u16 *)dest,
+(unsigned short __user *) ea);
+   break;
+   case 4:
+   err = __get_user(*(u32 *)dest,
+(unsigned int __user *) ea);
+   break;
+#ifdef __powerpc64__
+   case 8:
+   err = __get_user(*(unsigned long *)dest,
+(unsigned long __user *) ea);
+   break;
+#endif
+   }
if (err)
return err;
-   x = (x << (8 * c)) + b;
+   dest += c;
ea += c;
}
-#ifdef __LITTLE_ENDIAN__
-   switch (len) {
-   case 2:
-   *dest = byterev_2(x);
-   break;
-   case 4:
-   *dest = byterev_4(x);
-   break;
-#ifdef __powerpc64__
-   case 8:
-   *dest = byterev_8(x);
-   break;
-#endif
-   }
-#endif
-#ifdef __BIG_ENDIAN__
-   *dest = x;
-#endif
return 0;
 }
 
+static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
+ unsigned long ea, int nb,
+ struct pt_regs *regs)
+{
+   union {
+   unsigned long ul;
+   u8 b[sizeof(unsigned long)];
+   } u;
+   int i;
+   int err;
+
+   u.ul = 0;
+   i = IS_BE ? sizeof(unsigned long) - nb : 0;
+   err = copy_mem_in(&u.b[i], ea, nb);
+   if (!err)
+   *dest = u.ul;
+   return err;
+}
+
 /*
  * Read memory at address ea for nb bytes, return 0 for success
- * or -EFAULT if an error occurred.
+ * or -EFAULT if an error occurred.  N.B. nb must be 1, 2, 4 or 8.
+ * If nb < sizeof(long), the result is right-justified on BE systems.
  */
 static int read_mem(unsigned long *dest, unsigned long ea, int nb,
  struct pt_regs *regs)
@@ -325,48 +337,64 @@ static nokprobe_inline int write_mem_aligned(unsigned 
long val,
return err;
 }
 
-static nokprobe_inline int write_mem_unaligned(u

[PATCH v3 08/17] powerpc: Add emulation for the addpcis instruction

2017-08-29 Thread Paul Mackerras

The addpcis instruction puts the sum of the next instruction address
plus a constant into a register.  Since the result depends on the
address of the instruction, it will give an incorrect result if it
is single-stepped out of line, which is what the *probes subsystem
will currently do if a probe is placed on an addpcis instruction.
This fixes the problem by adding emulation of it to analyse_instr().

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 114e597..ed2bc4c 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1021,9 +1021,6 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
op->ccval = (regs->ccr & ~(1UL << (31 - rd))) |
(val << (31 - rd));
return 1;
-   default:
-   op->type = UNKNOWN;
-   return 0;
}
break;
case 31:
@@ -1123,6 +1120,17 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
op->val = imm;
goto compute_done;
 
+   case 19:
+   if (((instr >> 1) & 0x1f) == 2) {
+   /* addpcis */
+   imm = (short) (instr & 0xffc1); /* d0 + d2 fields */
+   imm |= (instr >> 15) & 0x3e;/* d1 field */
+   op->val = regs->nip + (imm << 16) + 4;
+   goto compute_done;
+   }
+   op->type = UNKNOWN;
+   return 0;
+
case 20:/* rlwimi */
mb = (instr >> 6) & 0x1f;
me = (instr >> 1) & 0x1f;
-- 
2.7.4

[PATCH v3 07/17] powerpc: Don't update CR0 in emulation of popcnt, prty, bpermd instructions

2017-08-29 Thread Paul Mackerras

The architecture shows the least-significant bit of the instruction
word as reserved for the popcnt[bwd], prty[wd] and bpermd
instructions, that is, these instructions never update CR0.
Therefore this changes the emulation of these instructions to
skip the CR0 update.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 522bc7b..114e597 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1469,7 +1469,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
 
case 122:   /* popcntb */
do_popcnt(regs, op, regs->gpr[rd], 8);
-   goto logical_done;
+   goto logical_done_nocc;
 
case 124:   /* nor */
op->val = ~(regs->gpr[rd] | regs->gpr[rb]);
@@ -1477,15 +1477,15 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 
case 154:   /* prtyw */
do_prty(regs, op, regs->gpr[rd], 32);
-   goto logical_done;
+   goto logical_done_nocc;
 
case 186:   /* prtyd */
do_prty(regs, op, regs->gpr[rd], 64);
-   goto logical_done;
+   goto logical_done_nocc;
 #ifdef CONFIG_PPC64
case 252:   /* bpermd */
do_bpermd(regs, op, regs->gpr[rd], regs->gpr[rb]);
-   goto logical_done;
+   goto logical_done_nocc;
 #endif
case 284:   /* xor */
op->val = ~(regs->gpr[rd] ^ regs->gpr[rb]);
@@ -1497,7 +1497,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
 
case 378:   /* popcntw */
do_popcnt(regs, op, regs->gpr[rd], 32);
-   goto logical_done;
+   goto logical_done_nocc;
 
case 412:   /* orc */
op->val = regs->gpr[rd] | ~regs->gpr[rb];
@@ -1513,7 +1513,7 @@ int analyse_instr(struct instruction_op *op, const struct 
pt_regs *regs,
 #ifdef CONFIG_PPC64
case 506:   /* popcntd */
do_popcnt(regs, op, regs->gpr[rd], 64);
-   goto logical_done;
+   goto logical_done_nocc;
 #endif
case 922:   /* extsh */
op->val = (signed short) regs->gpr[rd];
-- 
2.7.4

[PATCH v3 06/17] powerpc: Fix emulation of the isel instruction

2017-08-29 Thread Paul Mackerras

The case added for the isel instruction was added inside a switch
statement which uses the 10-bit minor opcode field in the 0x7fe
bits of the instruction word.  However, for the isel instruction,
the minor opcode field is only the 0x3e bits, and the 0x7c0 bits
are used for the "BC" field, which indicates which CR bit to use
to select the result.

Therefore, for the isel emulation to work correctly when BC != 0,
we need to match on ((instr >> 1) & 0x1f) == 15).  To do this, we
pull the isel case out of the switch statement and put it in an
if statement of its own.

Fixes: e27f71e5ff3c ("powerpc/lib/sstep: Add isel instruction emulation")
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 18 ++
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index e20f2b4..522bc7b 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1216,6 +1216,16 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
return 0;
 
case 31:
+   /* isel occupies 32 minor opcodes */
+   if (((instr >> 1) & 0x1f) == 15) {
+   mb = (instr >> 6) & 0x1f; /* bc field */
+   val = (regs->ccr >> (31 - mb)) & 1;
+   val2 = (ra) ? regs->gpr[ra] : 0;
+
+   op->val = (val) ? val2 : regs->gpr[rb];
+   goto compute_done;
+   }
+
switch ((instr >> 1) & 0x3ff) {
case 4: /* tw */
if (rd == 0x1f ||
@@ -1441,14 +1451,6 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 /*
  * Logical instructions
  */
-   case 15:/* isel */
-   mb = (instr >> 6) & 0x1f; /* bc */
-   val = (regs->ccr >> (31 - mb)) & 1;
-   val2 = (ra) ? regs->gpr[ra] : 0;
-
-   op->val = (val) ? val2 : regs->gpr[rb];
-   goto compute_done;
-
case 26:/* cntlzw */
op->val = __builtin_clz((unsigned int) regs->gpr[rd]);
goto logical_done;
-- 
2.7.4

[PATCH v3 05/17] powerpc/64: Fix update forms of loads and stores to write 64-bit EA

2017-08-29 Thread Paul Mackerras

When a 64-bit processor is executing in 32-bit mode, the update forms
of load and store instructions are required by the architecture to
write the full 64-bit effective address into the RA register, though
only the bottom 32 bits are used to address memory.  Currently,
the instruction emulation code writes the truncated address to the
RA register.  This fixes it by keeping the full 64-bit EA in the
instruction_op structure, truncating the address in emulate_step()
where it is used to address memory, rather than in the address
computations in analyse_instr().

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |   4 +-
 arch/powerpc/lib/sstep.c | 109 ---
 2 files changed, 58 insertions(+), 55 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 9801970..4fcc2c9 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -25,7 +25,7 @@ struct pt_regs;
 
 enum instruction_type {
COMPUTE,/* arith/logical/CR op, etc. */
-   LOAD,
+   LOAD,   /* load and store types need to be contiguous */
LOAD_MULTI,
LOAD_FP,
LOAD_VMX,
@@ -52,6 +52,8 @@ enum instruction_type {
 
 #define INSTR_TYPE_MASK0x1f
 
+#define OP_IS_LOAD_STORE(type) (LOAD <= (type) && (type) <= STCX)
+
 /* Compute flags, ORed in with type */
 #define SETREG 0x20
 #define SETCC  0x40
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 88c7487..e20f2b4 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -126,7 +126,7 @@ static nokprobe_inline unsigned long dform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 
 #ifdef __powerpc64__
@@ -144,7 +144,7 @@ static nokprobe_inline unsigned long dsform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 
 /*
@@ -161,7 +161,7 @@ static nokprobe_inline unsigned long dqform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 #endif /* __powerpc64 */
 
@@ -180,7 +180,7 @@ static nokprobe_inline unsigned long xform_ea(unsigned int 
instr,
if (ra)
ea += regs->gpr[ra];
 
-   return truncate_if_32bit(regs->msr, ea);
+   return ea;
 }
 
 /*
@@ -1789,10 +1789,7 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
if (rb == 0)
rb = 32;/* # bytes to load */
op->type = MKOP(LOAD_MULTI, 0, rb);
-   op->ea = 0;
-   if (ra)
-   op->ea = truncate_if_32bit(regs->msr,
-  regs->gpr[ra]);
+   op->ea = ra ? regs->gpr[ra] : 0;
break;
 
 #ifdef CONFIG_PPC_FPU
@@ -1837,10 +1834,7 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
if (rb == 0)
rb = 32;/* # bytes to store */
op->type = MKOP(STORE_MULTI, 0, rb);
-   op->ea = 0;
-   if (ra)
-   op->ea = truncate_if_32bit(regs->msr,
-  regs->gpr[ra]);
+   op->ea = ra ? regs->gpr[ra] : 0;
break;
 
case 790:   /* lhbrx */
@@ -2407,10 +2401,11 @@ void emulate_update_regs(struct pt_regs *regs, struct 
instruction_op *op)
 int emulate_step(struct pt_regs *regs, unsigned int instr)
 {
struct instruction_op op;
-   int r, err, size;
+   int r, err, size, type;
unsigned long val;
unsigned int cr;
int i, rd, nb;
+   unsigned long ea;
 
r = analyse_instr(&op, regs, instr);
if (r < 0)
@@ -2422,27 +2417,33 @@ int emulate_step(struct pt_regs *regs, unsigned int 
instr)
 
err = 0;
size = GETSIZE(op.type);
-   switch (op.type & INSTR_TYPE_MASK) {
+   type = op.type & INSTR_TYPE_MASK;
+
+   ea = op.ea;
+   if (OP_IS_LOAD_STORE(type) || type == CACHEOP)
+   ea = truncate_if_32bit(regs->msr, op.ea);
+
+   switch (type) {
case CACHEOP:
-   if (!address_ok(regs, op.ea, 8))
+   if (!address_ok(regs, ea, 8))
return 0;
switch (op.type & CACHEOP_MASK) {
case DCBST:
-   __cacheop_user_asmx(op.ea, err, "dcbst");
+   __cacheop_user_asmx(ea, err, "dcbst");
break;
case DCBF:
-

[PATCH v3 04/17] powerpc: Handle most loads and stores in instruction emulation code

2017-08-29 Thread Paul Mackerras

This extends the instruction emulation infrastructure in sstep.c to
handle all the load and store instructions defined in the Power ISA
v3.0, except for the atomic memory operations, ldmx (which was never
implemented), lfdp/stfdp, and the vector element load/stores.

The instructions added are:

Integer loads and stores: lbarx, lharx, lqarx, stbcx., sthcx., stqcx.,
lq, stq.

VSX loads and stores: lxsiwzx, lxsiwax, stxsiwx, lxvx, lxvl, lxvll,
lxvdsx, lxvwsx, stxvx, stxvl, stxvll, lxsspx, lxsdx, stxsspx, stxsdx,
lxvw4x, lxsibzx, lxvh8x, lxsihzx, lxvb16x, stxvw4x, stxsibx, stxvh8x,
stxsihx, stxvb16x, lxsd, lxssp, lxv, stxsd, stxssp, stxv.

These instructions are handled both in the analyse_instr phase and in
the emulate_step phase.

The code for lxvd2ux and stxvd2ux has been taken out, as those
instructions were never implemented in any processor and have been
taken out of the architecture, and their opcodes have been reused for
other instructions in POWER9 (lxvb16x and stxvb16x).

The emulation for the VSX loads and stores uses helper functions
which don't access registers or memory directly, which can hopefully
be reused by KVM later.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/ppc-opcode.h |   8 +
 arch/powerpc/include/asm/sstep.h  |  21 ++
 arch/powerpc/lib/Makefile |   1 +
 arch/powerpc/lib/ldstfp.S |  70 ++--
 arch/powerpc/lib/quad.S   |  62 
 arch/powerpc/lib/sstep.c  | 610 +++---
 6 files changed, 710 insertions(+), 62 deletions(-)
 create mode 100644 arch/powerpc/lib/quad.S

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 8861289..46f3b26 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -205,6 +205,8 @@
 #define PPC_INST_ISEL_MASK 0xfc3e
 #define PPC_INST_LDARX 0x7ca8
 #define PPC_INST_STDCX 0x7c0001ad
+#define PPC_INST_LQARX 0x7c000228
+#define PPC_INST_STQCX 0x7c00016d
 #define PPC_INST_LSWI  0x7c0004aa
 #define PPC_INST_LSWX  0x7c00042a
 #define PPC_INST_LWARX 0x7c28
@@ -403,12 +405,18 @@
__PPC_RA(a) | __PPC_RB(b))
 #definePPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \
__PPC_RA(a) | __PPC_RB(b))
+#define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LQARX | \
+   ___PPC_RT(t) | ___PPC_RA(a) | \
+   ___PPC_RB(b) | __PPC_EH(eh))
 #define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \
___PPC_RT(t) | ___PPC_RA(a) | \
___PPC_RB(b) | __PPC_EH(eh))
 #define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \
___PPC_RT(t) | ___PPC_RA(a) | \
___PPC_RB(b) | __PPC_EH(eh))
+#define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \
+   ___PPC_RT(t) | ___PPC_RA(a) | \
+   ___PPC_RB(b))
 #define PPC_MSGSND(b)  stringify_in_c(.long PPC_INST_MSGSND | \
___PPC_RB(b))
 #define PPC_MSGSYNCstringify_in_c(.long PPC_INST_MSGSYNC)
diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index 442e636..9801970 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -83,6 +83,12 @@ enum instruction_type {
 #define DCBT   0x300
 #define ICBI   0x400
 
+/* VSX flags values */
+#define VSX_FPCONV 1   /* do floating point SP/DP conversion */
+#define VSX_SPLAT  2   /* store loaded value into all elements */
+#define VSX_LDLEFT 4   /* load VSX register from left */
+#define VSX_CHECK_VEC  8   /* check MSR_VEC not MSR_VSX for reg >= 32 */
+
 /* Size field in type word */
 #define SIZE(n)((n) << 8)
 #define GETSIZE(w) ((w) >> 8)
@@ -100,6 +106,17 @@ struct instruction_op {
int spr;
u32 ccval;
u32 xerval;
+   u8 element_size;/* for VSX/VMX loads/stores */
+   u8 vsx_flags;
+};
+
+union vsx_reg {
+   u8  b[16];
+   u16 h[8];
+   u32 w[4];
+   unsigned long d[2];
+   float   fp[4];
+   double  dp[2];
 };
 
 /*
@@ -131,3 +148,7 @@ void emulate_update_regs(struct pt_regs *reg, struct 
instruction_op *op);
  */
 extern int emulate_step(struct pt_regs *regs, unsigned int instr);
 
+extern void emulate_vsx_load(struct instruction_op *op, union vsx_reg *reg,
+const void *mem);
+extern void emulate_vsx_store(struct instruction_op *op, const union vsx_reg 
*reg,
+

[PATCH v3 01/17] powerpc: Correct instruction code for xxlor instruction

2017-08-29 Thread Paul Mackerras

The instruction code for xxlor that commit 0016a4cf5582 ("powerpc:
Emulate most Book I instructions in emulate_step()", 2010-06-15)
added is actually the code for xxlnor.  It is used in get_vsr()
and put_vsr() and the effect of the error is that if emulate_step
is used to emulate a VSX load or store from any register other
than vsr0, the bitwise complement of the correct value will be
loaded or stored.  This corrects the error.

Fixes: 0016a4cf5582 ("powerpc: Emulate most Book I instructions in 
emulate_step()")
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/ppc-opcode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 041ba15..8861289 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -262,7 +262,7 @@
 #define PPC_INST_TLBSRX_DOT0x7c0006a5
 #define PPC_INST_VPMSUMW   0x1488
 #define PPC_INST_VPMSUMD   0x14c8
-#define PPC_INST_XXLOR 0xf510
+#define PPC_INST_XXLOR 0xf490
 #define PPC_INST_XXSWAPD   0xf250
 #define PPC_INST_XVCPSGNDP 0xf780
 #define PPC_INST_TRECHKPT  0x7c0007dd
-- 
2.7.4

[PATCH v3 00/17] powerpc: Do alignment fixups using analyse_instr etc.

2017-08-29 Thread Paul Mackerras

This series extends the instruction emulation infrastructure in
arch/powerpc/lib/sstep.c and uses it for emulating instructions when
we get an alignment interrupt.  The advantage of this is that we only
have to add the new POWER9 instructions in one place, and it fixes
several bugs in alignment interrupt handling that have been identified
recently.

With this, analyse_instr() and emulate_step() handle almost all load
and store instructions in Power ISA v3.00 -- all except the atomic
memory operations (lwat, stwat, etc.).  We now always use the largest
possible aligned memory accesses (up to 8 bytes) to emulate unaligned
accesses.  If we get a fault, the faulting address is accurately
recorded in regs->dar.  We also can now access FP/VMX/VSX registers
directly if they are live, without having to spill them all to the
thread_struct and the reload them all later.  There are also various
other fixes in the series.

This version is based on the current powerpc next branch.

Paul.

 arch/powerpc/Kconfig  |4 -
 arch/powerpc/include/asm/ppc-opcode.h |   10 +-
 arch/powerpc/include/asm/sstep.h  |   90 +-
 arch/powerpc/kernel/align.c   |  774 +---
 arch/powerpc/lib/Makefile |3 +-
 arch/powerpc/lib/ldstfp.S |  307 ++---
 arch/powerpc/lib/quad.S   |   62 +
 arch/powerpc/lib/sstep.c  | 2139 +++--
 8 files changed, 1802 insertions(+), 1587 deletions(-)

[PATCH v3 03/17] powerpc: Don't check MSR FP/VMX/VSX enable bits in analyse_instr()

2017-08-29 Thread Paul Mackerras

This removes the checks for the FP/VMX/VSX enable bits in the MSR
from analyse_instr() and adds them to emulate_step() instead.

The reason for this is that we may want to use analyse_instr() in
a situation where the FP/VMX/VSX register values are stored in the
current thread_struct and the FP/VMX/VSX enable bits in the MSR
image in the pt_regs are zero.  Since analyse_instr() doesn't make
any changes to register state, it is reasonable for it to indicate
what the effect of an instruction would be even though the relevant
enable bit is off.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/lib/sstep.c | 54 +++-
 1 file changed, 12 insertions(+), 42 deletions(-)

diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 8e581c6..13733b7 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -1505,15 +1505,11 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #ifdef CONFIG_ALTIVEC
case 103:   /* lvx */
case 359:   /* lvxl */
-   if (!(regs->msr & MSR_VEC))
-   goto vecunavail;
op->type = MKOP(LOAD_VMX, 0, 16);
break;
 
case 231:   /* stvx */
case 487:   /* stvxl */
-   if (!(regs->msr & MSR_VEC))
-   goto vecunavail;
op->type = MKOP(STORE_VMX, 0, 16);
break;
 #endif /* CONFIG_ALTIVEC */
@@ -1584,29 +1580,21 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #ifdef CONFIG_PPC_FPU
case 535:   /* lfsx */
case 567:   /* lfsux */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(LOAD_FP, u, 4);
break;
 
case 599:   /* lfdx */
case 631:   /* lfdux */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(LOAD_FP, u, 8);
break;
 
case 663:   /* stfsx */
case 695:   /* stfsux */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(STORE_FP, u, 4);
break;
 
case 727:   /* stfdx */
case 759:   /* stfdux */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(STORE_FP, u, 8);
break;
 #endif
@@ -1649,16 +1637,12 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #ifdef CONFIG_VSX
case 844:   /* lxvd2x */
case 876:   /* lxvd2ux */
-   if (!(regs->msr & MSR_VSX))
-   goto vsxunavail;
op->reg = rd | ((instr & 1) << 5);
op->type = MKOP(LOAD_VSX, u, 16);
break;
 
case 972:   /* stxvd2x */
case 1004:  /* stxvd2ux */
-   if (!(regs->msr & MSR_VSX))
-   goto vsxunavail;
op->reg = rd | ((instr & 1) << 5);
op->type = MKOP(STORE_VSX, u, 16);
break;
@@ -1724,32 +1708,24 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
 #ifdef CONFIG_PPC_FPU
case 48:/* lfs */
case 49:/* lfsu */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(LOAD_FP, u, 4);
op->ea = dform_ea(instr, regs);
break;
 
case 50:/* lfd */
case 51:/* lfdu */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(LOAD_FP, u, 8);
op->ea = dform_ea(instr, regs);
break;
 
case 52:/* stfs */
case 53:/* stfsu */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(STORE_FP, u, 4);
op->ea = dform_ea(instr, regs);
break;
 
case 54:/* stfd */
case 55:/* stfdu */
-   if (!(regs->msr & MSR_FP))
-   goto fpunavail;
op->type = MKOP(STORE_FP, u, 8);
op->ea = dform_ea(instr, regs);
break;
@@ -1812,24 +1788,6 @@ int analyse_instr(struct instruction_op *op, const 
struct pt_regs *regs,
op->type = INTERRUPT | 0x700;
op->val = SRR1_PROGTRAP;

[PATCH v3 02/17] powerpc: Change analyse_instr so it doesn't modify *regs

2017-08-29 Thread Paul Mackerras

The analyse_instr function currently doesn't just work out what an
instruction does, it also executes those instructions whose effect
is only to update CPU registers that are stored in struct pt_regs.
This is undesirable because optprobes uses analyse_instr to work out
if an instruction could be successfully emulated in future.

This changes analyse_instr so it doesn't modify *regs; instead it
stores information in the instruction_op structure to indicate what
registers (GPRs, CR, XER, LR) would be set and what value they would
be set to.  A companion function called emulate_update_regs() can
then use that information to update a pt_regs struct appropriately.

As a minor cleanup, this replaces inline asm using the cntlzw and
cntlzd instructions with calls to __builtin_clz() and __builtin_clzl().

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/sstep.h |  52 +++-
 arch/powerpc/lib/sstep.c | 601 +++
 2 files changed, 396 insertions(+), 257 deletions(-)

diff --git a/arch/powerpc/include/asm/sstep.h b/arch/powerpc/include/asm/sstep.h
index d3a42cc..442e636 100644
--- a/arch/powerpc/include/asm/sstep.h
+++ b/arch/powerpc/include/asm/sstep.h
@@ -23,9 +23,6 @@ struct pt_regs;
 #define IS_RFID(instr) (((instr) & 0xfc0007fe) == 0x4c24)
 #define IS_RFI(instr)  (((instr) & 0xfc0007fe) == 0x4c64)
 
-/* Emulate instructions that cause a transfer of control. */
-extern int emulate_step(struct pt_regs *regs, unsigned int instr);
-
 enum instruction_type {
COMPUTE,/* arith/logical/CR op, etc. */
LOAD,
@@ -55,11 +52,29 @@ enum instruction_type {
 
 #define INSTR_TYPE_MASK0x1f
 
+/* Compute flags, ORed in with type */
+#define SETREG 0x20
+#define SETCC  0x40
+#define SETXER 0x80
+
+/* Branch flags, ORed in with type */
+#define SETLK  0x20
+#define BRTAKEN0x40
+#define DECCTR 0x80
+
 /* Load/store flags, ORed in with type */
 #define SIGNEXT0x20
 #define UPDATE 0x40/* matches bit in opcode 31 instructions */
 #define BYTEREV0x80
 
+/* Barrier type field, ORed in with type */
+#define BARRIER_MASK   0xe0
+#define BARRIER_SYNC   0x00
+#define BARRIER_ISYNC  0x20
+#define BARRIER_EIEIO  0x40
+#define BARRIER_LWSYNC 0x60
+#define BARRIER_PTESYNC0x80
+
 /* Cacheop values, ORed in with type */
 #define CACHEOP_MASK   0x700
 #define DCBST  0
@@ -83,7 +98,36 @@ struct instruction_op {
int update_reg;
/* For MFSPR */
int spr;
+   u32 ccval;
+   u32 xerval;
 };
 
-extern int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
+/*
+ * Decode an instruction, and return information about it in *op
+ * without changing *regs.
+ *
+ * Return value is 1 if the instruction can be emulated just by
+ * updating *regs with the information in *op, -1 if we need the
+ * GPRs but *regs doesn't contain the full register set, or 0
+ * otherwise.
+ */
+extern int analyse_instr(struct instruction_op *op, const struct pt_regs *regs,
 unsigned int instr);
+
+/*
+ * Emulate an instruction that can be executed just by updating
+ * fields in *regs.
+ */
+void emulate_update_regs(struct pt_regs *reg, struct instruction_op *op);
+
+/*
+ * Emulate instructions that cause a transfer of control,
+ * arithmetic/logical instructions, loads and stores,
+ * cache operations and barriers.
+ *
+ * Returns 1 if the instruction was emulated successfully,
+ * 0 if it could not be emulated, or -1 for an instruction that
+ * should not be emulated (rfid, mtmsrd clearing MSR_RI, etc.).
+ */
+extern int emulate_step(struct pt_regs *regs, unsigned int instr);
+
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index a85b82c..8e581c6 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -62,15 +62,17 @@ static nokprobe_inline unsigned long 
truncate_if_32bit(unsigned long msr,
 /*
  * Determine whether a conditional branch instruction would branch.
  */
-static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs 
*regs)
+static nokprobe_inline int branch_taken(unsigned int instr,
+   const struct pt_regs *regs,
+   struct instruction_op *op)
 {
unsigned int bo = (instr >> 21) & 0x1f;
unsigned int bi;
 
if ((bo & 4) == 0) {
/* decrement counter */
-   --regs->ctr;
-   if (((bo >> 1) & 1) ^ (regs->ctr == 0))
+   op->type |= DECCTR;
+   if (((bo >> 1) & 1) ^ (regs->ctr == 1))
return 0;
}
if ((bo & 0x10) == 0) {
@@ -92,7 +94,8 @@ static nokprobe_inline long address_ok(struct pt_regs *regs, 
unsigned long ea, i
 /*
  * Calculate effective address for a D-form instruction
  */
-static nokprobe_inline unsigned long dform_ea(unsigned int

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Anshuman Khandual

On 08/29/2017 05:34 PM, Peter Zijlstra wrote:
> On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
>> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
 +
 +  if (unlikely(!vma->anon_vma))
 +  goto unlock;
>>> It deserves a comment.
>> You're right I'll add it in the next version.
>> For the record, the root cause is that __anon_vma_prepare() requires the
>> mmap_sem to be held because vm_next and vm_prev must be safe.
> But should that test not be:
> 
>   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
>   goto unlock;

This makes more sense. We are backing off from speculative path
because struct anon_vma has not been created for this anonymous
vma and we cannot do that without holding mmap_sem. This should
have nothing to do with vma->vm_ops availability.

[PATCH RFC] Interface to set SPRN_TIDR

2017-08-29 Thread Sukadev Bhattiprolu


We need the SPRN_TIDR to be set for use with fast thread-wakeup
(core-to-core wakeup) in VAS. Each user thread that has a receive
window setup and expects to be notified when a sender issues a paste
needs to have a unique SPRN_TIDR value.

The SPRN_TIDR value only needs to unique within the process but for
now we use a globally unique thread id as described below.

Signed-off-by: Sukadev Bhattiprolu 
---
Changelog[v2]
- Michael Ellerman: Use an interface to assign TIDR so it is
  assigned to only threads that need it; move assignment to
  restore_sprs(). Drop lint from rebase;


 arch/powerpc/include/asm/processor.h |  4 ++
 arch/powerpc/include/asm/switch_to.h |  3 ++
 arch/powerpc/kernel/process.c| 97 
 3 files changed, 104 insertions(+)

diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index fab7ff8..bf6ba63 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -232,6 +232,10 @@ struct debug_reg {
 struct thread_struct {
unsigned long   ksp;/* Kernel stack pointer */
 
+#ifdef CONFIG_PPC_VAS
+   unsigned long   tidr;
+#endif
+
 #ifdef CONFIG_PPC64
unsigned long   ksp_vsid;
 #endif
diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index 17c8380..4962455 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -91,4 +91,7 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+extern void set_thread_tidr(struct task_struct *t);
+extern void clear_thread_tidr(struct task_struct *t);
+
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 1f0fd36..13abb22 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1132,6 +1132,10 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 #endif
+#ifdef CONFIG_PPC_VAS
+   if (old_thread->tidr != new_thread->tidr)
+   mtspr(SPRN_TIDR, new_thread->tidr);
+#endif
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -1446,9 +1450,97 @@ void flush_thread(void)
 #endif /* CONFIG_HAVE_HW_BREAKPOINT */
 }
 
+#ifdef CONFIG_PPC_VAS
+static DEFINE_SPINLOCK(vas_thread_id_lock);
+static DEFINE_IDA(vas_thread_ida);
+
+/*
+ * We need to assign an unique thread id to each thread in a process. This
+ * thread id is intended to be used with the Fast Thread-wakeup (aka Core-
+ * to-core wakeup) mechanism being implemented on top of Virtual Accelerator
+ * Switchboard (VAS).
+ *
+ * To get a unique thread-id per process we could simply use task_pid_nr()
+ * but the problem is that task_pid_nr() is not yet available for the thread
+ * when copy_thread() is called. Fixing that would require changing more
+ * intrusive arch-neutral code in code path in copy_process()?.
+ *
+ * Further, to assign unique thread ids within each process, we need an
+ * atomic field (or an IDR) in task_struct, which again intrudes into the
+ * arch-neutral code.
+ *
+ * So try to assign globally unique thraed ids for now.
+ *
+ * NOTE: TIDR 0 indicates that the thread does not need a TIDR value.
+ *  For now, only threads that expect to be notified by the VAS
+ *  hardware need a TIDR value and we assign values > 0 for those.
+ */
+#define MAX_THREAD_CONTEXT ((1 << 15) - 2)
+static int assign_thread_tidr(void)
+{
+   int index;
+   int err;
+
+again:
+   if (!ida_pre_get(&vas_thread_ida, GFP_KERNEL))
+   return -ENOMEM;
+
+   spin_lock(&vas_thread_id_lock);
+   err = ida_get_new_above(&vas_thread_ida, 1, &index);
+   spin_unlock(&vas_thread_id_lock);
+
+   if (err == -EAGAIN)
+   goto again;
+   else if (err)
+   return err;
+
+   if (index > MAX_THREAD_CONTEXT) {
+   spin_lock(&vas_thread_id_lock);
+   ida_remove(&vas_thread_ida, index);
+   spin_unlock(&vas_thread_id_lock);
+   return -ENOMEM;
+   }
+
+   return index;
+}
+
+static void free_thread_tidr(int id)
+{
+   spin_lock(&vas_thread_id_lock);
+   ida_remove(&vas_thread_ida, id);
+   spin_unlock(&vas_thread_id_lock);
+}
+
+void clear_thread_tidr(struct task_struct *t)
+{
+   if (t->thread.tidr) {
+   free_thread_tidr(t->thread.tidr);
+   t->thread.tidr = 0;
+   mtspr(SPRN_TIDR, 0);
+   }
+}
+
+/*
+ * Assign an unique thread id for this thread and set it in the
+ * thread structure. For now, we need this interface only for
+ * the current task.
+ */
+void set_thread_tidr(struct task_struct *t)
+{
+   WARN_ON(t != current);
+   t->thread.tidr = assign_thread_tidr();
+   mtspr(SPRN_TIDR, t->thread.tidr);
+}
+
+#endif /* CONFIG_PPC_VAS */
+
+
 void
 release_thread(struct task_struct *t)
 {
+#ifdef CONFI

Re: [PATCH] powerpc/pseries: Don't attempt to acquire drc during memory hot add for assigned lmbs

2017-08-29 Thread Michael Ellerman

John Allen  writes:

> Check if an LMB is assigned before attempting to call dlpar_acquire_drc in
> order to avoid any unnecessary rtas calls. This substantially reduces the
> running time of memory hot add on lpars with large amounts of memory.
>
> Signed-off-by: John Allen 
> ---
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index ca9b2f4..95cf2ff 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -817,6 +817,9 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add, 
> struct property *prop)
>   return -EINVAL;
>
>   for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) {
> + if (lmbs[i].flags & DRCONF_MEM_ASSIGNED)
> + continue;
> +
>   rc = dlpar_acquire_drc(lmbs[i].drc_index);
>   if (rc)
>   continue;

This doesn't build for me, see below. What compiler are you using to
test this?

  arch/powerpc/platforms/pseries/hotplug-memory.c: In function 'dlpar_memory':
  arch/powerpc/platforms/pseries/hotplug-memory.c:1081:2: error: 'rc' may be 
used uninitialized in this function [-Werror=maybe-uninitialized]
return rc;
^

It's a bit confusing because you didn't modify that function, but the
function you did modify has been inlined into there.

Possibly the compiler is wrong and we do always initialise rc, but it's
not clear at all.

And it raises a bigger question, how is this supposed to actually work?

If we go around the loop and find that something is already assigned:

for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) {
if (lmbs[i].flags & DRCONF_MEM_ASSIGNED)
continue;
...
lmbs_added++;
...
}

We don't increment lmbs_added, so at the end of the loop we will see
that lmbs_added is not equal to lmbs_to_add, and so we remove
everything:

if (lmbs_added != lmbs_to_add) {
pr_err("Memory hot-add failed, removing any added LMBs\n");

for (i = 0; i < num_lmbs; i++) {
if (!lmbs[i].reserved)
continue;

rc = dlpar_remove_lmb(&lmbs[i]);


So it seems like if we ever hit that continue you added, the whole
operation will fail anyway. So I'm confused.

cheers

Re: [PATCH v7 07/11] sparc64: optimized struct page zeroing

2017-08-29 Thread David Miller

From: Pavel Tatashin 
Date: Mon, 28 Aug 2017 22:02:18 -0400

> Add an optimized mm_zero_struct_page(), so struct page's are zeroed without
> calling memset(). We do eight to ten regular stores based on the size of
> struct page. Compiler optimizes out the conditions of switch() statement.
> 
> SPARC-M6 with 15T of memory, single thread performance:
> 
>BASEFIX  OPTIMIZED_FIX
> bootmem_init   28.440467985s   2.305674818s   2.305161615s
> free_area_init_nodes  202.845901673s 225.343084508s 172.556506560s
>   
> Total 231.286369658s 227.648759326s 174.861668175s
> 
> BASE:  current linux
> FIX:   This patch series without "optimized struct page zeroing"
> OPTIMIZED_FIX: This patch series including the current patch.
> 
> bootmem_init() is where memory for struct pages is zeroed during
> allocation. Note, about two seconds in this function is a fixed time: it
> does not increase as memory is increased.
> 
> Signed-off-by: Pavel Tatashin 
> Reviewed-by: Steven Sistare 
> Reviewed-by: Daniel Jordan 
> Reviewed-by: Bob Picco 

You should probably use initializing stores when you are doing 8
stores and we thus know the page struct is cache line aligned.

But other than that:

Acked-by: David S. Miller

Re: [PATCH v7 02/11] sparc64/mm: setting fields in deferred pages

2017-08-29 Thread David Miller

From: Pavel Tatashin 
Date: Mon, 28 Aug 2017 22:02:13 -0400

> Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT),
> flags and other fields in "struct page"es are never changed prior to first
> initializing struct pages by going through __init_single_page().
> 
> With deferred struct page feature enabled there is a case where we set some
> fields prior to initializing:
> 
> mem_init() {
>  register_page_bootmem_info();
>  free_all_bootmem();
>  ...
> }
> 
> When register_page_bootmem_info() is called only non-deferred struct pages
> are initialized. But, this function goes through some reserved pages which
> might be part of the deferred, and thus are not yet initialized.
> 
> mem_init
> register_page_bootmem_info
> register_page_bootmem_info_node
>  get_page_bootmem
>   .. setting fields here ..
>   such as: page->freelist = (void *)type;
> 
> free_all_bootmem()
> free_low_memory_core_early()
>  for_each_reserved_mem_region()
>   reserve_bootmem_region()
>init_reserved_page() <- Only if this is deferred reserved page
> __init_single_pfn()
>  __init_single_page()
>   memset(0) <-- Loose the set fields here
> 
> We end-up with similar issue as in the previous patch, where currently we
> do not observe problem as memory is zeroed. But, if flag asserts are
> changed we can start hitting issues.
> 
> Also, because in this patch series we will stop zeroing struct page memory
> during allocation, we must make sure that struct pages are properly
> initialized prior to using them.
> 
> The deferred-reserved pages are initialized in free_all_bootmem().
> Therefore, the fix is to switch the above calls.
> 
> Signed-off-by: Pavel Tatashin 
> Reviewed-by: Steven Sistare 
> Reviewed-by: Daniel Jordan 
> Reviewed-by: Bob Picco 

Acked-by: David S. Miller

Re: [PATCH v7 04/11] sparc64: simplify vmemmap_populate

2017-08-29 Thread David Miller

From: Pavel Tatashin 
Date: Mon, 28 Aug 2017 22:02:15 -0400

> Remove duplicating code by using common functions
> vmemmap_pud_populate and vmemmap_pgd_populate.
> 
> Signed-off-by: Pavel Tatashin 
> Reviewed-by: Steven Sistare 
> Reviewed-by: Daniel Jordan 
> Reviewed-by: Bob Picco 

Acked-by: David S. Miller

Re: [PATCH V3 6/6] crypto/nx: Add P9 NX support for 842 compression engine

2017-08-29 Thread Haren Myneni

On 08/29/2017 02:57 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2017-08-29 at 14:54 -0700, Haren Myneni wrote:
>> Opening send window for each crypto transform (crypto_alloc,
>> compression/decompression, ..., crypto_free) so that does not have to
>> wait for the previous copy/paste complete. VAS will map send and
>> receive windows, and can cache in send windows (up to 128). So I
>> thought using the same send window (per chip) for more requests (say
>> 1000) may be adding overhead.
>>
>> I will make changes if you prefer using 1 send window per chip.  
> 
> Did you check the cost of opening/closing a window ?

No, Not yet. opening / closing happens only during alloc/free, but not for each 
compression/decompression. Hence used separate send windows. 

Thanks
Haren  
 
> 
> Cheers,
> Ben.
>

Re: [PATCH 00/13] mmu_notifier kill invalidate_page callback

2017-08-29 Thread Jerome Glisse

On Tue, Aug 29, 2017 at 05:11:24PM -0700, Linus Torvalds wrote:
> On Tue, Aug 29, 2017 at 4:54 PM, Jérôme Glisse  wrote:
> >
> > Note this is barely tested. I intend to do more testing of next few days
> > but i do not have access to all hardware that make use of the mmu_notifier
> > API.
> 
> Thanks for doing this.
> 
> > First 2 patches convert existing call of mmu_notifier_invalidate_page()
> > to mmu_notifier_invalidate_range() and bracket those call with call to
> > mmu_notifier_invalidate_range_start()/end().
> 
> Ok, those two patches are a bit more complex than I was hoping for,
> but not *too* bad.
> 
> And the final end result certainly looks nice:
> 
> >  16 files changed, 74 insertions(+), 214 deletions(-)
> 
> Yeah, removing all those invalidate_page() notifiers certainly makes
> for a nice patch.
> 
> And I actually think you missed some more lines that can now be
> removed: kvm_arch_mmu_notifier_invalidate_page() should no longer be
> needed either, so you can remove all of those too (most of them are
> empty inline functions, but x86 has one that actually does something.
> 
> So there's an added 30 or so dead lines that should be removed in the
> kvm patch, I think.

Yes i missed that. I will wait for people to test and for result of my
own test before reposting if need be, otherwise i will post as separate
patch.

> 
> But from a _very_ quick read-through this looks fine. But it obviously
> needs testing.
> 
> People - *especially* the people who saw issues under KVM - can you
> try out Jérôme's patch-series? I aded some people to the cc, the full
> series is on lkml. Jérôme - do you have a git branch for people to
> test that they could easily pull and try out?

https://cgit.freedesktop.org/~glisse/linux mmu-notifier branch
git://people.freedesktop.org/~glisse/linux

(Sorry if that tree is bit big it has a lot of dead thing i need
 to push a clean and slim one)

Jérôme

[PATCH 2/2] powerpc/xmon: revisit SPR support

2017-08-29 Thread Balbir Singh

This patch readjusts the SPR's adds support for IAMR/AMR
UAMOR/AMOR based on their supported ISA revisions.

There is also support for printing the PIDR/TIDR for
ISA 300 and PSSCR and PTCR in ISA 300 hypervisor mode.
SPRN_PSSCR_PR is the privileged mode access and is used
when we are not in hypervisor mode.

Signed-off-by: Balbir Singh 
---
 arch/powerpc/include/asm/reg.h |  1 +
 arch/powerpc/xmon/xmon.c   | 34 ++
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c36823d..2c4366a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -356,6 +356,7 @@
 #define SPRN_PMSR  0x355   /* Power Management Status Reg */
 #define SPRN_PMMAR 0x356   /* Power Management Memory Activity Register */
 #define SPRN_PSSCR 0x357   /* Processor Stop Status and Control Register 
(ISA 3.0) */
+#define SPRN_PSSCR_PR  0x337   /* PSSCR ISA 3.0, privileged mode access */
 #define SPRN_PMCR  0x374   /* Power Management Control Register */
 
 /* HFSCR and FSCR bit numbers are the same */
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 1b26d53..33351c6 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1743,18 +1743,20 @@ static void dump_206_sprs(void)
mfspr(SPRN_SRR0), mfspr(SPRN_SRR1), mfspr(SPRN_DSISR));
printf("dscr   = %.16lx  ppr   = %.16lx pir= %.8x\n",
mfspr(SPRN_DSCR), mfspr(SPRN_PPR), mfspr(SPRN_PIR));
+   printf("amr= %.16lx  uamor = %.16lx\n",
+   mfspr(SPRN_AMR), mfspr(SPRN_UAMOR));
 
if (!(mfmsr() & MSR_HV))
return;
 
printf("sdr1   = %.16lx  hdar  = %.16lx hdsisr = %.8x\n",
mfspr(SPRN_SDR1), mfspr(SPRN_HDAR), mfspr(SPRN_HDSISR));
-   printf("hsrr0  = %.16lx hsrr1  = %.16lx hdec = %.16lx\n",
+   printf("hsrr0  = %.16lx hsrr1  = %.16lx hdec   = %.16lx\n",
mfspr(SPRN_HSRR0), mfspr(SPRN_HSRR1), mfspr(SPRN_HDEC));
-   printf("lpcr   = %.16lx  pcr   = %.16lx lpidr = %.8x\n",
+   printf("lpcr   = %.16lx  pcr   = %.16lx lpidr  = %.8x\n",
mfspr(SPRN_LPCR), mfspr(SPRN_PCR), mfspr(SPRN_LPID));
-   printf("hsprg0 = %.16lx hsprg1 = %.16lx\n",
-   mfspr(SPRN_HSPRG0), mfspr(SPRN_HSPRG1));
+   printf("hsprg0 = %.16lx hsprg1 = %.16lx amor   = %.16lx\n",
+   mfspr(SPRN_HSPRG0), mfspr(SPRN_HSPRG1), mfspr(SPRN_AMOR));
printf("dabr   = %.16lx dabrx  = %.16lx\n",
mfspr(SPRN_DABR), mfspr(SPRN_DABRX));
 #endif
@@ -1793,6 +1795,7 @@ static void dump_207_sprs(void)
mfspr(SPRN_SDAR), mfspr(SPRN_SIER), mfspr(SPRN_PMC6));
printf("ebbhr  = %.16lx  ebbrr = %.16lx bescr  = %.16lx\n",
mfspr(SPRN_EBBHR), mfspr(SPRN_EBBRR), mfspr(SPRN_BESCR));
+   printf("iamr   = %.16lx\n", mfspr(SPRN_IAMR));
 
if (!(msr & MSR_HV))
return;
@@ -1804,6 +1807,28 @@ static void dump_207_sprs(void)
 #endif
 }
 
+static void dump_300_sprs(void)
+{
+#ifdef CONFIG_PPC64
+   bool hv = mfmsr() & MSR_HV;
+
+   if (!cpu_has_feature(CPU_FTR_ARCH_300))
+   return;
+
+   printf("pidr   = %.16lx  tidr  = %.16lx\n",
+   mfspr(SPRN_PID), mfspr(SPRN_TIDR));
+   printf("asdr   = %.16lx  psscr = %.16lx\n",
+   mfspr(SPRN_ASDR), hv ? mfspr(SPRN_PSSCR)
+   : mfspr(SPRN_PSSCR_PR));
+
+   if (!hv)
+   return;
+
+   printf("ptcr   = %.16lx\n",
+   mfspr(SPRN_PTCR));
+#endif
+}
+
 static void dump_one_spr(int spr, bool show_unimplemented)
 {
unsigned long val;
@@ -1857,6 +1882,7 @@ static void super_regs(void)
 
dump_206_sprs();
dump_207_sprs();
+   dump_300_sprs();
 
return;
}
-- 
2.9.4

[PATCH 1/2] powerpc/xmon: hdec is now 64bits

2017-08-29 Thread Balbir Singh

ISA 300 defines hypervisor decrementer to be 64 bits in length.
This patch extends the print format for all archs to be 64 bits

Signed-off-by: Balbir Singh 
---
 arch/powerpc/xmon/xmon.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 9e68f1d..1b26d53 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1749,7 +1749,7 @@ static void dump_206_sprs(void)
 
printf("sdr1   = %.16lx  hdar  = %.16lx hdsisr = %.8x\n",
mfspr(SPRN_SDR1), mfspr(SPRN_HDAR), mfspr(SPRN_HDSISR));
-   printf("hsrr0  = %.16lx hsrr1  = %.16lx hdec = %.8x\n",
+   printf("hsrr0  = %.16lx hsrr1  = %.16lx hdec = %.16lx\n",
mfspr(SPRN_HSRR0), mfspr(SPRN_HSRR1), mfspr(SPRN_HDEC));
printf("lpcr   = %.16lx  pcr   = %.16lx lpidr = %.8x\n",
mfspr(SPRN_LPCR), mfspr(SPRN_PCR), mfspr(SPRN_LPID));
-- 
2.9.4

[PATCH 03/13] powerpc/powernv: update to new mmu_notifier semantic

2017-08-29 Thread Jérôme Glisse

Call to mmu_notifier_invalidate_page() are replaced by call to
mmu_notifier_invalidate_range() and thus call are bracketed by
call to mmu_notifier_invalidate_range_start()/end()

Remove now useless invalidate_page callback.

Signed-off-by: Jérôme Glisse 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Alistair Popple 
Cc: Michael Ellerman 
Cc: Kirill A. Shutemov 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Andrea Arcangeli 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index b5d960d6db3d..4c7b8591f737 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -614,15 +614,6 @@ static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
mmio_invalidate(npu_context, 1, address, true);
 }
 
-static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
-   struct mm_struct *mm,
-   unsigned long address)
-{
-   struct npu_context *npu_context = mn_to_npu_context(mn);
-
-   mmio_invalidate(npu_context, 1, address, true);
-}
-
 static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -640,7 +631,6 @@ static void pnv_npu2_mn_invalidate_range(struct 
mmu_notifier *mn,
 static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
.release = pnv_npu2_mn_release,
.change_pte = pnv_npu2_mn_change_pte,
-   .invalidate_page = pnv_npu2_mn_invalidate_page,
.invalidate_range = pnv_npu2_mn_invalidate_range,
 };
 
-- 
2.13.5

[PATCH 00/13] mmu_notifier kill invalidate_page callback

2017-08-29 Thread Jérôme Glisse

(Sorry for so many list cross-posting and big cc)

Please help testing !

The invalidate_page callback suffered from 2 pitfalls. First it used to
happen after page table lock was release and thus a new page might have
been setup for the virtual address before the call to invalidate_page().

This is in a weird way fixed by c7ab0d2fdc840266b39db94538f74207ec2afbf6
which moved the callback under the page table lock. Which also broke
several existing user of the mmu_notifier API that assumed they could
sleep inside this callback.

The second pitfall was invalidate_page being the only callback not taking
a range of address in respect to invalidation but was giving an address
and a page. Lot of the callback implementer assumed this could never be
THP and thus failed to invalidate the appropriate range for THP pages.

By killing this callback we unify the mmu_notifier callback API to always
take a virtual address range as input.

There is now 2 clear API (I am not mentioning the youngess API which is
seldomly used):
  - invalidate_range_start()/end() callback (which allow you to sleep)
  - invalidate_range() where you can not sleep but happen right after
page table update under page table lock


Note that a lot of existing user feels broken in respect to range_start/
range_end. Many user only have range_start() callback but there is nothing
preventing them to undo what was invalidated in their range_start() callback
after it returns but before any CPU page table update take place.

The code pattern use in kvm or umem odp is an example on how to properly
avoid such race. In a nutshell use some kind of sequence number and active
range invalidation counter to block anything that might undo what the
range_start() callback did.

If you do not care about keeping fully in sync with CPU page table (ie
you can live with CPU page table pointing to new different page for a
given virtual address) then you can take a reference on the pages inside
the range_start callback and drop it in range_end or when your driver
is done with those pages.

Last alternative is to use invalidate_range() if you can do invalidation
without sleeping as invalidate_range() callback happens under the CPU
page table spinlock right after the page table is updated.


Note this is barely tested. I intend to do more testing of next few days
but i do not have access to all hardware that make use of the mmu_notifier
API.


First 2 patches convert existing call of mmu_notifier_invalidate_page()
to mmu_notifier_invalidate_range() and bracket those call with call to
mmu_notifier_invalidate_range_start()/end().

The next 10 patches remove existing invalidate_page() callback as it can
no longer happen.

Finaly the last page remove it completely so it can RIP.

Jérôme Glisse (13):
  dax: update to new mmu_notifier semantic
  mm/rmap: update to new mmu_notifier semantic
  powerpc/powernv: update to new mmu_notifier semantic
  drm/amdgpu: update to new mmu_notifier semantic
  IB/umem: update to new mmu_notifier semantic
  IB/hfi1: update to new mmu_notifier semantic
  iommu/amd: update to new mmu_notifier semantic
  iommu/intel: update to new mmu_notifier semantic
  misc/mic/scif: update to new mmu_notifier semantic
  sgi-gru: update to new mmu_notifier semantic
  xen/gntdev: update to new mmu_notifier semantic
  KVM: update to new mmu_notifier semantic
  mm/mmu_notifier: kill invalidate_page

Cc: Kirill A. Shutemov 
Cc: Linus Torvalds 
Cc: Andrew Morton 
Cc: Andrea Arcangeli 
Cc: Joerg Roedel 
Cc: Dan Williams 
Cc: Sudeep Dutt 
Cc: Ashutosh Dixit 
Cc: Dimitri Sivanich 
Cc: Jack Steiner 
Cc: Paolo Bonzini 
Cc: Radim Krčmář 

Cc: linuxppc-dev@lists.ozlabs.org
Cc: dri-de...@lists.freedesktop.org
Cc: amd-...@lists.freedesktop.org
Cc: linux-r...@vger.kernel.org
Cc: io...@lists.linux-foundation.org
Cc: xen-de...@lists.xenproject.org
Cc: k...@vger.kernel.org


 arch/powerpc/platforms/powernv/npu-dma.c | 10 
 drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c   | 31 --
 drivers/infiniband/core/umem_odp.c   | 19 --
 drivers/infiniband/hw/hfi1/mmu_rb.c  |  9 ---
 drivers/iommu/amd_iommu_v2.c |  8 --
 drivers/iommu/intel-svm.c|  9 ---
 drivers/misc/mic/scif/scif_dma.c | 11 
 drivers/misc/sgi-gru/grutlbpurge.c   | 12 -
 drivers/xen/gntdev.c |  8 --
 fs/dax.c | 19 --
 include/linux/mm.h   |  1 +
 include/linux/mmu_notifier.h | 25 --
 mm/memory.c  | 26 +++
 mm/mmu_notifier.c| 14 --
 mm/rmap.c| 44 +---
 virt/kvm/kvm_main.c  | 42 --
 16 files changed, 74 insertions(+), 214 deletions(-)

-- 
2.13.5

Re: [PATCH 00/13] mmu_notifier kill invalidate_page callback

2017-08-29 Thread Linus Torvalds

On Tue, Aug 29, 2017 at 4:54 PM, Jérôme Glisse  wrote:
>
> Note this is barely tested. I intend to do more testing of next few days
> but i do not have access to all hardware that make use of the mmu_notifier
> API.

Thanks for doing this.

> First 2 patches convert existing call of mmu_notifier_invalidate_page()
> to mmu_notifier_invalidate_range() and bracket those call with call to
> mmu_notifier_invalidate_range_start()/end().

Ok, those two patches are a bit more complex than I was hoping for,
but not *too* bad.

And the final end result certainly looks nice:

>  16 files changed, 74 insertions(+), 214 deletions(-)

Yeah, removing all those invalidate_page() notifiers certainly makes
for a nice patch.

And I actually think you missed some more lines that can now be
removed: kvm_arch_mmu_notifier_invalidate_page() should no longer be
needed either, so you can remove all of those too (most of them are
empty inline functions, but x86 has one that actually does something.

So there's an added 30 or so dead lines that should be removed in the
kvm patch, I think.

But from a _very_ quick read-through this looks fine. But it obviously
needs testing.

People - *especially* the people who saw issues under KVM - can you
try out Jérôme's patch-series? I aded some people to the cc, the full
series is on lkml. Jérôme - do you have a git branch for people to
test that they could easily pull and try out?

Linus

Re: Question: handling early hotplug interrupts

2017-08-29 Thread Daniel Henrique Barboza


Hi Ben,

On 08/29/2017 06:55 PM, Benjamin Herrenschmidt wrote:

On Tue, 2017-08-29 at 17:43 -0300, Daniel Henrique Barboza wrote:

Hi,

This is a scenario I've been facing when working in early device
hotplugs in QEMU. When a device is added, a IRQ pulse is fired to warn
the guest of the event, then the kernel fetches it by calling
'check_exception' and handles it. If the hotplug is done too early
(before SLOF, for example), the pulse is ignored and the hotplug event
is left unchecked in the events queue.

One solution would be to pulse the hotplug queue interrupt after CAS,
when we are sure that the hotplug queue is negotiated. However, this
panics the kernel with sig 11 kernel access of bad area, which suggests
that the kernel wasn't quite ready to handle it.

That's not right. This is a bug that needs fixing. The interrupt should
be masked anyway but still.

Tell us more about the crash (backtrace etc...)  this definitely needs
fixing.


This is the backtrace using a 4.13.0-rc3 guest:

-
[0.008913] Unable to handle kernel paging request for data at 
address 0x0100

[0.008989] Faulting instruction address: 0xc012c318
[0.009046] Oops: Kernel access of bad area, sig: 11 [#1]
[0.009092] SMP NR_CPUS=1024
[0.009092] NUMA
[0.009128] pSeries
[0.009173] Modules linked in:
[0.009210] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.13.0-rc3+ #1
[0.009268] task: c000feb02580 task.stack: c000fe108000
[0.009325] NIP: c012c318 LR: c012c9c4 CTR: 


[0.009394] REGS: c000fffef910 TRAP: 0380   Not tainted (4.13.0-rc3+)
[0.009450] MSR: 82009033 
[0.009454]   CR: 28000822  XER: 2000
[0.009554] CFAR: c012c9c0 SOFTE: 0
[0.009554] GPR00: c012c9c4 c000fffefb90 c141f100 
0400
[0.009554] GPR04:  c000fe1851c0  
fee6
[0.009554] GPR08: 000fffe1  0001 
02001001
[0.009554] GPR12: 0040 cfd8 c000db58 

[0.009554] GPR16:    

[0.009554] GPR20:    
0001
[0.009554] GPR24: 0002 0013 c000fe14bc00 
0400
[0.009554] GPR28: 0400  c000fe1851c0 
0001

[0.010121] NIP [c012c318] __queue_work+0x48/0x640
[0.010168] LR [c012c9c4] queue_work_on+0xb4/0xf0
[0.010213] Call Trace:
[0.010239] [c000fffefb90] [c000db58] 
kernel_init+0x8/0x160 (unreliable)

[0.010308] [c000fffefc70] [c012c9c4] queue_work_on+0xb4/0xf0
[0.010368] [c000fffefcb0] [c00c4608] 
queue_hotplug_event+0xd8/0x150
[0.010435] [c000fffefd00] [c00c30d0] 
ras_hotplug_interrupt+0x140/0x190
[0.010505] [c000fffefd90] [c018c8b0] 
__handle_irq_event_percpu+0x90/0x310
[0.010573] [c000fffefe50] [c018cb6c] 
handle_irq_event_percpu+0x3c/0x90
[0.010642] [c000fffefe90] [c018cc24] 
handle_irq_event+0x64/0xc0
[0.010710] [c000fffefec0] [c01928b0] 
handle_fasteoi_irq+0xc0/0x230
[0.010779] [c000fffefef0] [c018ae14] 
generic_handle_irq+0x54/0x80

[0.010847] [c000fffeff20] [c00189f0] __do_irq+0x90/0x210
[0.010904] [c000fffeff90] [c002e730] call_do_irq+0x14/0x24
[0.010961] [c000fe10b640] [c0018c10] do_IRQ+0xa0/0x130
[0.011021] [c000fe10b6a0] [c0008c58] 
hardware_interrupt_common+0x158/0x160

[0.011090] --- interrupt: 501 at __replay_interrupt+0x38/0x3c
[0.011090] LR = arch_local_irq_restore+0x74/0x90
[0.011179] [c000fe10b990] [c000fe10b9e0] 0xc000fe10b9e0 
(unreliable)
[0.011249] [c000fe10b9b0] [c0b967fc] 
_raw_spin_unlock_irqrestore+0x4c/0xb0

[0.011316] [c000fe10b9e0] [c018ff50] __setup_irq+0x630/0x9e0
[0.011374] [c000fe10ba90] [c019054c] 
request_threaded_irq+0x13c/0x250
[0.011441] [c000fe10baf0] [c00c2cd0] 
request_event_sources_irqs+0x100/0x180
[0.011511] [c000fe10bc10] [c0eceda8] 
__machine_initcall_pseries_init_ras_IRQ+0xc4/0x12c
[0.011591] [c000fe10bc40] [c000d8c8] 
do_one_initcall+0x68/0x1e0
[0.011659] [c000fe10bd00] [c0eb4484] 
kernel_init_freeable+0x284/0x370

[0.011725] [c000fe10bdc0] [c000db7c] kernel_init+0x2c/0x160
[0.011782] [c000fe10be30] [c000bc9c] 
ret_from_kernel_thread+0x5c/0xc0

[0.011848] Instruction dump:
[0.011885] fbc1fff0 f8010010 f821ff21 7c7c1b78 7c9d2378 7cbe2b78 
787b0020 6000
[0.011955] 6000 892d028a 2fa9 409e04bc <813d0100> 75290001 
408204c0 3d2061c8

[0.012026] ---[ end trace e0b4d36daf3f8b2a ]---
[0.013850]

Applied "ASoC: fsl_dma: remove dma_object path member" to the asoc tree

2017-08-29 Thread Mark Brown

The patch

   ASoC: fsl_dma: remove dma_object path member

has been applied to the asoc tree at

   git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git 

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.  

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

>From b1dc00abcf18d27b36de500c150be88022c82270 Mon Sep 17 00:00:00 2001
From: Rob Herring 
Date: Tue, 29 Aug 2017 07:37:55 -0500
Subject: [PATCH] ASoC: fsl_dma: remove dma_object path member

dma_object.path is unused, so rather than fix it to work with DT
full_name changes, just remove it.

Signed-off-by: Rob Herring 
Cc: Timur Tabi 
Cc: Nicolin Chen 
Cc: Xiubo Li 
Cc: Fabio Estevam 
Cc: Liam Girdwood 
Cc: Mark Brown 
Cc: Jaroslav Kysela 
Cc: Takashi Iwai 
Cc: alsa-de...@alsa-project.org
Cc: linuxppc-dev@lists.ozlabs.org
Reviewed-by: Fabio Estevam 
Signed-off-by: Mark Brown 
---
 sound/soc/fsl/fsl_dma.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sound/soc/fsl/fsl_dma.c b/sound/soc/fsl/fsl_dma.c
index c17359648915..0c11f434a374 100644
--- a/sound/soc/fsl/fsl_dma.c
+++ b/sound/soc/fsl/fsl_dma.c
@@ -63,7 +63,6 @@ struct dma_object {
struct ccsr_dma_channel __iomem *channel;
unsigned int irq;
bool assigned;
-   char path[1];
 };
 
 /*
@@ -903,13 +902,12 @@ static int fsl_soc_dma_probe(struct platform_device *pdev)
return ret;
}
 
-   dma = kzalloc(sizeof(*dma) + strlen(np->full_name), GFP_KERNEL);
+   dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
of_node_put(ssi_np);
return -ENOMEM;
}
 
-   strcpy(dma->path, np->full_name);
dma->dai.ops = &fsl_dma_ops;
dma->dai.pcm_new = fsl_dma_new;
dma->dai.pcm_free = fsl_dma_free_dma_buffers;
-- 
2.13.2

Re: [PATCH V3 6/6] crypto/nx: Add P9 NX support for 842 compression engine

2017-08-29 Thread Benjamin Herrenschmidt

On Tue, 2017-08-29 at 14:54 -0700, Haren Myneni wrote:
> Opening send window for each crypto transform (crypto_alloc,
> compression/decompression, ..., crypto_free) so that does not have to
> wait for the previous copy/paste complete. VAS will map send and
> receive windows, and can cache in send windows (up to 128). So I
> thought using the same send window (per chip) for more requests (say
> 1000) may be adding overhead.
> 
> I will make changes if you prefer using 1 send window per chip.  

Did you check the cost of opening/closing a window ?

Cheers,
Ben.

[PATCH RFC] KVM: PPC: Book3S: Add MMIO emulation for VMX instructions

2017-08-29 Thread Jose Ricardo Ziviani

This patch provides the MMIO load/store vector indexed
X-Form emulation.

Instructions implemented: lvx, stvx

Signed-off-by: Jose Ricardo Ziviani 
---
 arch/powerpc/include/asm/kvm_host.h   |   2 +
 arch/powerpc/include/asm/kvm_ppc.h|   4 +
 arch/powerpc/include/asm/ppc-opcode.h |   6 ++
 arch/powerpc/kvm/emulate_loadstore.c  |  32 +++
 arch/powerpc/kvm/powerpc.c| 162 ++
 5 files changed, 189 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 8b3f123..5835163 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -697,6 +697,7 @@ struct kvm_vcpu_arch {
u8 mmio_vsx_offset;
u8 mmio_vsx_copy_type;
u8 mmio_vsx_tx_sx_enabled;
+   u8 mmio_vmx_copy_nums;
u8 osi_needed;
u8 osi_enabled;
u8 papr_enabled;
@@ -807,6 +808,7 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_QPR   0x0040
 #define KVM_MMIO_REG_FQPR  0x0060
 #define KVM_MMIO_REG_VSX   0x0080
+#define KVM_MMIO_REG_VMX   0x00a0
 
 #define __KVM_HAVE_ARCH_WQP
 #define __KVM_HAVE_CREATE_DEVICE
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index ba5fadd..c444d16 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -81,6 +81,10 @@ extern int kvmppc_handle_loads(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int rt, unsigned int bytes,
int is_default_endian, int mmio_sign_extend);
+extern int kvmppc_handle_load128_by2x64(struct kvm_run *run,
+   struct kvm_vcpu *vcpu, unsigned int rt, int is_default_endian);
+extern int kvmppc_handle_store128_by2x64(struct kvm_run *run,
+   struct kvm_vcpu *vcpu, unsigned int rs, int is_default_endian);
 extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
   u64 val, unsigned int bytes,
   int is_default_endian);
diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index fa9ebae..ea9bf37 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -156,6 +156,12 @@
 #define OP_31_XOP_LFDX  599
 #define OP_31_XOP_LFDUX631
 
+/* VMX Vector Load Instructions */
+#define OP_31_XOP_LVX   103
+
+/* VMX Vector Store Instructions */
+#define OP_31_XOP_STVX  231
+
 #define OP_LWZ  32
 #define OP_STFS 52
 #define OP_STFSU 53
diff --git a/arch/powerpc/kvm/emulate_loadstore.c 
b/arch/powerpc/kvm/emulate_loadstore.c
index af83353..40fbc14 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -58,6 +58,18 @@ static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_VSX */
 
+#ifdef CONFIG_ALTIVEC
+static bool kvmppc_check_altivec_disabled(struct kvm_vcpu *vcpu)
+{
+   if (!(kvmppc_get_msr(vcpu) & MSR_VEC)) {
+   kvmppc_core_queue_vec_unavail(vcpu);
+   return true;
+   }
+
+   return false;
+}
+#endif /* CONFIG_ALTIVEC */
+
 /*
  * XXX to do:
  * lfiwax, lfiwzx
@@ -98,6 +110,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE;
vcpu->arch.mmio_sp64_extend = 0;
vcpu->arch.mmio_sign_extend = 0;
+   vcpu->arch.mmio_vmx_copy_nums = 0;
 
switch (get_op(inst)) {
case 31:
@@ -459,6 +472,25 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 rs, 4, 1);
break;
 #endif /* CONFIG_VSX */
+
+#ifdef CONFIG_ALTIVEC
+   case OP_31_XOP_LVX:
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+   vcpu->arch.mmio_vmx_copy_nums = 2;
+   emulated = kvmppc_handle_load128_by2x64(run, vcpu,
+   KVM_MMIO_REG_VMX|rt, 1);
+   break;
+
+   case OP_31_XOP_STVX:
+   if (kvmppc_check_altivec_disabled(vcpu))
+   return EMULATE_DONE;
+   vcpu->arch.mmio_vmx_copy_nums = 2;
+   emulated = kvmppc_handle_store128_by2x64(run, vcpu,
+   rs, 1);
+   break;
+#endif /* CONFIG_ALTIVEC */
+
default:
emulated = EMULATE_FAIL;
break;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1a75c0b..dc3611b 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -828,23 +828,7 @@ void kvm_arch_irq_bypass_del_producer(struct

[PATCH RFC] powerpc: Implements MMIO emulation for lvx/stvx instructions

2017-08-29 Thread Jose Ricardo Ziviani

Hello!

This patch implements MMIO emulation for two instructions: lvx and stvx. I 
started to implement other instructions but I'd like to have this reviewed 
beforehand because this is my first patch here and I'll certainly have some 
rework/fixes :-).

Note: stvx is only storing 8 bytes, for some reason the code 
"vcpu->arch.paddr_accessed += run->mmio.len;", which adds the 8-byte offset 
after the first write is not making any difference (interesting that it works 
for load operations). I'm still investigating it but any idea about it will be 
appreciated.

Thank you very much,

Jose Ricardo Ziviani (1):
  KVM: PPC: Book3S: Add MMIO emulation for VMX instructions

 arch/powerpc/include/asm/kvm_host.h   |   2 +
 arch/powerpc/include/asm/kvm_ppc.h|   4 +
 arch/powerpc/include/asm/ppc-opcode.h |   6 ++
 arch/powerpc/kvm/emulate_loadstore.c  |  32 +++
 arch/powerpc/kvm/powerpc.c| 162 ++
 5 files changed, 189 insertions(+), 17 deletions(-)

-- 
2.7.4

Re: Question: handling early hotplug interrupts

2017-08-29 Thread Benjamin Herrenschmidt

On Tue, 2017-08-29 at 17:43 -0300, Daniel Henrique Barboza wrote:
> Hi,
> 
> This is a scenario I've been facing when working in early device 
> hotplugs in QEMU. When a device is added, a IRQ pulse is fired to warn 
> the guest of the event, then the kernel fetches it by calling 
> 'check_exception' and handles it. If the hotplug is done too early 
> (before SLOF, for example), the pulse is ignored and the hotplug event 
> is left unchecked in the events queue.
> 
> One solution would be to pulse the hotplug queue interrupt after CAS, 
> when we are sure that the hotplug queue is negotiated. However, this 
> panics the kernel with sig 11 kernel access of bad area, which suggests 
> that the kernel wasn't quite ready to handle it.

That's not right. This is a bug that needs fixing. The interrupt should
be masked anyway but still.

Tell us more about the crash (backtrace etc...)  this definitely needs
fixing.

> In my experiments using upstream 4.13 I saw that there is a 'safe time' 
> to pulse the queue, sometime after CAS and before mounting the root fs, 
> but I wasn't able to pinpoint it. From QEMU perspective, the last hcall 
> done (an h_set_mode) is still too early to pulse it and the kernel 
> panics. Looking at the kernel source I saw that the IRQ handling is 
> initiated quite early in the init process.
> 
> So my question (ok, actually 2 questions):
> 
> - Is my analysis correct? Is there an unsafe time to fire a IRQ pulse 
> before CAS that can break the kernel or am I overlooking/doing something 
> wrong?
> - is there a reliable way to know when can the kernel safely handle the 
> hotplug interrupt?

So I don't think that's the right approach. Virtual interrutps are edge
sensitive and we will potentially lose them if they occur early. I
think what needs to happen is:

 - Fix whatever's causing the above crash

and

 - The hotplug code should check for pending events (check_exception ?)
at boot time to enqueue whatever's there. It needs to do that after
unmasking the interrupt and in a way that is protected from races with
said interrupt.

Cheers,
Ben.
 

> 
> Thanks,
> 
> 
> Daniel

Re: [PATCH V3 6/6] crypto/nx: Add P9 NX support for 842 compression engine

2017-08-29 Thread Haren Myneni

On 08/29/2017 02:23 PM, Benjamin Herrenschmidt wrote:
> On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
>>> +
>>> +   ret = -EINVAL;
>>> +   if (coproc && coproc->vas.rxwin) {
>>> +   wmem->txwin = nx842_alloc_txwin(coproc);
>>
>> this is wrong.  the workmem is scratch memory that's valid only for
>> the duration of a single operation.

Correct, workmem is used until crypto_free is called. 
>>
>> do you actually need a txwin per crypto transform?  or do you need a
>> txwin per coprocessor?  or txwin per processor?  either per-coproc or
>> per-cpu should be created at driver init and held separately
>> (globally) instead of a per-transform txwin.  I really don't see why
>> you would need a txwin per transform, because the coproc should not
>> care how many different transforms there are.
> 
> We should only need a single window for the whole kernel really, plus
> one per user process who wants direct access but that's not relevant
> here.

Opening send window for each crypto transform (crypto_alloc, 
compression/decompression, ..., crypto_free) so that does not have to wait for 
the previous copy/paste complete. VAS will map send and receive windows, and 
can cache in send windows (up to 128). So I thought using the same send window 
(per chip) for more requests (say 1000) may be adding overhead.

I will make changes if you prefer using 1 send window per chip.  

> 
> Cheers,
> Ben.
>

Re: [PATCH V3 6/6] crypto/nx: Add P9 NX support for 842 compression engine

2017-08-29 Thread Benjamin Herrenschmidt

On Tue, 2017-08-29 at 09:58 -0400, Dan Streetman wrote:
> > +
> > +   ret = -EINVAL;
> > +   if (coproc && coproc->vas.rxwin) {
> > +   wmem->txwin = nx842_alloc_txwin(coproc);
> 
> this is wrong.  the workmem is scratch memory that's valid only for
> the duration of a single operation.
> 
> do you actually need a txwin per crypto transform?  or do you need a
> txwin per coprocessor?  or txwin per processor?  either per-coproc or
> per-cpu should be created at driver init and held separately
> (globally) instead of a per-transform txwin.  I really don't see why
> you would need a txwin per transform, because the coproc should not
> care how many different transforms there are.

We should only need a single window for the whole kernel really, plus
one per user process who wants direct access but that's not relevant
here.

Cheers,
Ben.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Benjamin Herrenschmidt

On Tue, 2017-08-29 at 13:27 +0200, Peter Zijlstra wrote:
> mpe helped me out and explained that is the PWC hint to TBLIE.
> 
> So, you set need_flush_all when you unhook pud/pmd/pte which you then
> use to set PWC. So free_pgtables() will do the PWC when it unhooks
> higher level pages.
> 
> But you're right that there's some issues, free_pgtables() itself
> doesn't seem to use mm->page_table_lock,pmd->lock _AT_ALL_ to unhook the
> pages.
> 
> If it were to do that, things should work fine since those locks would
> then serialize against the speculative faults, we would never install a
> page if the VMA would be under tear-down and it would thus not be
> visible to your caches either.

That's one case. I don't remember of *all* the cases to be honest, but
I do remember several times over the past few years thinking "ah we are
fine because the mm sem taken for writing protects us from any
concurrent tree structure change" :-)

Cheers,
Ben.

Question: handling early hotplug interrupts

2017-08-29 Thread Daniel Henrique Barboza


Hi,

This is a scenario I've been facing when working in early device 
hotplugs in QEMU. When a device is added, a IRQ pulse is fired to warn 
the guest of the event, then the kernel fetches it by calling 
'check_exception' and handles it. If the hotplug is done too early 
(before SLOF, for example), the pulse is ignored and the hotplug event 
is left unchecked in the events queue.


One solution would be to pulse the hotplug queue interrupt after CAS, 
when we are sure that the hotplug queue is negotiated. However, this 
panics the kernel with sig 11 kernel access of bad area, which suggests 
that the kernel wasn't quite ready to handle it.


In my experiments using upstream 4.13 I saw that there is a 'safe time' 
to pulse the queue, sometime after CAS and before mounting the root fs, 
but I wasn't able to pinpoint it. From QEMU perspective, the last hcall 
done (an h_set_mode) is still too early to pulse it and the kernel 
panics. Looking at the kernel source I saw that the IRQ handling is 
initiated quite early in the init process.


So my question (ok, actually 2 questions):

- Is my analysis correct? Is there an unsafe time to fire a IRQ pulse 
before CAS that can break the kernel or am I overlooking/doing something 
wrong?
- is there a reliable way to know when can the kernel safely handle the 
hotplug interrupt?



Thanks,


Daniel

Re: [PATCH] i2c: busses: make i2c_adapter_quirks const

2017-08-29 Thread Wolfram Sang

On Mon, Aug 21, 2017 at 05:42:04PM +0530, Bhumika Goyal wrote:
> Make these const as they are only stored as a reference in the quirks
> field of an i2c_adapter structure, which is const.
> 
> Done using Coccinelle:
> @match disable optional_qualifier@
> identifier s;
> @@
> static struct i2c_adapter_quirks s = {...};
> 
> @ref@
> position p;
> identifier match.s;
> @@
> s@p
> 
> @good1@
> identifier y;
> position ref.p;
> identifier match.s;
> @@
> struct i2c_adapter y = {...,.quirks=&s@p,...};
> 
> @good2@
> struct i2c_adapter y;
> identifier match.s;
> position ref.p;
> @@
> y.quirks = &s@p
> 
> @bad depends on  !good1 && !good2@
> position ref.p;
> identifier match.s;
> @@
> s@p
> 
> @depends on forall !bad disable optional_qualifier@
> identifier match.s;
> @@
> static
> + const
> struct i2c_adapter_quirks s;
> 
> Signed-off-by: Bhumika Goyal 

Removed the cocci script from the commit message and applied to
for-next, thanks!



signature.asc
Description: PGP signature

Re: [PATCH v2 20/20] powerpc/mm: Add speculative page fault

2017-08-29 Thread Laurent Dufour

On 21/08/2017 08:58, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> This patch enable the speculative page fault on the PowerPC
>> architecture.
>>
>> This will try a speculative page fault without holding the mmap_sem,
>> if it returns with WM_FAULT_RETRY, the mmap_sem is acquired and the
> 
> s/WM_FAULT_RETRY/VM_FAULT_RETRY/

Good catch ;)

>> traditional page fault processing is done.
>>
>> Support is only provide for BOOK3S_64 currently because:
>> - require CONFIG_PPC_STD_MMU because checks done in
>>   set_access_flags_filter()
> 
> What checks are done in set_access_flags_filter() ? We are just
> adding the code block in do_page_fault().

set_access_flags_filter() is checking for vm_flags & VM_EXEC which may be
changed in our back, leading to a spurious WARN displayed.
This being said, I focused on the BOOK3S as this meaningful for large
system, and I didn't get time to check for embedded systems.

> 
>> - require BOOK3S because we can't support for book3e_hugetlb_preload()
>>   called by update_mmu_cache()
>>
>> Signed-off-by: Laurent Dufour 
>> ---
>>  arch/powerpc/include/asm/book3s/64/pgtable.h |  5 +
>>  arch/powerpc/mm/fault.c  | 30 
>> +++-
>>  2 files changed, 34 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
>> b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> index 818a58fc3f4f..897f8b9f67e6 100644
>> --- a/arch/powerpc/include/asm/book3s/64/pgtable.h
>> +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
>> @@ -313,6 +313,11 @@ extern unsigned long pci_io_base;
>>  /* Advertise support for _PAGE_SPECIAL */
>>  #define __HAVE_ARCH_PTE_SPECIAL
>>  
>> +/* Advertise that we call the Speculative Page Fault handler */
>> +#if defined(CONFIG_PPC_BOOK3S_64)
>> +#define __HAVE_ARCH_CALL_SPF
>> +#endif
>> +
>>  #ifndef __ASSEMBLY__
>>  
>>  /*
>> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
>> index 4c422632047b..7b3cc4c30eab 100644
>> --- a/arch/powerpc/mm/fault.c
>> +++ b/arch/powerpc/mm/fault.c
>> @@ -291,9 +291,36 @@ int do_page_fault(struct pt_regs *regs, unsigned long 
>> address,
>>  if (is_write && is_user)
>>  store_update_sp = store_updates_sp(regs);
>>  
>> -if (is_user)
>> +if (is_user) {
>>  flags |= FAULT_FLAG_USER;
>>  
>> +#if defined(__HAVE_ARCH_CALL_SPF)
>> +/* let's try a speculative page fault without grabbing the
>> + * mmap_sem.
>> + */
>> +
>> +/*
>> + * flags is set later based on the VMA's flags, for the common
>> + * speculative service, we need some flags to be set.
>> + */
>> +if (is_write)
>> +flags |= FAULT_FLAG_WRITE;
>> +
>> +fault = handle_speculative_fault(mm, address, flags);
>> +if (!(fault & VM_FAULT_RETRY || fault & VM_FAULT_ERROR)) {
>> +perf_sw_event(PERF_COUNT_SW_SPF_DONE, 1,
>> +  regs, address);
>> +goto done;
> 
> Why we should retry with classical page fault on VM_FAULT_ERROR ?
> We should always return VM_FAULT_RETRY in case there is a clear
> collision some where which requires retry with classical method
> and return VM_FAULT_ERROR in cases where we know that it cannot
> be retried and fail for good. Should not handle_speculative_fault()
> be changed to accommodate this ?

There is no need to change handle_speculative_fault(), it should return
VM_FAULT_RETRY when a retry is required. If VM_FAULT_ERROR is return, we
should be able to jump to the block dealing with VM_FAULT_ERROR and calling
vm_fault_error().


> 
>> +}
>> +
>> +/*
>> + * Resetting flags since the following code assumes
>> + * FAULT_FLAG_WRITE is not set.
>> + */
>> +flags &= ~FAULT_FLAG_WRITE;
>> +#endif /* defined(__HAVE_ARCH_CALL_SPF) */
> 
> Setting and resetting of FAULT_FLAG_WRITE seems confusing. Why you
> say that some flags need to be set for handle_speculative_fault()
> function. Could you elaborate on this ?

FAULT_FLAG_WRITE is required to handle write access. In the case we retry
with the classical path, the flag is reset and will be set later if
!is_exec and is_write.

Re: [PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-29 Thread Laurent Dufour

On 29/08/2017 16:50, Laurent Dufour wrote:
> On 21/08/2017 09:29, Anshuman Khandual wrote:
>> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>>> From: Peter Zijlstra 
>>>
>>> Try a speculative fault before acquiring mmap_sem, if it returns with
>>> VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
>>> traditional fault.
>>>
>>> Signed-off-by: Peter Zijlstra (Intel) 
>>>
>>> [Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
>>>  handle_speculative_fault()]
>>> [Retry with usual fault path in the case VM_ERROR is returned by
>>>  handle_speculative_fault(). This allows signal to be delivered]
>>> Signed-off-by: Laurent Dufour 
>>> ---
>>>  arch/x86/include/asm/pgtable_types.h |  7 +++
>>>  arch/x86/mm/fault.c  | 19 +++
>>>  2 files changed, 26 insertions(+)
>>>
>>> diff --git a/arch/x86/include/asm/pgtable_types.h 
>>> b/arch/x86/include/asm/pgtable_types.h
>>> index bf9638e1ee42..4fd2693a037e 100644
>>> --- a/arch/x86/include/asm/pgtable_types.h
>>> +++ b/arch/x86/include/asm/pgtable_types.h
>>> @@ -234,6 +234,13 @@ enum page_cache_mode {
>>>  #define PGD_IDENT_ATTR  0x001  /* PRESENT (no other 
>>> attributes) */
>>>  #endif
>>>  
>>> +/*
>>> + * Advertise that we call the Speculative Page Fault handler.
>>> + */
>>> +#ifdef CONFIG_X86_64
>>> +#define __HAVE_ARCH_CALL_SPF
>>> +#endif
>>> +
>>>  #ifdef CONFIG_X86_32
>>>  # include 
>>>  #else
>>> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
>>> index 2a1fa10c6a98..4c070b9a4362 100644
>>> --- a/arch/x86/mm/fault.c
>>> +++ b/arch/x86/mm/fault.c
>>> @@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
>>> error_code,
>>> if (error_code & PF_INSTR)
>>> flags |= FAULT_FLAG_INSTRUCTION;
>>>  
>>> +#ifdef __HAVE_ARCH_CALL_SPF
>>> +   if (error_code & PF_USER) {
>>> +   fault = handle_speculative_fault(mm, address, flags);
>>> +
>>> +   /*
>>> +* We also check against VM_FAULT_ERROR because we have to
>>> +* raise a signal by calling later mm_fault_error() which
>>> +* requires the vma pointer to be set. So in that case,
>>> +* we fall through the normal path.
>>
>> Cant mm_fault_error() be called inside handle_speculative_fault() ?
>> Falling through the normal page fault path again just to raise a
>> signal seems overkill. Looking into mm_fault_error(), it seems they
>> are different for x86 and powerpc.
>>
>> X86:
>>
>> mm_fault_error(struct pt_regs *regs, unsigned long error_code,
>>unsigned long address, struct vm_area_struct *vma,
>>unsigned int fault)
>>
>> powerpc:
>>
>> mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
>>
>> Even in case of X86, I guess we would have reference to the faulting
>> VMA (after the SRCU search) which can be used to call this function
>> directly.
> 
> Yes I think this is doable in the case of x86.

Indeed this is not doable as the vma pointer is not returned by
handle_speculative_fault() and this is not possible to return it because
once srcu_read_unlock() is called, the pointer is no more safe.

Re: [PATCH v2 19/20] x86/mm: Add speculative pagefault handling

2017-08-29 Thread Laurent Dufour

On 21/08/2017 09:29, Anshuman Khandual wrote:
> On 08/18/2017 03:35 AM, Laurent Dufour wrote:
>> From: Peter Zijlstra 
>>
>> Try a speculative fault before acquiring mmap_sem, if it returns with
>> VM_FAULT_RETRY continue with the mmap_sem acquisition and do the
>> traditional fault.
>>
>> Signed-off-by: Peter Zijlstra (Intel) 
>>
>> [Clearing of FAULT_FLAG_ALLOW_RETRY is now done in
>>  handle_speculative_fault()]
>> [Retry with usual fault path in the case VM_ERROR is returned by
>>  handle_speculative_fault(). This allows signal to be delivered]
>> Signed-off-by: Laurent Dufour 
>> ---
>>  arch/x86/include/asm/pgtable_types.h |  7 +++
>>  arch/x86/mm/fault.c  | 19 +++
>>  2 files changed, 26 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/pgtable_types.h 
>> b/arch/x86/include/asm/pgtable_types.h
>> index bf9638e1ee42..4fd2693a037e 100644
>> --- a/arch/x86/include/asm/pgtable_types.h
>> +++ b/arch/x86/include/asm/pgtable_types.h
>> @@ -234,6 +234,13 @@ enum page_cache_mode {
>>  #define PGD_IDENT_ATTR   0x001  /* PRESENT (no other 
>> attributes) */
>>  #endif
>>  
>> +/*
>> + * Advertise that we call the Speculative Page Fault handler.
>> + */
>> +#ifdef CONFIG_X86_64
>> +#define __HAVE_ARCH_CALL_SPF
>> +#endif
>> +
>>  #ifdef CONFIG_X86_32
>>  # include 
>>  #else
>> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
>> index 2a1fa10c6a98..4c070b9a4362 100644
>> --- a/arch/x86/mm/fault.c
>> +++ b/arch/x86/mm/fault.c
>> @@ -1365,6 +1365,24 @@ __do_page_fault(struct pt_regs *regs, unsigned long 
>> error_code,
>>  if (error_code & PF_INSTR)
>>  flags |= FAULT_FLAG_INSTRUCTION;
>>  
>> +#ifdef __HAVE_ARCH_CALL_SPF
>> +if (error_code & PF_USER) {
>> +fault = handle_speculative_fault(mm, address, flags);
>> +
>> +/*
>> + * We also check against VM_FAULT_ERROR because we have to
>> + * raise a signal by calling later mm_fault_error() which
>> + * requires the vma pointer to be set. So in that case,
>> + * we fall through the normal path.
> 
> Cant mm_fault_error() be called inside handle_speculative_fault() ?
> Falling through the normal page fault path again just to raise a
> signal seems overkill. Looking into mm_fault_error(), it seems they
> are different for x86 and powerpc.
> 
> X86:
> 
> mm_fault_error(struct pt_regs *regs, unsigned long error_code,
>unsigned long address, struct vm_area_struct *vma,
>unsigned int fault)
> 
> powerpc:
> 
> mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
> 
> Even in case of X86, I guess we would have reference to the faulting
> VMA (after the SRCU search) which can be used to call this function
> directly.

Yes I think this is doable in the case of x86.

Re: [PATCH V3 6/6] crypto/nx: Add P9 NX support for 842 compression engine

2017-08-29 Thread Dan Streetman

On Sat, Jul 22, 2017 at 1:01 AM, Haren Myneni  wrote:
>
> This patch adds P9 NX support for 842 compression engine. Virtual
> Accelerator Switchboard (VAS) is used to access 842 engine on P9.
>
> For each NX engine per chip, setup receive window using
> vas_rx_win_open() which configures RxFIFo with FIFO address, lpid,
> pid and tid values. This unique (lpid, pid, tid) combination will
> be used to identify the target engine.
>
> For crypto open request, open send window on the NX engine for
> the corresponding chip / cpu where the open request is executed.
> This send window will be closed upon crypto close request.
>
> NX provides high and normal priority FIFOs. For compression /
> decompression requests, we use only hight priority FIFOs in kernel.
>
> Each NX request will be communicated to VAS using copy/paste
> instructions with vas_copy_crb() / vas_paste_crb() functions.
>
> Signed-off-by: Haren Myneni 
> ---
>  drivers/crypto/nx/Kconfig  |   1 +
>  drivers/crypto/nx/nx-842-powernv.c | 375 
> -
>  drivers/crypto/nx/nx-842.c |   2 +-
>  3 files changed, 371 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/crypto/nx/Kconfig b/drivers/crypto/nx/Kconfig
> index ad7552a6998c..cd5dda9c48f4 100644
> --- a/drivers/crypto/nx/Kconfig
> +++ b/drivers/crypto/nx/Kconfig
> @@ -38,6 +38,7 @@ config CRYPTO_DEV_NX_COMPRESS_PSERIES
>  config CRYPTO_DEV_NX_COMPRESS_POWERNV
> tristate "Compression acceleration support on PowerNV platform"
> depends on PPC_POWERNV
> +   depends on PPC_VAS
> default y
> help
>   Support for PowerPC Nest (NX) compression acceleration. This
> diff --git a/drivers/crypto/nx/nx-842-powernv.c 
> b/drivers/crypto/nx/nx-842-powernv.c
> index c0dd4c7e17d3..13089a0b9dfa 100644
> --- a/drivers/crypto/nx/nx-842-powernv.c
> +++ b/drivers/crypto/nx/nx-842-powernv.c
> @@ -23,6 +23,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Dan Streetman ");
> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>
>  #define WORKMEM_ALIGN  (CRB_ALIGN)
>  #define CSB_WAIT_MAX   (5000) /* ms */
> +#define VAS_RETRIES(10)
> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
> +#define MAX_CREDITS_PER_RXFIFO (1024)
>
>  struct nx842_workmem {
> /* Below fields must be properly aligned */
> @@ -42,16 +46,27 @@ struct nx842_workmem {
>
> ktime_t start;
>
> +   struct vas_window *txwin;   /* Used with VAS function */
> char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>  } __packed __aligned(WORKMEM_ALIGN);
>
>  struct nx842_coproc {
> unsigned int chip_id;
> unsigned int ct;
> -   unsigned int ci;
> +   unsigned int ci;/* Coprocessor instance, used with icswx */
> +   struct {
> +   struct vas_window *rxwin;
> +   int id;
> +   } vas;
> struct list_head list;
>  };
>
> +/*
> + * Send the request to NX engine on the chip for the corresponding CPU
> + * where the process is executing. Use with VAS function.
> + */
> +static DEFINE_PER_CPU(struct nx842_coproc *, coproc_inst);
> +
>  /* no cpu hotplug on powernv, so this list never changes after init */
>  static LIST_HEAD(nx842_coprocs);
>  static unsigned int nx842_ct;  /* used in icswx function */
> @@ -513,6 +528,105 @@ static int nx842_exec_icswx(const unsigned char *in, 
> unsigned int inlen,
>  }
>
>  /**
> + * nx842_exec_vas - compress/decompress data using the 842 algorithm
> + *
> + * (De)compression provided by the NX842 coprocessor on IBM PowerNV systems.
> + * This compresses or decompresses the provided input buffer into the 
> provided
> + * output buffer.
> + *
> + * Upon return from this function @outlen contains the length of the
> + * output data.  If there is an error then @outlen will be 0 and an
> + * error will be specified by the return code from this function.
> + *
> + * The @workmem buffer should only be used by one function call at a time.
> + *
> + * @in: input buffer pointer
> + * @inlen: input buffer size
> + * @out: output buffer pointer
> + * @outlenp: output buffer size pointer
> + * @workmem: working memory buffer pointer, size determined by
> + *   nx842_powernv_driver.workmem_size
> + * @fc: function code, see CCW Function Codes in nx-842.h
> + *
> + * Returns:
> + *   0 Success, output of length @outlenp stored in the buffer
> + * at @out
> + *   -ENODEV   Hardware unavailable
> + *   -ENOSPC   Output buffer is to small
> + *   -EMSGSIZE Input buffer too large
> + *   -EINVAL   buffer constraints do not fix nx842_constraints
> + *   -EPROTO   hardware error during operation
> + *   -ETIMEDOUThardware did not complete operation in reasonable time
> + *   -EINTRoperation was aborted
> + */
> +static int nx842_exec_vas(const unsigned char *in, unsigned int inlen,
> + unsigned char *out,

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Peter Zijlstra

On Tue, Aug 29, 2017 at 03:18:25PM +0200, Laurent Dufour wrote:
> On 29/08/2017 14:04, Peter Zijlstra wrote:
> > On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
> >> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
>  +
>  +if (unlikely(!vma->anon_vma))
>  +goto unlock;
> >>>
> >>> It deserves a comment.
> >>
> >> You're right I'll add it in the next version.
> >> For the record, the root cause is that __anon_vma_prepare() requires the
> >> mmap_sem to be held because vm_next and vm_prev must be safe.
> > 
> > But should that test not be:
> > 
> > if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
> > goto unlock;
> > 
> > Because !anon vmas will never have ->anon_vma set and you don't want to
> > exclude those.
> 
> Yes in the case we later allow non anonymous vmas to be handled.
> Currently only anonymous vmas are supported so the check is good enough,
> isn't it ?

That wasn't at all clear from reading the code. This makes it clear
->anon_vma is only ever looked at for anonymous.

And like Kirill says, we _really_ should start allowing some (if not
all) vm_ops. Large file based mappings aren't particularly rare.

I'm not sure we want to introduce a white-list or just bite the bullet
and audit all ->fault() implementations. But either works and isn't
terribly difficult, auditing all is more work though.

Re: [PATCH V3 6/6] crypto/nx: Add P9 NX support for 842 compression engine

2017-08-29 Thread Dan Streetman

On Mon, Aug 28, 2017 at 7:25 PM, Michael Ellerman  wrote:
> Hi Haren,
>
> Some comments inline ...
>
> Haren Myneni  writes:
>
>> diff --git a/drivers/crypto/nx/nx-842-powernv.c 
>> b/drivers/crypto/nx/nx-842-powernv.c
>> index c0dd4c7e17d3..13089a0b9dfa 100644
>> --- a/drivers/crypto/nx/nx-842-powernv.c
>> +++ b/drivers/crypto/nx/nx-842-powernv.c
>> @@ -32,6 +33,9 @@ MODULE_ALIAS_CRYPTO("842-nx");
>>
>>  #define WORKMEM_ALIGN(CRB_ALIGN)
>>  #define CSB_WAIT_MAX (5000) /* ms */
>> +#define VAS_RETRIES  (10)
>
> Where does that number come from?
>
> Do we have any idea what the trade off is between retrying vs just
> falling back to doing the request in software?
>
>> +/* # of requests allowed per RxFIFO at a time. 0 for unlimited */
>> +#define MAX_CREDITS_PER_RXFIFO   (1024)
>>
>>  struct nx842_workmem {
>>   /* Below fields must be properly aligned */
>> @@ -42,16 +46,27 @@ struct nx842_workmem {
>>
>>   ktime_t start;
>>
>> + struct vas_window *txwin;   /* Used with VAS function */
>
> I don't understand how it makes sense to put txwin and start between the
> fields above, and the padding.

workmem is a scratch buffer and shouldn't be used for something
persistent like this.

>
> If the workmem pointer you receive is not aligned, then PTR_ALIGN() will
> advance it and mean you end up writing over start and txwin.
>
> That's probably not your bug, the code is already like that.

no, it's a bug in this patch, because workmem is scratch whose
contents are only valid for the duration of each operation (compress
or decompress).

>
>>   char padding[WORKMEM_ALIGN]; /* unused, to allow alignment */
>>  } __packed __aligned(WORKMEM_ALIGN);
>
>> @@ -576,6 +690,198 @@ static inline void nx842_add_coprocs_list(struct 
>> nx842_coproc *coproc,
>>   list_add(&coproc->list, &nx842_coprocs);
>>  }
>>
>> +/*
>> + * Identify chip ID for each CPU and save coprocesor adddress for the
>> + * corresponding NX engine in percpu coproc_inst.
>> + * coproc_inst is used in crypto_init to open send window on the NX instance
>> + * for the corresponding CPU / chip where the open request is executed.
>> + */
>> +static void nx842_set_per_cpu_coproc(struct nx842_coproc *coproc)
>> +{
>> + unsigned int i, chip_id;
>> +
>> + for_each_possible_cpu(i) {
>> + chip_id = cpu_to_chip_id(i);
>> +
>> + if (coproc->chip_id == chip_id)
>> + per_cpu(coproc_inst, i) = coproc;
>> + }
>> +}
>> +
>> +
>> +static struct vas_window *nx842_alloc_txwin(struct nx842_coproc *coproc)
>> +{
>> + struct vas_window *txwin = NULL;
>> + struct vas_tx_win_attr txattr;
>> +
>> + /*
>> +  * Kernel requests will be high priority. So open send
>> +  * windows only for high priority RxFIFO entries.
>> +  */
>> + vas_init_tx_win_attr(&txattr, coproc->ct);
>> + txattr.lpid = 0;/* lpid is 0 for kernel requests */
>> + txattr.pid = mfspr(SPRN_PID);
>
> Should we be using SPRN_PID here? That makes it appear as if it comes
> from the current user process, which seems fishy.
>
>> + /*
>> +  * Open a VAS send window which is used to send request to NX.
>> +  */
>> + txwin = vas_tx_win_open(coproc->vas.id, coproc->ct, &txattr);
>> + if (IS_ERR(txwin)) {
>> + pr_err("ibm,nx-842: Can not open TX window: %ld\n",
>> + PTR_ERR(txwin));
>> + return NULL;
>> + }
>> +
>> + return txwin;
>> +}
>> +
>> +static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
>> + int vasid)
>> +{
>> + struct vas_window *rxwin = NULL;
>> + struct vas_rx_win_attr rxattr;
>> + struct nx842_coproc *coproc;
>> + u32 lpid, pid, tid, fifo_size;
>> + u64 rx_fifo;
>> + const char *priority;
>> + int ret;
>> +
>> + ret = of_property_read_u64(dn, "rx-fifo-address", (void *)&rx_fifo);
>   
>   Unnecessary cast.
>
>> + if (ret) {
>> + pr_err("Missing rx-fifo-address property\n");
>> + return ret;
>> + }
>> +
>> + ret = of_property_read_u32(dn, "rx-fifo-size", &fifo_size);
>> + if (ret) {
>> + pr_err("Missing rx-fifo-size property\n");
>> + return ret;
>> + }
>> +
>> + ret = of_property_read_u32(dn, "lpid", &lpid);
>> + if (ret) {
>> + pr_err("Missing lpid property\n");
>> + return ret;
>> + }
>> +
>> + ret = of_property_read_u32(dn, "pid", &pid);
>> + if (ret) {
>> + pr_err("Missing pid property\n");
>> + return ret;
>> + }
>> +
>> + ret = of_property_read_u32(dn, "tid", &tid);
>> + if (ret) {
>> + pr_err("Missing tid property\n");
>> + return ret;
>> + }
>> +
>> + ret = of_property_read_string(dn, "priority", &priority);
>

Re: [PATCH] powerpc/kernel: Change retrieval of pci_dn

2017-08-29 Thread Bryant G. Ly


On 8/29/17 1:20 AM, Sam Bobroff wrote:


On Mon, Aug 28, 2017 at 11:05:03AM -0500, Bryant G. Ly wrote:

For a PCI device it's pci_dn can be retrieved from
pdev->dev.archdata.firmware_data, PCI_DN(devnode), or parent's list.
Thus, we should just use the generic function pci_get_pdn_by_devfn
to get the pci_dn.

Signed-off-by: Bryant G. Ly 

Reviewed-by: Sam Bobroff 

I don't know this area but I tested it using a patched kernel with the
old and new code together. My test kernel booted fine (in QEMU+KVM) and
I saw 26 reads and 4 writes, all of which got the same value with either
code block.

I also checked that the error result in the "not found" case is the same
as well, which it is, because rtas_{read,write}_config() will return
PCIBIOS_DEVICE_NOT_FOUND if given a NULL pdn.

So, looks good to me.

Cheers,
Sam.


Thanks for the review Sam!

Re: [PATCH] powerpc/kernel: Change retrieval of pci_dn

2017-08-29 Thread Bryant G. Ly


On 8/29/17 1:33 AM, Michael Ellerman wrote:


"Bryant G. Ly"  writes:


For a PCI device it's pci_dn can be retrieved from
pdev->dev.archdata.firmware_data, PCI_DN(devnode), or parent's list.
Thus, we should just use the generic function pci_get_pdn_by_devfn
to get the pci_dn.

Signed-off-by: Bryant G. Ly 

Minor issue, it's preferable if the email in your Signed-off-by matches
the email you send patches from.

cheers


Hi Michael,

Thanks for the review. I apologize for the email's not matching, I switch 
between the two frequently
throughout the day for internal gerrit commits and Linux patches. I have 
addressed all your comments
in the new patch that I had just put up. Also, I have tested it with mellanox 
cx4 cards on P8 systems.

I'd also like to let you know that I am working on patches to enable SRIOV on 
power and would like
your feedback on design, which I will send in a private email.

-Bryant

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Laurent Dufour

On 29/08/2017 14:04, Peter Zijlstra wrote:
> On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
>> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
 +
 +  if (unlikely(!vma->anon_vma))
 +  goto unlock;
>>>
>>> It deserves a comment.
>>
>> You're right I'll add it in the next version.
>> For the record, the root cause is that __anon_vma_prepare() requires the
>> mmap_sem to be held because vm_next and vm_prev must be safe.
> 
> But should that test not be:
> 
>   if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
>   goto unlock;
> 
> Because !anon vmas will never have ->anon_vma set and you don't want to
> exclude those.

Yes in the case we later allow non anonymous vmas to be handled.
Currently only anonymous vmas are supported so the check is good enough,
isn't it ?

[PATCH] powerpc/kernel: Change retrieval of pci_dn

2017-08-29 Thread Bryant G. Ly

For a PCI device it's pci_dn can be retrieved from
pdev->dev.archdata.firmware_data, PCI_DN(devnode), or parent's list.
Thus, we should just use the existing function pci_get_pdn_by_devfn
to get the pci_dn.

Signed-off-by: Bryant G. Ly 
Reviewed-by: Sam Bobroff 
---
 arch/powerpc/kernel/rtas_pci.c | 33 -
 1 file changed, 4 insertions(+), 29 deletions(-)

diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index 73f1934..c2b148b 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -91,26 +91,14 @@ static int rtas_pci_read_config(struct pci_bus *bus,
unsigned int devfn,
int where, int size, u32 *val)
 {
-   struct device_node *busdn, *dn;
struct pci_dn *pdn;
-   bool found = false;
int ret;
 
-   /* Search only direct children of the bus */
*val = 0x;
-   busdn = pci_bus_to_OF_node(bus);
-   for (dn = busdn->child; dn; dn = dn->sibling) {
-   pdn = PCI_DN(dn);
-   if (pdn && pdn->devfn == devfn
-   && of_device_is_available(dn)) {
-   found = true;
-   break;
-   }
-   }
 
-   if (!found)
-   return PCIBIOS_DEVICE_NOT_FOUND;
+   pdn = pci_get_pdn_by_devfn(bus, devfn);
 
+   /* Validity of pdn is checked in here */
ret = rtas_read_config(pdn, where, size, val);
if (*val == EEH_IO_ERROR_VALUE(size) &&
eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
@@ -153,24 +141,11 @@ static int rtas_pci_write_config(struct pci_bus *bus,
 unsigned int devfn,
 int where, int size, u32 val)
 {
-   struct device_node *busdn, *dn;
struct pci_dn *pdn;
-   bool found = false;
-
-   /* Search only direct children of the bus */
-   busdn = pci_bus_to_OF_node(bus);
-   for (dn = busdn->child; dn; dn = dn->sibling) {
-   pdn = PCI_DN(dn);
-   if (pdn && pdn->devfn == devfn
-   && of_device_is_available(dn)) {
-   found = true;
-   break;
-   }
-   }
 
-   if (!found)
-   return PCIBIOS_DEVICE_NOT_FOUND;
+   pdn = pci_get_pdn_by_devfn(bus, devfn);
 
+   /* Validity of pdn is checked in here. */
return rtas_write_config(pdn, where, size, val);
 }
 
-- 
2.5.4 (Apple Git-61)

Re: [PATCH] powerpc/powernv: Turn on SCSI_AACRAID in powernv_defconfig

2017-08-29 Thread Guilherme G. Piccoli

On 08/29/2017 08:22 AM, Michael Ellerman wrote:
> "Guilherme G. Piccoli"  writes:
> 
>> On 08/28/2017 02:56 AM, Michael Ellerman wrote:
>>> Some Power9 boxes will have this adapter installed, so add it to the
>>> defconfig so we can boot on those machines without an initrd.
>>
>> Michael, not sure if this affects Petitboot (I know it has its own
>> default config files), but in the past we had some issues with this
>> driver and it's nice to have the possibility to blacklist it.
> 
> Why we were black listing it?

Basically for the same reason you want it built-in - debug purposes heheh

In fact, this driver takes some time in its initialization process, so
if my rootfs is not under it, to speedup my boot-time I could blacklist
it; also, to debug the aacraid driver we could blacklist it in Petitboot
and debug on distro (easier to build kernels there), since the issue
could be in first probe only (or be affected by kexec-out from Petitboot).

Anyway, since make it a module will harm your workflow, I agree built-in
is the best option!

Thanks for your detailed explanation.
Cheers,


Guilherme


> 
>> What is the rationale for this change? I mean, how being able to boot
>> without initrd is interesting here? Sorry if it's silly question!
> 
> It is a silly question! :)
> 
> When you boot as many kernels as me it's preferable to be able to build
> a kernel completely remote from the box, throw the kernel on a tftp
> server and have the machine boot from that kernel all the way to
> userspace.
> 
> That requires not using an initrd, and therefore having all the drivers
> built-in that are needed to get to the boot disk.
> 
> (Yeah it is "possible" to build an initrd remotely but it's a pain).
> 
> And the one P9 box I have here uses that driver for its root disk, so I
> do really want it to be =y.
> 
> cheers
>

Re: [PATCH v3] sound: aoa: Convert to using %pOF instead of full_name

2017-08-29 Thread Takashi Iwai

On Tue, 29 Aug 2017 14:35:03 +0200,
Rob Herring wrote:
> 
> On Mon, Aug 7, 2017 at 6:29 PM, Rob Herring  wrote:
> > Now that we have a custom printf format specifier, convert users of
> > full_name to use %pOF instead. This is preparation to remove storing
> > of the full path string for each node.
> >
> > Signed-off-by: Rob Herring 
> > Cc: Johannes Berg 
> > Cc: Jaroslav Kysela 
> > Cc: Takashi Iwai 
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: alsa-de...@alsa-project.org
> > ---
> > v3: Split aoa to separate patch
> 
> Ping.

Sorry, overlooked this one.  Now applied.

Thanks!


Takashi

Re: [alsa-devel] [PATCH v3] ASoC: fsl_dma: remove dma_object path member

2017-08-29 Thread Fabio Estevam

On Tue, Aug 29, 2017 at 9:37 AM, Rob Herring  wrote:
> dma_object.path is unused, so rather than fix it to work with DT
> full_name changes, just remove it.
>
> Signed-off-by: Rob Herring 

Reviewed-by: Fabio Estevam

[PATCH v3] ASoC: fsl_dma: remove dma_object path member

2017-08-29 Thread Rob Herring

dma_object.path is unused, so rather than fix it to work with DT
full_name changes, just remove it.

Signed-off-by: Rob Herring 
Cc: Timur Tabi 
Cc: Nicolin Chen 
Cc: Xiubo Li 
Cc: Fabio Estevam 
Cc: Liam Girdwood 
Cc: Mark Brown 
Cc: Jaroslav Kysela 
Cc: Takashi Iwai 
Cc: alsa-de...@alsa-project.org
Cc: linuxppc-dev@lists.ozlabs.org
---
v3:
- Update subject: s/sound/ASoC/
v2:
- Move printf specifier change to correct patch

 sound/soc/fsl/fsl_dma.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sound/soc/fsl/fsl_dma.c b/sound/soc/fsl/fsl_dma.c
index c17359648915..0c11f434a374 100644
--- a/sound/soc/fsl/fsl_dma.c
+++ b/sound/soc/fsl/fsl_dma.c
@@ -63,7 +63,6 @@ struct dma_object {
struct ccsr_dma_channel __iomem *channel;
unsigned int irq;
bool assigned;
-   char path[1];
 };
 
 /*
@@ -903,13 +902,12 @@ static int fsl_soc_dma_probe(struct platform_device *pdev)
return ret;
}
 
-   dma = kzalloc(sizeof(*dma) + strlen(np->full_name), GFP_KERNEL);
+   dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
of_node_put(ssi_np);
return -ENOMEM;
}
 
-   strcpy(dma->path, np->full_name);
dma->dai.ops = &fsl_dma_ops;
dma->dai.pcm_new = fsl_dma_new;
dma->dai.pcm_free = fsl_dma_free_dma_buffers;
-- 
2.11.0

Re: [PATCH v3] sound: aoa: Convert to using %pOF instead of full_name

2017-08-29 Thread Rob Herring

On Mon, Aug 7, 2017 at 6:29 PM, Rob Herring  wrote:
> Now that we have a custom printf format specifier, convert users of
> full_name to use %pOF instead. This is preparation to remove storing
> of the full path string for each node.
>
> Signed-off-by: Rob Herring 
> Cc: Johannes Berg 
> Cc: Jaroslav Kysela 
> Cc: Takashi Iwai 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: alsa-de...@alsa-project.org
> ---
> v3: Split aoa to separate patch

Ping.

>
>  sound/aoa/codecs/tas.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/sound/aoa/codecs/tas.c b/sound/aoa/codecs/tas.c
> index 733b6365dad6..15c05755d270 100644
> --- a/sound/aoa/codecs/tas.c
> +++ b/sound/aoa/codecs/tas.c
> @@ -905,8 +905,8 @@ static int tas_i2c_probe(struct i2c_client *client,
> goto fail;
> }
> printk(KERN_DEBUG
> -  "snd-aoa-codec-tas: tas found, addr 0x%02x on %s\n",
> -  (unsigned int)client->addr, node->full_name);
> +  "snd-aoa-codec-tas: tas found, addr 0x%02x on %pOF\n",
> +  (unsigned int)client->addr, node);
> return 0;
>   fail:
> mutex_destroy(&tas->mtx);
> --
> 2.11.0
>

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Peter Zijlstra

On Tue, Aug 29, 2017 at 09:59:30AM +0200, Laurent Dufour wrote:
> On 27/08/2017 02:18, Kirill A. Shutemov wrote:
> >> +
> >> +  if (unlikely(!vma->anon_vma))
> >> +  goto unlock;
> > 
> > It deserves a comment.
> 
> You're right I'll add it in the next version.
> For the record, the root cause is that __anon_vma_prepare() requires the
> mmap_sem to be held because vm_next and vm_prev must be safe.

But should that test not be:

if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma))
goto unlock;

Because !anon vmas will never have ->anon_vma set and you don't want to
exclude those.

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Peter Zijlstra

On Tue, Aug 29, 2017 at 10:33:52AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 29, 2017 at 07:14:37AM +1000, Benjamin Herrenschmidt wrote:
> > We'd have to audit archs closely. Things like the page walk cache
> > flushing on power etc...
> 
> If you point me where to look, I'll have a poke around. I'm not
> quite sure what you mean with pagewalk cache flushing. Your hash thing
> flushes everything inside the PTL IIRC and the radix code appears fairly
> 'normal'.

mpe helped me out and explained that is the PWC hint to TBLIE.

So, you set need_flush_all when you unhook pud/pmd/pte which you then
use to set PWC. So free_pgtables() will do the PWC when it unhooks
higher level pages.

But you're right that there's some issues, free_pgtables() itself
doesn't seem to use mm->page_table_lock,pmd->lock _AT_ALL_ to unhook the
pages.

If it were to do that, things should work fine since those locks would
then serialize against the speculative faults, we would never install a
page if the VMA would be under tear-down and it would thus not be
visible to your caches either.

Re: [PATCH] powerpc/powernv: Turn on SCSI_AACRAID in powernv_defconfig

2017-08-29 Thread Michael Ellerman

"Guilherme G. Piccoli"  writes:

> On 08/28/2017 02:56 AM, Michael Ellerman wrote:
>> Some Power9 boxes will have this adapter installed, so add it to the
>> defconfig so we can boot on those machines without an initrd.
>
> Michael, not sure if this affects Petitboot (I know it has its own
> default config files), but in the past we had some issues with this
> driver and it's nice to have the possibility to blacklist it.

Why we were black listing it?

> What is the rationale for this change? I mean, how being able to boot
> without initrd is interesting here? Sorry if it's silly question!

It is a silly question! :)

When you boot as many kernels as me it's preferable to be able to build
a kernel completely remote from the box, throw the kernel on a tftp
server and have the machine boot from that kernel all the way to
userspace.

That requires not using an initrd, and therefore having all the drivers
built-in that are needed to get to the boot disk.

(Yeah it is "possible" to build an initrd remotely but it's a pain).

And the one P9 box I have here uses that driver for its root disk, so I
do really want it to be =y.

cheers

Re: [PATCH v3 2/4] powerpc/64s: idle POWER9 can execute stop without a sync sequence

2017-08-29 Thread Michael Ellerman

Paul Mackerras  writes:
> On Fri, Aug 25, 2017 at 02:30:34PM +1000, Nicholas Piggin wrote:
>> diff --git a/arch/powerpc/kernel/idle_book3s.S 
>> b/arch/powerpc/kernel/idle_book3s.S
>> index 4924647d964d..14e97f442167 100644
>> --- a/arch/powerpc/kernel/idle_book3s.S
>> +++ b/arch/powerpc/kernel/idle_book3s.S
>> @@ -205,6 +205,19 @@ pnv_powersave_common:
>>  mtmsrd  r7,0
>>  bctr
>>  
>> +/*
>> + * This is the sequence required to execute idle instructions, as
>> + * specified in ISA v2.07. MSR[IR] and MSR[DR] must be 0.
>> + */
>> +#define ARCH207_IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)   \
>
> We had to do this sequence on POWER7 also, which is architecture
> v2.06.  Thus the comments and the naming (ARCH207_*) are a bit
> misleading here.  The actual code change looks OK.

I'll just drop the name change, I don't think it's crucial. That makes
P9 the special case.

We can come up with a better name or something in future.

Unless Nick objects?

cheers

Re: [RFC Part1 PATCH v3 16/17] X86/KVM: Provide support to create Guest and HV shared per-CPU variables

2017-08-29 Thread Borislav Petkov

On Mon, Jul 24, 2017 at 02:07:56PM -0500, Brijesh Singh wrote:
> Some KVM specific MSR's (steal-time, asyncpf, avic_eio) allocates per-CPU

   MSRs

> variable at compile time and share its physical address with hypervisor.

That sentence needs changing - the MSRs don't allocate - for them gets
allocated.

> It presents a challege when SEV is active in guest OS, when SEV is active,
> the guest memory is encrypted with guest key hence hypervisor will not
> able to modify the guest memory. When SEV is active, we need to clear the
> encryption attribute (aka C-bit) of shared physical addresses so that both
> guest and hypervisor can access the data.

This whole paragraph needs rewriting.

> To solve this problem, I have tried these three options:
> 
> 1) Convert the static per-CPU to dynamic per-CPU allocation and when SEV
> is detected clear the C-bit from the page table. But while doing so I
> found that per-CPU dynamic allocator was not ready when kvm_guest_cpu_init
> was called.
> 
> 2) Since the C-bit works on PAGE_SIZE hence add some extra padding to
> 'struct kvm-steal-time' to make it PAGE_SIZE and then at runtime

"to make it PAGE_SIZE"?

I know what it means but it reads strange and needs more restraint when
rewriting it. :)

> clear the encryption attribute of the full PAGE. The downside of this -
> we need to modify structure which may break the compatibility.
> 
> 3) Define a new per-CPU section (.data..percpu.hv_shared) which will be
> used to hold the compile time shared per-CPU variables. When SEV is
> detected we map this section without C-bit.
> 
> This patch implements #3.

>From Documentation/process/submitting-patches.rst:

 "Describe your changes in imperative mood, e.g. "make xyzzy do frotz"
  instead of "[This patch] makes xyzzy do frotz" or "[I] changed xyzzy
  to do frotz", as if you are giving orders to the codebase to change
  its behaviour."

Also, never say "This patch" in a commit message of a patch. It is
tautologically useless.

> It introduces a new DEFINE_PER_CPU_HV_SHAHRED

There's no DEFINE_PER_CPU_HV_SHAHRED. Typo.

> macro to create a compile time per-CPU variable. When SEV is detected we
> clear the C-bit from the shared per-CPU variable.
> 
> Signed-off-by: Brijesh Singh 
> ---
>  arch/x86/kernel/kvm.c | 46 
> ---
>  include/asm-generic/vmlinux.lds.h |  3 +++
>  include/linux/percpu-defs.h   | 12 ++
>  3 files changed, 58 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
> index 71c17a5..1f6fec8 100644
> --- a/arch/x86/kernel/kvm.c
> +++ b/arch/x86/kernel/kvm.c
> @@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg)
>  
>  early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
>  
> -static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
> -static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
> +static DEFINE_PER_CPU_HV_SHARED(struct kvm_vcpu_pv_apf_data, apf_reason) 
> __aligned(64);
> +static DEFINE_PER_CPU_HV_SHARED(struct kvm_steal_time, steal_time) 
> __aligned(64);
>  static int has_steal_clock = 0;
>  
>  /*
> @@ -303,7 +303,7 @@ static void kvm_register_steal_time(void)
>   cpu, (unsigned long long) slow_virt_to_phys(st));
>  }
>  
> -static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
> +static DEFINE_PER_CPU_HV_SHARED(unsigned long, kvm_apic_eoi) = 
> KVM_PV_EOI_DISABLED;
>  
>  static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
>  {
> @@ -319,11 +319,51 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, 
> u32 val)
>   apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
>  }
>  
> +/* NOTE: function is marked as __ref because it is used by __init functions 
> */

No need for that comment.

What should you look into is why do you need to call the early versions:

" * producing a warning (of course, no warning does not mean code is
 * correct, so optimally document why the __ref is needed and why it's OK)."

And we do have the normal set_memory_decrypted() etc helpers so why
aren't we using those?

If you need to use the early ones too, then you probably need to
differentiate this in the callers by passing a "bool early", which calls
the proper flavor.

> +static int __ref kvm_map_hv_shared_decrypted(void)
> +{
> + static int once, ret;
> + int cpu;
> +
> + if (once)
> + return ret;

So this function gets called per-CPU but you need to do this ugly "once"
thing - i.e., global function called in a per-CPU context.

Why can't you do that mapping only on the current CPU and then
when that function is called on the next CPU, it will do the same thing
on that next CPU?

> + /*
> +  * Iterate through all possible CPU's and clear the C-bit from
> +  * percpu variables.
> +  */
> + for_each_possible_cpu(cpu) {
> + struct kvm_vcpu_pv_apf_data *apf;
> + unsigned long pa;

Re: [RESEND PATCH v5 00/16] eeprom: at24: Add OF device ID table

2017-08-29 Thread Wolfram Sang


> I don't have a DT based system at hand now, but I'll test it again and
> let you know probably tomorrow.

I will try again today, too. Thanks!



signature.asc
Description: PGP signature

Re: [RESEND PATCH v5 00/16] eeprom: at24: Add OF device ID table

2017-08-29 Thread Javier Martinez Canillas

Hello Wolfram,

On Mon, Aug 28, 2017 at 6:01 PM, Wolfram Sang  wrote:
>
>> > But there is a dependency, no? If I apply the driver patch,
>> > non-converted device trees will not find their eeproms anymore. So, I
>>
>> I don't think that's correct. If you apply this patch before the DTS
>> changes, the driver will still match using the I2C device ID table
>> like it has been doing it until today.
>
> My tests do not confirm this. If I add a node with a "renesas,24c01"
> compatible to my board, it works before your patch, but not after. If I
> change it to "atmel,24c01" it works even after your patch. I haven't
> looked into it, though, maybe i2c_of_match_device_sysfs() is stepping on
> our foots here?
>
> Did you test and did it work for you?
>

I would swear that I tested both combinations (driver patch without DT
changes and DTS changes without driver patch), but it was months ago
when I first posted the patches so I may misremembering.

I don't have a DT based system at hand now, but I'll test it again and
let you know probably tomorrow.

Best regards,
Javier

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Peter Zijlstra

On Tue, Aug 29, 2017 at 07:14:37AM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2017-08-28 at 11:37 +0200, Peter Zijlstra wrote:
> > > Doing all this job and just give up because we cannot allocate page tables
> > > looks very wasteful to me.
> > > 
> > > Have you considered to look how we can hand over from speculative to
> > > non-speculative path without starting from scratch (when possible)?
> > 
> > So we _can_ in fact allocate and install page-tables, but we have to be
> > very careful about it. The interesting case is where we race with
> > free_pgtables() and install a page that was just taken out.
> > 
> > But since we already have the VMA I think we can do something like:
> 
> That makes me extremely nervous... there could be all sort of
> assumptions esp. in arch code about the fact that we never populate the
> tree without the mm sem.

That _would_ be somewhat dodgy, because that means it needs to rely on
taking mmap_sem for _writing_ to undo things and arch/powerpc/ doesn't
have many down_write.*mmap_sem:

$ git grep "down_write.*mmap_sem" arch/powerpc/
arch/powerpc/kernel/vdso.c: if (down_write_killable(&mm->mmap_sem))
arch/powerpc/kvm/book3s_64_vio.c:   down_write(¤t->mm->mmap_sem);
arch/powerpc/mm/mmu_context_iommu.c:down_write(&mm->mmap_sem);
arch/powerpc/mm/subpage-prot.c: down_write(&mm->mmap_sem);
arch/powerpc/mm/subpage-prot.c: down_write(&mm->mmap_sem);
arch/powerpc/mm/subpage-prot.c: down_write(&mm->mmap_sem);

Then again, I suppose it could be relying on the implicit down_write
from things like munmap() and the like..

And things _ought_ to be ordered by the various PTLs
(mm->page_table_lock and pmd->lock) which of course doesn't mean
something accidentally snuck through.

> We'd have to audit archs closely. Things like the page walk cache
> flushing on power etc...

If you point me where to look, I'll have a poke around. I'm not
quite sure what you mean with pagewalk cache flushing. Your hash thing
flushes everything inside the PTL IIRC and the radix code appears fairly
'normal'.

> I don't mind the "retry" .. .we've brought stuff in the L1 cache
> already which I would expect to be the bulk of the overhead, and the
> allocation case isn't that common. Do we have numbers to show how
> destrimental this is today ?

No numbers, afaik. And like I said, I didn't consider this an actual
problem when I did these patches. But since Kirill asked ;-)

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Peter Zijlstra

On Mon, Aug 28, 2017 at 03:35:11PM -0700, Andi Kleen wrote:
> Yes the whole thing is quite risky. Probably will need some
> kind of per architecture opt-in scheme?

See patch 19/20, that not enough for you?

Re: [PATCH v2 14/20] mm: Provide speculative fault infrastructure

2017-08-29 Thread Laurent Dufour

On 27/08/2017 02:18, Kirill A. Shutemov wrote:
> On Fri, Aug 18, 2017 at 12:05:13AM +0200, Laurent Dufour wrote:
>> +/*
>> + * vm_normal_page() adds some processing which should be done while
>> + * hodling the mmap_sem.
>> + */
>> +int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
>> + unsigned int flags)
>> +{
>> +struct vm_fault vmf = {
>> +.address = address,
>> +};
>> +pgd_t *pgd;
>> +p4d_t *p4d;
>> +pud_t *pud;
>> +pmd_t *pmd;
>> +int dead, seq, idx, ret = VM_FAULT_RETRY;
>> +struct vm_area_struct *vma;
>> +struct mempolicy *pol;
>> +
>> +/* Clear flags that may lead to release the mmap_sem to retry */
>> +flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
>> +flags |= FAULT_FLAG_SPECULATIVE;
>> +
>> +idx = srcu_read_lock(&vma_srcu);
>> +vma = find_vma_srcu(mm, address);
>> +if (!vma)
>> +goto unlock;
>> +
>> +/*
>> + * Validate the VMA found by the lockless lookup.
>> + */
>> +dead = RB_EMPTY_NODE(&vma->vm_rb);
>> +seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> 
>> seqlock,vma_rb_erase() */
>> +if ((seq & 1) || dead)
>> +goto unlock;
>> +
>> +/*
>> + * Can't call vm_ops service has we don't know what they would do
>> + * with the VMA.
>> + * This include huge page from hugetlbfs.
>> + */
>> +if (vma->vm_ops)
>> +goto unlock;
> 
> I think we need to have a way to white-list safe ->vm_ops.

Hi Kirill,
Yes this would be a good optimization done in a next step.

>> +
>> +if (unlikely(!vma->anon_vma))
>> +goto unlock;
> 
> It deserves a comment.

You're right I'll add it in the next version.
For the record, the root cause is that __anon_vma_prepare() requires the
mmap_sem to be held because vm_next and vm_prev must be safe.


>> +
>> +vmf.vma_flags = READ_ONCE(vma->vm_flags);
>> +vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
>> +
>> +/* Can't call userland page fault handler in the speculative path */
>> +if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
>> +goto unlock;
>> +
>> +/*
>> + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
>> + * are not compatible with the speculative page fault processing.
>> + */
>> +pol = __get_vma_policy(vma, address);
>> +if (!pol)
>> +pol = get_task_policy(current);
>> +if (pol && pol->mode == MPOL_INTERLEAVE)
>> +goto unlock;
>> +
>> +if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
>> +/*
>> + * This could be detected by the check address against VMA's
>> + * boundaries but we want to trace it as not supported instead
>> + * of changed.
>> + */
>> +goto unlock;
>> +
>> +if (address < READ_ONCE(vma->vm_start)
>> +|| READ_ONCE(vma->vm_end) <= address)
>> +goto unlock;
>> +
>> +/*
>> + * The three following checks are copied from access_error from
>> + * arch/x86/mm/fault.c
>> + */
>> +if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
>> +   flags & FAULT_FLAG_INSTRUCTION,
>> +   flags & FAULT_FLAG_REMOTE))
>> +goto unlock;
>> +
>> +/* This is one is required to check that the VMA has write access set */
>> +if (flags & FAULT_FLAG_WRITE) {
>> +if (unlikely(!(vmf.vma_flags & VM_WRITE)))
>> +goto unlock;
>> +} else {
>> +if (unlikely(!(vmf.vma_flags & (VM_READ | VM_EXEC | VM_WRITE
>> +goto unlock;
>> +}
>> +
>> +/*
>> + * Do a speculative lookup of the PTE entry.
>> + */
>> +local_irq_disable();
>> +pgd = pgd_offset(mm, address);
>> +if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
>> +goto out_walk;
>> +
>> +p4d = p4d_alloc(mm, pgd, address);
>> +if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
>> +goto out_walk;
>> +
>> +pud = pud_alloc(mm, p4d, address);
>> +if (pud_none(*pud) || unlikely(pud_bad(*pud)))
>> +goto out_walk;
>> +
>> +pmd = pmd_offset(pud, address);
>> +if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
>> +goto out_walk;
>> +
>> +/*
>> + * The above does not allocate/instantiate page-tables because doing so
>> + * would lead to the possibility of instantiating page-tables after
>> + * free_pgtables() -- and consequently leaking them.
>> + *
>> + * The result is that we take at least one !speculative fault per PMD
>> + * in order to instantiate it.
>> + */
> 
> 
> Doing all this job and just give up because we cannot allocate page tables
> looks very wasteful to me.
> 
> Have you considered to look how we can hand over from speculative to
> non-speculative path without starting from scratch (when possible)?

[PATCH kernel v2 5/6] powerpc/eeh: Reduce use of pci_dn::node

2017-08-29 Thread Alexey Kardashevskiy

The pci_dn struct caches a OF device node pointer in order to access
the "ibm,loc-code" property when EEH is recovering.

However, when this happens in eeh_dev_check_failure(), we also have
a pci_dev pointer which should have a valid pointer to the device node
when pci_dn has one (both pointers are not NULL for physical functions
and are NULL for virtual functions).

This changes pci_remove_device_node_info() to look for a parent of
the node being removed, just like pci_add_device_node_info() does when it
references the parent node.

This is the first step to get rid of pci_dn::node.

Signed-off-by: Alexey Kardashevskiy 
---
Changelog:
v2:
* fixed pseries platform
* this does not remove pci_dn::node yet, just prepares
---
 arch/powerpc/kernel/eeh.c| 9 +
 arch/powerpc/kernel/pci_dn.c | 7 +--
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 66c98c158ee3..9e816787c0d4 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -435,7 +435,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
int ret;
int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
unsigned long flags;
-   struct pci_dn *pdn;
+   struct device_node *dn;
struct pci_dev *dev;
struct eeh_pe *pe, *parent_pe, *phb_pe;
int rc = 0;
@@ -493,9 +493,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
if (pe->state & EEH_PE_ISOLATED) {
pe->check_count++;
if (pe->check_count % EEH_MAX_FAILS == 0) {
-   pdn = eeh_dev_to_pdn(edev);
-   if (pdn->node)
-   location = of_get_property(pdn->node, 
"ibm,loc-code", NULL);
+   dn = pci_device_to_OF_node(dev);
+   if (dn)
+   location = of_get_property(dn, "ibm,loc-code",
+   NULL);
printk(KERN_ERR "EEH: %d reads ignored for recovering 
device at "
"location=%s driver=%s pci addr=%s\n",
pe->check_count,
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 0256372b72de..dfb107631116 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -342,6 +342,7 @@ EXPORT_SYMBOL_GPL(pci_add_device_node_info);
 void pci_remove_device_node_info(struct device_node *dn)
 {
struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
+   struct device_node *parent;
 #ifdef CONFIG_EEH
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 
@@ -354,8 +355,10 @@ void pci_remove_device_node_info(struct device_node *dn)
 
WARN_ON(!list_empty(&pdn->child_list));
list_del(&pdn->list);
-   if (pdn->parent)
-   of_node_put(pdn->parent->node);
+
+   parent = of_get_parent(dn);
+   if (parent)
+   of_node_put(parent);
 
dn->data = NULL;
kfree(pdn);
-- 
2.11.0

[PATCH kernel v2 2/6] powerpc/eeh: Reduce to one the number of places where edev is allocated

2017-08-29 Thread Alexey Kardashevskiy

arch/powerpc/kernel/eeh_dev.c:57 is the only legit place where edev
is allocated; other 2 places allocate it on stack and in the heap for
a very short period of time to use eeh_pe_get() as takes edev.

This changes eeh_pe_get() to receive required parameters explicitly.

This removes unnecessary temporary allocation of edev.

This uses the "pe_no" name instead of the "pe_config_addr" name as
it actually is a PE number and not a config space address as it seemed.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: Andrew Donnellan 
Acked-by: Russell Currey 
---
 arch/powerpc/include/asm/eeh.h   |  3 ++-
 arch/powerpc/kernel/eeh_pe.c | 32 ++--
 arch/powerpc/platforms/powernv/eeh-powernv.c | 15 ++---
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 8e37b71674f4..26a6a43f8799 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -262,7 +262,8 @@ typedef void *(*eeh_traverse_func)(void *data, void *flag);
 void eeh_set_pe_aux_size(int size);
 int eeh_phb_pe_create(struct pci_controller *phb);
 struct eeh_pe *eeh_phb_pe_get(struct pci_controller *phb);
-struct eeh_pe *eeh_pe_get(struct eeh_dev *edev);
+struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
+ int pe_no, int config_addr);
 int eeh_add_to_parent_pe(struct eeh_dev *edev);
 int eeh_rmv_from_parent_pe(struct eeh_dev *edev);
 void eeh_pe_update_time_stamp(struct eeh_pe *pe);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index cc4b206f77e4..84d79f3da7d6 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -230,10 +230,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root,
  * Bus/Device/Function number. The extra data referred by flag
  * indicates which type of address should be used.
  */
+struct eeh_pe_get_flag {
+   int pe_no;
+   int config_addr;
+};
+
 static void *__eeh_pe_get(void *data, void *flag)
 {
struct eeh_pe *pe = (struct eeh_pe *)data;
-   struct eeh_dev *edev = (struct eeh_dev *)flag;
+   struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag;
 
/* Unexpected PHB PE */
if (pe->type & EEH_PE_PHB)
@@ -244,17 +249,17 @@ static void *__eeh_pe_get(void *data, void *flag)
 * have non-zero PE address
 */
if (eeh_has_flag(EEH_VALID_PE_ZERO)) {
-   if (edev->pe_config_addr == pe->addr)
+   if (tmp->pe_no == pe->addr)
return pe;
} else {
-   if (edev->pe_config_addr &&
-   (edev->pe_config_addr == pe->addr))
+   if (tmp->pe_no &&
+   (tmp->pe_no == pe->addr))
return pe;
}
 
/* Try BDF address */
-   if (edev->config_addr &&
-  (edev->config_addr == pe->config_addr))
+   if (tmp->config_addr &&
+  (tmp->config_addr == pe->config_addr))
return pe;
 
return NULL;
@@ -262,7 +267,9 @@ static void *__eeh_pe_get(void *data, void *flag)
 
 /**
  * eeh_pe_get - Search PE based on the given address
- * @edev: EEH device
+ * @phb: PCI controller
+ * @pe_no: PE number
+ * @config_addr: Config address
  *
  * Search the corresponding PE based on the specified address which
  * is included in the eeh device. The function is used to check if
@@ -271,12 +278,14 @@ static void *__eeh_pe_get(void *data, void *flag)
  * which is composed of PCI bus/device/function number, or unified
  * PE address.
  */
-struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
+   int pe_no, int config_addr)
 {
-   struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+   struct eeh_pe *root = eeh_phb_pe_get(phb);
+   struct eeh_pe_get_flag tmp = { pe_no, config_addr };
struct eeh_pe *pe;
 
-   pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+   pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp);
 
return pe;
 }
@@ -344,7 +353,8 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 * PE should be composed of PCI bus and its subordinate
 * components.
 */
-   pe = eeh_pe_get(edev);
+   pe = eeh_pe_get(edev->pdn->phb, edev->pe_config_addr,
+   edev->config_addr);
if (pe && !(pe->type & EEH_PE_INVALID)) {
/* Mark the PE as type of PCI bus */
pe->type = EEH_PE_BUS;
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c 
b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 3f48f6df1cf3..ac8c01cd251c 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -113,7 +113,6 @@ static ssize_t pnv_eeh_ei_write(struct file *filp,
size_t count, loff_t *ppos)
 {
struct pci_controller *hose = filp->private_data;
-

[PATCH kernel v2 1/6] powerpc/pci: Remove unused parameter from add_one_dev_pci_data()

2017-08-29 Thread Alexey Kardashevskiy

pdev is always NULL, remove it.

To make checkpatch.pl happy, this also removes the "out of memory"
message.

Signed-off-by: Alexey Kardashevskiy 
Reviewed-by: Andrew Donnellan 
Acked-by: Russell Currey 
---
 arch/powerpc/kernel/pci_dn.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 592693437070..0256372b72de 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -139,7 +139,6 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
 
 #ifdef CONFIG_PCI_IOV
 static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
-  struct pci_dev *pdev,
   int vf_index,
   int busno, int devfn)
 {
@@ -150,10 +149,8 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn 
*parent,
return NULL;
 
pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
-   if (!pdn) {
-   dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__);
+   if (!pdn)
return NULL;
-   }
 
pdn->phb = parent->phb;
pdn->parent = parent;
@@ -167,13 +164,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn 
*parent,
INIT_LIST_HEAD(&pdn->list);
list_add_tail(&pdn->list, &parent->child_list);
 
-   /*
-* If we already have PCI device instance, lets
-* bind them.
-*/
-   if (pdev)
-   pdev->dev.archdata.pci_data = pdn;
-
return pdn;
 }
 #endif
@@ -201,7 +191,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
struct eeh_dev *edev __maybe_unused;
 
-   pdn = add_one_dev_pci_data(parent, NULL, i,
+   pdn = add_one_dev_pci_data(parent, i,
   pci_iov_virtfn_bus(pdev, i),
   pci_iov_virtfn_devfn(pdev, i));
if (!pdn) {
-- 
2.11.0

[PATCH kernel v2 0/6] powerpc/eeh: Some cleanups

2017-08-29 Thread Alexey Kardashevskiy

Here are few patches to get rid of some cached pointers across EEH and
powernv code as I was struggling to figure out about lifetime of
structures and so on.


This is based on sha1
98b9f8a45499 Linus Torvalds Merge tag 'ext4_for_linus_stable' of 
git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4


Please comment. Thanks.

Changelog:
v2:
* added sof/ab to 1/6, 2/6
* fixed compile issue with pseries platform in the rest of v1
* split 5/5 from v1 to 2 separate patches (for easier bisect if/when needed)



Alexey Kardashevskiy (6):
  powerpc/pci: Remove unused parameter from add_one_dev_pci_data()
  powerpc/eeh: Reduce to one the number of places where edev is
allocated
  powerpc/eeh: Remove unnecessary pointer to phb from eeh_dev
  powerpc/eeh: Remove unnecessary config_addr from eeh_dev
  powerpc/eeh: Reduce use of pci_dn::node
  powerpc/pci: Remove OF node back pointer from pci_dn

 arch/powerpc/include/asm/eeh.h   |  5 +-
 arch/powerpc/include/asm/pci-bridge.h|  1 -
 arch/powerpc/kernel/eeh.c| 16 ++---
 arch/powerpc/kernel/eeh_dev.c|  2 -
 arch/powerpc/kernel/eeh_driver.c |  2 +-
 arch/powerpc/kernel/eeh_pe.c | 90 
 arch/powerpc/kernel/eeh_sysfs.c  |  3 -
 arch/powerpc/kernel/pci_dn.c | 22 +++
 arch/powerpc/platforms/powernv/eeh-powernv.c | 29 +++--
 arch/powerpc/platforms/pseries/eeh_pseries.c |  4 +-
 arch/powerpc/platforms/pseries/msi.c | 11 +---
 11 files changed, 81 insertions(+), 104 deletions(-)

-- 
2.11.0

[PATCH kernel v2 6/6] powerpc/pci: Remove OF node back pointer from pci_dn

2017-08-29 Thread Alexey Kardashevskiy

The check_req() helper uses pci_get_pdn() to get an OF node pointer.
pci_get_pdn() returns a pci_dn pointer which either:
1) from the OF node returned by pci_device_to_OF_node();
2) from the parent child_list where entries don't have OF node pointers.
Since check_req() does not care about 2), it can call
pci_device_to_OF_node() directly, hence the change.

The find_pe_dn() helper uses embedded pci_dn to get an OF node which is
also stored in edev->pdev so let's take a shortcut and call
pci_device_to_OF_node() directly.

With these 2 changes, we can finally get rid of the OF node back pointer.

Signed-off-by: Alexey Kardashevskiy 
---
Changelog:
v2:
* fixed pseries platform
---
 arch/powerpc/include/asm/pci-bridge.h |  1 -
 arch/powerpc/kernel/pci_dn.c  |  1 -
 arch/powerpc/platforms/pseries/msi.c  | 11 ++-
 3 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/pci-bridge.h 
b/arch/powerpc/include/asm/pci-bridge.h
index 56c67d3f0108..0b8aa1fe2d5f 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -195,7 +195,6 @@ struct pci_dn {
struct  pci_dn *parent;
struct  pci_controller *phb;/* for pci devices */
struct  iommu_table_group *table_group; /* for phb's or bridges */
-   struct  device_node *node;  /* back-pointer to the device_node */
 
int pci_ext_config_space;   /* for pci devices */
 
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index dfb107631116..0e395afbf0f4 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -293,7 +293,6 @@ struct pci_dn *pci_add_device_node_info(struct 
pci_controller *hose,
if (pdn == NULL)
return NULL;
dn->data = pdn;
-   pdn->node = dn;
pdn->phb = hose;
 #ifdef CONFIG_PPC_POWERNV
pdn->pe_number = IODA_INVALID_PE;
diff --git a/arch/powerpc/platforms/pseries/msi.c 
b/arch/powerpc/platforms/pseries/msi.c
index 326ef0dd6038..70b7aeff8139 100644
--- a/arch/powerpc/platforms/pseries/msi.c
+++ b/arch/powerpc/platforms/pseries/msi.c
@@ -132,15 +132,10 @@ static void rtas_teardown_msi_irqs(struct pci_dev *pdev)
 static int check_req(struct pci_dev *pdev, int nvec, char *prop_name)
 {
struct device_node *dn;
-   struct pci_dn *pdn;
const __be32 *p;
u32 req_msi;
 
-   pdn = pci_get_pdn(pdev);
-   if (!pdn)
-   return -ENODEV;
-
-   dn = pdn->node;
+   dn = pci_device_to_OF_node(pdev);
 
p = of_get_property(dn, prop_name, NULL);
if (!p) {
@@ -197,7 +192,6 @@ static struct device_node *find_pe_total_msi(struct pci_dev 
*dev, int *total)
 static struct device_node *find_pe_dn(struct pci_dev *dev, int *total)
 {
struct device_node *dn;
-   struct pci_dn *pdn;
struct eeh_dev *edev;
 
/* Found our PE and assume 8 at that point. */
@@ -210,8 +204,7 @@ static struct device_node *find_pe_dn(struct pci_dev *dev, 
int *total)
edev = pdn_to_eeh_dev(PCI_DN(dn));
if (edev->pe)
edev = list_first_entry(&edev->pe->edevs, struct eeh_dev, list);
-   pdn = eeh_dev_to_pdn(edev);
-   dn = pdn ? pdn->node : NULL;
+   dn = pci_device_to_OF_node(edev->pdev);
if (!dn)
return NULL;
 
-- 
2.11.0

[PATCH kernel v2 4/6] powerpc/eeh: Remove unnecessary config_addr from eeh_dev

2017-08-29 Thread Alexey Kardashevskiy

The eeh_dev struct hold a config space address of an associated node
and the very same address is also stored in the pci_dn struct which
is always present during the eeh_dev lifetime.

This uses bus:devfn directly from pci_dn instead of cached and packed
config_addr.

Since config_addr is made from device's bus:dev.fn, there is no point
in keeping it in the debugfs either so remove that too.

Signed-off-by: Alexey Kardashevskiy 
---
Changelog:
v2:
* fixed pseries platform
---
 arch/powerpc/include/asm/eeh.h   |  1 -
 arch/powerpc/kernel/eeh_pe.c | 42 ++--
 arch/powerpc/kernel/eeh_sysfs.c  |  3 --
 arch/powerpc/platforms/powernv/eeh-powernv.c |  9 +++---
 arch/powerpc/platforms/pseries/eeh_pseries.c |  2 --
 5 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 777d37aa0a7f..9847ae3a12d1 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -131,7 +131,6 @@ static inline bool eeh_pe_passed(struct eeh_pe *pe)
 struct eeh_dev {
int mode;   /* EEH mode */
int class_code; /* Class code of the device */
-   int config_addr;/* Config address   */
int pe_config_addr; /* PE config address*/
u32 config_space[16];   /* Saved PCI config space   */
int pcix_cap;   /* Saved PCIx capability*/
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 419c3f07afd5..2e8d1b2b5af4 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -340,11 +340,12 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 {
struct eeh_pe *pe, *parent;
struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+   int config_addr = (pdn->busno << 8) | (pdn->devfn);
 
/* Check if the PE number is valid */
if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n",
-  __func__, edev->config_addr, pdn->phb->global_number);
+  __func__, config_addr, pdn->phb->global_number);
return -EINVAL;
}
 
@@ -354,8 +355,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 * PE should be composed of PCI bus and its subordinate
 * components.
 */
-   pe = eeh_pe_get(pdn->phb, edev->pe_config_addr,
-   edev->config_addr);
+   pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
if (pe && !(pe->type & EEH_PE_INVALID)) {
/* Mark the PE as type of PCI bus */
pe->type = EEH_PE_BUS;
@@ -365,10 +365,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
list_add_tail(&edev->list, &pe->edevs);
pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n",
 pdn->phb->global_number,
-   edev->config_addr >> 8,
-   PCI_SLOT(edev->config_addr & 0xFF),
-   PCI_FUNC(edev->config_addr & 0xFF),
-   pe->addr);
+pdn->busno,
+PCI_SLOT(pdn->devfn),
+PCI_FUNC(pdn->devfn),
+pe->addr);
return 0;
} else if (pe && (pe->type & EEH_PE_INVALID)) {
list_add_tail(&edev->list, &pe->edevs);
@@ -388,10 +388,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device "
 "PE#%x, Parent PE#%x\n",
 pdn->phb->global_number,
-   edev->config_addr >> 8,
-PCI_SLOT(edev->config_addr & 0xFF),
-PCI_FUNC(edev->config_addr & 0xFF),
-   pe->addr, pe->parent->addr);
+pdn->busno,
+PCI_SLOT(pdn->devfn),
+PCI_FUNC(pdn->devfn),
+pe->addr, pe->parent->addr);
return 0;
}
 
@@ -405,7 +405,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
return -ENOMEM;
}
pe->addr= edev->pe_config_addr;
-   pe->config_addr = edev->config_addr;
+   pe->config_addr = config_addr;
 
/*
 * Put the new EEH PE into hierarchy tree. If the parent
@@ -436,9 +436,9 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
pr_debug("EEH: Add %04x:%02x:%02x.%01x to "
 "Device PE#%x, Parent PE#%x\n",
 pdn->phb->global_number,
-edev->config_addr >> 8,
-PCI_SLOT(edev->config_addr & 0xFF),
-PCI_FUNC(edev->config_addr & 0xFF),
+pdn->busno,
+PCI_SLOT(pdn

[PATCH kernel v2 3/6] powerpc/eeh: Remove unnecessary pointer to phb from eeh_dev

2017-08-29 Thread Alexey Kardashevskiy

The eeh_dev struct already holds a pointer to pci_dn which it does not
exist without and pci_dn itself holds the very same pointer so just
use it.

Signed-off-by: Alexey Kardashevskiy 
---
Changelog:
v2:
* fixed pseries platform
---
 arch/powerpc/include/asm/eeh.h   |  1 -
 arch/powerpc/kernel/eeh.c|  7 +++
 arch/powerpc/kernel/eeh_dev.c|  2 --
 arch/powerpc/kernel/eeh_driver.c |  2 +-
 arch/powerpc/kernel/eeh_pe.c | 24 +---
 arch/powerpc/platforms/powernv/eeh-powernv.c |  5 ++---
 arch/powerpc/platforms/pseries/eeh_pseries.c |  2 +-
 7 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 26a6a43f8799..777d37aa0a7f 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -141,7 +141,6 @@ struct eeh_dev {
struct eeh_pe *pe;  /* Associated PE*/
struct list_head list;  /* Form link list in the PE */
struct list_head rmv_list;  /* Record the removed edevs */
-   struct pci_controller *phb; /* Associated PHB   */
struct pci_dn *pdn; /* Associated PCI device node   */
struct pci_dev *pdev;   /* Associated PCI device*/
bool in_error;  /* Error flag for edev  */
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 5e6887c40528..66c98c158ee3 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -170,10 +170,10 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char 
*buf, size_t len)
char buffer[128];
 
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
-  edev->phb->global_number, pdn->busno,
+  pdn->phb->global_number, pdn->busno,
   PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
-   edev->phb->global_number, pdn->busno,
+   pdn->phb->global_number, pdn->busno,
PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
 
eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg);
@@ -1064,7 +1064,7 @@ core_initcall_sync(eeh_init);
  */
 void eeh_add_device_early(struct pci_dn *pdn)
 {
-   struct pci_controller *phb;
+   struct pci_controller *phb = pdn ? pdn->phb : NULL;
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
 
if (!edev)
@@ -1074,7 +1074,6 @@ void eeh_add_device_early(struct pci_dn *pdn)
return;
 
/* USB Bus children of PCI devices will not have BUID's */
-   phb = edev->phb;
if (NULL == phb ||
(eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid))
return;
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index d6b2ca70d14d..bdf4a3698a35 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -50,7 +50,6 @@
  */
 struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
 {
-   struct pci_controller *phb = pdn->phb;
struct eeh_dev *edev;
 
/* Allocate EEH device */
@@ -64,7 +63,6 @@ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
/* Associate EEH device with OF node */
pdn->edev = edev;
edev->pdn = pdn;
-   edev->phb = phb;
INIT_LIST_HEAD(&edev->list);
INIT_LIST_HEAD(&edev->rmv_list);
 
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index c405c79e50cd..8b840191df59 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -428,7 +428,7 @@ static void *eeh_add_virt_device(void *data, void *userdata)
 
if (!(edev->physfn)) {
pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
-   __func__, edev->phb->global_number, pdn->busno,
+   __func__, pdn->phb->global_number, pdn->busno,
PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
return NULL;
}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index 84d79f3da7d6..419c3f07afd5 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -339,11 +339,12 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev 
*edev)
 int eeh_add_to_parent_pe(struct eeh_dev *edev)
 {
struct eeh_pe *pe, *parent;
+   struct pci_dn *pdn = eeh_dev_to_pdn(edev);
 
/* Check if the PE number is valid */
if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n",
-  __func__, edev->config_addr, edev->phb->global_number);
+  __func__, edev->config_addr, pdn->phb->global_number);
return -EINVAL;
}
 
@@ -353,7 +354,7 @@ int eeh_add_to_parent_pe(struct eeh_de

[PATCH] powerpc/xmon: convert 0.16x to 0.16lx

2017-08-29 Thread Balbir Singh

Otherwise we lose the top 8 nibbles and effectively print only
the last 32 bits.

Fixes: e0ddf7a ("powerpc/xmon: Dump ISA 2.07 SPRs")
Fixes: 1846193 ("powerpc/xmon: Dump ISA 2.06 SPRs")

Signed-off-by: Balbir Singh 
---
 arch/powerpc/xmon/xmon.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index d038e7d..9e68f1d 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -1739,23 +1739,23 @@ static void dump_206_sprs(void)
 
/* Actually some of these pre-date 2.06, but whatevs */
 
-   printf("srr0   = %.16x  srr1  = %.16x dsisr  = %.8x\n",
+   printf("srr0   = %.16lx  srr1  = %.16lx dsisr  = %.8x\n",
mfspr(SPRN_SRR0), mfspr(SPRN_SRR1), mfspr(SPRN_DSISR));
-   printf("dscr   = %.16x  ppr   = %.16x pir= %.8x\n",
+   printf("dscr   = %.16lx  ppr   = %.16lx pir= %.8x\n",
mfspr(SPRN_DSCR), mfspr(SPRN_PPR), mfspr(SPRN_PIR));
 
if (!(mfmsr() & MSR_HV))
return;
 
-   printf("sdr1   = %.16x  hdar  = %.16x hdsisr = %.8x\n",
+   printf("sdr1   = %.16lx  hdar  = %.16lx hdsisr = %.8x\n",
mfspr(SPRN_SDR1), mfspr(SPRN_HDAR), mfspr(SPRN_HDSISR));
-   printf("hsrr0  = %.16x hsrr1  = %.16x hdec = %.8x\n",
+   printf("hsrr0  = %.16lx hsrr1  = %.16lx hdec = %.8x\n",
mfspr(SPRN_HSRR0), mfspr(SPRN_HSRR1), mfspr(SPRN_HDEC));
-   printf("lpcr   = %.16x  pcr   = %.16x lpidr = %.8x\n",
+   printf("lpcr   = %.16lx  pcr   = %.16lx lpidr = %.8x\n",
mfspr(SPRN_LPCR), mfspr(SPRN_PCR), mfspr(SPRN_LPID));
-   printf("hsprg0 = %.16x hsprg1 = %.16x\n",
+   printf("hsprg0 = %.16lx hsprg1 = %.16lx\n",
mfspr(SPRN_HSPRG0), mfspr(SPRN_HSPRG1));
-   printf("dabr   = %.16x dabrx  = %.16x\n",
+   printf("dabr   = %.16lx dabrx  = %.16lx\n",
mfspr(SPRN_DABR), mfspr(SPRN_DABRX));
 #endif
 }
@@ -1768,38 +1768,38 @@ static void dump_207_sprs(void)
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return;
 
-   printf("dpdes  = %.16x  tir   = %.16x cir= %.8x\n",
+   printf("dpdes  = %.16lx  tir   = %.16lx cir= %.8x\n",
mfspr(SPRN_DPDES), mfspr(SPRN_TIR), mfspr(SPRN_CIR));
 
-   printf("fscr   = %.16x  tar   = %.16x pspb   = %.8x\n",
+   printf("fscr   = %.16lx  tar   = %.16lx pspb   = %.8x\n",
mfspr(SPRN_FSCR), mfspr(SPRN_TAR), mfspr(SPRN_PSPB));
 
msr = mfmsr();
if (msr & MSR_TM) {
/* Only if TM has been enabled in the kernel */
-   printf("tfhar  = %.16x  tfiar = %.16x texasr = %.16x\n",
+   printf("tfhar  = %.16lx  tfiar = %.16lx texasr = %.16lx\n",
mfspr(SPRN_TFHAR), mfspr(SPRN_TFIAR),
mfspr(SPRN_TEXASR));
}
 
-   printf("mmcr0  = %.16x  mmcr1 = %.16x mmcr2  = %.16x\n",
+   printf("mmcr0  = %.16lx  mmcr1 = %.16lx mmcr2  = %.16lx\n",
mfspr(SPRN_MMCR0), mfspr(SPRN_MMCR1), mfspr(SPRN_MMCR2));
printf("pmc1   = %.8x pmc2 = %.8x  pmc3 = %.8x  pmc4   = %.8x\n",
mfspr(SPRN_PMC1), mfspr(SPRN_PMC2),
mfspr(SPRN_PMC3), mfspr(SPRN_PMC4));
-   printf("mmcra  = %.16x   siar = %.16x pmc5   = %.8x\n",
+   printf("mmcra  = %.16lx   siar = %.16lx pmc5   = %.8x\n",
mfspr(SPRN_MMCRA), mfspr(SPRN_SIAR), mfspr(SPRN_PMC5));
-   printf("sdar   = %.16x   sier = %.16x pmc6   = %.8x\n",
+   printf("sdar   = %.16lx   sier = %.16lx pmc6   = %.8x\n",
mfspr(SPRN_SDAR), mfspr(SPRN_SIER), mfspr(SPRN_PMC6));
-   printf("ebbhr  = %.16x  ebbrr = %.16x bescr  = %.16x\n",
+   printf("ebbhr  = %.16lx  ebbrr = %.16lx bescr  = %.16lx\n",
mfspr(SPRN_EBBHR), mfspr(SPRN_EBBRR), mfspr(SPRN_BESCR));
 
if (!(msr & MSR_HV))
return;
 
-   printf("hfscr  = %.16x  dhdes = %.16x rpr= %.16x\n",
+   printf("hfscr  = %.16lx  dhdes = %.16lx rpr= %.16lx\n",
mfspr(SPRN_HFSCR), mfspr(SPRN_DHDES), mfspr(SPRN_RPR));
-   printf("dawr   = %.16x  dawrx = %.16x ciabr  = %.16x\n",
+   printf("dawr   = %.16lx  dawrx = %.16lx ciabr  = %.16lx\n",
mfspr(SPRN_DAWR), mfspr(SPRN_DAWRX), mfspr(SPRN_CIABR));
 #endif
 }
-- 
2.9.4

82 matches

Mail list logo