[PATCH v2 2/7] powerpc/64s/radix: Fix process table entry cache invalidation

2017-11-06 Thread Nicholas Piggin
According to the architecture, the process table entry cache must be
flushed with tlbie RIC=2.

Currently the process table entry is set to invalid right before the
PID is returned to the allocator, with no invalidation. This works on
existing implementations that are known to not cache the process table
entry for any except the current PIDR.

It is architecturally correct and cleaner to invalidate with RIC=2
after clearing the process table entry and before the PID is returned
to the allocator. This can be done in arch_exit_mmap that runs before
the final flush, and to ensure the final flush (fullmm) is always a
RIC=2 variant.

Reviewed-by: Aneesh Kumar K.V 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/mmu_context.h |  4 
 arch/powerpc/mm/mmu_context_book3s64.c | 25 -
 arch/powerpc/mm/tlb-radix.c|  6 +-
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index b4cdf574cf61..6177d43f0ce8 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -165,9 +165,13 @@ static inline void arch_dup_mmap(struct mm_struct *oldmm,
 {
 }
 
+#ifndef CONFIG_PPC_BOOK3S_64
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 }
+#else
+extern void arch_exit_mmap(struct mm_struct *mm);
+#endif
 
 static inline void arch_unmap(struct mm_struct *mm,
  struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..6d724dab27c2 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -216,19 +216,34 @@ void destroy_context(struct mm_struct *mm)
 #ifdef CONFIG_SPAPR_TCE_IOMMU
WARN_ON_ONCE(!list_empty(>context.iommu_group_mem_list));
 #endif
+   if (radix_enabled())
+   WARN_ON(process_tb[mm->context.id].prtb0 != 0);
+   else
+   subpage_prot_free(mm);
+   destroy_pagetable_page(mm);
+   __destroy_context(mm->context.id);
+   mm->context.id = MMU_NO_CONTEXT;
+}
+
+void arch_exit_mmap(struct mm_struct *mm)
+{
if (radix_enabled()) {
/*
 * Radix doesn't have a valid bit in the process table
 * entries. However we know that at least P9 implementation
 * will avoid caching an entry with an invalid RTS field,
 * and 0 is invalid. So this will do.
+*
+* This runs before the "fullmm" tlb flush in exit_mmap,
+* which does a RIC=2 tlbie to clear the process table
+* entry. See the "fullmm" comments in tlb-radix.c.
+*
+* No barrier required here after the store because
+* this process will do the invalidate, which starts with
+* ptesync.
 */
process_tb[mm->context.id].prtb0 = 0;
-   } else
-   subpage_prot_free(mm);
-   destroy_pagetable_page(mm);
-   __destroy_context(mm->context.id);
-   mm->context.id = MMU_NO_CONTEXT;
+   }
 }
 
 #ifdef CONFIG_PPC_RADIX_MMU
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index feeb96693aeb..6e77ed2d7c6c 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -297,10 +297,14 @@ void radix__tlb_flush(struct mmu_gather *tlb)
psize = radix_get_mmu_psize(page_size);
/*
 * if page size is not something we understand, do a full mm flush
+*
+* A "fullmm" flush must always do a flush_all_mm (RIC=2) flush
+* that flushes the process table entry cache upon process teardown.
+* See the comment for radix in arch_exit_mmap().
 */
if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all)
radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize);
-   else if (tlb->need_flush_all) {
+   else if (tlb->fullmm || tlb->need_flush_all) {
tlb->need_flush_all = 0;
radix__flush_all_mm(mm);
} else
-- 
2.15.0



[PATCH v2 1/7] powerpc/64s/radix: tlbie improve preempt handling

2017-11-06 Thread Nicholas Piggin
Preempt should be consistently disabled for mm_is_thread_local tests,
so bring the rest of these under preempt_disable().

Preempt does not need to be disabled for the mm->context.id tests,
which allows simplification and removal of gotos.

Reviewed-by: Aneesh Kumar K.V 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/tlb-radix.c | 50 ++---
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index e2f15810b9c0..feeb96693aeb 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -186,16 +186,15 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
 {
unsigned long pid;
 
-   preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto no_context;
+   return;
 
+   preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_TLB);
else
_tlbiel_pid(pid, RIC_FLUSH_TLB);
-no_context:
preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_tlb_mm);
@@ -204,16 +203,15 @@ void radix__flush_all_mm(struct mm_struct *mm)
 {
unsigned long pid;
 
-   preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto no_context;
+   return;
 
+   preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
else
_tlbiel_pid(pid, RIC_FLUSH_ALL);
-no_context:
preempt_enable();
 }
 EXPORT_SYMBOL(radix__flush_all_mm);
@@ -230,15 +228,14 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, 
unsigned long vmaddr,
unsigned long pid;
unsigned long ap = mmu_get_ap(psize);
 
-   preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto bail;
+   return;
+   preempt_disable();
if (!mm_is_thread_local(mm))
_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
else
_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
-bail:
preempt_enable();
 }
 
@@ -322,15 +319,17 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
 {
unsigned long pid;
unsigned long addr;
-   int local = mm_is_thread_local(mm);
+   bool local;
unsigned long ap = mmu_get_ap(psize);
unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
 
 
-   preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto err_out;
+   return;
+
+   preempt_disable();
+   local = mm_is_thread_local(mm);
 
if (end == TLB_FLUSH_ALL ||
(end - start) > tlb_single_page_flush_ceiling * page_size) {
@@ -338,39 +337,38 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, 
unsigned long start,
_tlbiel_pid(pid, RIC_FLUSH_TLB);
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
-   goto err_out;
+   } else {
+   for (addr = start; addr < end; addr += page_size) {
+   if (local)
+   _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
+   else
+   _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
+   }
}
-   for (addr = start; addr < end; addr += page_size) {
 
-   if (local)
-   _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
-   else
-   _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
-   }
-err_out:
preempt_enable();
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
 {
-   int local = mm_is_thread_local(mm);
+   bool local;
unsigned long ap = mmu_get_ap(mmu_virtual_psize);
unsigned long pid, end;
 
-
pid = mm->context.id;
-   preempt_disable();
if (unlikely(pid == MMU_NO_CONTEXT))
-   goto no_context;
+   return;
 
/* 4k page size, just blow the world */
if (PAGE_SIZE == 0x1000) {
radix__flush_all_mm(mm);
-   preempt_enable();
return;
}
 
+   preempt_disable();
+   local = mm_is_thread_local(mm);
+
/* Otherwise first do the PWC */
if (local)
_tlbiel_pid(pid, RIC_FLUSH_PWC);
@@ -385,7 +383,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, 
unsigned long addr)
else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
-no_context:
+
preempt_enable();
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-- 
2.15.0



[PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements

2017-11-06 Thread Nicholas Piggin
Resending, sorry for the noise.

Since the v1/RFC, I pulled the first 2 fix patches into this series,
and rediffed to powerpc merge branch. Dropped the final 2 patches
which were not completely agreed upon and baked.

Thanks,
Nick

Nicholas Piggin (7):
  powerpc/64s/radix: tlbie improve preempt handling
  powerpc/64s/radix: Fix process table entry cache invalidation
  powerpc/64s/radix: optimize TLB range flush barriers
  powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
  powerpc/64s/radix: Optimize flush_tlb_range
  powerpc/64s/radix: Introduce local single page ceiling for TLB range
flush
  powerpc/64s/radix: Improve TLB flushing for page table freeing

 arch/powerpc/include/asm/mmu_context.h |   4 +
 arch/powerpc/mm/mmu_context_book3s64.c |  25 ++-
 arch/powerpc/mm/tlb-radix.c| 318 -
 3 files changed, 256 insertions(+), 91 deletions(-)

-- 
2.15.0



Re: [PATCH v9 00/51] powerpc, mm: Memory Protection Keys

2017-11-06 Thread Florian Weimer
* Ram Pai:

> On Mon, Nov 06, 2017 at 10:28:41PM +0100, Florian Weimer wrote:
>> * Ram Pai:
>> 
>> > Testing:
>> > ---
>> > This patch series has passed all the protection key
>> > tests available in the selftest directory.The
>> > tests are updated to work on both x86 and powerpc.
>> > The selftests have passed on x86 and powerpc hardware.
>> 
>> How do you deal with the key reuse problem?  Is it the same as x86-64,
>> where it's quite easy to accidentally grant existing threads access to
>> a just-allocated key, either due to key reuse or a changed init_pkru
>> parameter?
>
> I am not sure how on x86-64, two threads get allocated the same key
> at the same time? the key allocation is guarded under the mmap_sem
> semaphore. So there cannot be a race where two threads get allocated
> the same key.

The problem is a pkey_alloc/pthread_create/pkey_free/pkey_alloc
sequence.  The pthread_create call makes the new thread inherit the
access rights of the current thread, but then the key is deallocated.
Reallocation of the same key will have that thread retain its access
rights, which is IMHO not correct.

> Can you point me to the issue, if it is already discussed somewhere?

See ‘MPK: pkey_free and key reuse’ on various lists (including
linux-mm and linux-arch).

It has a test case attached which demonstrates the behavior.

> As far as the semantics is concerned, a key allocated in one thread's
> context has no meaning if used in some other threads context within the
> same process.  The app should not try to re-use a key allocated in a
> thread's context in some other threads's context.

Uh-oh, that's not how this feature works on x86-64 at all.  There, the
keys are a process-global resource.  Treating them per-thread
seriously reduces their usefulness.

>> What about siglongjmp from a signal handler?
>
> On powerpc there is some relief.  the permissions on a key can be
> modified from anywhere, including from the signal handler, and the
> effect will be immediate.  You dont have to wait till the
> signal handler returns for the key permissions to be restore.

My concern is that the signal handler knows nothing about protection
keys, but the current x86-64 semantics will cause it to clobber the
access rights of the current thread.

> also after return from the sigsetjmp();
> possibly caused by siglongjmp(), the program can restore the permission
> on any key.

So that's not really an option.

> Atleast that is my theory. Can you give me a testcase; if you have one
> handy.

The glibc patch I posted under the ‘MPK: pkey_free and key reuse’
thread covers this, too.


[PATCH v2 0/7] powerpc/64s/radix TLB flush fixes and performance improvements

2017-11-06 Thread Nicholas Piggin
Since the v1/RFC, I pulled the first 2 fix patches into this series,
and rediffed to powerpc merge branch. Dropped the final 2 patches
which were not completely agreed upon and baked.

Thanks,
Nick

Nicholas Piggin (7):
  powerpc/64s/radix: tlbie improve preempt handling
  powerpc/64s/radix: Fix process table entry cache invalidation
  powerpc/64s/radix: optimize TLB range flush barriers
  powerpc/64s/radix: Implement _tlbie(l)_va_range flush functions
  powerpc/64s/radix: Optimize flush_tlb_range
  powerpc/64s/radix: Introduce local single page ceiling for TLB range
flush
  powerpc/64s/radix: Improve TLB flushing for page table freeing

 arch/powerpc/include/asm/mmu_context.h |   4 +
 arch/powerpc/mm/mmu_context_book3s64.c |  25 ++-
 arch/powerpc/mm/tlb-radix.c| 318 -
 3 files changed, 256 insertions(+), 91 deletions(-)

-- 
2.15.0



Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-06 Thread Nicholas Piggin
C'ing everyone who was on the x86 56-bit user virtual address patch.

I think we need more time to discuss this behaviour, in light of the
regression Florian uncovered. I would propose we turn off the 56-bit
user virtual address support for x86 for 4.14, and powerpc would
follow and turn off its 512T support until we can get a better handle
on the problems. (Actually Florian initially hit a couple of bugs in
powerpc implementation, but pulling that string uncovers a whole lot
of difficulties.)

The bi-modal behavior switched based on a combination of mmap address
hint and MAP_FIXED just sucks. It's segregating our VA space with
some non-standard heuristics, and it doesn't seem to work very well.

What are we trying to do? Allow SAP HANA etc use huge address spaces
by coding to these specific mmap heuristics we're going to add,
rather than solving it properly in a way that requires adding a new
syscall or personality or prctl or sysctl. Okay, but the cost is that
despite best efforts, it still changes ABI behaviour for existing
applications and these heuristics will become baked into the ABI that
we will have to support. Not a good tradeoff IMO.

First of all, using addr and MAP_FIXED to develop our heuristic can
never really give unchanged ABI. It's an in-band signal. brk() is a
good example that steadily keeps incrementing address, so depending
on malloc usage and address space randomization, you will get a brk()
that ends exactly at 128T, then the next one will be >
DEFAULT_MAP_WINDOW, and it will switch you to 56 bit address space.

Second, the kernel can never completely solve the problem this way.
How do we know a malloc library will not ask for > 128TB addresses
and pass them to an unknowing application?

And lastly, there are a fair few bugs and places where description
in changelogs and mailing lists does not match code. You don't want
to know the mess in powerpc, but even x86 has two I can see:
MAP_FIXED succeeds even when crossing 128TB addresses (where changelog
indicated it should not), arch_get_unmapped_area_topdown() with an
address hint is checking against TASK_SIZE rather than the limited
128TB address, so it looks like it won't follow the heuristics.

So unless everyone else thinks I'm crazy and disagrees, I'd ask for
a bit more time to make sure we get this interface right. I would
hope for something like prctl PR_SET_MM which can be used to set
our user virtual address bits on a fine grained basis. Maybe a
sysctl, maybe a personality. Something out-of-band. I don't wan to
get too far into that discussion yet. First we need to agree whether
or not the code in the tree today is a problem.

Thanks,
Nick

On Mon, 6 Nov 2017 09:32:25 +0100
Florian Weimer  wrote:

> On 11/06/2017 09:30 AM, Aneesh Kumar K.V wrote:
> > On 11/06/2017 01:55 PM, Nicholas Piggin wrote:  
> >> On Mon, 6 Nov 2017 09:11:37 +0100
> >> Florian Weimer  wrote:
> >>  
> >>> On 11/06/2017 07:47 AM, Nicholas Piggin wrote:  
>  "You get < 128TB unless explicitly requested."
> 
>  Simple, reasonable, obvious rule. Avoids breaking apps that store
>  some bits in the top of pointers (provided that memory allocator
>  userspace libraries also do the right thing).  
> >>>
> >>> So brk would simplify fail instead of crossing the 128 TiB threshold?  
> >>
> >> Yes, that was the intention and that's what x86 seems to do.
> >>  
> >>>
> >>> glibc malloc should cope with that and switch to malloc, but this code
> >>> path is obviously less well-tested than the regular way.  
> >>
> >> Switch to mmap() I guess you meant?  
> 
> Yes, sorry.
> 
> >> powerpc has a couple of bugs in corner cases, so those should be fixed
> >> according to intended policy for stable kernels I think.
> >>
> >> But I question the policy. Just seems like an ugly and ineffective wart.
> >> Exactly for such cases as this -- behaviour would change from run to run
> >> depending on your address space randomization for example! In case your
> >> brk happens to land nicely on 128TB then the next one would succeed.  
> > 
> > Why ? It should not change between run to run. We limit the free
> > area search range based on hint address. So we should get consistent 
> > results across run. even if we changed the context.addr_limit.  
> 
> The size of the gap to the 128 TiB limit varies between runs because of 
> ASLR.  So some runs would use brk alone, others would use brk + malloc. 
> That's not really desirable IMHO.



Re: [PATCH v3] kernel/module_64.c: Add REL24 relocation support of livepatch symbols

2017-11-06 Thread Josh Poimboeuf
On Tue, Oct 31, 2017 at 07:39:59PM +0100, Torsten Duwe wrote:
> On Tue, Oct 31, 2017 at 09:53:16PM +0530, Naveen N . Rao wrote:
> > On 2017/10/31 03:30PM, Torsten Duwe wrote:
> > > 
> > > Maybe I failed to express my views properly; I find the whole approach
> [...]
> > > NAK'd-by: Torsten Duwe 
> > 
> > Hmm... that wasn't evident at all given Balbir's reponse to your 
> > previous concerns and your lack of response for the same:
> > https://www.mail-archive.com/linuxppc-dev@lists.ozlabs.org/msg125350.html
> 
> To me it was obvious that the root cause was kpatch's current inability to
> deal with ppc calling conventions when copying binary functions. Hence my
> hint at the discussion about a possible source-level solution that would
> work nicely for all architectures.

That other discussion isn't relevant.  Even if we do eventually decide
to go with a source-based approach, that's still a long ways off.

For the foreseeable future, kpatch-build is the only available safe way
to create live patches.  We need to figure out a way to make it work,
one way or another.

If I understand correctly, the main problem here is that a call to a
previously-local-but-now-global function is missing a needed nop
instruction after the call, which is needed for restoring r2 (the TOC
pointer).

So, just brainstorming a bit, here are the possible solutions I can
think of:

a) Create a special klp stub for such calls (as in Kamalesh's patch)

b) Have kpatch-build rewrite the function to insert nops after calls to
   previously-local functions.  This would also involve adjusting the
   offsets of intra-function branches and relocations which come
   afterwards in the same section.  And also patching up the DWARF
   debuginfo, if we care about that (I think we do).  And also patching
   up the jump tables which GCC sometimes creates for switch statements.
   Yuck.  I'm pretty sure this is a horrible idea.

c) Create a new GCC flag which treats all calls as global, which can be
   used by kpatch-build to generate the right code (assuming this flag
   doesn't already exist).  This would be a good option, I think.

d) Have kpatch-build do some other kind of transformation?  For example,
   maybe it could generate klp stubs which the callee calls into.  Each
   klp stub could then do a proper global call to the SHN_LIVEPATCH
   symbol.

Do I understand the problem correctly?  Do the potential solutions make
sense?  Any other possible solutions I missed?

-- 
Josh


[PATCH kernel v2] powerpc/powernv/ioda: Remove explicit max window size check

2017-11-06 Thread Alexey Kardashevskiy
DMA windows can only have a size of power of two on IODA2 hardware and
using memory_hotplug_max() to determine the upper limit won't work
correcly if it returns not power of two value.

This removes the check as the platform code does this check in
pnv_pci_ioda2_setup_default_config() anyway; the other client is VFIO
and that thing checks against locked_vm limit which prevents the userspace
from locking too much memory.

It is expected to impact DPDK on machines with non-power-of-two RAM size,
mostly. KVM guests are less likely to be affected as usually guests get
less than half of hosts RAM.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v2:
* instead of relaxing the check, this simply removes it
---
 arch/powerpc/platforms/powernv/pci-ioda.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 1de94fb..433cf84 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2776,7 +2776,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, 
__u64 bus_offset,
if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
return -EINVAL;
 
-   if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+   if (!is_power_of_2(window_size))
return -EINVAL;
 
/* Adjust direct table size from window_size and levels */
-- 
2.11.0



Re: [PATCH 2/5] powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary

2017-11-06 Thread Nicholas Piggin
On Tue, 07 Nov 2017 13:28:08 +1100
Michael Ellerman  wrote:

> Nicholas Piggin  writes:
> 
> > On Mon, 06 Nov 2017 16:14:10 +0530
> > "Aneesh Kumar K.V"  wrote:
> >  
> >> Nicholas Piggin  writes:
> >>   
> >> > While mapping hints with a length that cross 128TB are disallowed,
> >> > MAP_FIXED allocations that cross 128TB are allowed. These are failing
> >> > on hash (on radix they succeed). Add an additional case for fixed
> >> > mappings to expand the addr_limit when crossing 128TB.
> >> 
> >> Shouldn't that be fixed in radix. But i see x86 also doing this?
> >> 
> >> 
> >>if (flags & MAP_FIXED)
> >>return addr;
> >> 
> >> Kiril,
> >> 
> >> Is that expected?  
> >
> > I should actually reply to this one because the other did not
> > have Kirill on cc.
> >
> > Generic mapping code appears it will always succeed when given an
> > explicit hint request, even if the address is below the boundary
> > and address + length is above it. Even when !MAP_FIXED. This is the
> > sane behaviour AFAIKS.  
> 
> It's "sane" if you want the 128T boundary to be invisible.

Not invisible: mmap(NULL, length, ...) + length < 128TB

> But the whole goddamn point was that we wanted apps to have to opt-in to
> using > 128T, and having brk accidentally go over the 128T boundary does
> not count as opting in.

If brk() is given an explicit address > 128TB, then that's opting
in, right?

If malloc is doing some allocation scheme which "opts in", such as
using explicit brk or mmap, then there is no way for the kernel to
solve that. Making increasingly convoluted policy to try to allow
the good and reject the bad is not right IMO.

> So it's a mess as usual.

If we wanted to make this ABI-identical with existing systems, then
it can't work without out-of-band opt-in. Why wasn't a personality
+ sysctl used for this, as we apparently have for a previous address
space layout change?

> 
> > So we should switch powerpc to match, shouldn't we?  
> 
> Yes we should do whatever the other arches do. 
> 
> Actually we should do what the man page describes .. except it doesn't.



Re: [PATCH 2/5] powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary

2017-11-06 Thread Michael Ellerman
Nicholas Piggin  writes:

> On Mon, 06 Nov 2017 16:14:10 +0530
> "Aneesh Kumar K.V"  wrote:
>
>> Nicholas Piggin  writes:
>> 
>> > While mapping hints with a length that cross 128TB are disallowed,
>> > MAP_FIXED allocations that cross 128TB are allowed. These are failing
>> > on hash (on radix they succeed). Add an additional case for fixed
>> > mappings to expand the addr_limit when crossing 128TB.  
>> 
>> Shouldn't that be fixed in radix. But i see x86 also doing this?
>> 
>> 
>>  if (flags & MAP_FIXED)
>>  return addr;
>> 
>> Kiril,
>> 
>> Is that expected?
>
> I should actually reply to this one because the other did not
> have Kirill on cc.
>
> Generic mapping code appears it will always succeed when given an
> explicit hint request, even if the address is below the boundary
> and address + length is above it. Even when !MAP_FIXED. This is the
> sane behaviour AFAIKS.

It's "sane" if you want the 128T boundary to be invisible.

But the whole goddamn point was that we wanted apps to have to opt-in to
using > 128T, and having brk accidentally go over the 128T boundary does
not count as opting in.

So it's a mess as usual.

> So we should switch powerpc to match, shouldn't we?

Yes we should do whatever the other arches do. 

Actually we should do what the man page describes .. except it doesn't.

cheers


Re: [PATCH kernel] powerpc/powernv/ioda: Relax max DMA window size check

2017-11-06 Thread Alexey Kardashevskiy
On 06/11/17 21:45, Michael Ellerman wrote:
> Alexey Kardashevskiy  writes:
> 
>> On 31/10/17 15:04, Alexey Kardashevskiy wrote:
>>> DMA windows can only have a size of power of two on IODA2 hardware and
>>> using memory_hotplug_max() to determine the upper limit won't work
>>> correcly if it returns not power of two value.
>>>
>>> This relaxes the check by rounding up the value returned by
>>> memory_hotplug_max().
>>>
>>> It is expected to impact DPDK on machines with non-power-of-two RAM size,
>>> mostly. KVM guests are less likely to be affected as usually guests get
>>> less than half of hosts RAM.
>>
>>
>> It was pointed out that this check is quite useless anyway as the vm_locked
>> memory limit should hit first, and if that is not set or the user got the
>> root privilege level, then there are easier ways to crash the host so I am
>> thinking of:
>>
>>
>> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
>> b/arch/powerpc/platforms/powernv/pci-ioda.c
>> index 269f119e4b3c..a47e4cf343b2 100644
>> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
>> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>> @@ -2769,7 +2769,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid,
>> __u64 bus_offset,
>> if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
>> return -EINVAL;
>>
>> -   if ((window_size > memory_hotplug_max()) ||
>> !is_power_of_2(window_size))
>> +   if (!is_power_of_2(window_size))
>> return -EINVAL;
>>
>>
>>
>> Makes sense?
> 
> Sounds reasonable.
> 
> Execpt where is the vm_locked check? I think it's in the VFIO driver?

Yes, as Jonas already said.

> If
> so I guess the only concern is that this code might be called via some
> other path that doesn't do that check.

It is also called from pnv_pci_ioda2_setup_default_config() to create a
32bit DMA window which is limited by
__rounddown_pow_of_two(memory_hotplug_max()). I'll repost. Thanks.


-- 
Alexey


Re: [PATCH 1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Nicholas Piggin
On Tue, 7 Nov 2017 07:30:51 +0530
"Aneesh Kumar K.V"  wrote:

> On 11/06/2017 04:35 PM, Aneesh Kumar K.V wrote:
> > 
> > 
> > On 11/06/2017 04:24 PM, Nicholas Piggin wrote:  
> >> On Mon, 06 Nov 2017 16:08:06 +0530
> >> "Aneesh Kumar K.V"  wrote:
> >>  
> >>> Nicholas Piggin  writes:
> >>>  
>  When allocating VA space with a hint that crosses 128TB, the SLB 
>  addr_limit
>  variable is not expanded if addr is not > 128TB, but the slice 
>  allocation
>  looks at task_size, which is 512TB. This results in slice_check_fit()
>  incorrectly succeeding because the slice_count truncates off bit 128 
>  of the
>  requested mask, so the comparison to the available mask succeeds.  
> >>>
> >>>
> >>> But then the mask passed to slice_check_fit() is generated using
> >>> context.addr_limit as max value. So how did that return succcess? ie,
> >>> we get the request mask via
> >>>
> >>> slice_range_to_mask(addr, len, );
> >>>
> >>> And the potential/possible mask using
> >>>
> >>> slice_mask_for_size(mm, psize, _mask);
> >>>
> >>> So how did slice_check_fit() return sucess with
> >>>
> >>> slice_check_fit(mm, mask, good_mask);  
> >>
> >> Because the addr_limit check is used to *limit* the comparison.
> >>
> >> The available mask had bit up to 127 set, and the mask had 127 and
> >> 128 set. However the 128T addr_limit causes only bits 0-127 to be
> >> compared.
> >>  
> > 
> > Should we fix it then via ? I haven't tested this yet. Also this result 
> > in us comparing more bits?
> > 
> > modified   arch/powerpc/mm/slice.c
> > @@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
> >      struct slice_mask mask, struct slice_mask available)
> >   {
> >   DECLARE_BITMAP(result, SLICE_NUM_HIGH);
> > -    unsigned long slice_count = 
> > GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
> > 
> >   bitmap_and(result, mask.high_slices,
> > -   available.high_slices, slice_count);
> > +   available.high_slices, SLICE_NUM_HIGH);
> > 
> >   return (mask.low_slices & available.low_slices) == mask.low_slices &&
> > -    bitmap_equal(result, mask.high_slices, slice_count);
> > +    bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)
> > 
> >   
> 
> Florian, will you be able to test this patch ? We may not really want to 
> push this. But it will confirm that we end up getting >128TB address 
> because of this.

Oh we are, I went through and traced it, and this is the reason hash's
get_unmapped_area gives out > 128TB addresses if addr + len crosses
128TB.

Thanks,
Nick


Re: [PATCH 1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Aneesh Kumar K.V



On 11/06/2017 04:35 PM, Aneesh Kumar K.V wrote:



On 11/06/2017 04:24 PM, Nicholas Piggin wrote:

On Mon, 06 Nov 2017 16:08:06 +0530
"Aneesh Kumar K.V"  wrote:


Nicholas Piggin  writes:

When allocating VA space with a hint that crosses 128TB, the SLB 
addr_limit
variable is not expanded if addr is not > 128TB, but the slice 
allocation

looks at task_size, which is 512TB. This results in slice_check_fit()
incorrectly succeeding because the slice_count truncates off bit 128 
of the

requested mask, so the comparison to the available mask succeeds.



But then the mask passed to slice_check_fit() is generated using
context.addr_limit as max value. So how did that return succcess? ie,
we get the request mask via

slice_range_to_mask(addr, len, );

And the potential/possible mask using

slice_mask_for_size(mm, psize, _mask);

So how did slice_check_fit() return sucess with

slice_check_fit(mm, mask, good_mask);


Because the addr_limit check is used to *limit* the comparison.

The available mask had bit up to 127 set, and the mask had 127 and
128 set. However the 128T addr_limit causes only bits 0-127 to be
compared.



Should we fix it then via ? I haven't tested this yet. Also this result 
in us comparing more bits?


modified   arch/powerpc/mm/slice.c
@@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
     struct slice_mask mask, struct slice_mask available)
  {
  DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-    unsigned long slice_count = 
GET_HIGH_SLICE_INDEX(mm->context.addr_limit);


  bitmap_and(result, mask.high_slices,
-   available.high_slices, slice_count);
+   available.high_slices, SLICE_NUM_HIGH);

  return (mask.low_slices & available.low_slices) == mask.low_slices &&
-    bitmap_equal(result, mask.high_slices, slice_count);
+    bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)




Florian, will you be able to test this patch ? We may not really want to 
push this. But it will confirm that we end up getting >128TB address 
because of this.


-aneesh



Re: [PATCH 0/5] VA allocator fixes

2017-11-06 Thread Aneesh Kumar K.V



On 11/07/2017 05:36 AM, Nicholas Piggin wrote:

On Mon, 6 Nov 2017 16:16:07 +0100
Florian Weimer  wrote:


On 11/06/2017 11:03 AM, Nicholas Piggin wrote:

Florian found a nasty corner case with the VA allocation logic
for crossing from 128TB to 512TB limit on hash, and made a
really superb report of the problem -- traces, reproducer recipes,
analysis, etc. which already mostly solved it.

The first patch in the series should solve Florian's particular
case, the next 3 are other issues with addr_limit. The last
patch is technically a cleanup but I think it's fairly important
in terms of understanding the code and also enabling some BUG
checks (when addr_limit == 0).

I have not tested these exactly on Florian's test case, but
some tests of my own behave better afterwards. Hopefully he has
time to re-test. Some careful review would be welcome too.


I think I have applied the five patches you posted, but I still get a
brk value above 128 TiB:

# /lib64/ld64.so.1 ./a.out
initial brk value: 0x7fffde96
probing at 0x8001fffc

I assumed you wanted to reject those?


It was difficult to understand what the intended semantics are, but I
think brk should succeed (it is implemented with MAP_FIXED). Of course
it should not succeed then segfault when you try to access it.



In either case, I recommend to tweak the VM layout, so that ld.so does
not land closely to to the 128 TiB limit, so that the brk failure or
returning of 48-bit addresses is avoided.


Yeah well that's yet another issue. I was not really involved with the
address space extension work. Anees, Kirill, was the intention for the
128T->512T extension logic to be a no-op for all address space allocaiton
except those with explicit addresses?



yes.

-aneesh



Re: [PATCH v9 00/51] powerpc, mm: Memory Protection Keys

2017-11-06 Thread Ram Pai
On Mon, Nov 06, 2017 at 10:28:41PM +0100, Florian Weimer wrote:
> * Ram Pai:
> 
> > Testing:
> > ---
> > This patch series has passed all the protection key
> > tests available in the selftest directory.The
> > tests are updated to work on both x86 and powerpc.
> > The selftests have passed on x86 and powerpc hardware.
> 
> How do you deal with the key reuse problem?  Is it the same as x86-64,
> where it's quite easy to accidentally grant existing threads access to
> a just-allocated key, either due to key reuse or a changed init_pkru
> parameter?

I am not sure how on x86-64, two threads get allocated the same key
at the same time? the key allocation is guarded under the mmap_sem
semaphore. So there cannot be a race where two threads get allocated
the same key.

Can you point me to the issue, if it is already discussed somewhere?

As far as the semantics is concerned, a key allocated in one thread's
context has no meaning if used in some other threads context within the
same process.  The app should not try to re-use a key allocated in a
thread's context in some other threads's context.

> 
> What about siglongjmp from a signal handler?

On powerpc there is some relief.  the permissions on a key can be
modified from anywhere, including from the signal handler, and the
effect will be immediate.  You dont have to wait till the
signal handler returns for the key permissions to be restore.

also after return from the sigsetjmp();
possibly caused by siglongjmp(), the program can restore the permission
on any key.

Atleast that is my theory. Can you give me a testcase; if you have one
handy.

> 
>   
> 
> 
> I wonder if it's possible to fix some of these things before the exact
> semantics of these interfaces are set in stone.

Will try.

RP



Re: [PATCH v2 14/18] powerpc: Define set_thread_used_vas()

2017-11-06 Thread Nicholas Piggin
On Fri,  6 Oct 2017 19:28:14 -0700
Sukadev Bhattiprolu  wrote:

> A CP_ABORT instruction is required in processes that have mapped a VAS
> "paste address" with the intention of using COPY/PASTE instructions.
> But since CP_ABORT is expensive, we want to restrict it to only processes
> that use/intend to use COPY/PASTE.
> 
> Define an interface, set_thread_used_vas(), that VAS can use to indicate
> that the current process opened a send window. During context switch,
> issue CP_ABORT only for processes that have the flag set.
> 
> Thanks for input from Nick Piggin, Michael Ellerman.
> 
> Signed-off-by: Sukadev Bhattiprolu 
> ---
>  arch/powerpc/include/asm/processor.h |  2 ++
>  arch/powerpc/include/asm/switch_to.h |  2 ++
>  arch/powerpc/kernel/process.c| 32 ++--
>  3 files changed, 26 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> index 58cc212..bdab3b74 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -341,7 +341,9 @@ struct thread_struct {
>   unsigned long   sier;
>   unsigned long   mmcr2;
>   unsignedmmcr0;
> +
>   unsignedused_ebb;
> + unsigned intused_vas;
>  #endif
>  };
>  
> diff --git a/arch/powerpc/include/asm/switch_to.h 
> b/arch/powerpc/include/asm/switch_to.h
> index f5da32f..aeb305b 100644
> --- a/arch/powerpc/include/asm/switch_to.h
> +++ b/arch/powerpc/include/asm/switch_to.h
> @@ -91,6 +91,8 @@ static inline void clear_task_ebb(struct task_struct *t)
>  #endif
>  }
>  
> +extern int set_thread_used_vas(void);
> +
>  extern int set_thread_tidr(struct task_struct *t);
>  extern void clear_thread_tidr(struct task_struct *t);
>  
> diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
> index d861fcd..cb5f108 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -1234,17 +1234,17 @@ struct task_struct *__switch_to(struct task_struct 
> *prev,
>* The copy-paste buffer can only store into foreign real
>* addresses, so unprivileged processes can not see the
>* data or use it in any way unless they have foreign real
> -  * mappings. We don't have a VAS driver that allocates those
> -  * yet, so no cpabort is required.
> +  * mappings. If the new process has the foreign real address
> +  * mappings, we must issue a cp_abort to clear any state and
> +  * prevent a covert channel being setup.
> +  *
> +  * DD1 allows paste into normal system memory so we do an
> +  * unpaired copy, rather than cp_abort, to clear the buffer,
> +  * since cp_abort is quite expensive.
>*/
> - if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
> - /*
> -  * DD1 allows paste into normal system memory, so we
> -  * do an unpaired copy here to clear the buffer and
> -  * prevent a covert channel being set up.
> -  *
> -  * cpabort is not used because it is quite expensive.
> -  */
> + if (new_thread->used_vas) {
> + asm volatile(PPC_CP_ABORT);
> + } else if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
>   asm volatile(PPC_COPY(%0, %1)
>   : : "r"(dummy_copy_buffer), "r"(0));
>   }

I *think* we're okay with this now, right? If we are switching to a thread
with a foreign address mapping (that could interact with something in the
copy buffer from a previous thread), then we cp_abort to ensure the copy
buffer is clear.

It's not just a covert channel, but also snooping, or accidental or
deliberate corruption .


> @@ -1445,6 +1445,18 @@ void flush_thread(void)
>  #endif /* CONFIG_HAVE_HW_BREAKPOINT */
>  }
>  
> +int set_thread_used_vas(void)
> +{

At the risk of nitpicking, I would like to change from used (past
tense) to something else that indicates future tense. We have to set
this before starting to use vas.

I think we should pull in the abort into this function rather than
the caller, and your caller in patch 18 does not check the return
code which it should.

Other than these small bits, it looks much better, thanks!

> +#ifdef CONFIG_PPC_BOOK3S_64
> + if (!cpu_has_feature(CPU_FTR_ARCH_300))
> + return -EINVAL;
> +
> + current->thread.used_vas = 1;
> +
> +#endif /* CONFIG_PPC_BOOK3S_64 */
> + return 0;
> +}
> +
>  #ifdef CONFIG_PPC64
>  static DEFINE_SPINLOCK(vas_thread_id_lock);
>  static DEFINE_IDA(vas_thread_ida);



Re: [PATCH 0/5] VA allocator fixes

2017-11-06 Thread Nicholas Piggin
On Mon, 6 Nov 2017 16:16:07 +0100
Florian Weimer  wrote:

> On 11/06/2017 11:03 AM, Nicholas Piggin wrote:
> > Florian found a nasty corner case with the VA allocation logic
> > for crossing from 128TB to 512TB limit on hash, and made a
> > really superb report of the problem -- traces, reproducer recipes,
> > analysis, etc. which already mostly solved it.
> > 
> > The first patch in the series should solve Florian's particular
> > case, the next 3 are other issues with addr_limit. The last
> > patch is technically a cleanup but I think it's fairly important
> > in terms of understanding the code and also enabling some BUG
> > checks (when addr_limit == 0).
> > 
> > I have not tested these exactly on Florian's test case, but
> > some tests of my own behave better afterwards. Hopefully he has
> > time to re-test. Some careful review would be welcome too.  
> 
> I think I have applied the five patches you posted, but I still get a 
> brk value above 128 TiB:
> 
> # /lib64/ld64.so.1 ./a.out
> initial brk value: 0x7fffde96
> probing at 0x8001fffc
> 
> I assumed you wanted to reject those?

It was difficult to understand what the intended semantics are, but I
think brk should succeed (it is implemented with MAP_FIXED). Of course
it should not succeed then segfault when you try to access it.

> 
> In either case, I recommend to tweak the VM layout, so that ld.so does 
> not land closely to to the 128 TiB limit, so that the brk failure or 
> returning of 48-bit addresses is avoided.

Yeah well that's yet another issue. I was not really involved with the
address space extension work. Anees, Kirill, was the intention for the
128T->512T extension logic to be a no-op for all address space allocaiton
except those with explicit addresses?

Thanks,
Nick


Re: [PATCH v5 06/10] powerpc/opal: Rework the opal-async interface

2017-11-06 Thread Cyril Bur
On Mon, 2017-11-06 at 20:41 +1100, Michael Ellerman wrote:
> Cyril Bur  writes:
> 
> > diff --git a/arch/powerpc/platforms/powernv/opal-async.c 
> > b/arch/powerpc/platforms/powernv/opal-async.c
> > index c43421ab2d2f..fbae8a37ce2c 100644
> > --- a/arch/powerpc/platforms/powernv/opal-async.c
> > +++ b/arch/powerpc/platforms/powernv/opal-async.c
> > @@ -23,40 +23,45 @@
> >  #include 
> >  #include 
> >  
> > -#define N_ASYNC_COMPLETIONS64
> > +enum opal_async_token_state {
> > +   ASYNC_TOKEN_UNALLOCATED = 0,
> > +   ASYNC_TOKEN_ALLOCATED,
> > +   ASYNC_TOKEN_COMPLETED
> > +};
> > +
> > +struct opal_async_token {
> > +   enum opal_async_token_state state;
> > +   struct opal_msg response;
> > +};
> >  
> > -static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = 
> > {~0UL};
> > -static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS);
> >  static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
> >  static DEFINE_SPINLOCK(opal_async_comp_lock);
> >  static struct semaphore opal_async_sem;
> > -static struct opal_msg *opal_async_responses;
> >  static unsigned int opal_max_async_tokens;
> > +static struct opal_async_token *opal_async_tokens;
> >  
> >  static int __opal_async_get_token(void)
> >  {
> > unsigned long flags;
> > -   int token;
> > +   int token = -EBUSY;
> >  
> > spin_lock_irqsave(_async_comp_lock, flags);
> > -   token = find_first_bit(opal_async_complete_map, opal_max_async_tokens);
> > -   if (token >= opal_max_async_tokens) {
> > -   token = -EBUSY;
> > -   goto out;
> > +   for (token = 0; token < opal_max_async_tokens; token++) {
> > +   if (opal_async_tokens[token].state == ASYNC_TOKEN_UNALLOCATED) {
> > +   opal_async_tokens[token].state = ASYNC_TOKEN_ALLOCATED;
> > +   goto out;
> > +   }
> > }
> > -
> > -   if (__test_and_set_bit(token, opal_async_token_map)) {
> > -   token = -EBUSY;
> > -   goto out;
> > -   }
> > -
> > -   __clear_bit(token, opal_async_complete_map);
> > -
> >  out:
> > spin_unlock_irqrestore(_async_comp_lock, flags);
> > return token;
> >  }
> 
> Resulting in:
> 
>  static int __opal_async_get_token(void)
>  {
>   unsigned long flags;
> + int token = -EBUSY;
>  
>   spin_lock_irqsave(_async_comp_lock, flags);
> + for (token = 0; token < opal_max_async_tokens; token++) {
> + if (opal_async_tokens[token].state == ASYNC_TOKEN_UNALLOCATED) {
> + opal_async_tokens[token].state = ASYNC_TOKEN_ALLOCATED;
> + goto out;
> + }
>   }
>  out:
>   spin_unlock_irqrestore(_async_comp_lock, flags);
>   return token;
>  }
> 
> So when no unallocated token is found we return opal_max_async_tokens :(
> 
> I changed it to:
> 
> static int __opal_async_get_token(void)
> {
>   unsigned long flags;
>   int i, token = -EBUSY;
> 
>   spin_lock_irqsave(_async_comp_lock, flags);
> 
>   for (i = 0; i < opal_max_async_tokens; i++) {
>   if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) {
>   opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED;
>   token = i;
>   break;
>   }
>   }
> 
>   spin_unlock_irqrestore(_async_comp_lock, flags);
>   return token;
> }
> 
> 

Thanks!!

> >  
> > +/*
> > + * Note: If the returned token is used in an opal call and opal returns
> > + * OPAL_ASYNC_COMPLETION you MUST opal_async_wait_response() before
> 
>  ^
>  call
> 
> 
> cheers


Re: [PATCH v9 00/51] powerpc, mm: Memory Protection Keys

2017-11-06 Thread Florian Weimer
* Ram Pai:

> Testing:
> ---
> This patch series has passed all the protection key
> tests available in the selftest directory.The
> tests are updated to work on both x86 and powerpc.
> The selftests have passed on x86 and powerpc hardware.

How do you deal with the key reuse problem?  Is it the same as x86-64,
where it's quite easy to accidentally grant existing threads access to
a just-allocated key, either due to key reuse or a changed init_pkru
parameter?

What about siglongjmp from a signal handler?

  

I wonder if it's possible to fix some of these things before the exact
semantics of these interfaces are set in stone.


[RFC PATCH for 4.15 07/14] Restartable sequences: Wire up powerpc system call

2017-11-06 Thread Mathieu Desnoyers
From: Boqun Feng 

Wire up the rseq system call on powerpc.

This provides an ABI improving the speed of a user-space getcpu
operation on powerpc by skipping the getcpu system call on the fast
path, as well as improving the speed of user-space operations on per-cpu
data compared to using load-reservation/store-conditional atomics.

Signed-off-by: Boqun Feng 
Signed-off-by: Mathieu Desnoyers 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: Peter Zijlstra 
CC: "Paul E. McKenney" 
CC: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/include/asm/systbl.h  | 1 +
 arch/powerpc/include/asm/unistd.h  | 2 +-
 arch/powerpc/include/uapi/asm/unistd.h | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index 449912f057f6..964321a5799c 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -389,3 +389,4 @@ COMPAT_SYS_SPU(preadv2)
 COMPAT_SYS_SPU(pwritev2)
 SYSCALL(kexec_file_load)
 SYSCALL(statx)
+SYSCALL(rseq)
diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index 9ba11dbcaca9..e76bd5601ea4 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
 #include 
 
 
-#define NR_syscalls384
+#define NR_syscalls385
 
 #define __NR__exit __NR_exit
 
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index df8684f31919..b1980fcd56d5 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -395,5 +395,6 @@
 #define __NR_pwritev2  381
 #define __NR_kexec_file_load   382
 #define __NR_statx 383
+#define __NR_rseq  384
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
-- 
2.11.0



[RFC PATCH for 4.15 10/14] cpu_opv: Wire up powerpc system call

2017-11-06 Thread Mathieu Desnoyers
Signed-off-by: Mathieu Desnoyers 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: Boqun Feng 
CC: Peter Zijlstra 
CC: "Paul E. McKenney" 
CC: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/include/asm/systbl.h  | 1 +
 arch/powerpc/include/asm/unistd.h  | 2 +-
 arch/powerpc/include/uapi/asm/unistd.h | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index 964321a5799c..f9cdb896fbaa 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -390,3 +390,4 @@ COMPAT_SYS_SPU(pwritev2)
 SYSCALL(kexec_file_load)
 SYSCALL(statx)
 SYSCALL(rseq)
+SYSCALL(cpu_opv)
diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index e76bd5601ea4..48f80f452e31 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
 #include 
 
 
-#define NR_syscalls385
+#define NR_syscalls386
 
 #define __NR__exit __NR_exit
 
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index b1980fcd56d5..972a7d68c143 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -396,5 +396,6 @@
 #define __NR_kexec_file_load   382
 #define __NR_statx 383
 #define __NR_rseq  384
+#define __NR_cpu_opv   385
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
-- 
2.11.0



[RFC PATCH for 4.15 06/14] Restartable sequences: powerpc architecture support

2017-11-06 Thread Mathieu Desnoyers
From: Boqun Feng 

Call the rseq_handle_notify_resume() function on return to userspace if
TIF_NOTIFY_RESUME thread flag is set.

Increment the event counter and perform fixup on the pre-signal when a
signal is delivered on top of a restartable sequence critical section.

Signed-off-by: Boqun Feng 
Signed-off-by: Mathieu Desnoyers 
CC: Benjamin Herrenschmidt 
CC: Paul Mackerras 
CC: Michael Ellerman 
CC: Peter Zijlstra 
CC: "Paul E. McKenney" 
CC: linuxppc-dev@lists.ozlabs.org
---
 arch/powerpc/Kconfig | 1 +
 arch/powerpc/kernel/signal.c | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index cb782ac1c35d..41d1dae3b1b5 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -219,6 +219,7 @@ config PPC
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_IRQ_TIME_ACCOUNTING
+   select HAVE_RSEQ
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index e9436c5e1e09..17a994b801b1 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -133,6 +133,8 @@ static void do_signal(struct task_struct *tsk)
/* Re-enable the breakpoints for the signal stack */
thread_change_pc(tsk, tsk->thread.regs);
 
+   rseq_signal_deliver(tsk->thread.regs);
+
if (is32) {
if (ksig.ka.sa.sa_flags & SA_SIGINFO)
ret = handle_rt_signal32(, oldset, tsk);
@@ -161,6 +163,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long 
thread_info_flags)
if (thread_info_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
+   rseq_handle_notify_resume(regs);
}
 
if (thread_info_flags & _TIF_PATCH_PENDING)
-- 
2.11.0



Re: [PATCH kernel] powerpc/powernv/ioda: Relax max DMA window size check

2017-11-06 Thread Jonas Pfefferle1

Michael Ellerman  wrote on 11/06/2017 11:45:34 AM:

> From: Michael Ellerman 
> To: Alexey Kardashevskiy , David Gibson
> 
> Cc: linuxppc-dev@lists.ozlabs.org, Jonas Pfefferle1
> , Nicholas Piggin 
> Date: 11/06/2017 11:45 AM
> Subject: Re: [PATCH kernel] powerpc/powernv/ioda: Relax max DMA
> window size check
>
> Alexey Kardashevskiy  writes:
>
> > On 31/10/17 15:04, Alexey Kardashevskiy wrote:
> >> DMA windows can only have a size of power of two on IODA2 hardware and
> >> using memory_hotplug_max() to determine the upper limit won't work
> >> correcly if it returns not power of two value.
> >>
> >> This relaxes the check by rounding up the value returned by
> >> memory_hotplug_max().
> >>
> >> It is expected to impact DPDK on machines with non-power-of-two RAM
size,
> >> mostly. KVM guests are less likely to be affected as usually guests
get
> >> less than half of hosts RAM.
> >
> >
> > It was pointed out that this check is quite useless anyway as the
vm_locked
> > memory limit should hit first, and if that is not set or the user got
the
> > root privilege level, then there are easier ways to crash the host so I
am
> > thinking of:
> >
> >
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> > b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index 269f119e4b3c..a47e4cf343b2 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -2769,7 +2769,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int
nid,
> > __u64 bus_offset,
> > if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
> > return -EINVAL;
> >
> > -   if ((window_size > memory_hotplug_max()) ||
> > !is_power_of_2(window_size))
> > +   if (!is_power_of_2(window_size))
> > return -EINVAL;
> >
> >
> >
> > Makes sense?
>
> Sounds reasonable.
>
> Execpt where is the vm_locked check? I think it's in the VFIO driver? If
> so I guess the only concern is that this code might be called via some
> other path that doesn't do that check.
>
> cheers
>

The vm_locked is incremented here:
http://elixir.free-electrons.com/linux/v4.13.11/source/drivers/vfio/vfio_iommu_spapr_tce.c#L176
resp.
http://elixir.free-electrons.com/linux/v4.13.11/source/arch/powerpc/mm/mmu_context_iommu.c#L124
on VFIO_IOMMU_SPAPR_REGISTER_MEMORY. From my understanding only pages
that have been registered through here can be mapped with MAP_DMA.

Cheers,
Jonas


[PATCH v3] ppc64 boot: Wait for boot cpu to show up if nr_cpus limit is about to hit.

2017-11-06 Thread Thadeu Lima de Souza Cascardo
From: Mahesh Salgaonkar 

The kernel boot parameter 'nr_cpus=' allows one to specify number of
possible cpus in the system. In the normal scenario the first cpu (cpu0)
that shows up is the boot cpu and hence it gets covered under nr_cpus
limit.

But this assumption will be broken in kdump scenario where kdump kenrel
after a crash can boot up on an non-zero boot cpu. The paca structure
allocation depends on value of nr_cpus and is indexed using logical cpu
ids. This definetly will be an issue if boot cpu id > nr_cpus

This patch modifies allocate_pacas() and smp_setup_cpu_maps() to
accommodate boot cpu for the case where boot_cpuid > nr_cpu_ids.

This change would help to reduce the memory reservation requirement for
kdump on ppc64.

Signed-off-by: Mahesh Salgaonkar 
Signed-off-by: Thadeu Lima de Souza Cascardo 
---

v3: fixup signedness or nr_cpus to match nr_cpu_ids
 and fix conflict due to change from %d to %u

Resending this as it was not applied, and I can reproduce the issue with
v4.14-rc8 when booting a kdump kernel after a crash that has been given
nr_cpus=1 as a parameter. With this patch, I can't reproduce it anymore.

---
 arch/powerpc/include/asm/paca.h|  3 +++
 arch/powerpc/include/asm/smp.h |  1 +
 arch/powerpc/kernel/paca.c | 23 +-
 arch/powerpc/kernel/prom.c | 39 +-
 arch/powerpc/kernel/setup-common.c | 25 
 5 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 04b60af027ae..ea0dbf2bbeef 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -49,6 +49,9 @@ extern unsigned int debug_smp_processor_id(void); /* from 
linux/smp.h */
 #define get_lppaca()   (get_paca()->lppaca_ptr)
 #define get_slb_shadow()   (get_paca()->slb_shadow_ptr)
 
+/* Maximum number of threads per core. */
+#defineMAX_SMT 8
+
 struct task_struct;
 
 /*
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index fac963e10d39..553cd22b2ccc 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -30,6 +30,7 @@
 #include 
 
 extern int boot_cpuid;
+extern int boot_hw_cpuid;
 extern int spinning_secondaries;
 
 extern void cpu_die(void);
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 2ff2b8a19f71..9c689ee4b6a3 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -207,6 +207,7 @@ void __init allocate_pacas(void)
 {
u64 limit;
int cpu;
+   unsigned int nr_cpus;
 
limit = ppc64_rma_size;
 
@@ -219,20 +220,32 @@ void __init allocate_pacas(void)
limit = min(0x1000ULL, limit);
 #endif
 
-   paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids);
+   /*
+* Always align up the nr_cpu_ids to SMT threads and allocate
+* the paca. This will help us to prepare for a situation where
+* boot cpu id > nr_cpus_id. We will use the last nthreads
+* slots (nthreads == threads per core) to accommodate a core
+* that contains boot cpu thread.
+*
+* Do not change nr_cpu_ids value here. Let us do that in
+* early_init_dt_scan_cpus() where we know exact value
+* of threads per core.
+*/
+   nr_cpus = _ALIGN_UP(nr_cpu_ids, MAX_SMT);
+   paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpus);
 
paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit));
memset(paca, 0, paca_size);
 
printk(KERN_DEBUG "Allocated %u bytes for %u pacas at %p\n",
-   paca_size, nr_cpu_ids, paca);
+   paca_size, nr_cpus, paca);
 
-   allocate_lppacas(nr_cpu_ids, limit);
+   allocate_lppacas(nr_cpus, limit);
 
-   allocate_slb_shadows(nr_cpu_ids, limit);
+   allocate_slb_shadows(nr_cpus, limit);
 
/* Can't use for_each_*_cpu, as they aren't functional yet */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu++)
+   for (cpu = 0; cpu < nr_cpus; cpu++)
initialise_paca([cpu], cpu);
 }
 
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f83056297441..93837093c5cb 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -302,6 +302,29 @@ static void __init check_cpu_feature_properties(unsigned 
long node)
}
 }
 
+/*
+ * Adjust the logical id of a boot cpu to fall under nr_cpu_ids. Map it to
+ * last core slot in the allocated paca array.
+ *
+ * e.g. on SMT=8 system, kernel booted with nr_cpus=1 and boot cpu = 33,
+ * align nr_cpu_ids to MAX_SMT value 8. Allocate paca array to hold up-to
+ * MAX_SMT=8 cpus. Since boot cpu 33 is greater than nr_cpus (8), adjust
+ * its logical id so that new id becomes less than nr_cpu_ids. Make sure
+ * that boot cpu's new logical id is aligned 

Re: [PATCH 0/5] VA allocator fixes

2017-11-06 Thread Florian Weimer

On 11/06/2017 11:03 AM, Nicholas Piggin wrote:

Florian found a nasty corner case with the VA allocation logic
for crossing from 128TB to 512TB limit on hash, and made a
really superb report of the problem -- traces, reproducer recipes,
analysis, etc. which already mostly solved it.

The first patch in the series should solve Florian's particular
case, the next 3 are other issues with addr_limit. The last
patch is technically a cleanup but I think it's fairly important
in terms of understanding the code and also enabling some BUG
checks (when addr_limit == 0).

I have not tested these exactly on Florian's test case, but
some tests of my own behave better afterwards. Hopefully he has
time to re-test. Some careful review would be welcome too.


I think I have applied the five patches you posted, but I still get a 
brk value above 128 TiB:


# /lib64/ld64.so.1 ./a.out
initial brk value: 0x7fffde96
probing at 0x8001fffc

I assumed you wanted to reject those?

In either case, I recommend to tweak the VM layout, so that ld.so does 
not land closely to to the 128 TiB limit, so that the brk failure or 
returning of 48-bit addresses is avoided.


Thanks,
Florian


[PATCH] powerpc/mm: Remove unused flag arg in global_invalidates

2017-11-06 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_hv_rm_mmu.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 4efe364f1188..d80240ba6de4 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -42,7 +42,7 @@ static void *real_vmalloc_addr(void *x)
 }
 
 /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
-static int global_invalidates(struct kvm *kvm, unsigned long flags)
+static int global_invalidates(struct kvm *kvm)
 {
int global;
int cpu;
@@ -499,7 +499,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long 
flags,
if (v & HPTE_V_VALID) {
hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
rb = compute_tlbie_rb(v, pte_r, pte_index);
-   do_tlbies(kvm, , 1, global_invalidates(kvm, flags), true);
+   do_tlbies(kvm, , 1, global_invalidates(kvm), true);
/*
 * The reference (R) and change (C) bits in a HPT
 * entry can be set by hardware at any time up until
@@ -549,7 +549,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
if (kvm_is_radix(kvm))
return H_FUNCTION;
-   global = global_invalidates(kvm, 0);
+   global = global_invalidates(kvm);
for (i = 0; i < 4 && ret == H_SUCCESS; ) {
n = 0;
for (; i < 4; ++i) {
@@ -709,8 +709,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long 
flags,
rb = compute_tlbie_rb(v, r, pte_index);
hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) |
  HPTE_V_ABSENT);
-   do_tlbies(kvm, , 1, global_invalidates(kvm, flags),
- true);
+   do_tlbies(kvm, , 1, global_invalidates(kvm), true);
/* Don't lose R/C bit updates done by hardware */
r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C);
hpte[1] = cpu_to_be64(r);
-- 
2.13.6



Re: [PATCH 2/5] powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary

2017-11-06 Thread Nicholas Piggin
On Mon, 06 Nov 2017 16:14:10 +0530
"Aneesh Kumar K.V"  wrote:

> Nicholas Piggin  writes:
> 
> > While mapping hints with a length that cross 128TB are disallowed,
> > MAP_FIXED allocations that cross 128TB are allowed. These are failing
> > on hash (on radix they succeed). Add an additional case for fixed
> > mappings to expand the addr_limit when crossing 128TB.  
> 
> Shouldn't that be fixed in radix. But i see x86 also doing this?
> 
> 
>   if (flags & MAP_FIXED)
>   return addr;
> 
> Kiril,
> 
> Is that expected?

I should actually reply to this one because the other did not
have Kirill on cc.

Generic mapping code appears it will always succeed when given an
explicit hint request, even if the address is below the boundary
and address + length is above it. Even when !MAP_FIXED. This is the
sane behaviour AFAIKS. So we should switch powerpc to match,
shouldn't we?

Thanks,
Nick


Re: [PATCH 4/5] powerpc/64s/radix: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Nicholas Piggin
On Mon, 06 Nov 2017 16:44:48 +0530
"Aneesh Kumar K.V"  wrote:

> Nicholas Piggin  writes:
> 
> > Radix VA space allocations test addresses against mm->task_size which is
> > 512TB, even in cases where the intention is to limit allocation to below
> > 128TB.
> >
> > This results in mmap with a hint address below 128TB but address + length
> > above 128TB succeeding when it should fail (as hash does after the
> > previous patch).
> >
> > Set the high address limit to be considered up front, and base subsequent
> > allocation checks on that consistently.  
> 
> Doesn't setting info.high_limit take care of that ? I would expect
> vm_unmapped_area to fail based on info.high_limit.

No, it is the hint address case. info.high_limit only gets involved if
the hint area was unavailable.

I prefer the behaviour without this fix because I disagree that the explicit
address request should fail, but this is what you asked for.

Actually now I come to look again, it seems that generic code does *not*
fail in this case either! Any explicit hint will succeed if it partially
or completely crosses 128TB. This is much better behaviour, so I think
powerpc has it wrong.

> Is this with MAP_FIXED?

With MAP_FIXED, it remains as succeeding as expected (like generic code
and hash). I did not change that case.

> 
> 
> >
> > Cc: "Aneesh Kumar K.V" 
> > Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> > Signed-off-by: Nicholas Piggin 
> > ---
> >  arch/powerpc/mm/hugetlbpage-radix.c | 13 +++--
> >  arch/powerpc/mm/mmap.c  | 27 ++-
> >  2 files changed, 21 insertions(+), 19 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/hugetlbpage-radix.c 
> > b/arch/powerpc/mm/hugetlbpage-radix.c
> > index a12e86395025..9c6a411e9c85 100644
> > --- a/arch/powerpc/mm/hugetlbpage-radix.c
> > +++ b/arch/powerpc/mm/hugetlbpage-radix.c
> > @@ -48,14 +48,18 @@ radix__hugetlb_get_unmapped_area(struct file *file, 
> > unsigned long addr,
> > struct mm_struct *mm = current->mm;
> > struct vm_area_struct *vma;
> > struct hstate *h = hstate_file(file);
> > +   unsigned long high_limit = DEFAULT_MAP_WINDOW;
> > struct vm_unmapped_area_info info;
> >
> > if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
> > mm->context.addr_limit = TASK_SIZE;
> >
> > +   if (addr > high_limit)
> > +   high_limit = TASK_SIZE;
> > +
> > if (len & ~huge_page_mask(h))
> > return -EINVAL;
> > -   if (len > mm->task_size)
> > +   if (len > high_limit)
> > return -ENOMEM;
> >
> > if (flags & MAP_FIXED) {
> > @@ -67,7 +71,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, 
> > unsigned long addr,
> > if (addr) {
> > addr = ALIGN(addr, huge_page_size(h));
> > vma = find_vma(mm, addr);
> > -   if (mm->task_size - len >= addr &&
> > +   if (high_limit - len >= addr &&
> > (!vma || addr + len <= vm_start_gap(vma)))
> > return addr;
> > }
> > @@ -78,12 +82,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, 
> > unsigned long addr,
> > info.flags = VM_UNMAPPED_AREA_TOPDOWN;
> > info.length = len;
> > info.low_limit = PAGE_SIZE;
> > -   info.high_limit = current->mm->mmap_base;
> > +   info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
> > info.align_mask = PAGE_MASK & ~huge_page_mask(h);
> > info.align_offset = 0;
> >
> > -   if (addr > DEFAULT_MAP_WINDOW)
> > -   info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
> > -
> > return vm_unmapped_area();
> >  }
> > diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
> > index 5d78b193fec4..e6cb3b3f7e93 100644
> > --- a/arch/powerpc/mm/mmap.c
> > +++ b/arch/powerpc/mm/mmap.c
> > @@ -106,13 +106,17 @@ radix__arch_get_unmapped_area(struct file *filp, 
> > unsigned long addr,
> >  {
> > struct mm_struct *mm = current->mm;
> > struct vm_area_struct *vma;
> > +   unsigned long high_limit = DEFAULT_MAP_WINDOW;
> > struct vm_unmapped_area_info info;
> >
> > if (unlikely(addr > mm->context.addr_limit &&
> >  mm->context.addr_limit != TASK_SIZE))
> > mm->context.addr_limit = TASK_SIZE;
> >
> > -   if (len > mm->task_size - mmap_min_addr)
> > +   if (addr > high_limit)
> > +   high_limit = TASK_SIZE;
> > +
> > +   if (len > high_limit - mmap_min_addr)
> > return -ENOMEM;
> >
> > if (flags & MAP_FIXED)
> > @@ -121,7 +125,7 @@ radix__arch_get_unmapped_area(struct file *filp, 
> > unsigned long addr,
> > if (addr) {
> > addr = PAGE_ALIGN(addr);
> > vma = find_vma(mm, addr);
> > -   if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
> > +   if (high_limit - len >= addr && addr >= mmap_min_addr &&
> > (!vma || 

Re: [PATCH 1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Nicholas Piggin
On Mon, 6 Nov 2017 16:35:43 +0530
"Aneesh Kumar K.V"  wrote:

> On 11/06/2017 04:24 PM, Nicholas Piggin wrote:
> > On Mon, 06 Nov 2017 16:08:06 +0530
> > "Aneesh Kumar K.V"  wrote:
> >   
> >> Nicholas Piggin  writes:
> >>  
> >>> When allocating VA space with a hint that crosses 128TB, the SLB 
> >>> addr_limit
> >>> variable is not expanded if addr is not > 128TB, but the slice allocation
> >>> looks at task_size, which is 512TB. This results in slice_check_fit()
> >>> incorrectly succeeding because the slice_count truncates off bit 128 of 
> >>> the
> >>> requested mask, so the comparison to the available mask succeeds.  
> >>
> >>
> >> But then the mask passed to slice_check_fit() is generated using
> >> context.addr_limit as max value. So how did that return succcess? ie,
> >> we get the request mask via
> >>
> >> slice_range_to_mask(addr, len, );
> >>
> >> And the potential/possible mask using
> >>
> >> slice_mask_for_size(mm, psize, _mask);
> >>
> >> So how did slice_check_fit() return sucess with
> >>
> >> slice_check_fit(mm, mask, good_mask);  
> > 
> > Because the addr_limit check is used to *limit* the comparison.
> > 
> > The available mask had bit up to 127 set, and the mask had 127 and
> > 128 set. However the 128T addr_limit causes only bits 0-127 to be
> > compared.
> >  
> 
> Should we fix it then via ? I haven't tested this yet. Also this result 
> in us comparing more bits?

I prefer not to rely on that as the fix because we should not be calling
into the slice code with an address beyond addr_limit IMO. There's quite
a few other places that use addr_limit. So I prefer my patch.

You could add this as an extra check, but yes it does result in more bitmap
to test. So if anything I would prefer to go the other way and actually
reduce the scope of *other* bitmap operations that are now using
SLICE_NUM_HIGH by similarly using addr_limit (if there are other
performance critical ones).

We could add some VM_BUG_ON checks to ensure tail bits are zero if
that's a concern.

> 
> modified   arch/powerpc/mm/slice.c
> @@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
>  struct slice_mask mask, struct slice_mask available)
>   {
>   DECLARE_BITMAP(result, SLICE_NUM_HIGH);
> - unsigned long slice_count = 
> GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
> 
>   bitmap_and(result, mask.high_slices,
> -available.high_slices, slice_count);
> +available.high_slices, SLICE_NUM_HIGH);
> 
>   return (mask.low_slices & available.low_slices) == mask.low_slices &&
> - bitmap_equal(result, mask.high_slices, slice_count);
> + bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)
> 
> 
> -aneesh
> 



Re: [PATCH v3 1/2] livepatch: send a fake signal to all blocking tasks

2017-11-06 Thread Pavel Machek
Hi!

> --- a/Documentation/ABI/testing/sysfs-kernel-livepatch
> +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch
> @@ -33,6 +33,15 @@ Contact:   live-patch...@vger.kernel.org
>   An attribute which indicates whether the patch is currently in
>   transition.
>  
> +What:/sys/kernel/livepatch//signal
> +Date:Oct 2017
> +KernelVersion:   4.15.0
> +Contact: live-patch...@vger.kernel.org
> +Description:
> + A writable attribute that allows administrator to affect the
> + course of an existing transition. Writing 1 sends a signal to
> + all remaining blocking tasks.

What kind of signal?

>  What:/sys/kernel/livepatch//
>  Date:Nov 2014
>  KernelVersion:   3.19.0
> diff --git a/Documentation/livepatch/livepatch.txt 
> b/Documentation/livepatch/livepatch.txt
> index ecdb18104ab0..6694530d0894 100644
> --- a/Documentation/livepatch/livepatch.txt
> +++ b/Documentation/livepatch/livepatch.txt
> @@ -178,6 +178,12 @@ transition, it shows -1.  Any tasks which are blocking 
> the transition
>  can be signaled with SIGSTOP and SIGCONT to force them to change their
>  patched state.
>  
> +Administrator can also affect a transition through
> +/sys/kernel/livepatch//signal attribute. Writing 1 to the attribute 
> sends
> +a signal to all remaining blocking tasks. This is an alternative for
> +SIGSTOP/SIGCONT approach mentioned in the previous paragraph. It should also 
> be
> +less harmful to the system.

Well... If SIGSTOP / SIGCONT is considered harmful (it probably is),
it should be mentioned above, and not in note here...
Pavel

-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH 4/5] powerpc/64s/radix: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Aneesh Kumar K.V
Nicholas Piggin  writes:

> Radix VA space allocations test addresses against mm->task_size which is
> 512TB, even in cases where the intention is to limit allocation to below
> 128TB.
>
> This results in mmap with a hint address below 128TB but address + length
> above 128TB succeeding when it should fail (as hash does after the
> previous patch).
>
> Set the high address limit to be considered up front, and base subsequent
> allocation checks on that consistently.

Doesn't setting info.high_limit take care of that ? I would expect
vm_unmapped_area to fail based on info.high_limit. Is this with MAP_FIXED?


>
> Cc: "Aneesh Kumar K.V" 
> Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/mm/hugetlbpage-radix.c | 13 +++--
>  arch/powerpc/mm/mmap.c  | 27 ++-
>  2 files changed, 21 insertions(+), 19 deletions(-)
>
> diff --git a/arch/powerpc/mm/hugetlbpage-radix.c 
> b/arch/powerpc/mm/hugetlbpage-radix.c
> index a12e86395025..9c6a411e9c85 100644
> --- a/arch/powerpc/mm/hugetlbpage-radix.c
> +++ b/arch/powerpc/mm/hugetlbpage-radix.c
> @@ -48,14 +48,18 @@ radix__hugetlb_get_unmapped_area(struct file *file, 
> unsigned long addr,
>   struct mm_struct *mm = current->mm;
>   struct vm_area_struct *vma;
>   struct hstate *h = hstate_file(file);
> + unsigned long high_limit = DEFAULT_MAP_WINDOW;
>   struct vm_unmapped_area_info info;
>
>   if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
>   mm->context.addr_limit = TASK_SIZE;
>
> + if (addr > high_limit)
> + high_limit = TASK_SIZE;
> +
>   if (len & ~huge_page_mask(h))
>   return -EINVAL;
> - if (len > mm->task_size)
> + if (len > high_limit)
>   return -ENOMEM;
>
>   if (flags & MAP_FIXED) {
> @@ -67,7 +71,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, 
> unsigned long addr,
>   if (addr) {
>   addr = ALIGN(addr, huge_page_size(h));
>   vma = find_vma(mm, addr);
> - if (mm->task_size - len >= addr &&
> + if (high_limit - len >= addr &&
>   (!vma || addr + len <= vm_start_gap(vma)))
>   return addr;
>   }
> @@ -78,12 +82,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, 
> unsigned long addr,
>   info.flags = VM_UNMAPPED_AREA_TOPDOWN;
>   info.length = len;
>   info.low_limit = PAGE_SIZE;
> - info.high_limit = current->mm->mmap_base;
> + info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
>   info.align_mask = PAGE_MASK & ~huge_page_mask(h);
>   info.align_offset = 0;
>
> - if (addr > DEFAULT_MAP_WINDOW)
> - info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
> -
>   return vm_unmapped_area();
>  }
> diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
> index 5d78b193fec4..e6cb3b3f7e93 100644
> --- a/arch/powerpc/mm/mmap.c
> +++ b/arch/powerpc/mm/mmap.c
> @@ -106,13 +106,17 @@ radix__arch_get_unmapped_area(struct file *filp, 
> unsigned long addr,
>  {
>   struct mm_struct *mm = current->mm;
>   struct vm_area_struct *vma;
> + unsigned long high_limit = DEFAULT_MAP_WINDOW;
>   struct vm_unmapped_area_info info;
>
>   if (unlikely(addr > mm->context.addr_limit &&
>mm->context.addr_limit != TASK_SIZE))
>   mm->context.addr_limit = TASK_SIZE;
>
> - if (len > mm->task_size - mmap_min_addr)
> + if (addr > high_limit)
> + high_limit = TASK_SIZE;
> +
> + if (len > high_limit - mmap_min_addr)
>   return -ENOMEM;
>
>   if (flags & MAP_FIXED)
> @@ -121,7 +125,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned 
> long addr,
>   if (addr) {
>   addr = PAGE_ALIGN(addr);
>   vma = find_vma(mm, addr);
> - if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
> + if (high_limit - len >= addr && addr >= mmap_min_addr &&
>   (!vma || addr + len <= vm_start_gap(vma)))
>   return addr;
>   }
> @@ -129,13 +133,9 @@ radix__arch_get_unmapped_area(struct file *filp, 
> unsigned long addr,
>   info.flags = 0;
>   info.length = len;
>   info.low_limit = mm->mmap_base;
> + info.high_limit = high_limit;
>   info.align_mask = 0;
>
> - if (unlikely(addr > DEFAULT_MAP_WINDOW))
> - info.high_limit = mm->context.addr_limit;
> - else
> - info.high_limit = DEFAULT_MAP_WINDOW;
> -
>   return vm_unmapped_area();
>  }
>
> @@ -149,14 +149,18 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
>   struct vm_area_struct *vma;
>   struct mm_struct *mm = current->mm;
>   unsigned long addr = addr0;
> + unsigned long high_limit = DEFAULT_MAP_WINDOW;
> 

Re: [PATCH 1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Aneesh Kumar K.V



On 11/06/2017 04:24 PM, Nicholas Piggin wrote:

On Mon, 06 Nov 2017 16:08:06 +0530
"Aneesh Kumar K.V"  wrote:


Nicholas Piggin  writes:


When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
variable is not expanded if addr is not > 128TB, but the slice allocation
looks at task_size, which is 512TB. This results in slice_check_fit()
incorrectly succeeding because the slice_count truncates off bit 128 of the
requested mask, so the comparison to the available mask succeeds.



But then the mask passed to slice_check_fit() is generated using
context.addr_limit as max value. So how did that return succcess? ie,
we get the request mask via

slice_range_to_mask(addr, len, );

And the potential/possible mask using

slice_mask_for_size(mm, psize, _mask);

So how did slice_check_fit() return sucess with

slice_check_fit(mm, mask, good_mask);


Because the addr_limit check is used to *limit* the comparison.

The available mask had bit up to 127 set, and the mask had 127 and
128 set. However the 128T addr_limit causes only bits 0-127 to be
compared.



Should we fix it then via ? I haven't tested this yet. Also this result 
in us comparing more bits?


modified   arch/powerpc/mm/slice.c
@@ -169,13 +169,12 @@ static int slice_check_fit(struct mm_struct *mm,
   struct slice_mask mask, struct slice_mask available)
 {
DECLARE_BITMAP(result, SLICE_NUM_HIGH);
-   unsigned long slice_count = 
GET_HIGH_SLICE_INDEX(mm->context.addr_limit);

bitmap_and(result, mask.high_slices,
-  available.high_slices, slice_count);
+  available.high_slices, SLICE_NUM_HIGH);

return (mask.low_slices & available.low_slices) == mask.low_slices &&
-   bitmap_equal(result, mask.high_slices, slice_count);
+   bitmap_equal(result, mask.high_slices, SLICE_NUM_HIGH)


-aneesh



Re: [PATCH 1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Nicholas Piggin
On Mon, 06 Nov 2017 16:08:06 +0530
"Aneesh Kumar K.V"  wrote:

> Nicholas Piggin  writes:
> 
> > When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
> > variable is not expanded if addr is not > 128TB, but the slice allocation
> > looks at task_size, which is 512TB. This results in slice_check_fit()
> > incorrectly succeeding because the slice_count truncates off bit 128 of the
> > requested mask, so the comparison to the available mask succeeds.  
> 
> 
> But then the mask passed to slice_check_fit() is generated using
> context.addr_limit as max value. So how did that return succcess? ie,
> we get the request mask via
> 
> slice_range_to_mask(addr, len, );
> 
> And the potential/possible mask using
> 
> slice_mask_for_size(mm, psize, _mask);
> 
> So how did slice_check_fit() return sucess with
> 
> slice_check_fit(mm, mask, good_mask);

Because the addr_limit check is used to *limit* the comparison.

The available mask had bit up to 127 set, and the mask had 127 and
128 set. However the 128T addr_limit causes only bits 0-127 to be
compared.

> > Fix this by using mm->context.addr_limit instead of mm->task_size for
> > testing allocation limits. This causes such allocations to fail.
> >
> > Cc: "Aneesh Kumar K.V" 
> > Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> > Reported-by: Florian Weimer 
> > Signed-off-by: Nicholas Piggin 
> > ---
> >  arch/powerpc/mm/slice.c | 11 ++-
> >  1 file changed, 6 insertions(+), 5 deletions(-)
> >
> > diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> > index 45f6740dd407..567db541c0a1 100644
> > --- a/arch/powerpc/mm/slice.c
> > +++ b/arch/powerpc/mm/slice.c
> > @@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, 
> > unsigned long addr,
> >  {
> > struct vm_area_struct *vma;
> >
> > -   if ((mm->task_size - len) < addr)
> > +   if ((mm->context.addr_limit - len) < addr)  
> 
> I was looking at these as generic boundary check against task size and
> for specific range check we should have created mask always using
> context.addr_limit. That should keep the boundary condition check same
> across radix/hash.

We need to actually fix the radix case too for other-but-similar reasons,
so fixing it like this does end up with the same tests for both. See
the later radix patch.

Thanks,
Nick


Re: [PATCH 3/5] powerpc/64s/hash: Fix fork() with 512TB process address space

2017-11-06 Thread Aneesh Kumar K.V
Nicholas Piggin  writes:

> Hash unconditionally resets the addr_limit to default (128TB) when
> the mm context is initialised. If a process has > 128TB mappings when
> it forks, the child will not get the 512TB addr_limit, so accesses to
> valid > 128TB mappings will fail in the child.
>
> Fix this by only resetting the addr_limit to default if it was 0. Non
> zero indicates it was duplicated from the parent (0 means exec()).
>

Reviewed-by: Aneesh Kumar K.V 

> Cc: "Aneesh Kumar K.V" 
> Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/mm/mmu_context_book3s64.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
> b/arch/powerpc/mm/mmu_context_book3s64.c
> index 05e15386d4cb..b94fb62e60fd 100644
> --- a/arch/powerpc/mm/mmu_context_book3s64.c
> +++ b/arch/powerpc/mm/mmu_context_book3s64.c
> @@ -93,11 +93,11 @@ static int hash__init_new_context(struct mm_struct *mm)
>   return index;
>
>   /*
> -  * We do switch_slb() early in fork, even before we setup the
> -  * mm->context.addr_limit. Default to max task size so that we copy the
> -  * default values to paca which will help us to handle slb miss early.
> +  * In the case of exec, use the default limit,
> +  * otherwise inherit it from the mm we are duplicating.
>*/
> - mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
> + if (!mm->context.addr_limit)
> + mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
>
>   /*
>* The old code would re-promote on fork, we don't do that when using
> -- 
> 2.15.0



Re: [PATCH kernel] powerpc/powernv/ioda: Relax max DMA window size check

2017-11-06 Thread Michael Ellerman
Alexey Kardashevskiy  writes:

> On 31/10/17 15:04, Alexey Kardashevskiy wrote:
>> DMA windows can only have a size of power of two on IODA2 hardware and
>> using memory_hotplug_max() to determine the upper limit won't work
>> correcly if it returns not power of two value.
>> 
>> This relaxes the check by rounding up the value returned by
>> memory_hotplug_max().
>> 
>> It is expected to impact DPDK on machines with non-power-of-two RAM size,
>> mostly. KVM guests are less likely to be affected as usually guests get
>> less than half of hosts RAM.
>
>
> It was pointed out that this check is quite useless anyway as the vm_locked
> memory limit should hit first, and if that is not set or the user got the
> root privilege level, then there are easier ways to crash the host so I am
> thinking of:
>
>
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 269f119e4b3c..a47e4cf343b2 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -2769,7 +2769,7 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid,
> __u64 bus_offset,
> if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
> return -EINVAL;
>
> -   if ((window_size > memory_hotplug_max()) ||
> !is_power_of_2(window_size))
> +   if (!is_power_of_2(window_size))
> return -EINVAL;
>
>
>
> Makes sense?

Sounds reasonable.

Execpt where is the vm_locked check? I think it's in the VFIO driver? If
so I guess the only concern is that this code might be called via some
other path that doesn't do that check.

cheers


Re: [PATCH 2/5] powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary

2017-11-06 Thread Aneesh Kumar K.V
Nicholas Piggin  writes:

> While mapping hints with a length that cross 128TB are disallowed,
> MAP_FIXED allocations that cross 128TB are allowed. These are failing
> on hash (on radix they succeed). Add an additional case for fixed
> mappings to expand the addr_limit when crossing 128TB.

Shouldn't that be fixed in radix. But i see x86 also doing this?


if (flags & MAP_FIXED)
return addr;

Kiril,

Is that expected?


>
> Cc: "Aneesh Kumar K.V" 
> Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/mm/slice.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 567db541c0a1..f980397b449d 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -419,7 +419,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
> unsigned long len,
>   /*
>* Check if we need to expland slice area.
>*/
> - if (unlikely(addr > mm->context.addr_limit &&
> + if (unlikely(((addr > mm->context.addr_limit) ||
> + (fixed && addr + len > mm->context.addr_limit)) &&
>mm->context.addr_limit != TASK_SIZE)) {
>   mm->context.addr_limit = TASK_SIZE;
>   on_each_cpu(slice_flush_segments, mm, 1);
> -- 
> 2.15.0


-aneesh



Re: powerpc/perf: Fix a sizeof() typo so we allocate less memory

2017-11-06 Thread Michael Ellerman
Dan Carpenter  writes:

> We're allocating the size of the struct which is 32 bytes when we should
> be allocating sizeof(void *) which is 4 or 8 bytes depending on the
> architecture.
>
> Fixes: 885dcd709ba9 ("powerpc/perf: Add nest IMC PMU support")
> Signed-off-by: Dan Carpenter 

Thanks.

I think your patch is right, but the code would be much simpler if we
actually did just allocate an array of struct imc_events, rather than an
array of pointers to structs.

Some of the cleanup code is written as if that is already the case, ie.
it frees the pmu->events array, but not the things the array points to.
So there's a memory leak there too.

Anju, Maddy, can one of you rework the code to just use an array of
structs? Basically imc_parse_event() wouldn't allocate a struct, we
would pass it a pointer to a slot in the array.

cheers

> diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c
> index 88126245881b..91c90ddab807 100644
> --- a/arch/powerpc/perf/imc-pmu.c
> +++ b/arch/powerpc/perf/imc-pmu.c
> @@ -210,7 +210,7 @@ static int update_events_in_group(struct device_node 
> *node, struct imc_pmu *pmu)
>   of_property_read_u32(node, "reg", _reg);
>  
>   /* Allocate memory for the events */
> - pmu->events = kcalloc(ct, sizeof(struct imc_events), GFP_KERNEL);
> + pmu->events = kcalloc(ct, sizeof(*pmu->events), GFP_KERNEL);
>   if (!pmu->events)
>   return -ENOMEM;
>  


Re: [PATCH 1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Aneesh Kumar K.V
Nicholas Piggin  writes:

> When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
> variable is not expanded if addr is not > 128TB, but the slice allocation
> looks at task_size, which is 512TB. This results in slice_check_fit()
> incorrectly succeeding because the slice_count truncates off bit 128 of the
> requested mask, so the comparison to the available mask succeeds.


But then the mask passed to slice_check_fit() is generated using
context.addr_limit as max value. So how did that return succcess? ie,
we get the request mask via

slice_range_to_mask(addr, len, );

And the potential/possible mask using

slice_mask_for_size(mm, psize, _mask);

So how did slice_check_fit() return sucess with

slice_check_fit(mm, mask, good_mask);


>
> Fix this by using mm->context.addr_limit instead of mm->task_size for
> testing allocation limits. This causes such allocations to fail.
>
> Cc: "Aneesh Kumar K.V" 
> Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
> Reported-by: Florian Weimer 
> Signed-off-by: Nicholas Piggin 
> ---
>  arch/powerpc/mm/slice.c | 11 ++-
>  1 file changed, 6 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
> index 45f6740dd407..567db541c0a1 100644
> --- a/arch/powerpc/mm/slice.c
> +++ b/arch/powerpc/mm/slice.c
> @@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, 
> unsigned long addr,
>  {
>   struct vm_area_struct *vma;
>
> - if ((mm->task_size - len) < addr)
> + if ((mm->context.addr_limit - len) < addr)

I was looking at these as generic boundary check against task size and
for specific range check we should have created mask always using
context.addr_limit. That should keep the boundary condition check same
across radix/hash.

>   return 0;
>   vma = find_vma(mm, addr);
>   return (!vma || (addr + len) <= vm_start_gap(vma));
> @@ -133,7 +133,7 @@ static void slice_mask_for_free(struct mm_struct *mm, 
> struct slice_mask *ret)
>   if (!slice_low_has_vma(mm, i))
>   ret->low_slices |= 1u << i;
>
> - if (mm->task_size <= SLICE_LOW_TOP)
> + if (mm->context.addr_limit <= SLICE_LOW_TOP)
>   return;
>
>   for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
> @@ -446,19 +446,20 @@ unsigned long slice_get_unmapped_area(unsigned long 
> addr, unsigned long len,
>
>   /* Sanity checks */
>   BUG_ON(mm->task_size == 0);
> + BUG_ON(mm->context.addr_limit == 0);
>   VM_BUG_ON(radix_enabled());
>
>   slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
>   slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
> addr, len, flags, topdown);
>
> - if (len > mm->task_size)
> + if (len > mm->context.addr_limit)
>   return -ENOMEM;
>   if (len & ((1ul << pshift) - 1))
>   return -EINVAL;
>   if (fixed && (addr & ((1ul << pshift) - 1)))
>   return -EINVAL;
> - if (fixed && addr > (mm->task_size - len))
> + if (fixed && addr > (mm->context.addr_limit - len))
>   return -ENOMEM;
>
>   /* If hint, make sure it matches our alignment restrictions */
> @@ -466,7 +467,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
> unsigned long len,
>   addr = _ALIGN_UP(addr, 1ul << pshift);
>   slice_dbg(" aligned addr=%lx\n", addr);
>   /* Ignore hint if it's too large or overlaps a VMA */
> - if (addr > mm->task_size - len ||
> + if (addr > mm->context.addr_limit - len ||
>   !slice_area_is_free(mm, addr, len))
>   addr = 0;
>   }
> -- 
> 2.15.0



Re: [PATCH] powerpc/opal: Fix EBUSY bug in acquiring tokens

2017-11-06 Thread Michael Ellerman
William Kennington  writes:

>> On Nov 4, 2017, at 2:14 AM, Michael Ellerman > > wrote:
>> 
>> "William A. Kennington III" > 
>> writes:
>> 
>>> The current code checks the completion map to look for the first token
>>> that is complete. In some cases, a completion can come in but the token
>>> can still be on lease to the caller processing the completion. If this
>>> completed but unreleased token is the first token found in the bitmap by
>>> another tasks trying to acquire a token, then the __test_and_set_bit
>>> call will fail since the token will still be on lease. The acquisition
>>> will then fail with an EBUSY.
>>> 
>>> This patch reorganizes the acquisition code to look at the
>>> opal_async_token_map for an unleased token. If the token has no lease it
>>> must have no outstanding completions so we should never see an EBUSY,
>>> unless we have leased out too many tokens. Since
>>> opal_async_get_token_inrerruptible is protected by a semaphore, we will
>>> practically never see EBUSY anymore.
>>> 
>>> Signed-off-by: William A. Kennington III >> >
>>> ---
>>> arch/powerpc/platforms/powernv/opal-async.c | 6 +++---
>>> 1 file changed, 3 insertions(+), 3 deletions(-)
>> 
>> I think this is superseeded by Cyrils rework (which he's finally
>> posted):
>> 
>>  http://patchwork.ozlabs.org/patch/833630/ 
>> 
>> 
>> If not please let us know.
>
> Yeah, I think Cyril’s rework fixes this. I wasn’t sure how long it
> would take for master to receive his changes so I figured we could use
> something in the interim to fix the locking failures. If his changes
> will be mailed into the next merge window then we should have the
> issue fixed in master. I understand that rework probably won’t make it
> into stable kernels? If not then we should probably send this along to
> stable kernel maintainers.

OK. I didn't realise the bug was sufficiently bad to need a backport
to stable.

To make a backport easier I've merged this patch first, and then Cyril's
on top of it (which essentially deletes this patch).

I assume you've tested this patch at least somewhat? :)

cheers


RE: [PATCH] [net-next,v2] ibmvnic: Feature implementation of Vital Product Data (VPD) for the ibmvnic driver

2017-11-06 Thread David Laight
From: David Miller
> Sent: 04 November 2017 13:21
> From: Desnes Augusto Nunes do Rosario 
> Date: Wed,  1 Nov 2017 19:03:32 -0200
> 
> > +   substr = strnstr(adapter->vpd->buff, "RM", adapter->vpd->len);
> > +   if (!substr) {
> > +   dev_info(dev, "No FW level provided by VPD\n");
> > +   complete(>fw_done);
> > +   return;
> > +   }
> > +
> > +   /* get length of firmware level ASCII substring */
> > +   fw_level_len = *(substr + 2);
> > +
> > +   /* copy firmware version string from vpd into adapter */
> > +   ptr = strncpy((char *)adapter->fw_version,
> > + substr + 3, fw_level_len);
> 
> You have to be more careful here, making sure first that
> (substr + 2) < (adapter->vpd->buff + adapter->vpd->len),
> and next that (substr + 2 + fw_level_len) is in range
> as well.

And that the copy isn't longer than the target buffer.

David



Re: [v5,22/22] powerpc/mm: Add speculative page fault

2017-11-06 Thread Sergey Senozhatsky
On (11/02/17 15:11), Laurent Dufour wrote:
> On 26/10/2017 10:14, kemi wrote:
> > Some regression is found by LKP-tools(linux kernel performance) on this 
> > patch series
> > tested on Intel 2s/4s Skylake platform. 
> > The regression result is sorted by the metric will-it-scale.per_process_ops.
> 
> Hi Kemi,
> 
> Thanks for reporting this, I'll try to address it by turning some features
> of the SPF path off when the process is monothreaded.

make them madvice()-able?
not all multi-threaded apps will necessarily benefit of SPF. right?
just an idea.

-ss


Re: POWER: Unexpected fault when writing to brk-allocated memory

2017-11-06 Thread Nicholas Piggin
On Mon, 6 Nov 2017 09:32:25 +0100
Florian Weimer  wrote:

> On 11/06/2017 09:30 AM, Aneesh Kumar K.V wrote:
> > On 11/06/2017 01:55 PM, Nicholas Piggin wrote:  
> >> On Mon, 6 Nov 2017 09:11:37 +0100
> >> Florian Weimer  wrote:
> >>  
> >>> On 11/06/2017 07:47 AM, Nicholas Piggin wrote:  
>  "You get < 128TB unless explicitly requested."
> 
>  Simple, reasonable, obvious rule. Avoids breaking apps that store
>  some bits in the top of pointers (provided that memory allocator
>  userspace libraries also do the right thing).  
> >>>
> >>> So brk would simplify fail instead of crossing the 128 TiB threshold?  
> >>
> >> Yes, that was the intention and that's what x86 seems to do.
> >>  
> >>>
> >>> glibc malloc should cope with that and switch to malloc, but this code
> >>> path is obviously less well-tested than the regular way.  
> >>
> >> Switch to mmap() I guess you meant?  
> 
> Yes, sorry.
> 
> >> powerpc has a couple of bugs in corner cases, so those should be fixed
> >> according to intended policy for stable kernels I think.
> >>
> >> But I question the policy. Just seems like an ugly and ineffective wart.
> >> Exactly for such cases as this -- behaviour would change from run to run
> >> depending on your address space randomization for example! In case your
> >> brk happens to land nicely on 128TB then the next one would succeed.  
> > 
> > Why ? It should not change between run to run. We limit the free
> > area search range based on hint address. So we should get consistent 
> > results across run. even if we changed the context.addr_limit.  
> 
> The size of the gap to the 128 TiB limit varies between runs because of 
> ASLR.  So some runs would use brk alone, others would use brk + malloc. 
> That's not really desirable IMHO.

Yeah. Actually I looked at the code a bit more, and it seems that the
intention is for MAP_FIXED to do exactly what I wanted. brk() uses
MAP_FIXED under the covers, so this case should be okay I think. I'm
just slightly happier now, but I still think it's not the right thing
to do to fail an explicit request for crossing 128TB with a hint. Same
fundamental criticism still applies -- it does not really solve bugs
and just adds an unintuitive wart to the API, and a random change in
behaviour based on randomization.

Anyway I sent some patches that are split up better and hopefully solve
some bugs for powerpc without changing intended policy. That's left for
another discussion.

Thanks,
Nick


[PATCH 5/5] powerpc/64s: mm_context.addr_limit is only used on hash

2017-11-06 Thread Nicholas Piggin
Radix keeps no meaningful state in addr_limit, so remove it from
radix code and rename to slb_addr_limit to make it clear it applies
to hash only.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  2 +-
 arch/powerpc/include/asm/book3s/64/mmu.h  |  2 +-
 arch/powerpc/include/asm/paca.h   |  2 +-
 arch/powerpc/kernel/asm-offsets.c |  2 +-
 arch/powerpc/kernel/paca.c|  4 ++--
 arch/powerpc/kernel/setup-common.c|  3 ++-
 arch/powerpc/mm/hugetlbpage-radix.c   |  3 ---
 arch/powerpc/mm/mmap.c|  8 ---
 arch/powerpc/mm/mmu_context_book3s64.c|  4 ++--
 arch/powerpc/mm/slb_low.S |  2 +-
 arch/powerpc/mm/slice.c   | 34 +--
 11 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 508275bb05d5..e91e115a816f 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -606,7 +606,7 @@ extern void slb_set_size(u16 size);
 
 /* 4 bits per slice and we have one slice per 1TB */
 #define SLICE_ARRAY_SIZE   (H_PGTABLE_RANGE >> 41)
-#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41)
+#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.slb_addr_limit >> 41)
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index c3b00e8ff791..49a07c5d9e50 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -92,7 +92,7 @@ typedef struct {
 #ifdef CONFIG_PPC_MM_SLICES
u64 low_slices_psize;   /* SLB page size encodings */
unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
-   unsigned long addr_limit;
+   unsigned long slb_addr_limit;
 #else
u16 sllp;   /* SLB page size encoding */
 #endif
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 7125efa6a6ae..2ef0c0da4bb7 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -143,7 +143,7 @@ struct paca_struct {
 #ifdef CONFIG_PPC_MM_SLICES
u64 mm_ctx_low_slices_psize;
unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
-   unsigned long addr_limit;
+   unsigned long mm_ctx_slb_addr_limit;
 #else
u16 mm_ctx_user_psize;
u16 mm_ctx_sllp;
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index 96c52235ecdc..912880873dfc 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,7 +185,7 @@ int main(void)
 #ifdef CONFIG_PPC_MM_SLICES
OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
-   DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
+   OFFSET(PACA_SLB_ADDR_LIMIT, paca_struct, mm_ctx_slb_addr_limit);
DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
 #endif /* CONFIG_PPC_MM_SLICES */
 #endif
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 2ff2b8a19f71..4c69d335863c 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -262,8 +262,8 @@ void copy_mm_to_paca(struct mm_struct *mm)
 
get_paca()->mm_ctx_id = context->id;
 #ifdef CONFIG_PPC_MM_SLICES
-   VM_BUG_ON(!mm->context.addr_limit);
-   get_paca()->addr_limit = mm->context.addr_limit;
+   VM_BUG_ON(!mm->context.slb_addr_limit);
+   get_paca()->mm_ctx_slb_addr_limit = mm->context.slb_addr_limit;
get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
memcpy(_paca()->mm_ctx_high_slices_psize,
   >high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index 2e3bc16d02b2..8c4fa6086b39 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -898,7 +898,8 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_PPC_MM_SLICES
 #ifdef CONFIG_PPC64
-   init_mm.context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
+   if (!radix_enabled())
+   init_mm.context.slb_addr_limit = DEFAULT_MAP_WINDOW_USER64;
 #else
 #error "context.addr_limit not initialized."
 #endif
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c 
b/arch/powerpc/mm/hugetlbpage-radix.c
index 9c6a411e9c85..0f69bdf33367 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -51,9 +51,6 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned 
long addr,
unsigned long high_limit = DEFAULT_MAP_WINDOW;
struct vm_unmapped_area_info info;
 
-   if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
-   

[PATCH 4/5] powerpc/64s/radix: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Nicholas Piggin
Radix VA space allocations test addresses against mm->task_size which is
512TB, even in cases where the intention is to limit allocation to below
128TB.

This results in mmap with a hint address below 128TB but address + length
above 128TB succeeding when it should fail (as hash does after the
previous patch).

Set the high address limit to be considered up front, and base subsequent
allocation checks on that consistently.

Cc: "Aneesh Kumar K.V" 
Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/hugetlbpage-radix.c | 13 +++--
 arch/powerpc/mm/mmap.c  | 27 ++-
 2 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage-radix.c 
b/arch/powerpc/mm/hugetlbpage-radix.c
index a12e86395025..9c6a411e9c85 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -48,14 +48,18 @@ radix__hugetlb_get_unmapped_area(struct file *file, 
unsigned long addr,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
+   unsigned long high_limit = DEFAULT_MAP_WINDOW;
struct vm_unmapped_area_info info;
 
if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
mm->context.addr_limit = TASK_SIZE;
 
+   if (addr > high_limit)
+   high_limit = TASK_SIZE;
+
if (len & ~huge_page_mask(h))
return -EINVAL;
-   if (len > mm->task_size)
+   if (len > high_limit)
return -ENOMEM;
 
if (flags & MAP_FIXED) {
@@ -67,7 +71,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned 
long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
-   if (mm->task_size - len >= addr &&
+   if (high_limit - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -78,12 +82,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned 
long addr,
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = PAGE_SIZE;
-   info.high_limit = current->mm->mmap_base;
+   info.high_limit = mm->mmap_base + (high_limit - DEFAULT_MAP_WINDOW);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
 
-   if (addr > DEFAULT_MAP_WINDOW)
-   info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
-
return vm_unmapped_area();
 }
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 5d78b193fec4..e6cb3b3f7e93 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -106,13 +106,17 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned 
long addr,
 {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+   unsigned long high_limit = DEFAULT_MAP_WINDOW;
struct vm_unmapped_area_info info;
 
if (unlikely(addr > mm->context.addr_limit &&
 mm->context.addr_limit != TASK_SIZE))
mm->context.addr_limit = TASK_SIZE;
 
-   if (len > mm->task_size - mmap_min_addr)
+   if (addr > high_limit)
+   high_limit = TASK_SIZE;
+
+   if (len > high_limit - mmap_min_addr)
return -ENOMEM;
 
if (flags & MAP_FIXED)
@@ -121,7 +125,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned 
long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
-   if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
+   if (high_limit - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
@@ -129,13 +133,9 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned 
long addr,
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
+   info.high_limit = high_limit;
info.align_mask = 0;
 
-   if (unlikely(addr > DEFAULT_MAP_WINDOW))
-   info.high_limit = mm->context.addr_limit;
-   else
-   info.high_limit = DEFAULT_MAP_WINDOW;
-
return vm_unmapped_area();
 }
 
@@ -149,14 +149,18 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
unsigned long addr = addr0;
+   unsigned long high_limit = DEFAULT_MAP_WINDOW;
struct vm_unmapped_area_info info;
 
if (unlikely(addr > mm->context.addr_limit &&
 mm->context.addr_limit != TASK_SIZE))
mm->context.addr_limit = TASK_SIZE;
 
+   if (addr > high_limit)
+   high_limit = TASK_SIZE;
+
   

[PATCH 3/5] powerpc/64s/hash: Fix fork() with 512TB process address space

2017-11-06 Thread Nicholas Piggin
Hash unconditionally resets the addr_limit to default (128TB) when
the mm context is initialised. If a process has > 128TB mappings when
it forks, the child will not get the 512TB addr_limit, so accesses to
valid > 128TB mappings will fail in the child.

Fix this by only resetting the addr_limit to default if it was 0. Non
zero indicates it was duplicated from the parent (0 means exec()).

Cc: "Aneesh Kumar K.V" 
Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/mmu_context_book3s64.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
b/arch/powerpc/mm/mmu_context_book3s64.c
index 05e15386d4cb..b94fb62e60fd 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -93,11 +93,11 @@ static int hash__init_new_context(struct mm_struct *mm)
return index;
 
/*
-* We do switch_slb() early in fork, even before we setup the
-* mm->context.addr_limit. Default to max task size so that we copy the
-* default values to paca which will help us to handle slb miss early.
+* In the case of exec, use the default limit,
+* otherwise inherit it from the mm we are duplicating.
 */
-   mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
+   if (!mm->context.addr_limit)
+   mm->context.addr_limit = DEFAULT_MAP_WINDOW_USER64;
 
/*
 * The old code would re-promote on fork, we don't do that when using
-- 
2.15.0



[PATCH 2/5] powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary

2017-11-06 Thread Nicholas Piggin
While mapping hints with a length that cross 128TB are disallowed,
MAP_FIXED allocations that cross 128TB are allowed. These are failing
on hash (on radix they succeed). Add an additional case for fixed
mappings to expand the addr_limit when crossing 128TB.

Cc: "Aneesh Kumar K.V" 
Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/slice.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 567db541c0a1..f980397b449d 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -419,7 +419,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
unsigned long len,
/*
 * Check if we need to expland slice area.
 */
-   if (unlikely(addr > mm->context.addr_limit &&
+   if (unlikely(((addr > mm->context.addr_limit) ||
+   (fixed && addr + len > mm->context.addr_limit)) &&
 mm->context.addr_limit != TASK_SIZE)) {
mm->context.addr_limit = TASK_SIZE;
on_each_cpu(slice_flush_segments, mm, 1);
-- 
2.15.0



[PATCH 1/5] powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case allocation

2017-11-06 Thread Nicholas Piggin
When allocating VA space with a hint that crosses 128TB, the SLB addr_limit
variable is not expanded if addr is not > 128TB, but the slice allocation
looks at task_size, which is 512TB. This results in slice_check_fit()
incorrectly succeeding because the slice_count truncates off bit 128 of the
requested mask, so the comparison to the available mask succeeds.

Fix this by using mm->context.addr_limit instead of mm->task_size for
testing allocation limits. This causes such allocations to fail.

Cc: "Aneesh Kumar K.V" 
Fixes: f4ea6dcb08 ("powerpc/mm: Enable mappings above 128TB")
Reported-by: Florian Weimer 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/mm/slice.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 45f6740dd407..567db541c0a1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -96,7 +96,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned 
long addr,
 {
struct vm_area_struct *vma;
 
-   if ((mm->task_size - len) < addr)
+   if ((mm->context.addr_limit - len) < addr)
return 0;
vma = find_vma(mm, addr);
return (!vma || (addr + len) <= vm_start_gap(vma));
@@ -133,7 +133,7 @@ static void slice_mask_for_free(struct mm_struct *mm, 
struct slice_mask *ret)
if (!slice_low_has_vma(mm, i))
ret->low_slices |= 1u << i;
 
-   if (mm->task_size <= SLICE_LOW_TOP)
+   if (mm->context.addr_limit <= SLICE_LOW_TOP)
return;
 
for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
@@ -446,19 +446,20 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
unsigned long len,
 
/* Sanity checks */
BUG_ON(mm->task_size == 0);
+   BUG_ON(mm->context.addr_limit == 0);
VM_BUG_ON(radix_enabled());
 
slice_dbg("slice_get_unmapped_area(mm=%p, psize=%d...\n", mm, psize);
slice_dbg(" addr=%lx, len=%lx, flags=%lx, topdown=%d\n",
  addr, len, flags, topdown);
 
-   if (len > mm->task_size)
+   if (len > mm->context.addr_limit)
return -ENOMEM;
if (len & ((1ul << pshift) - 1))
return -EINVAL;
if (fixed && (addr & ((1ul << pshift) - 1)))
return -EINVAL;
-   if (fixed && addr > (mm->task_size - len))
+   if (fixed && addr > (mm->context.addr_limit - len))
return -ENOMEM;
 
/* If hint, make sure it matches our alignment restrictions */
@@ -466,7 +467,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, 
unsigned long len,
addr = _ALIGN_UP(addr, 1ul << pshift);
slice_dbg(" aligned addr=%lx\n", addr);
/* Ignore hint if it's too large or overlaps a VMA */
-   if (addr > mm->task_size - len ||
+   if (addr > mm->context.addr_limit - len ||
!slice_area_is_free(mm, addr, len))
addr = 0;
}
-- 
2.15.0



[PATCH 0/5] VA allocator fixes

2017-11-06 Thread Nicholas Piggin
Florian found a nasty corner case with the VA allocation logic
for crossing from 128TB to 512TB limit on hash, and made a
really superb report of the problem -- traces, reproducer recipes,
analysis, etc. which already mostly solved it.

The first patch in the series should solve Florian's particular
case, the next 3 are other issues with addr_limit. The last
patch is technically a cleanup but I think it's fairly important
in terms of understanding the code and also enabling some BUG
checks (when addr_limit == 0).

I have not tested these exactly on Florian's test case, but
some tests of my own behave better afterwards. Hopefully he has
time to re-test. Some careful review would be welcome too.

Thanks,
Nick

Nicholas Piggin (5):
  powerpc/64s/hash: Fix 128TB-512TB virtual address boundary case
allocation
  powerpc/64s/hash: Allow MAP_FIXED allocations to cross 128TB boundary
  powerpc/64s/hash: Fix fork() with 512TB process address space
  powerpc/64s/radix: Fix 128TB-512TB virtual address boundary case
allocation
  powerpc/64s: mm_context.addr_limit is only used on hash

 arch/powerpc/include/asm/book3s/64/mmu-hash.h |  2 +-
 arch/powerpc/include/asm/book3s/64/mmu.h  |  2 +-
 arch/powerpc/include/asm/paca.h   |  2 +-
 arch/powerpc/kernel/asm-offsets.c |  2 +-
 arch/powerpc/kernel/paca.c|  4 ++--
 arch/powerpc/kernel/setup-common.c|  3 ++-
 arch/powerpc/mm/hugetlbpage-radix.c   | 14 +---
 arch/powerpc/mm/mmap.c| 31 ++
 arch/powerpc/mm/mmu_context_book3s64.c|  8 +++
 arch/powerpc/mm/slb_low.S |  2 +-
 arch/powerpc/mm/slice.c   | 32 ++-
 11 files changed, 48 insertions(+), 54 deletions(-)

-- 
2.15.0



Re: [PATCH v5 07/22] mm: Protect VMA modifications using VMA sequence count

2017-11-06 Thread Laurent Dufour
Hi Andrea,

On 02/11/2017 21:08, Andrea Arcangeli wrote:
> On Thu, Nov 02, 2017 at 06:25:11PM +0100, Laurent Dufour wrote:
>> I think there is some memory barrier missing when the VMA is modified so
>> currently the modifications done in the VMA structure may not be written
>> down at the time the pte is locked. So doing that change will also requires
>> to call smp_wmb() before locking the page tables. In the current patch this
>> is ensured by the call to write_seqcount_end().
>> Doing so will still require to have a memory barrier when touching the VMA.
>> Not sure we get far better performance compared to the sequence count
>> change. But I'll give it a try anyway ;)
> 
> Luckily smp_wmb is a noop on x86. I would suggest to ignore the above
> issue completely if you give it a try, and then if this performs, we
> can just embed a smp_wmb() before spin_lock() somewhere in
> pte_offset_map_lock/pte_lockptr/spin_lock_nested for those archs whose
> spin_lock isn't a smp_wmb() equivalent. I would focus at flushing
> writes before every pagetable spin_lock for non-x86 archs, rather than
> after all vma modifications. That should be easier to keep under
> control and it's going to be more efficient too as if something there
> are fewer spin locks than vma modifications.

I do agree that would simplify the patch series a lot.
I'll double check that pte lock is not done in a loop other wise having
smp_wmb() there will be bad.

Another point I'm trying to double check is that we may have inconsistency
while reading the vma's flags in the page fault path until the memory
barrier got it in the VMA's changing path. Especially we may have vm_flags
and vm_page_prot not matching at all, which couldn't happen when checking
for the vm_sequence count.

> 
> For non-x86 archs we may then need a smp_wmb__before_spin_lock. That
> looks more self contained than surrounding all vma modifications and
> it's a noop on x86 anyway.
> 
> I thought about the contention detection logic too yesterday: to
> detect contention we could have a mm->mmap_sem_contention_jiffies and
> if down_read_trylock_exclusive() [same as down_read_if_not_hold in
> prev mail] fails (and it'll fail if either read or write mmap_sem is
> hold, so also convering mremap/mprotect etc..) we set
> mm->mmap_sem_contention_jiffies = jiffies and then to know if you must
> not touch the mmap_sem at all, you compare jiffies against
> mmap_sem_contention_jiffies, if it's equal we go speculative. If
> that's not enough we can just keep going speculative for a few more
> jiffies with time_before(). The srcu lock is non concerning because the
> inc/dec of the fast path is in per-cpu cacheline of course, no false
> sharing possible there or it wouldn't be any better than a normal lock.

I'm sorry, I should have missed something here. I can't see how this would
help fixing the case where a thread is entering the page fault handler
seeing that no one else has the mmap_sem and then grab it. While it is
processing the page fault another thread is entering mprotect for instance
and thus will wait for the mmap_sem to be released by the thread processing
the page fault.

Cheers,
Laurent.

> The vma revalidation is already done by khugepaged and mm/userfaultfd,
> both need to drop the mmap_sem and continue working on the pagetables,
> so we already know it's workable and not too slow.
> 
> Summarizing.. by using a runtime contention triggered speculative
> design that goes speculative only when contention is runtime-detected
> using the above logic (or equivalent), and by having to revalidate the
> vma by hand with find_vma without knowing instantly if the vma become
> stale, we will run with a substantially slower speculative page fault
> than with your current speculative always-on design, but the slower
> speculative page fault runtime will still scale 100% in SMP so it
> should still be faster on large SMP systems. The pros is that it won't
> regress the mmap/brk vma modifications. The whole complexity of
> tracking the vma modifications should also go away and the resulting
> code should be more maintainable and less risky to break in subtle
> ways impossible to reproduce.
> 
> Thanks!
> Andrea
> 



Re: [PATCH v5 06/10] powerpc/opal: Rework the opal-async interface

2017-11-06 Thread Michael Ellerman
Cyril Bur  writes:

> diff --git a/arch/powerpc/platforms/powernv/opal-async.c 
> b/arch/powerpc/platforms/powernv/opal-async.c
> index c43421ab2d2f..fbae8a37ce2c 100644
> --- a/arch/powerpc/platforms/powernv/opal-async.c
> +++ b/arch/powerpc/platforms/powernv/opal-async.c
> @@ -23,40 +23,45 @@
>  #include 
>  #include 
>  
> -#define N_ASYNC_COMPLETIONS  64
> +enum opal_async_token_state {
> + ASYNC_TOKEN_UNALLOCATED = 0,
> + ASYNC_TOKEN_ALLOCATED,
> + ASYNC_TOKEN_COMPLETED
> +};
> +
> +struct opal_async_token {
> + enum opal_async_token_state state;
> + struct opal_msg response;
> +};
>  
> -static DECLARE_BITMAP(opal_async_complete_map, N_ASYNC_COMPLETIONS) = {~0UL};
> -static DECLARE_BITMAP(opal_async_token_map, N_ASYNC_COMPLETIONS);
>  static DECLARE_WAIT_QUEUE_HEAD(opal_async_wait);
>  static DEFINE_SPINLOCK(opal_async_comp_lock);
>  static struct semaphore opal_async_sem;
> -static struct opal_msg *opal_async_responses;
>  static unsigned int opal_max_async_tokens;
> +static struct opal_async_token *opal_async_tokens;
>  
>  static int __opal_async_get_token(void)
>  {
>   unsigned long flags;
> - int token;
> + int token = -EBUSY;
>  
>   spin_lock_irqsave(_async_comp_lock, flags);
> - token = find_first_bit(opal_async_complete_map, opal_max_async_tokens);
> - if (token >= opal_max_async_tokens) {
> - token = -EBUSY;
> - goto out;
> + for (token = 0; token < opal_max_async_tokens; token++) {
> + if (opal_async_tokens[token].state == ASYNC_TOKEN_UNALLOCATED) {
> + opal_async_tokens[token].state = ASYNC_TOKEN_ALLOCATED;
> + goto out;
> + }
>   }
> -
> - if (__test_and_set_bit(token, opal_async_token_map)) {
> - token = -EBUSY;
> - goto out;
> - }
> -
> - __clear_bit(token, opal_async_complete_map);
> -
>  out:
>   spin_unlock_irqrestore(_async_comp_lock, flags);
>   return token;
>  }

Resulting in:

 static int __opal_async_get_token(void)
 {
unsigned long flags;
+   int token = -EBUSY;
 
spin_lock_irqsave(_async_comp_lock, flags);
+   for (token = 0; token < opal_max_async_tokens; token++) {
+   if (opal_async_tokens[token].state == ASYNC_TOKEN_UNALLOCATED) {
+   opal_async_tokens[token].state = ASYNC_TOKEN_ALLOCATED;
+   goto out;
+   }
}
 out:
spin_unlock_irqrestore(_async_comp_lock, flags);
return token;
 }

So when no unallocated token is found we return opal_max_async_tokens :(

I changed it to:

static int __opal_async_get_token(void)
{
unsigned long flags;
int i, token = -EBUSY;

spin_lock_irqsave(_async_comp_lock, flags);

for (i = 0; i < opal_max_async_tokens; i++) {
if (opal_async_tokens[i].state == ASYNC_TOKEN_UNALLOCATED) {
opal_async_tokens[i].state = ASYNC_TOKEN_ALLOCATED;
token = i;
break;
}
}

spin_unlock_irqrestore(_async_comp_lock, flags);
return token;
}


>  
> +/*
> + * Note: If the returned token is used in an opal call and opal returns
> + * OPAL_ASYNC_COMPLETION you MUST opal_async_wait_response() before
 ^
 call


cheers


[PATCH v9 51/51] selftests/powerpc: Add core file test for Protection Key register

2017-11-06 Thread Ram Pai
From: Thiago Jung Bauermann 

This test verifies that the AMR is being written to a
process' core file.

Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/powerpc/ptrace/Makefile|2 +-
 tools/testing/selftests/powerpc/ptrace/core-pkey.c |  438 
 2 files changed, 439 insertions(+), 1 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/ptrace/core-pkey.c

diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile 
b/tools/testing/selftests/powerpc/ptrace/Makefile
index fd896b2..ca25fda 100644
--- a/tools/testing/selftests/powerpc/ptrace/Makefile
+++ b/tools/testing/selftests/powerpc/ptrace/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \
   ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx 
ptrace-tm-vsx \
-  ptrace-tm-spd-vsx ptrace-tm-spr ptrace-pkey
+  ptrace-tm-spd-vsx ptrace-tm-spr ptrace-pkey core-pkey
 
 include ../../lib.mk
 
diff --git a/tools/testing/selftests/powerpc/ptrace/core-pkey.c 
b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
new file mode 100644
index 000..2328f8c
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/core-pkey.c
@@ -0,0 +1,438 @@
+/*
+ * Ptrace test for Memory Protection Key registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ * Copyright (C) 2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "ptrace.h"
+
+#ifndef __NR_pkey_alloc
+#define __NR_pkey_alloc384
+#endif
+
+#ifndef __NR_pkey_free
+#define __NR_pkey_free 385
+#endif
+
+#ifndef NT_PPC_PKEY
+#define NT_PPC_PKEY0x110
+#endif
+
+#ifndef PKEY_DISABLE_EXECUTE
+#define PKEY_DISABLE_EXECUTE   0x4
+#endif
+
+#define AMR_BITS_PER_PKEY 2
+#define PKEY_REG_BITS (sizeof(u64) * 8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
+
+#define CORE_FILE_LIMIT(5 * 1024 * 1024)   /* 5 MB should be 
enough */
+
+static const char core_pattern_file[] = "/proc/sys/kernel/core_pattern";
+
+static const char user_write[] = "[User Write (Running)]";
+static const char core_read_running[] = "[Core Read (Running)]";
+
+/* Information shared between the parent and the child. */
+struct shared_info {
+   /* AMR value the parent expects to read in the core file. */
+   unsigned long amr;
+
+   /* IAMR value the parent expects to read from the child. */
+   unsigned long iamr;
+
+   /* UAMOR value the parent expects to read from the child. */
+   unsigned long uamor;
+
+   /* When the child crashed. */
+   time_t core_time;
+};
+
+static int sys_pkey_alloc(unsigned long flags, unsigned long 
init_access_rights)
+{
+   return syscall(__NR_pkey_alloc, flags, init_access_rights);
+}
+
+static int sys_pkey_free(int pkey)
+{
+   return syscall(__NR_pkey_free, pkey);
+}
+
+static int increase_core_file_limit(void)
+{
+   struct rlimit rlim;
+   int ret;
+
+   ret = getrlimit(RLIMIT_CORE, );
+   FAIL_IF(ret);
+
+   if (rlim.rlim_cur != RLIM_INFINITY && rlim.rlim_cur < CORE_FILE_LIMIT) {
+   rlim.rlim_cur = CORE_FILE_LIMIT;
+
+   if (rlim.rlim_max != RLIM_INFINITY &&
+   rlim.rlim_max < CORE_FILE_LIMIT)
+   rlim.rlim_max = CORE_FILE_LIMIT;
+
+   ret = setrlimit(RLIMIT_CORE, );
+   FAIL_IF(ret);
+   }
+
+   ret = getrlimit(RLIMIT_FSIZE, );
+   FAIL_IF(ret);
+
+   if (rlim.rlim_cur != RLIM_INFINITY && rlim.rlim_cur < CORE_FILE_LIMIT) {
+   rlim.rlim_cur = CORE_FILE_LIMIT;
+
+   if (rlim.rlim_max != RLIM_INFINITY &&
+   rlim.rlim_max < CORE_FILE_LIMIT)
+   rlim.rlim_max = CORE_FILE_LIMIT;
+
+   ret = setrlimit(RLIMIT_FSIZE, );
+   FAIL_IF(ret);
+   }
+
+   return TEST_PASS;
+}
+
+static int child(struct shared_info *info)
+{
+   bool disable_execute = true;
+   int pkey1, pkey2, pkey3;
+   int *ptr, ret;
+
+   ret = increase_core_file_limit();
+   FAIL_IF(ret);
+
+   /* Get some pkeys so that we can change their bits in the AMR. */
+   pkey1 = sys_pkey_alloc(0, PKEY_DISABLE_EXECUTE);
+   if (pkey1 < 0) {
+   pkey1 = sys_pkey_alloc(0, 0);
+   FAIL_IF(pkey1 < 0);
+
+   disable_execute = false;
+   }
+
+   pkey2 = sys_pkey_alloc(0, 0);
+   FAIL_IF(pkey2 < 0);
+
+   pkey3 = sys_pkey_alloc(0, 0);
+   FAIL_IF(pkey3 < 0);
+
+   info->amr = 

[PATCH v9 50/51] selftests/powerpc: Add ptrace tests for Protection Key register

2017-11-06 Thread Ram Pai
From: Thiago Jung Bauermann 

This test exercises read and write access to the AMR.

Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/powerpc/include/reg.h  |1 +
 tools/testing/selftests/powerpc/ptrace/Makefile|5 +-
 .../testing/selftests/powerpc/ptrace/ptrace-pkey.c |  443 
 3 files changed, 448 insertions(+), 1 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c

diff --git a/tools/testing/selftests/powerpc/include/reg.h 
b/tools/testing/selftests/powerpc/include/reg.h
index 4afdebc..7f348c0 100644
--- a/tools/testing/selftests/powerpc/include/reg.h
+++ b/tools/testing/selftests/powerpc/include/reg.h
@@ -54,6 +54,7 @@
 #define SPRN_DSCR_PRIV 0x11/* Privilege State DSCR */
 #define SPRN_DSCR  0x03/* Data Stream Control Register */
 #define SPRN_PPR   896 /* Program Priority Register */
+#define SPRN_AMR   13  /* Authority Mask Register - problem state */
 
 /* TEXASR register bits */
 #define TEXASR_FC  0xFE00
diff --git a/tools/testing/selftests/powerpc/ptrace/Makefile 
b/tools/testing/selftests/powerpc/ptrace/Makefile
index 4803052..fd896b2 100644
--- a/tools/testing/selftests/powerpc/ptrace/Makefile
+++ b/tools/testing/selftests/powerpc/ptrace/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 TEST_PROGS := ptrace-gpr ptrace-tm-gpr ptrace-tm-spd-gpr \
   ptrace-tar ptrace-tm-tar ptrace-tm-spd-tar ptrace-vsx 
ptrace-tm-vsx \
-  ptrace-tm-spd-vsx ptrace-tm-spr
+  ptrace-tm-spd-vsx ptrace-tm-spr ptrace-pkey
 
 include ../../lib.mk
 
@@ -9,6 +9,9 @@ all: $(TEST_PROGS)
 
 CFLAGS += -m64 -I../../../../../usr/include -I../tm -mhtm -fno-pie
 
+ptrace-pkey: ../harness.c ../utils.c ../lib/reg.S ptrace.h ptrace-pkey.c
+   $(LINK.c) $^ $(LDLIBS) -pthread -o $@
+
 $(TEST_PROGS): ../harness.c ../utils.c ../lib/reg.S ptrace.h
 
 clean:
diff --git a/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c 
b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
new file mode 100644
index 000..2e5b676
--- /dev/null
+++ b/tools/testing/selftests/powerpc/ptrace/ptrace-pkey.c
@@ -0,0 +1,443 @@
+/*
+ * Ptrace test for Memory Protection Key registers
+ *
+ * Copyright (C) 2015 Anshuman Khandual, IBM Corporation.
+ * Copyright (C) 2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include 
+#include "ptrace.h"
+
+#ifndef __NR_pkey_alloc
+#define __NR_pkey_alloc384
+#endif
+
+#ifndef __NR_pkey_free
+#define __NR_pkey_free 385
+#endif
+
+#ifndef NT_PPC_PKEY
+#define NT_PPC_PKEY0x110
+#endif
+
+#ifndef PKEY_DISABLE_EXECUTE
+#define PKEY_DISABLE_EXECUTE   0x4
+#endif
+
+#define AMR_BITS_PER_PKEY 2
+#define PKEY_REG_BITS (sizeof(u64) * 8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey + 1) * AMR_BITS_PER_PKEY))
+
+static const char user_read[] = "[User Read (Running)]";
+static const char user_write[] = "[User Write (Running)]";
+static const char ptrace_read_running[] = "[Ptrace Read (Running)]";
+static const char ptrace_write_running[] = "[Ptrace Write (Running)]";
+
+/* Information shared between the parent and the child. */
+struct shared_info {
+   /* AMR value the parent expects to read from the child. */
+   unsigned long amr1;
+
+   /* AMR value the parent is expected to write to the child. */
+   unsigned long amr2;
+
+   /* AMR value that ptrace should refuse to write to the child. */
+   unsigned long amr3;
+
+   /* IAMR value the parent expects to read from the child. */
+   unsigned long expected_iamr;
+
+   /* UAMOR value the parent expects to read from the child. */
+   unsigned long expected_uamor;
+
+   /*
+* IAMR and UAMOR values that ptrace should refuse to write to the child
+* (even though they're valid ones) because userspace doesn't have
+* access to those registers.
+*/
+   unsigned long new_iamr;
+   unsigned long new_uamor;
+
+   /* The parent waits on this semaphore. */
+   sem_t sem_parent;
+
+   /* If true, the child should give up as well. */
+   bool parent_gave_up;
+
+   /* The child waits on this semaphore. */
+   sem_t sem_child;
+
+   /* If true, the parent should give up as well. */
+   bool child_gave_up;
+};
+
+#define CHILD_FAIL_IF(x, info) \
+   do {\
+   if ((x)) {  \
+   fprintf(stderr, \
+   "[FAIL] Test FAILED on line %d\n", 

[PATCH v9 49/51] selftest/vm: sub-page allocator

2017-11-06 Thread Ram Pai
introduce a new allocator that allocates 4k hardware-pages to back
64k linux-page. This allocator is only applicable on powerpc.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   30 ++
 1 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index c790bff..7b3649f 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -765,6 +765,35 @@ void free_pkey_malloc(void *ptr)
return ptr;
 }
 
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+#ifdef __powerpc64__
+   void *ptr;
+   int ret;
+
+   dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+   size, prot, pkey);
+   pkey_assert(pkey < NR_PKEYS);
+   ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+   pkey_assert(ptr != (void *)-1);
+
+   ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+   if (ret) {
+   perror("subpage_perm");
+   return PTR_ERR_ENOTSUP;
+   }
+
+   ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+   pkey_assert(!ret);
+   record_pkey_malloc(ptr, size);
+
+   dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+   return ptr;
+#else /*  __powerpc64__ */
+   return PTR_ERR_ENOTSUP;
+#endif /*  __powerpc64__ */
+}
+
 void *malloc_pkey_anon_huge(long size, int prot, u16 pkey)
 {
int ret;
@@ -887,6 +916,7 @@ void setup_hugetlbfs(void)
 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 
malloc_pkey_with_mprotect,
+   malloc_pkey_with_mprotect_subpage,
malloc_pkey_anon_huge,
malloc_pkey_hugetlb
 /* can not do direct with the pkey_mprotect() API:
-- 
1.7.1



[PATCH v9 48/51] selftest/vm: detect write violation on a mapped access-denied-key page

2017-11-06 Thread Ram Pai
detect write-violation on a page to which access-disabled
key is associated much after the page is mapped.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 0b7b826..c790bff 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1058,6 +1058,18 @@ void test_write_of_access_disabled_region(int *ptr, u16 
pkey)
*ptr = __LINE__;
expected_pkey_fault(pkey);
 }
+
+void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   *ptr = __LINE__;
+   dprintf1("disabling access; after accessing the page, "
+   " to PKEY[%02d], doing write\n", pkey);
+   pkey_access_deny(pkey);
+   *ptr = __LINE__;
+   expected_pkey_fault(pkey);
+}
+
 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
 {
int ret;
@@ -1342,6 +1354,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
test_write_of_write_disabled_region,
test_write_of_write_disabled_region_with_page_already_mapped,
test_write_of_access_disabled_region,
+   test_write_of_access_disabled_region_with_page_already_mapped,
test_kernel_write_of_access_disabled_region,
test_kernel_write_of_write_disabled_region,
test_kernel_gup_of_access_disabled_region,
-- 
1.7.1



[PATCH v9 47/51] selftest/vm: associate key on a mapped page and detect write violation

2017-11-06 Thread Ram Pai
detect write-violation on a page to which write-disabled
key is associated much after the page is mapped.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 998a44f..0b7b826 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1033,6 +1033,17 @@ void 
test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
expected_pkey_fault(pkey);
 }
 
+void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   *ptr = __LINE__;
+   dprintf1("disabling write access; after accessing the page, "
+   "to PKEY[%02d], doing write\n", pkey);
+   pkey_write_deny(pkey);
+   *ptr = __LINE__;
+   expected_pkey_fault(pkey);
+}
+
 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
@@ -1329,6 +1340,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
test_read_of_access_disabled_region,
test_read_of_access_disabled_region_with_page_already_mapped,
test_write_of_write_disabled_region,
+   test_write_of_write_disabled_region_with_page_already_mapped,
test_write_of_access_disabled_region,
test_kernel_write_of_access_disabled_region,
test_kernel_write_of_write_disabled_region,
-- 
1.7.1



[PATCH v9 46/51] selftest/vm: associate key on a mapped page and detect access violation

2017-11-06 Thread Ram Pai
detect access-violation on a page to which access-disabled
key is associated much after the page is mapped.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   19 +++
 1 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 8f0dd94..998a44f 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1015,6 +1015,24 @@ void test_read_of_access_disabled_region(int *ptr, u16 
pkey)
dprintf1("*ptr: %d\n", ptr_contents);
expected_pkey_fault(pkey);
 }
+
+void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   int ptr_contents;
+
+   dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+   pkey, ptr);
+   ptr_contents = read_ptr(ptr);
+   dprintf1("reading ptr before disabling the read : %d\n",
+   ptr_contents);
+   rdpkey_reg();
+   pkey_access_deny(pkey);
+   ptr_contents = read_ptr(ptr);
+   dprintf1("*ptr: %d\n", ptr_contents);
+   expected_pkey_fault(pkey);
+}
+
 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
@@ -1309,6 +1327,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
 void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_read_of_write_disabled_region,
test_read_of_access_disabled_region,
+   test_read_of_access_disabled_region_with_page_already_mapped,
test_write_of_write_disabled_region,
test_write_of_access_disabled_region,
test_kernel_write_of_access_disabled_region,
-- 
1.7.1



[PATCH v9 45/51] selftest/vm: fix an assertion in test_pkey_alloc_exhaust()

2017-11-06 Thread Ram Pai
The maximum number of keys that can be allocated has to
take into consideration, that some keys are reserved by
the architecture for   specific   purpose. Hence cannot
be allocated.

Fix the assertion in test_pkey_alloc_exhaust()

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|   14 ++
 tools/testing/selftests/vm/protection_keys.c |9 -
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index f764d66..3ea3e06 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -388,4 +388,18 @@ static inline int get_start_key(void)
 #endif /* arch */
 }
 
+static inline int arch_reserved_keys(void)
+{
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+   return NR_RESERVED_PKEYS;
+#elif __powerpc64__ /* arch */
+   if (sysconf(_SC_PAGESIZE) == 4096)
+   return NR_RESERVED_PKEYS_4K;
+   else
+   return NR_RESERVED_PKEYS_64K;
+#else /* arch */
+   NOT SUPPORTED
+#endif /* arch */
+}
+
 #endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 4fe42cc..8f0dd94 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1166,12 +1166,11 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
pkey_assert(i < NR_PKEYS*2);
 
/*
-* There are 16 pkeys supported in hardware.  One is taken
-* up for the default (0) and another can be taken up by
-* an execute-only mapping.  Ensure that we can allocate
-* at least 14 (16-2).
+* There are NR_PKEYS pkeys supported in hardware. arch_reserved_keys()
+* are reserved. One   can   be   taken   up by an execute-only mapping.
+* Ensure that we can allocate at least the remaining.
 */
-   pkey_assert(i >= NR_PKEYS-2);
+   pkey_assert(i >= (NR_PKEYS-arch_reserved_keys()-1));
 
for (i = 0; i < nr_allocated_pkeys; i++) {
err = sys_pkey_free(allocated_pkeys[i]);
-- 
1.7.1



[PATCH v9 44/51] selftest/vm: powerpc implementation for generic abstraction

2017-11-06 Thread Ram Pai
Introduce powerpc implementation for the different
abstactions.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|  109 ++
 tools/testing/selftests/vm/protection_keys.c |   38 ++
 2 files changed, 117 insertions(+), 30 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 30755be..f764d66 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -18,27 +18,54 @@
 #define u16 uint16_t
 #define u32 uint32_t
 #define u64 uint64_t
-#define pkey_reg_t u32
 
-#ifdef __i386__
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+
+#ifdef __i386__ /* arch */
 #define SYS_mprotect_key 380
-#define SYS_pkey_alloc  381
-#define SYS_pkey_free   382
+#define SYS_pkey_alloc   381
+#define SYS_pkey_free382
 #define REG_IP_IDX REG_EIP
 #define si_pkey_offset 0x14
-#else
+#elif __x86_64__
 #define SYS_mprotect_key 329
-#define SYS_pkey_alloc  330
-#define SYS_pkey_free   331
+#define SYS_pkey_alloc   330
+#define SYS_pkey_free331
 #define REG_IP_IDX REG_RIP
 #define si_pkey_offset 0x20
-#endif
+#endif /* __x86_64__ */
+
+#define NR_PKEYS   16
+#define NR_RESERVED_PKEYS  1
+#define PKEY_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x1
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<21)
+#define pkey_reg_t u32
 
-#define NR_PKEYS 16
-#define PKEY_BITS_PER_PKEY 2
-#define PKEY_DISABLE_ACCESS0x1
-#define PKEY_DISABLE_WRITE 0x2
-#define HPAGE_SIZE (1UL<<21)
+#elif __powerpc64__ /* arch */
+
+#define SYS_mprotect_key 386
+#define SYS_pkey_alloc  384
+#define SYS_pkey_free   385
+#define si_pkey_offset 0x20
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define gregs gp_regs
+#define fpregs fp_regs
+
+#define NR_PKEYS   32
+#define NR_RESERVED_PKEYS_4K   26
+#define NR_RESERVED_PKEYS_64K  3
+#define PKEY_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x3  /* disable read and write */
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<24)
+#define pkey_reg_t u64
+
+#else /* arch */
+   NOT SUPPORTED
+#endif /* arch */
 
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
@@ -47,7 +74,11 @@
 
 static inline u32 pkey_to_shift(int pkey)
 {
+#if defined(__i386__) || defined(__x86_64__) /* arch */
return pkey * PKEY_BITS_PER_PKEY;
+#elif __powerpc64__ /* arch */
+   return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+#endif /* arch */
 }
 
 static inline pkey_reg_t reset_bits(int pkey, pkey_reg_t bits)
@@ -108,6 +139,7 @@ static inline void sigsafe_printf(const char *format, ...)
 extern pkey_reg_t shadow_pkey_reg;
 static inline pkey_reg_t __rdpkey_reg(void)
 {
+#if defined(__i386__) || defined(__x86_64__) /* arch */
unsigned int eax, edx;
unsigned int ecx = 0;
pkey_reg_t pkey_reg;
@@ -115,7 +147,13 @@ static inline pkey_reg_t __rdpkey_reg(void)
asm volatile(".byte 0x0f,0x01,0xee\n\t"
 : "=a" (eax), "=d" (edx)
 : "c" (ecx));
-   pkey_reg = eax;
+#elif __powerpc64__ /* arch */
+   pkey_reg_t eax;
+   pkey_reg_t pkey_reg;
+
+   asm volatile("mfspr %0, 0xd" : "=r" ((pkey_reg_t)(eax)));
+#endif /* arch */
+   pkey_reg = (pkey_reg_t)eax;
return pkey_reg;
 }
 
@@ -135,6 +173,7 @@ static inline pkey_reg_t _rdpkey_reg(int line)
 static inline void __wrpkey_reg(pkey_reg_t pkey_reg)
 {
pkey_reg_t eax = pkey_reg;
+#if defined(__i386__) || defined(__x86_64__) /* arch */
pkey_reg_t ecx = 0;
pkey_reg_t edx = 0;
 
@@ -143,6 +182,14 @@ static inline void __wrpkey_reg(pkey_reg_t pkey_reg)
asm volatile(".byte 0x0f,0x01,0xef\n\t"
 : : "a" (eax), "c" (ecx), "d" (edx));
assert(pkey_reg == __rdpkey_reg());
+
+#elif __powerpc64__ /* arch */
+   dprintf4("%s() changing %llx to %llx\n",
+__func__, __rdpkey_reg(), pkey_reg);
+   asm volatile("mtspr 0xd, %0" : : "r" ((unsigned long)(eax)) : "memory");
+#endif /* arch */
+   dprintf4("%s() pkey register after changing %016lx to %016lx\n",
+__func__, __rdpkey_reg(), pkey_reg);
 }
 
 static inline void wrpkey_reg(pkey_reg_t pkey_reg)
@@ -189,6 +236,8 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
dprintf4("pkey_reg now: %08x\n", rdpkey_reg());
 }
 
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+
 #define PAGE_SIZE 4096
 #define MB (1<<20)
 
@@ -271,8 +320,18 @@ static inline void __page_o_noops(void)
/* 8-bytes of instruction * 512 bytes = 1 page */
asm(".rept 512 ; nopl 0x7eee(%eax) ; .endr");
 }
+#elif __powerpc64__ /* arch */
 
-#endif /* _PKEYS_HELPER_H */
+#define PAGE_SIZE (0x1UL << 16)
+static inline int cpu_has_pku(void)
+{
+   return 1;
+}
+
+/* 8-bytes of instruction * 16384bytes = 1 page */
+#define 

[PATCH v9 43/51] selftest/vm: generic cleanup

2017-11-06 Thread Ram Pai
cleanup the code to satisfy coding styles.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   81 ++
 1 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 2600f7a..3868434 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -4,7 +4,7 @@
  *
  * There are examples in here of:
  *  * how to set protection keys on memory
- *  * how to set/clear bits in pkey registers (the rights register)
+ *  * how to set/clear bits in Protection Key registers (the rights register)
  *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
  *information from the siginfo
  *
@@ -13,13 +13,18 @@
  * prefault pages in at malloc, or not
  * protect MPX bounds tables with protection keys?
  * make sure VMA splitting/merging is working correctly
- * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune 
to pkeys
- * look for pkey "leaks" where it is still set on a VMA but "freed" back 
to the kernel
- * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey 
sticks
+ * OOMs can destroy mm->mmap (see exit_mmap()),
+ * so make sure it is immune to pkeys
+ * look for pkey "leaks" where it is still set on a VMA
+ *  but "freed" back to the kernel
+ * do a plain mprotect() to a mprotect_pkey() area and make
+ *  sure the pkey sticks
  *
  * Compile like this:
- * gcc  -o protection_keys-O2 -g -std=gnu99 -pthread -Wall 
protection_keys.c -lrt -ldl -lm
- * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall 
protection_keys.c -lrt -ldl -lm
+ * gcc  -o protection_keys-O2 -g -std=gnu99
+ *  -pthread -Wall protection_keys.c -lrt -ldl -lm
+ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99
+ *  -pthread -Wall protection_keys.c -lrt -ldl -lm
  */
 #define _GNU_SOURCE
 #include 
@@ -251,26 +256,11 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
dprintf1("signal pkey_reg from  pkey_reg: %016lx\n", __rdpkey_reg());
dprintf1("si_pkey from siginfo: %jx\n", si_pkey);
*(u64 *)pkey_reg_ptr = 0x;
-   dprintf1("WARNING: set PRKU=0 to allow faulting instruction to 
continue\n");
+   dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction "
+   "to continue\n");
pkey_faults++;
dprintf1("==\n");
return;
-   if (trapno == 14) {
-   fprintf(stderr,
-   "ERROR: In signal handler, page fault, trapno = %d, ip 
= %016lx\n",
-   trapno, ip);
-   fprintf(stderr, "si_addr %p\n", si->si_addr);
-   fprintf(stderr, "REG_ERR: %lx\n",
-   (unsigned 
long)uctxt->uc_mcontext.gregs[REG_ERR]);
-   exit(1);
-   } else {
-   fprintf(stderr, "unexpected trap %d! at 0x%lx\n", trapno, ip);
-   fprintf(stderr, "si_addr %p\n", si->si_addr);
-   fprintf(stderr, "REG_ERR: %lx\n",
-   (unsigned 
long)uctxt->uc_mcontext.gregs[REG_ERR]);
-   exit(2);
-   }
-   dprint_in_signal = 0;
 }
 
 int wait_all_children(void)
@@ -415,7 +405,7 @@ void pkey_disable_set(int pkey, int flags)
 {
unsigned long syscall_flags = 0;
int ret;
-   int pkey_rights;
+   u32 pkey_rights;
pkey_reg_t orig_pkey_reg = rdpkey_reg();
 
dprintf1("START->%s(%d, 0x%x)\n", __func__,
@@ -453,7 +443,7 @@ void pkey_disable_clear(int pkey, int flags)
 {
unsigned long syscall_flags = 0;
int ret;
-   int pkey_rights = pkey_get(pkey, syscall_flags);
+   u32 pkey_rights = pkey_get(pkey, syscall_flags);
pkey_reg_t orig_pkey_reg = rdpkey_reg();
 
pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
@@ -516,9 +506,10 @@ int sys_mprotect_pkey(void *ptr, size_t size, unsigned 
long orig_prot,
return sret;
 }
 
-int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+int sys_pkey_alloc(unsigned long flags, u64 init_val)
 {
int ret = syscall(SYS_pkey_alloc, flags, init_val);
+
dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
__func__, flags, init_val, ret, errno);
return ret;
@@ -542,7 +533,7 @@ void pkey_set_shadow(u32 key, u64 init_val)
 int alloc_pkey(void)
 {
int ret;
-   unsigned long init_val = 0x0;
+   u64 init_val = 0x0;
 
dprintf1("%s()::%d, pkey_reg: 0x%016lx shadow: %016lx\n", __func__,
__LINE__, __rdpkey_reg(), shadow_pkey_reg);
@@ -692,7 +683,9 @@ void record_pkey_malloc(void 

[PATCH v9 42/51] selftest/vm: pkey register should match shadow pkey

2017-11-06 Thread Ram Pai
expected_pkey_fault() is comparing the contents of pkey
register with 0. This may not be true all the time. There
could be bits set by default by the architecture
which can never be changed. Hence compare the value against
shadow pkey register, which is supposed to track the bits
accurately all throughout

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 19ae991..2600f7a 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -926,10 +926,10 @@ void expected_pkey_fault(int pkey)
pkey_assert(last_pkey_faults + 1 == pkey_faults);
pkey_assert(last_si_pkey == pkey);
/*
-* The signal handler shold have cleared out PKEY register to let the
+* The signal handler shold have cleared out pkey-register to let the
 * test program continue.  We now have to restore it.
 */
-   if (__rdpkey_reg() != 0)
+   if (__rdpkey_reg() != shadow_pkey_reg)
pkey_assert(0);
 
__wrpkey_reg(shadow_pkey_reg);
-- 
1.7.1



[PATCH v9 41/51] selftest/vm: introduce two arch independent abstraction

2017-11-06 Thread Ram Pai
open_hugepage_file() <- opens the huge page file
get_start_key() <--  provides the first non-reserved key.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|   11 +++
 tools/testing/selftests/vm/protection_keys.c |6 +++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index d521f53..30755be 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -301,3 +301,14 @@ static inline void __page_o_noops(void)
}   \
 } while (0)
 #define raw_assert(cond) assert(cond)
+
+static inline int open_hugepage_file(int flag)
+{
+   return open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages",
+O_RDONLY);
+}
+
+static inline int get_start_key(void)
+{
+   return 1;
+}
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 1a14027..19ae991 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -809,7 +809,7 @@ void setup_hugetlbfs(void)
 * Now go make sure that we got the pages and that they
 * are 2M pages.  Someone might have made 1G the default.
 */
-   fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", 
O_RDONLY);
+   fd = open_hugepage_file(O_RDONLY);
if (fd < 0) {
perror("opening sysfs 2M hugetlb config");
return;
@@ -1087,10 +1087,10 @@ void test_kernel_gup_write_to_write_disabled_region(int 
*ptr, u16 pkey)
 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
 {
int err;
-   int i;
+   int i = get_start_key();
 
/* Note: 0 is the default pkey, so don't mess with it */
-   for (i = 1; i < NR_PKEYS; i++) {
+   for (; i < NR_PKEYS; i++) {
if (pkey == i)
continue;
 
-- 
1.7.1



[PATCH v9 40/51] selftest/vm: fix alloc_random_pkey() to make it really random

2017-11-06 Thread Ram Pai
alloc_random_pkey() was allocating the same pkey every time.
Not all pkeys were geting tested. fixed it.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   10 +++---
 1 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 2823d4d..1a14027 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -24,6 +24,7 @@
 #define _GNU_SOURCE
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -602,13 +603,15 @@ int alloc_random_pkey(void)
int alloced_pkeys[NR_PKEYS];
int nr_alloced = 0;
int random_index;
+
memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+   srand((unsigned int)time(NULL));
 
/* allocate every possible key and make a note of which ones we got */
max_nr_pkey_allocs = NR_PKEYS;
-   max_nr_pkey_allocs = 1;
for (i = 0; i < max_nr_pkey_allocs; i++) {
int new_pkey = alloc_pkey();
+
if (new_pkey < 0)
break;
alloced_pkeys[nr_alloced++] = new_pkey;
@@ -624,13 +627,14 @@ int alloc_random_pkey(void)
/* go through the allocated ones that we did not want and free them */
for (i = 0; i < nr_alloced; i++) {
int free_ret;
+
if (!alloced_pkeys[i])
continue;
free_ret = sys_pkey_free(alloced_pkeys[i]);
pkey_assert(!free_ret);
}
-   dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%x\n", __func__,
-   __LINE__, ret, __rdpkey_reg(), shadow_pkey_reg);
+   dprintf1("%s()::%d, ret: %d pkey_reg: 0x%x shadow: 0x%016lx\n",
+   __func__, __LINE__, ret, __rdpkey_reg(), shadow_pkey_reg);
return ret;
 }
 
-- 
1.7.1



[PATCH v9 39/51] selftest/vm: clear the bits in shadow reg when a pkey is freed.

2017-11-06 Thread Ram Pai
When a key is freed, the  key  is  no  more  effective.
Clear the bits corresponding to the pkey in the shadow
register. Otherwise  it  will carry some spurious bits
which can trigger false-positive asserts.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 384cc9a..2823d4d 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -582,6 +582,9 @@ int alloc_pkey(void)
 int sys_pkey_free(unsigned long pkey)
 {
int ret = syscall(SYS_pkey_free, pkey);
+
+   if (!ret)
+   shadow_pkey_reg &= reset_bits(pkey, PKEY_DISABLE_ACCESS);
dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
return ret;
 }
-- 
1.7.1



[PATCH v9 38/51] selftest/vm: fixed bugs in pkey_disable_clear()

2017-11-06 Thread Ram Pai
instead of clearing the bits, pkey_disable_clear() was setting
the bits. Fixed it.

Also fixed a wrong assertion in that function. When bits are
cleared, the resulting bit value will be less than the original.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 5aba137..384cc9a 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -461,7 +461,7 @@ void pkey_disable_clear(int pkey, int flags)
pkey, pkey, pkey_rights);
pkey_assert(pkey_rights >= 0);
 
-   pkey_rights |= flags;
+   pkey_rights &= ~flags;
 
ret = pkey_set(pkey, pkey_rights, 0);
/* pkey_reg and flags have the same format */
@@ -475,7 +475,7 @@ void pkey_disable_clear(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x%016lx\n", __func__,
pkey, rdpkey_reg());
if (flags)
-   assert(rdpkey_reg() > orig_pkey_reg);
+   assert(rdpkey_reg() < orig_pkey_reg);
 }
 
 void pkey_write_allow(int pkey)
-- 
1.7.1



[PATCH v9 37/51] selftest/vm: fix the wrong assert in pkey_disable_set()

2017-11-06 Thread Ram Pai
If the flag is 0, no bits will be set. Hence we cant expect
the resulting bitmap to have a higher value than what it
was earlier.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 8e2e277..5aba137 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -443,7 +443,7 @@ void pkey_disable_set(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x%lx\n",
__func__, pkey, rdpkey_reg());
if (flags)
-   pkey_assert(rdpkey_reg() > orig_pkey_reg);
+   pkey_assert(rdpkey_reg() >= orig_pkey_reg);
dprintf1("END<---%s(%d, 0x%x)\n", __func__,
pkey, flags);
 }
-- 
1.7.1



[PATCH v9 36/51] selftest/vm: generic function to handle shadow key register

2017-11-06 Thread Ram Pai
helper functions to handler shadow pkey register

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|   27 
 tools/testing/selftests/vm/protection_keys.c |   34 -
 2 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index b03f7e5..d521f53 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -44,6 +44,33 @@
 #define DEBUG_LEVEL 0
 #endif
 #define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+
+static inline u32 pkey_to_shift(int pkey)
+{
+   return pkey * PKEY_BITS_PER_PKEY;
+}
+
+static inline pkey_reg_t reset_bits(int pkey, pkey_reg_t bits)
+{
+   u32 shift = pkey_to_shift(pkey);
+
+   return ~(bits << shift);
+}
+
+static inline pkey_reg_t left_shift_bits(int pkey, pkey_reg_t bits)
+{
+   u32 shift = pkey_to_shift(pkey);
+
+   return (bits << shift);
+}
+
+static inline pkey_reg_t right_shift_bits(int pkey, pkey_reg_t bits)
+{
+   u32 shift = pkey_to_shift(pkey);
+
+   return (bits >> shift);
+}
+
 extern int dprint_in_signal;
 extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 static inline void sigsafe_printf(const char *format, ...)
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 2e8de01..8e2e277 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -374,7 +374,7 @@ u32 pkey_get(int pkey, unsigned long flags)
__func__, pkey, flags, 0, 0);
dprintf2("%s() raw pkey_reg: %x\n", __func__, pkey_reg);
 
-   shifted_pkey_reg = (pkey_reg >> (pkey * PKEY_BITS_PER_PKEY));
+   shifted_pkey_reg = right_shift_bits(pkey, pkey_reg);
dprintf2("%s() shifted_pkey_reg: %x\n", __func__, shifted_pkey_reg);
masked_pkey_reg = shifted_pkey_reg & mask;
dprintf2("%s() masked  pkey_reg: %x\n", __func__, masked_pkey_reg);
@@ -397,9 +397,9 @@ int pkey_set(int pkey, unsigned long rights, unsigned long 
flags)
/* copy old pkey_reg */
new_pkey_reg = old_pkey_reg;
/* mask out bits from pkey in old value: */
-   new_pkey_reg &= ~(mask << (pkey * PKEY_BITS_PER_PKEY));
+   new_pkey_reg &= reset_bits(pkey, mask);
/* OR in new bits for pkey: */
-   new_pkey_reg |= (rights << (pkey * PKEY_BITS_PER_PKEY));
+   new_pkey_reg |= left_shift_bits(pkey, rights);
 
__wrpkey_reg(new_pkey_reg);
 
@@ -430,7 +430,7 @@ void pkey_disable_set(int pkey, int flags)
ret = pkey_set(pkey, pkey_rights, syscall_flags);
assert(!ret);
/*pkey_reg and flags have the same format */
-   shadow_pkey_reg |= flags << (pkey * 2);
+   shadow_pkey_reg |= left_shift_bits(pkey, flags);
dprintf1("%s(%d) shadow: 0x%016lx\n",
__func__, pkey, shadow_pkey_reg);
 
@@ -465,7 +465,7 @@ void pkey_disable_clear(int pkey, int flags)
 
ret = pkey_set(pkey, pkey_rights, 0);
/* pkey_reg and flags have the same format */
-   shadow_pkey_reg &= ~(flags << (pkey * 2));
+   shadow_pkey_reg &= reset_bits(pkey, flags);
pkey_assert(ret >= 0);
 
pkey_rights = pkey_get(pkey, syscall_flags);
@@ -523,6 +523,21 @@ int sys_pkey_alloc(unsigned long flags, unsigned long 
init_val)
return ret;
 }
 
+void pkey_setup_shadow(void)
+{
+   shadow_pkey_reg = __rdpkey_reg();
+}
+
+void pkey_reset_shadow(u32 key)
+{
+   shadow_pkey_reg &= reset_bits(key, 0x3);
+}
+
+void pkey_set_shadow(u32 key, u64 init_val)
+{
+   shadow_pkey_reg |=  left_shift_bits(key, init_val);
+}
+
 int alloc_pkey(void)
 {
int ret;
@@ -540,7 +555,7 @@ int alloc_pkey(void)
shadow_pkey_reg);
if (ret) {
/* clear both the bits: */
-   shadow_pkey_reg &= ~(0x3  << (ret * 2));
+   pkey_reset_shadow(ret);
dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016lx "
"shadow: 0x%016lx\n",
__func__,
@@ -550,7 +565,7 @@ int alloc_pkey(void)
 * move the new state in from init_val
 * (remember, we cheated and init_val == pkey_reg format)
 */
-   shadow_pkey_reg |=  (init_val << (ret * 2));
+   pkey_set_shadow(ret, init_val);
}
dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016lx shadow: 0x%016lx\n",
__func__, __LINE__, ret, __rdpkey_reg(),
@@ -1322,11 +1337,6 @@ void run_tests_once(void)
iteration_nr++;
 }
 
-void pkey_setup_shadow(void)
-{
-   shadow_pkey_reg = __rdpkey_reg();
-}
-
 int main(void)
 {
int nr_iterations = 22;
-- 
1.7.1



[PATCH v9 35/51] selftest/vm: typecast the pkey register

2017-11-06 Thread Ram Pai
This is in preparation to accomadate a differing size register
across architectures.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|   27 +-
 tools/testing/selftests/vm/protection_keys.c |   71 ++
 2 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 1b15b54..b03f7e5 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -18,6 +18,7 @@
 #define u16 uint16_t
 #define u32 uint32_t
 #define u64 uint64_t
+#define pkey_reg_t u32
 
 #ifdef __i386__
 #define SYS_mprotect_key 380
@@ -77,12 +78,12 @@ static inline void sigsafe_printf(const char *format, ...)
 #define dprintf3(args...) dprintf_level(3, args)
 #define dprintf4(args...) dprintf_level(4, args)
 
-extern unsigned int shadow_pkey_reg;
-static inline unsigned int __rdpkey_reg(void)
+extern pkey_reg_t shadow_pkey_reg;
+static inline pkey_reg_t __rdpkey_reg(void)
 {
unsigned int eax, edx;
unsigned int ecx = 0;
-   unsigned int pkey_reg;
+   pkey_reg_t pkey_reg;
 
asm volatile(".byte 0x0f,0x01,0xee\n\t"
 : "=a" (eax), "=d" (edx)
@@ -91,11 +92,11 @@ static inline unsigned int __rdpkey_reg(void)
return pkey_reg;
 }
 
-static inline unsigned int _rdpkey_reg(int line)
+static inline pkey_reg_t _rdpkey_reg(int line)
 {
-   unsigned int pkey_reg = __rdpkey_reg();
+   pkey_reg_t pkey_reg = __rdpkey_reg();
 
-   dprintf4("rdpkey_reg(line=%d) pkey_reg: %x shadow: %x\n",
+   dprintf4("rdpkey_reg(line=%d) pkey_reg: %016lx shadow: %016lx\n",
line, pkey_reg, shadow_pkey_reg);
assert(pkey_reg == shadow_pkey_reg);
 
@@ -104,11 +105,11 @@ static inline unsigned int _rdpkey_reg(int line)
 
 #define rdpkey_reg() _rdpkey_reg(__LINE__)
 
-static inline void __wrpkey_reg(unsigned int pkey_reg)
+static inline void __wrpkey_reg(pkey_reg_t pkey_reg)
 {
-   unsigned int eax = pkey_reg;
-   unsigned int ecx = 0;
-   unsigned int edx = 0;
+   pkey_reg_t eax = pkey_reg;
+   pkey_reg_t ecx = 0;
+   pkey_reg_t edx = 0;
 
dprintf4("%s() changing %08x to %08x\n", __func__,
__rdpkey_reg(), pkey_reg);
@@ -117,7 +118,7 @@ static inline void __wrpkey_reg(unsigned int pkey_reg)
assert(pkey_reg == __rdpkey_reg());
 }
 
-static inline void wrpkey_reg(unsigned int pkey_reg)
+static inline void wrpkey_reg(pkey_reg_t pkey_reg)
 {
dprintf4("%s() changing %08x to %08x\n", __func__,
__rdpkey_reg(), pkey_reg);
@@ -135,7 +136,7 @@ static inline void wrpkey_reg(unsigned int pkey_reg)
  */
 static inline void __pkey_access_allow(int pkey, int do_allow)
 {
-   unsigned int pkey_reg = rdpkey_reg();
+   pkey_reg_t pkey_reg = rdpkey_reg();
int bit = pkey * 2;
 
if (do_allow)
@@ -149,7 +150,7 @@ static inline void __pkey_access_allow(int pkey, int 
do_allow)
 
 static inline void __pkey_write_allow(int pkey, int do_allow_write)
 {
-   long pkey_reg = rdpkey_reg();
+   pkey_reg_t pkey_reg = rdpkey_reg();
int bit = pkey * 2 + 1;
 
if (do_allow_write)
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index dec05e0..2e8de01 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -48,7 +48,7 @@
 int iteration_nr = 1;
 int test_nr;
 
-unsigned int shadow_pkey_reg;
+pkey_reg_t shadow_pkey_reg;
 int dprint_in_signal;
 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
@@ -158,7 +158,7 @@ void dump_mem(void *dumpme, int len_bytes)
 
for (i = 0; i < len_bytes; i += sizeof(u64)) {
u64 *ptr = (u64 *)(c + i);
-   dprintf1("dump[%03d][@%p]: %016jx\n", i, ptr, *ptr);
+   dprintf1("dump[%03d][@%p]: %016lx\n", i, ptr, *ptr);
}
 }
 
@@ -186,15 +186,16 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
int trapno;
unsigned long ip;
char *fpregs;
-   u32 *pkey_reg_ptr;
-   u64 si_pkey;
+   pkey_reg_t *pkey_reg_ptr;
+   u32 si_pkey;
u32 *si_pkey_ptr;
int pkey_reg_offset;
fpregset_t fpregset;
 
dprint_in_signal = 1;
dprintf1("===SIGSEGV\n");
-   dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__,
+   dprintf1("%s()::%d, pkey_reg: 0x%016lx shadow: %016lx\n",
+   __func__, __LINE__,
__rdpkey_reg(), shadow_pkey_reg);
 
trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
@@ -202,8 +203,9 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
fpregset = uctxt->uc_mcontext.fpregs;
fpregs = (void *)fpregset;
 
-   dprintf2("%s() trapno: %d ip: 

[PATCH v9 34/51] selftest/vm: move generic definitions to header file

2017-11-06 Thread Ram Pai
Moved all the generic definition and helper functions to the
header file

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|   62 +++--
 tools/testing/selftests/vm/protection_keys.c |   54 --
 2 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 2d91d34..1b15b54 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -13,8 +13,31 @@
 #include 
 #include 
 
+/* Define some kernel-like types */
+#define  u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#ifdef __i386__
+#define SYS_mprotect_key 380
+#define SYS_pkey_alloc  381
+#define SYS_pkey_free   382
+#define REG_IP_IDX REG_EIP
+#define si_pkey_offset 0x14
+#else
+#define SYS_mprotect_key 329
+#define SYS_pkey_alloc  330
+#define SYS_pkey_free   331
+#define REG_IP_IDX REG_RIP
+#define si_pkey_offset 0x20
+#endif
+
 #define NR_PKEYS 16
 #define PKEY_BITS_PER_PKEY 2
+#define PKEY_DISABLE_ACCESS0x1
+#define PKEY_DISABLE_WRITE 0x2
+#define HPAGE_SIZE (1UL<<21)
 
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
@@ -138,11 +161,6 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
dprintf4("pkey_reg now: %08x\n", rdpkey_reg());
 }
 
-#define PROT_PKEY0 0x10/* protection key value (bit 0) */
-#define PROT_PKEY1 0x20/* protection key value (bit 1) */
-#define PROT_PKEY2 0x40/* protection key value (bit 2) */
-#define PROT_PKEY3 0x80/* protection key value (bit 3) */
-
 #define PAGE_SIZE 4096
 #define MB (1<<20)
 
@@ -220,4 +238,38 @@ int pkey_reg_xstate_offset(void)
return xstate_offset;
 }
 
+static inline void __page_o_noops(void)
+{
+   /* 8-bytes of instruction * 512 bytes = 1 page */
+   asm(".rept 512 ; nopl 0x7eee(%eax) ; .endr");
+}
+
 #endif /* _PKEYS_HELPER_H */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to)  \
+   ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to) \
+   ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...) #x
+#define __stringify(x...)   __stringify_1(x)
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
+int dprint_in_signal;
+char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+extern void abort_hooks(void);
+#define pkey_assert(condition) do {\
+   if (!(condition)) { \
+   dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+   __FILE__, __LINE__, \
+   test_nr, iteration_nr); \
+   dprintf0("errno at assert: %d", errno); \
+   abort_hooks();  \
+   assert(condition);  \
+   }   \
+} while (0)
+#define raw_assert(cond) assert(cond)
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 27b11e6..dec05e0 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -49,34 +49,9 @@
 int test_nr;
 
 unsigned int shadow_pkey_reg;
-
-#define HPAGE_SIZE (1UL<<21)
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
-#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
-#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
-#define ALIGN_PTR_UP(p, ptr_align_to)  ((typeof(p))ALIGN_UP((unsigned 
long)(p),ptr_align_to))
-#define ALIGN_PTR_DOWN(p, ptr_align_to)
((typeof(p))ALIGN_DOWN((unsigned long)(p),  ptr_align_to))
-#define __stringify_1(x...) #x
-#define __stringify(x...)   __stringify_1(x)
-
-#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
-
 int dprint_in_signal;
 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
-extern void abort_hooks(void);
-#define pkey_assert(condition) do {\
-   if (!(condition)) { \
-   dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
-   __FILE__, __LINE__, \
-   test_nr, iteration_nr); \
-   dprintf0("errno at assert: %d", errno); \
-   abort_hooks();  \
-   assert(condition);  \
-   }   \
-} while (0)
-#define raw_assert(cond) assert(cond)
-
 void cat_into_file(char *str, char *file)
 {
int fd = open(file, O_RDWR);
@@ -154,12 +129,6 @@ void abort_hooks(void)
 #endif
 }
 
-static inline void __page_o_noops(void)
-{
-   /* 8-bytes of 

[PATCH v9 33/51] selftest/vm: rename all references to pkru to a generic name

2017-11-06 Thread Ram Pai
some pkru references are named to pkey_reg
and some prku references are renamed to pkey

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|   85 +-
 tools/testing/selftests/vm/protection_keys.c |  227 ++
 2 files changed, 164 insertions(+), 148 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 3818f25..2d91d34 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -14,7 +14,7 @@
 #include 
 
 #define NR_PKEYS 16
-#define PKRU_BITS_PER_PKEY 2
+#define PKEY_BITS_PER_PKEY 2
 
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
@@ -54,85 +54,88 @@ static inline void sigsafe_printf(const char *format, ...)
 #define dprintf3(args...) dprintf_level(3, args)
 #define dprintf4(args...) dprintf_level(4, args)
 
-extern unsigned int shadow_pkru;
-static inline unsigned int __rdpkru(void)
+extern unsigned int shadow_pkey_reg;
+static inline unsigned int __rdpkey_reg(void)
 {
unsigned int eax, edx;
unsigned int ecx = 0;
-   unsigned int pkru;
+   unsigned int pkey_reg;
 
asm volatile(".byte 0x0f,0x01,0xee\n\t"
 : "=a" (eax), "=d" (edx)
 : "c" (ecx));
-   pkru = eax;
-   return pkru;
+   pkey_reg = eax;
+   return pkey_reg;
 }
 
-static inline unsigned int _rdpkru(int line)
+static inline unsigned int _rdpkey_reg(int line)
 {
-   unsigned int pkru = __rdpkru();
+   unsigned int pkey_reg = __rdpkey_reg();
 
-   dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
-   line, pkru, shadow_pkru);
-   assert(pkru == shadow_pkru);
+   dprintf4("rdpkey_reg(line=%d) pkey_reg: %x shadow: %x\n",
+   line, pkey_reg, shadow_pkey_reg);
+   assert(pkey_reg == shadow_pkey_reg);
 
-   return pkru;
+   return pkey_reg;
 }
 
-#define rdpkru() _rdpkru(__LINE__)
+#define rdpkey_reg() _rdpkey_reg(__LINE__)
 
-static inline void __wrpkru(unsigned int pkru)
+static inline void __wrpkey_reg(unsigned int pkey_reg)
 {
-   unsigned int eax = pkru;
+   unsigned int eax = pkey_reg;
unsigned int ecx = 0;
unsigned int edx = 0;
 
-   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   dprintf4("%s() changing %08x to %08x\n", __func__,
+   __rdpkey_reg(), pkey_reg);
asm volatile(".byte 0x0f,0x01,0xef\n\t"
 : : "a" (eax), "c" (ecx), "d" (edx));
-   assert(pkru == __rdpkru());
+   assert(pkey_reg == __rdpkey_reg());
 }
 
-static inline void wrpkru(unsigned int pkru)
+static inline void wrpkey_reg(unsigned int pkey_reg)
 {
-   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   dprintf4("%s() changing %08x to %08x\n", __func__,
+   __rdpkey_reg(), pkey_reg);
/* will do the shadow check for us: */
-   rdpkru();
-   __wrpkru(pkru);
-   shadow_pkru = pkru;
-   dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+   rdpkey_reg();
+   __wrpkey_reg(pkey_reg);
+   shadow_pkey_reg = pkey_reg;
+   dprintf4("%s(%08x) pkey_reg: %08x\n", __func__,
+   pkey_reg, __rdpkey_reg());
 }
 
 /*
  * These are technically racy. since something could
- * change PKRU between the read and the write.
+ * change PKEY register between the read and the write.
  */
 static inline void __pkey_access_allow(int pkey, int do_allow)
 {
-   unsigned int pkru = rdpkru();
+   unsigned int pkey_reg = rdpkey_reg();
int bit = pkey * 2;
 
if (do_allow)
-   pkru &= (1<

[PATCH v9 32/51] selftest/x86: Move protecton key selftest to arch neutral directory

2017-11-06 Thread Ram Pai
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  220 
 tools/testing/selftests/vm/protection_keys.c  | 1395 +
 tools/testing/selftests/x86/Makefile  |2 +-
 tools/testing/selftests/x86/pkey-helpers.h|  220 
 tools/testing/selftests/x86/protection_keys.c | 1395 -
 6 files changed, 1617 insertions(+), 1616 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/vm/protection_keys.c
 delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h
 delete mode 100644 tools/testing/selftests/x86/protection_keys.c

diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index e49eca1..6f18ef4 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -18,6 +18,7 @@ TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
 TEST_GEN_FILES += mlock-random-test
 TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += protection_keys
 
 TEST_PROGS := run_vmtests
 
diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
new file mode 100644
index 000..3818f25
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -0,0 +1,220 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define NR_PKEYS 16
+#define PKRU_BITS_PER_PKEY 2
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+static inline void sigsafe_printf(const char *format, ...)
+{
+   va_list ap;
+
+   va_start(ap, format);
+   if (!dprint_in_signal) {
+   vprintf(format, ap);
+   } else {
+   int len = vsnprintf(dprint_in_signal_buffer,
+   DPRINT_IN_SIGNAL_BUF_SIZE,
+   format, ap);
+   /*
+* len is amount that would have been printed,
+* but actual write is truncated at BUF_SIZE.
+*/
+   if (len > DPRINT_IN_SIGNAL_BUF_SIZE)
+   len = DPRINT_IN_SIGNAL_BUF_SIZE;
+   write(1, dprint_in_signal_buffer, len);
+   }
+   va_end(ap);
+}
+#define dprintf_level(level, args...) do { \
+   if (level <= DEBUG_LEVEL)   \
+   sigsafe_printf(args);   \
+   fflush(NULL);   \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern unsigned int shadow_pkru;
+static inline unsigned int __rdpkru(void)
+{
+   unsigned int eax, edx;
+   unsigned int ecx = 0;
+   unsigned int pkru;
+
+   asm volatile(".byte 0x0f,0x01,0xee\n\t"
+: "=a" (eax), "=d" (edx)
+: "c" (ecx));
+   pkru = eax;
+   return pkru;
+}
+
+static inline unsigned int _rdpkru(int line)
+{
+   unsigned int pkru = __rdpkru();
+
+   dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
+   line, pkru, shadow_pkru);
+   assert(pkru == shadow_pkru);
+
+   return pkru;
+}
+
+#define rdpkru() _rdpkru(__LINE__)
+
+static inline void __wrpkru(unsigned int pkru)
+{
+   unsigned int eax = pkru;
+   unsigned int ecx = 0;
+   unsigned int edx = 0;
+
+   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   asm volatile(".byte 0x0f,0x01,0xef\n\t"
+: : "a" (eax), "c" (ecx), "d" (edx));
+   assert(pkru == __rdpkru());
+}
+
+static inline void wrpkru(unsigned int pkru)
+{
+   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   /* will do the shadow check for us: */
+   rdpkru();
+   __wrpkru(pkru);
+   shadow_pkru = pkru;
+   dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKRU between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+   unsigned int pkru = rdpkru();
+   int bit = pkey * 2;
+
+   if (do_allow)
+   pkru &= (1<

[PATCH v9 31/51] Documentation/vm: PowerPC specific updates to memory protection keys

2017-11-06 Thread Ram Pai
Add documentation updates that capture PowerPC specific changes.

Signed-off-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 Documentation/vm/protection-keys.txt |  126 +++---
 1 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/Documentation/vm/protection-keys.txt 
b/Documentation/vm/protection-keys.txt
index fa46dcb..bc079b3 100644
--- a/Documentation/vm/protection-keys.txt
+++ b/Documentation/vm/protection-keys.txt
@@ -1,22 +1,46 @@
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
-which will be found on future Intel CPUs.
-
-Memory Protection Keys provides a mechanism for enforcing page-based
-protections, but without requiring modification of the page tables
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
-
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
-thread a different set of protections from every other thread.
-
-There are two new instructions (RDPKRU/WRPKRU) for reading and writing
-to the new register.  The feature is only available in 64-bit mode,
-even though there is theoretically space in the PAE PTEs.  These
-permissions are enforced on data access only and have no effect on
-instruction fetches.
+Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature found on
+future Intel CPUs and on PowerPC 5 and higher CPUs.
+
+Memory Protection Keys provide a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables when an
+application changes protection domains.
+
+It works by dedicating bits in each page table entry to a "protection key".
+There is also a user-accessible register with two separate bits for each
+key.  Being a CPU register, the user-accessible register is inherently
+thread-local, potentially giving each thread a different set of protections
+from every other thread.
+
+On Intel:
+
+   Four previously bits are used the page table entry giving 16 possible 
keys.
+
+   The user accessible register(PKRU) has a bit each per key to disable
+   access and to disable write.
+
+   The feature is only available in 64-bit mode, even though there is
+   theoretically space in the PAE PTEs.  These permissions are enforced on
+   data access only and have no effect on instruction fetches.
+
+On PowerPC:
+
+   Five bits in the page table entry are used giving 32 possible keys.
+   This support is currently for Hash Page Table mode only.
+
+   The user accessible register(AMR) has a bit each per key to disable
+   read and write. Access disable can be achieved by disabling
+   read and write.
+
+   'mtspr 0xd, mem' reads the AMR register
+   'mfspr mem, 0xd' writes into the AMR register.
+
+   Execution can  be  disabled by allocating a key with execute-disabled
+   permission. The execute-permissions on the key; however, cannot be
+   changed through a user accessible register. Instead; a powerpc specific
+   system call sys_pkey_modify() must be used. The CPU will not allow
+   execution of instruction in pages that are associated with
+   execute-disabled key.
+
 
 === Syscalls ===
 
@@ -28,9 +52,9 @@ There are 3 system calls which directly interact with pkeys:
  unsigned long prot, int pkey);
 
 Before a pkey can be used, it must first be allocated with
-pkey_alloc().  An application calls the WRPKRU instruction
+pkey_alloc().  An application calls the WRPKRU/AMR instruction
 directly in order to change access permissions to memory covered
-with a key.  In this example WRPKRU is wrapped by a C function
+with a key.  In this example WRPKRU/AMR is wrapped by a C function
 called pkey_set().
 
int real_prot = PROT_READ|PROT_WRITE;
@@ -52,11 +76,11 @@ is no longer in use:
munmap(ptr, PAGE_SIZE);
pkey_free(pkey);
 
-(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
+(Note: pkey_set() is a wrapper for the RDPKRU,WRPKRU or AMR instructions.
  An example implementation can be found in
- tools/testing/selftests/x86/protection_keys.c)
+ tools/testing/selftests/vm/protection_keys.c)
 
-=== Behavior ===
+=== Behavior =
 
 The kernel attempts to make protection keys consistent with the
 behavior of a plain mprotect().  For instance if you do this:
@@ -66,7 +90,7 @@ behavior of a plain mprotect().  For instance if you do this:
 
 you can expect the same effects with protection keys when doing this:
 
-   pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | 

[PATCH v9 30/51] Documentation/x86: Move protecton key documentation to arch neutral directory

2017-11-06 Thread Ram Pai
Since PowerPC and Intel both support memory protection keys, moving
the documenation to arch-neutral directory.

Signed-off-by: Ram Pai 
---
 Documentation/vm/protection-keys.txt  |   85 +
 Documentation/x86/protection-keys.txt |   85 -
 2 files changed, 85 insertions(+), 85 deletions(-)
 create mode 100644 Documentation/vm/protection-keys.txt
 delete mode 100644 Documentation/x86/protection-keys.txt

diff --git a/Documentation/vm/protection-keys.txt 
b/Documentation/vm/protection-keys.txt
new file mode 100644
index 000..fa46dcb
--- /dev/null
+++ b/Documentation/vm/protection-keys.txt
@@ -0,0 +1,85 @@
+Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
+which will be found on future Intel CPUs.
+
+Memory Protection Keys provides a mechanism for enforcing page-based
+protections, but without requiring modification of the page tables
+when an application changes protection domains.  It works by
+dedicating 4 previously ignored bits in each page table entry to a
+"protection key", giving 16 possible keys.
+
+There is also a new user-accessible register (PKRU) with two separate
+bits (Access Disable and Write Disable) for each key.  Being a CPU
+register, PKRU is inherently thread-local, potentially giving each
+thread a different set of protections from every other thread.
+
+There are two new instructions (RDPKRU/WRPKRU) for reading and writing
+to the new register.  The feature is only available in 64-bit mode,
+even though there is theoretically space in the PAE PTEs.  These
+permissions are enforced on data access only and have no effect on
+instruction fetches.
+
+=== Syscalls ===
+
+There are 3 system calls which directly interact with pkeys:
+
+   int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
+   int pkey_free(int pkey);
+   int pkey_mprotect(unsigned long start, size_t len,
+ unsigned long prot, int pkey);
+
+Before a pkey can be used, it must first be allocated with
+pkey_alloc().  An application calls the WRPKRU instruction
+directly in order to change access permissions to memory covered
+with a key.  In this example WRPKRU is wrapped by a C function
+called pkey_set().
+
+   int real_prot = PROT_READ|PROT_WRITE;
+   pkey = pkey_alloc(0, PKEY_DISABLE_WRITE);
+   ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
0);
+   ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey);
+   ... application runs here
+
+Now, if the application needs to update the data at 'ptr', it can
+gain access, do the update, then remove its write access:
+
+   pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE
+   *ptr = foo; // assign something
+   pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again
+
+Now when it frees the memory, it will also free the pkey since it
+is no longer in use:
+
+   munmap(ptr, PAGE_SIZE);
+   pkey_free(pkey);
+
+(Note: pkey_set() is a wrapper for the RDPKRU and WRPKRU instructions.
+ An example implementation can be found in
+ tools/testing/selftests/x86/protection_keys.c)
+
+=== Behavior ===
+
+The kernel attempts to make protection keys consistent with the
+behavior of a plain mprotect().  For instance if you do this:
+
+   mprotect(ptr, size, PROT_NONE);
+   something(ptr);
+
+you can expect the same effects with protection keys when doing this:
+
+   pkey = pkey_alloc(0, PKEY_DISABLE_WRITE | PKEY_DISABLE_READ);
+   pkey_mprotect(ptr, size, PROT_READ|PROT_WRITE, pkey);
+   something(ptr);
+
+That should be true whether something() is a direct access to 'ptr'
+like:
+
+   *ptr = foo;
+
+or when the kernel does the access on the application's behalf like
+with a read():
+
+   read(fd, ptr, 1);
+
+The kernel will send a SIGSEGV in both cases, but si_code will be set
+to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
+the plain mprotect() permissions are violated.
diff --git a/Documentation/x86/protection-keys.txt 
b/Documentation/x86/protection-keys.txt
deleted file mode 100644
index fa46dcb..000
--- a/Documentation/x86/protection-keys.txt
+++ /dev/null
@@ -1,85 +0,0 @@
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature
-which will be found on future Intel CPUs.
-
-Memory Protection Keys provides a mechanism for enforcing page-based
-protections, but without requiring modification of the page tables
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
-
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
-thread a different 

[PATCH v9 29/51] mm/mprotect, powerpc/mm/pkeys, x86/mm/pkeys: Add sysfs interface

2017-11-06 Thread Ram Pai
From: Thiago Jung Bauermann 

Expose useful information for programs using memory protection keys.
Provide implementation for powerpc and x86.

On a powerpc system with pkeys support, here is what is shown:

$ head /sys/kernel/mm/protection_keys/*
==> /sys/kernel/mm/protection_keys/disable_access_supported <==
true

==> /sys/kernel/mm/protection_keys/disable_execute_supported <==
true

==> /sys/kernel/mm/protection_keys/disable_write_supported <==
true

==> /sys/kernel/mm/protection_keys/total_keys <==
31

==> /sys/kernel/mm/protection_keys/usable_keys <==
27

And on an x86 without pkeys support:

$ head /sys/kernel/mm/protection_keys/*
==> /sys/kernel/mm/protection_keys/disable_access_supported <==
false

==> /sys/kernel/mm/protection_keys/disable_execute_supported <==
false

==> /sys/kernel/mm/protection_keys/disable_write_supported <==
false

==> /sys/kernel/mm/protection_keys/total_keys <==
1

==> /sys/kernel/mm/protection_keys/usable_keys <==
0

Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/pkeys.h   |2 +
 arch/powerpc/mm/pkeys.c|   24 ++
 arch/x86/include/asm/mmu_context.h |4 +-
 arch/x86/include/asm/pkeys.h   |1 +
 arch/x86/mm/pkeys.c|9 
 include/linux/pkeys.h  |2 +-
 mm/mprotect.c  |   88 
 7 files changed, 128 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 333fb28..6d70b1a 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -237,6 +237,8 @@ static inline void pkey_mmu_values(int total_data, int 
total_execute)
pkeys_total = total_data;
 }
 
+extern bool arch_supports_pkeys(int cap);
+extern unsigned int arch_usable_pkeys(void);
 extern void thread_pkey_regs_save(struct thread_struct *thread);
 extern void thread_pkey_regs_restore(struct thread_struct *new_thread,
 struct thread_struct *old_thread);
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 2612f61..7e8468f 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -421,6 +421,30 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, 
bool write,
return pkey_access_permitted(vma_pkey(vma), write, execute);
 }
 
+unsigned int arch_usable_pkeys(void)
+{
+   unsigned int reserved;
+
+   if (static_branch_likely(_disabled))
+   return 0;
+
+   /* Reserve one more to account for the execute-only pkey. */
+   reserved = hweight32(initial_allocation_mask) + 1;
+
+   return pkeys_total > reserved ? pkeys_total - reserved : 0;
+}
+
+bool arch_supports_pkeys(int cap)
+{
+   if (static_branch_likely(_disabled))
+   return false;
+
+   if (cap & PKEY_DISABLE_EXECUTE)
+   return pkey_execute_disable_supported;
+
+   return (cap & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+}
+
 long sys_pkey_modify(int pkey, unsigned long new_val)
 {
bool ret;
diff --git a/arch/x86/include/asm/mmu_context.h 
b/arch/x86/include/asm/mmu_context.h
index 6699fc4..e3efabb 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -129,6 +129,8 @@ static inline void switch_ldt(struct mm_struct *prev, 
struct mm_struct *next)
 
 void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 
+#define PKEY_INITIAL_ALLOCATION_MAP1
+
 static inline int init_new_context(struct task_struct *tsk,
   struct mm_struct *mm)
 {
@@ -138,7 +140,7 @@ static inline int init_new_context(struct task_struct *tsk,
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and always allocated */
-   mm->context.pkey_allocation_map = 0x1;
+   mm->context.pkey_allocation_map = PKEY_INITIAL_ALLOCATION_MAP;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index f6c287b..6807288 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -106,5 +106,6 @@ extern int arch_set_user_pkey_access(struct task_struct 
*tsk, int pkey,
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
 extern void copy_init_pkru_to_fpregs(void);
+extern unsigned int arch_usable_pkeys(void);
 
 #endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index d7bc0ee..3083a59 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -122,6 +122,15 @@ int __arch_override_mprotect_pkey(struct vm_area_struct 
*vma, int prot, int pkey
return vma_pkey(vma);
 }
 

[PATCH v9 28/51] mm: display pkey in smaps if arch_pkeys_enabled() is true

2017-11-06 Thread Ram Pai
Currently the  architecture  specific code is expected to
display  the  protection  keys  in  smap  for a given vma.
This can lead to redundant code and possibly to divergent
formats in which the key gets displayed.

This  patch  changes  the implementation. It displays the
pkey only if the architecture support pkeys.

x86 arch_show_smap() function is not needed anymore.
Delete it.

Signed-off-by: Ram Pai 
---
 arch/x86/kernel/setup.c |8 
 fs/proc/task_mmu.c  |   11 ++-
 2 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0957dd7..b8b8d0e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1357,11 +1357,3 @@ static int __init register_kernel_offset_dumper(void)
return 0;
 }
 __initcall(register_kernel_offset_dumper);
-
-void arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
-{
-   if (!boot_cpu_has(X86_FEATURE_OSPKE))
-   return;
-
-   seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
-}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fad19a0..5ce3ec0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -731,10 +732,6 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long 
hmask,
 }
 #endif /* HUGETLB_PAGE */
 
-void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma)
-{
-}
-
 static int show_smap(struct seq_file *m, void *v, int is_pid)
 {
struct proc_maps_private *priv = m->private;
@@ -854,9 +851,13 @@ static int show_smap(struct seq_file *m, void *v, int 
is_pid)
   (unsigned long)(mss->pss >> (10 + PSS_SHIFT)));
 
if (!rollup_mode) {
-   arch_show_smap(m, vma);
+#ifdef CONFIG_ARCH_HAS_PKEYS
+   if (arch_pkeys_enabled())
+   seq_printf(m, "ProtectionKey:  %8u\n", vma_pkey(vma));
+#endif
show_smap_vma_flags(m, vma);
}
+
m_cache_vma(m, vma);
return ret;
 }
-- 
1.7.1



[PATCH v9 27/51] mm, x86 : introduce arch_pkeys_enabled()

2017-11-06 Thread Ram Pai
Arch neutral code needs to know if the architecture supports
protection  keys  to  display protection key in smaps. Hence
introducing arch_pkeys_enabled().

This patch also provides x86 implementation for
arch_pkeys_enabled().

Signed-off-by: Ram Pai 
---
 arch/x86/include/asm/pkeys.h |1 +
 arch/x86/kernel/fpu/xstate.c |5 +
 include/linux/pkeys.h|5 +
 3 files changed, 11 insertions(+), 0 deletions(-)

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index a0ba1ff..f6c287b 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -6,6 +6,7 @@
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
+extern bool arch_pkeys_enabled(void);
 
 /*
  * Try to dedicate one of the protection keys to be used as an
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f1d5476..a43db74 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -942,6 +942,11 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int 
pkey,
 
return 0;
 }
+
+bool arch_pkeys_enabled(void)
+{
+   return boot_cpu_has(X86_FEATURE_OSPKE);
+}
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 /*
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index 0794ca7..3ca2e44 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -35,6 +35,11 @@ static inline int arch_set_user_pkey_access(struct 
task_struct *tsk, int pkey,
return 0;
 }
 
+static inline bool arch_pkeys_enabled(void)
+{
+   return false;
+}
+
 static inline void copy_init_pkru_to_fpregs(void)
 {
 }
-- 
1.7.1



[PATCH v9 26/51] powerpc: add sys_pkey_modify() system call

2017-11-06 Thread Ram Pai
sys_pkey_modify()  is   powerpc  specific  system  call.  It
enables  the ability to modify *any* attribute of a key.

Since powerpc disallows modification of IAMR from user space
an application is unable to change a key's execute-attribute.

This system call helps accomplish the above.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/systbl.h  |1 +
 arch/powerpc/include/asm/unistd.h  |2 +-
 arch/powerpc/include/uapi/asm/unistd.h |1 +
 arch/powerpc/kernel/entry_64.S |9 +
 arch/powerpc/mm/pkeys.c|   17 +
 5 files changed, 29 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index d61f9c9..533cdc5 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -392,3 +392,4 @@
 SYSCALL(pkey_alloc)
 SYSCALL(pkey_free)
 SYSCALL(pkey_mprotect)
+PPC64ONLY(pkey_modify)
diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index daf1ba9..1e97086 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,7 +12,7 @@
 #include 
 
 
-#define NR_syscalls387
+#define NR_syscalls388
 
 #define __NR__exit __NR_exit
 
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index 389c36f..318cd79 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -398,5 +398,6 @@
 #define __NR_pkey_alloc384
 #define __NR_pkey_free 385
 #define __NR_pkey_mprotect 386
+#define __NR_pkey_modify   387
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 4a0fd4f..47c85f9 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -455,6 +455,15 @@ _GLOBAL(ppc_switch_endian)
bl  sys_switch_endian
b   .Lsyscall_exit
 
+_GLOBAL(ppc_pkey_modify)
+   bl  save_nvgprs
+#ifdef  CONFIG_PPC_MEM_KEYS
+   bl  sys_pkey_modify
+#else
+   bl  sys_ni_syscall
+#endif
+   b   .Lsyscall_exit
+
 _GLOBAL(ret_from_fork)
bl  schedule_tail
REST_NVGPRS(r1)
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 5047371..2612f61 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -420,3 +420,20 @@ bool arch_vma_access_permitted(struct vm_area_struct *vma, 
bool write,
 
return pkey_access_permitted(vma_pkey(vma), write, execute);
 }
+
+long sys_pkey_modify(int pkey, unsigned long new_val)
+{
+   bool ret;
+   /* Check for unsupported init values */
+   if (new_val & ~PKEY_ACCESS_MASK)
+   return -EINVAL;
+
+   down_write(>mm->mmap_sem);
+   ret = mm_pkey_is_allocated(current->mm, pkey);
+   up_write(>mm->mmap_sem);
+
+   if (!ret)
+   return -EINVAL;
+
+   return __arch_set_user_pkey_access(current, pkey, new_val);
+}
-- 
1.7.1



[PATCH v9 25/51] powerpc: sys_pkey_mprotect() system call

2017-11-06 Thread Ram Pai
Patch provides the ability for a process to
associate a pkey with a address range.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/systbl.h  |1 +
 arch/powerpc/include/asm/unistd.h  |4 +---
 arch/powerpc/include/uapi/asm/unistd.h |1 +
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index dea4a95..d61f9c9 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -391,3 +391,4 @@
 SYSCALL(statx)
 SYSCALL(pkey_alloc)
 SYSCALL(pkey_free)
+SYSCALL(pkey_mprotect)
diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index e0273bc..daf1ba9 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,12 +12,10 @@
 #include 
 
 
-#define NR_syscalls386
+#define NR_syscalls387
 
 #define __NR__exit __NR_exit
 
-#define __IGNORE_pkey_mprotect
-
 #ifndef __ASSEMBLY__
 
 #include 
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index 5db4385..389c36f 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -397,5 +397,6 @@
 #define __NR_statx 383
 #define __NR_pkey_alloc384
 #define __NR_pkey_free 385
+#define __NR_pkey_mprotect 386
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
-- 
1.7.1



[PATCH v9 24/51] powerpc: sys_pkey_alloc() and sys_pkey_free() system calls

2017-11-06 Thread Ram Pai
Finally this patch provides the ability for a process to
allocate and free a protection key.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/systbl.h  |2 ++
 arch/powerpc/include/asm/unistd.h  |4 +---
 arch/powerpc/include/uapi/asm/unistd.h |2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/systbl.h 
b/arch/powerpc/include/asm/systbl.h
index 449912f..dea4a95 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -389,3 +389,5 @@
 COMPAT_SYS_SPU(pwritev2)
 SYSCALL(kexec_file_load)
 SYSCALL(statx)
+SYSCALL(pkey_alloc)
+SYSCALL(pkey_free)
diff --git a/arch/powerpc/include/asm/unistd.h 
b/arch/powerpc/include/asm/unistd.h
index 9ba11db..e0273bc 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -12,13 +12,11 @@
 #include 
 
 
-#define NR_syscalls384
+#define NR_syscalls386
 
 #define __NR__exit __NR_exit
 
 #define __IGNORE_pkey_mprotect
-#define __IGNORE_pkey_alloc
-#define __IGNORE_pkey_free
 
 #ifndef __ASSEMBLY__
 
diff --git a/arch/powerpc/include/uapi/asm/unistd.h 
b/arch/powerpc/include/uapi/asm/unistd.h
index df8684f..5db4385 100644
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -395,5 +395,7 @@
 #define __NR_pwritev2  381
 #define __NR_kexec_file_load   382
 #define __NR_statx 383
+#define __NR_pkey_alloc384
+#define __NR_pkey_free 385
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
-- 
1.7.1



[PATCH v9 23/51] powerpc: Enable pkey subsystem

2017-11-06 Thread Ram Pai
PAPR defines 'ibm,processor-storage-keys' property. It exports two
values. The first value holds the number of data-access keys and the
second holds the number of instruction-access keys.  Due to a bug in
the  firmware, instruction-access  keys is  always  reported  as zero.
However any key can be configured to disable data-access and/or disable
execution-access. The inavailablity of the second value is not a
big handicap, though it could have been used to determine if the
platform supported disable-execution-access.

Non PAPR platforms do not define this property   in the device tree yet.
Here, we   hardcode   CPUs   that   support  pkey by consulting
PowerISA3.0

This patch calculates the number of keys supported by the platform.
Alsi it determines the platform support for read/write/execution access
support for pkeys.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/cputable.h|   15 +
 arch/powerpc/include/asm/mmu_context.h |1 +
 arch/powerpc/include/asm/pkeys.h   |   10 +
 arch/powerpc/kernel/prom.c |   18 +
 arch/powerpc/mm/pkeys.c|   33 +--
 5 files changed, 61 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 53b31c2..b288735 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -215,7 +215,9 @@ enum {
 #define CPU_FTR_DAWR   LONG_ASM_CONST(0x0400)
 #define CPU_FTR_DABRX  LONG_ASM_CONST(0x0800)
 #define CPU_FTR_PMAO_BUG   LONG_ASM_CONST(0x1000)
+#define CPU_FTR_PKEY   LONG_ASM_CONST(0x2000)
 #define CPU_FTR_POWER9_DD1 LONG_ASM_CONST(0x4000)
+#define CPU_FTR_PKEY_EXECUTE   LONG_ASM_CONST(0x8000)
 
 #ifndef __ASSEMBLY__
 
@@ -436,7 +438,8 @@ enum {
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_MMCRA | CPU_FTR_SMT | \
CPU_FTR_COHERENT_ICACHE | CPU_FTR_PURR | \
-   CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_DABRX)
+   CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_DABRX | \
+   CPU_FTR_PKEY)
 #define CPU_FTRS_POWER6 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -444,7 +447,7 @@ enum {
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
CPU_FTR_DSCR | CPU_FTR_UNALIGNED_LD_STD | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_CFAR | \
-   CPU_FTR_DABRX)
+   CPU_FTR_DABRX | CPU_FTR_PKEY)
 #define CPU_FTRS_POWER7 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -453,7 +456,7 @@ enum {
CPU_FTR_DSCR | CPU_FTR_SAO  | CPU_FTR_ASYM_SMT | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | \
-   CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX)
+   CPU_FTR_VMX_COPY | CPU_FTR_HAS_PPR | CPU_FTR_DABRX | CPU_FTR_PKEY)
 #define CPU_FTRS_POWER8 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -463,7 +466,8 @@ enum {
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
-   CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP)
+   CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_PKEY |\
+   CPU_FTR_PKEY_EXECUTE)
 #define CPU_FTRS_POWER8E (CPU_FTRS_POWER8 | CPU_FTR_PMAO_BUG)
 #define CPU_FTRS_POWER8_DD1 (CPU_FTRS_POWER8 & ~CPU_FTR_DBELL)
 #define CPU_FTRS_POWER9 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
@@ -475,7 +479,8 @@ enum {
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
-   CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
+   CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300 | \
+   CPU_FTR_PKEY | CPU_FTR_PKEY_EXECUTE)
 #define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
 (~CPU_FTR_SAO))
 #define CPU_FTRS_CELL  (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 95a3288..5a15d37 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -152,6 +152,7 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 #define 

[PATCH v9 22/51] powerpc/ptrace: Add memory protection key regset

2017-11-06 Thread Ram Pai
From: Thiago Jung Bauermann 

The AMR/IAMR/UAMOR are part of the program context.
Allow it to be accessed via ptrace and through core files.

Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/include/asm/pkeys.h|5 +++
 arch/powerpc/include/uapi/asm/elf.h |1 +
 arch/powerpc/kernel/ptrace.c|   66 +++
 arch/powerpc/kernel/traps.c |7 
 include/uapi/linux/elf.h|1 +
 5 files changed, 80 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 3437a50..9ee4731 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -213,6 +213,11 @@ static inline int arch_set_user_pkey_access(struct 
task_struct *tsk, int pkey,
return __arch_set_user_pkey_access(tsk, pkey, init_val);
 }
 
+static inline bool arch_pkeys_enabled(void)
+{
+   return !static_branch_likely(_disabled);
+}
+
 static inline void pkey_mm_init(struct mm_struct *mm)
 {
if (static_branch_likely(_disabled))
diff --git a/arch/powerpc/include/uapi/asm/elf.h 
b/arch/powerpc/include/uapi/asm/elf.h
index 5f201d4..860c592 100644
--- a/arch/powerpc/include/uapi/asm/elf.h
+++ b/arch/powerpc/include/uapi/asm/elf.h
@@ -97,6 +97,7 @@
 #define ELF_NTMSPRREG  3   /* include tfhar, tfiar, texasr */
 #define ELF_NEBB   3   /* includes ebbrr, ebbhr, bescr */
 #define ELF_NPMU   5   /* includes siar, sdar, sier, mmcr2, mmcr0 */
+#define ELF_NPKEY  3   /* includes amr, iamr, uamor */
 
 typedef unsigned long elf_greg_t64;
 typedef elf_greg_t64 elf_gregset_t64[ELF_NGREG];
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index f52ad5b..3718a04 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -35,6 +35,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1775,6 +1776,61 @@ static int pmu_set(struct task_struct *target,
return ret;
 }
 #endif
+
+#ifdef CONFIG_PPC_MEM_KEYS
+static int pkey_active(struct task_struct *target,
+  const struct user_regset *regset)
+{
+   if (!arch_pkeys_enabled())
+   return -ENODEV;
+
+   return regset->n;
+}
+
+static int pkey_get(struct task_struct *target,
+   const struct user_regset *regset,
+   unsigned int pos, unsigned int count,
+   void *kbuf, void __user *ubuf)
+{
+   BUILD_BUG_ON(TSO(amr) + sizeof(unsigned long) != TSO(iamr));
+   BUILD_BUG_ON(TSO(iamr) + sizeof(unsigned long) != TSO(uamor));
+
+   if (!arch_pkeys_enabled())
+   return -ENODEV;
+
+   return user_regset_copyout(, , , ,
+  >thread.amr, 0,
+  ELF_NPKEY * sizeof(unsigned long));
+}
+
+static int pkey_set(struct task_struct *target,
+ const struct user_regset *regset,
+ unsigned int pos, unsigned int count,
+ const void *kbuf, const void __user *ubuf)
+{
+   u64 new_amr;
+   int ret;
+
+   if (!arch_pkeys_enabled())
+   return -ENODEV;
+
+   /* Only the AMR can be set from userspace */
+   if (pos != 0 || count != sizeof(new_amr))
+   return -EINVAL;
+
+   ret = user_regset_copyin(, , , ,
+_amr, 0, sizeof(new_amr));
+   if (ret)
+   return ret;
+
+   /* UAMOR determines which bits of the AMR can be set from userspace. */
+   target->thread.amr = (new_amr & target->thread.uamor) |
+   (target->thread.amr & ~target->thread.uamor);
+
+   return 0;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
 /*
  * These are our native regset flavors.
  */
@@ -1809,6 +1865,9 @@ enum powerpc_regset {
REGSET_EBB, /* EBB registers */
REGSET_PMR, /* Performance Monitor Registers */
 #endif
+#ifdef CONFIG_PPC_MEM_KEYS
+   REGSET_PKEY,/* AMR register */
+#endif
 };
 
 static const struct user_regset native_regsets[] = {
@@ -1914,6 +1973,13 @@ enum powerpc_regset {
.active = pmu_active, .get = pmu_get, .set = pmu_set
},
 #endif
+#ifdef CONFIG_PPC_MEM_KEYS
+   [REGSET_PKEY] = {
+   .core_note_type = NT_PPC_PKEY, .n = ELF_NPKEY,
+   .size = sizeof(u64), .align = sizeof(u64),
+   .active = pkey_active, .get = pkey_get, .set = pkey_set
+   },
+#endif
 };
 
 static const struct user_regset_view user_ppc_native_view = {
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index ed1c39b..f449dc5 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -291,6 +291,13 @@ void _exception_pkey(int signr, struct pt_regs *regs, int 
code, unsigned long ad
local_irq_enable();
 
   

[PATCH v9 21/51] powerpc: Deliver SEGV signal on pkey violation

2017-11-06 Thread Ram Pai
The value of the pkey, whose protection got violated,
is made available in si_pkey field of the siginfo structure.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/bug.h |1 +
 arch/powerpc/kernel/traps.c|   12 -
 arch/powerpc/mm/fault.c|   55 ++-
 3 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
index 3c04249..97c3847 100644
--- a/arch/powerpc/include/asm/bug.h
+++ b/arch/powerpc/include/asm/bug.h
@@ -133,6 +133,7 @@
 extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
 extern void bad_page_fault(struct pt_regs *, unsigned long, int);
 extern void _exception(int, struct pt_regs *, int, unsigned long);
+extern void _exception_pkey(int, struct pt_regs *, int, unsigned long, int);
 extern void die(const char *, struct pt_regs *, long);
 extern bool die_will_crash(void);
 
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 13c9dcd..ed1c39b 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -265,7 +266,9 @@ void user_single_step_siginfo(struct task_struct *tsk,
info->si_addr = (void __user *)regs->nip;
 }
 
-void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
+
+void _exception_pkey(int signr, struct pt_regs *regs, int code, unsigned long 
addr,
+   int key)
 {
siginfo_t info;
const char fmt32[] = KERN_INFO "%s[%d]: unhandled signal %d " \
@@ -292,9 +295,16 @@ void _exception(int signr, struct pt_regs *regs, int code, 
unsigned long addr)
info.si_signo = signr;
info.si_code = code;
info.si_addr = (void __user *) addr;
+   info.si_pkey = key;
+
force_sig_info(signr, , current);
 }
 
+void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
+{
+   _exception_pkey(signr, regs, code, addr, 0);
+}
+
 void system_reset_exception(struct pt_regs *regs)
 {
/*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index dfcd0e4..84523ed 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -107,7 +107,8 @@ static bool store_updates_sp(struct pt_regs *regs)
  */
 
 static int
-__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int 
si_code)
+__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int 
si_code,
+   int pkey)
 {
/*
 * If we are in kernel mode, bail out with a SEGV, this will
@@ -117,17 +118,18 @@ static bool store_updates_sp(struct pt_regs *regs)
if (!user_mode(regs))
return SIGSEGV;
 
-   _exception(SIGSEGV, regs, si_code, address);
+   _exception_pkey(SIGSEGV, regs, si_code, address, pkey);
 
return 0;
 }
 
 static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long 
address)
 {
-   return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
+   return __bad_area_nosemaphore(regs, address, SEGV_MAPERR, 0);
 }
 
-static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
+static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code,
+   int pkey)
 {
struct mm_struct *mm = current->mm;
 
@@ -137,30 +139,18 @@ static int __bad_area(struct pt_regs *regs, unsigned long 
address, int si_code)
 */
up_read(>mmap_sem);
 
-   return __bad_area_nosemaphore(regs, address, si_code);
+   return __bad_area_nosemaphore(regs, address, si_code, pkey);
 }
 
 static noinline int bad_area(struct pt_regs *regs, unsigned long address)
 {
-   return __bad_area(regs, address, SEGV_MAPERR);
+   return __bad_area(regs, address, SEGV_MAPERR, 0);
 }
 
-static int bad_page_fault_exception(struct pt_regs *regs, unsigned long 
address,
-   int si_code)
+static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
+   int pkey)
 {
-   int sig = SIGBUS;
-   int code = BUS_OBJERR;
-
-#ifdef CONFIG_PPC_MEM_KEYS
-   if (si_code & DSISR_KEYFAULT) {
-   perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
-   sig = SIGSEGV;
-   code = SEGV_PKUERR;
-   }
-#endif /* CONFIG_PPC_MEM_KEYS */
-
-   _exception(sig, regs, code, address);
-   return 0;
+   return __bad_area_nosemaphore(regs, address, SEGV_PKUERR, pkey);
 }
 
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
@@ -411,7 +401,16 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
if (unlikely(page_fault_is_bad(error_code))) {
if (!is_user)
return SIGBUS;
-   return bad_page_fault_exception(regs, address, error_code);
+
+   if 

[PATCH v9 20/51] powerpc: introduce get_mm_addr_key() helper

2017-11-06 Thread Ram Pai
get_mm_addr_key() helper returns the pkey associated with
an address corresponding to a given mm_struct.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/mmu.h  |9 +
 arch/powerpc/mm/hash_utils_64.c |   24 
 2 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 6364f5c..bb38312 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -260,6 +260,15 @@ static inline bool early_radix_enabled(void)
 }
 #endif
 
+#ifdef CONFIG_PPC_MEM_KEYS
+extern u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address);
+#else
+static inline u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
+{
+   return 0;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
 #endif /* !__ASSEMBLY__ */
 
 /* The kernel use the constants below to index in the page sizes array.
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index ddfc673..0108d12 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -1575,6 +1575,30 @@ void hash_preload(struct mm_struct *mm, unsigned long ea,
local_irq_restore(flags);
 }
 
+#ifdef CONFIG_PPC_MEM_KEYS
+/*
+ * Return the protection key associated with the given address and the
+ * mm_struct.
+ */
+u16 get_mm_addr_key(struct mm_struct *mm, unsigned long address)
+{
+   pte_t *ptep;
+   u16 pkey = 0;
+   unsigned long flags;
+
+   if (!mm || !mm->pgd)
+   return 0;
+
+   local_irq_save(flags);
+   ptep = find_linux_pte(mm->pgd, address, NULL, NULL);
+   if (ptep)
+   pkey = pte_to_pkey_bits(pte_val(READ_ONCE(*ptep)));
+   local_irq_restore(flags);
+
+   return pkey;
+}
+#endif /* CONFIG_PPC_MEM_KEYS */
+
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
 static inline void tm_flush_hash_page(int local)
 {
-- 
1.7.1



[PATCH v9 19/51] powerpc: Handle exceptions caused by pkey violation

2017-11-06 Thread Ram Pai
Handle Data and  Instruction exceptions caused by memory
protection-key.

The CPU will detect the key fault if the HPTE is already
programmed with the key.

However if the HPTE is not  hashed, a key fault will not
be detected by the hardware. The software will detect
pkey violation in such a case.

Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/fault.c |   32 +++-
 1 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 4797d08..dfcd0e4 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -145,6 +145,24 @@ static noinline int bad_area(struct pt_regs *regs, 
unsigned long address)
return __bad_area(regs, address, SEGV_MAPERR);
 }
 
+static int bad_page_fault_exception(struct pt_regs *regs, unsigned long 
address,
+   int si_code)
+{
+   int sig = SIGBUS;
+   int code = BUS_OBJERR;
+
+#ifdef CONFIG_PPC_MEM_KEYS
+   if (si_code & DSISR_KEYFAULT) {
+   perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+   sig = SIGSEGV;
+   code = SEGV_PKUERR;
+   }
+#endif /* CONFIG_PPC_MEM_KEYS */
+
+   _exception(sig, regs, code, address);
+   return 0;
+}
+
 static int do_sigbus(struct pt_regs *regs, unsigned long address,
 unsigned int fault)
 {
@@ -391,11 +409,9 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
return 0;
 
if (unlikely(page_fault_is_bad(error_code))) {
-   if (is_user) {
-   _exception(SIGBUS, regs, BUS_OBJERR, address);
-   return 0;
-   }
-   return SIGBUS;
+   if (!is_user)
+   return SIGBUS;
+   return bad_page_fault_exception(regs, address, error_code);
}
 
/* Additional sanity check(s) */
@@ -498,6 +514,12 @@ static int __do_page_fault(struct pt_regs *regs, unsigned 
long address,
 * the fault.
 */
fault = handle_mm_fault(vma, address, flags);
+
+#ifdef CONFIG_PPC_MEM_KEYS
+   if (unlikely(fault & VM_FAULT_SIGSEGV))
+   return __bad_area(regs, address, SEGV_PKUERR);
+#endif /* CONFIG_PPC_MEM_KEYS */
+
major |= fault & VM_FAULT_MAJOR;
 
/*
-- 
1.7.1



[PATCH v9 18/51] powerpc: implementation for arch_vma_access_permitted()

2017-11-06 Thread Ram Pai
This patch provides the implementation for
arch_vma_access_permitted(). Returns true if the
requested access is allowed by pkey associated with the
vma.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/mmu_context.h |5 +++-
 arch/powerpc/mm/pkeys.c|   34 
 2 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index a557735..95a3288 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -136,6 +136,10 @@ static inline void arch_bprm_mm_init(struct mm_struct *mm,
 {
 }
 
+#ifdef CONFIG_PPC_MEM_KEYS
+bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
+  bool execute, bool foreign);
+#else /* CONFIG_PPC_MEM_KEYS */
 static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
 {
@@ -143,7 +147,6 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
return true;
 }
 
-#ifndef CONFIG_PPC_MEM_KEYS
 #define pkey_initialize()
 #define pkey_mm_init(mm)
 #define thread_pkey_regs_save(thread)
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 13902be..3b221bd 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -375,3 +375,37 @@ bool arch_pte_access_permitted(u64 pte, bool write, bool 
execute)
 
return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
 }
+
+/*
+ * We only want to enforce protection keys on the current thread because we
+ * effectively have no access to AMR/IAMR for other threads or any way to tell
+ * which AMR/IAMR in a threaded process we could use.
+ *
+ * So do not enforce things if the VMA is not from the current mm, or if we are
+ * in a kernel thread.
+ */
+static inline bool vma_is_foreign(struct vm_area_struct *vma)
+{
+   if (!current->mm)
+   return true;
+
+   /* if it is not our ->mm, it has to be foreign */
+   if (current->mm != vma->vm_mm)
+   return true;
+
+   return false;
+}
+
+bool arch_vma_access_permitted(struct vm_area_struct *vma, bool write,
+  bool execute, bool foreign)
+{
+   if (static_branch_likely(_disabled))
+   return true;
+   /*
+* Do not enforce our key-permissions on a foreign vma.
+*/
+   if (foreign || vma_is_foreign(vma))
+   return true;
+
+   return pkey_access_permitted(vma_pkey(vma), write, execute);
+}
-- 
1.7.1



[PATCH v9 17/51] powerpc: check key protection for user page access

2017-11-06 Thread Ram Pai
Make sure that the kernel does not access user pages without
checking their key-protection.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index c277a63..5ecb846 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -464,6 +464,19 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
 
 #ifdef CONFIG_PPC_MEM_KEYS
 extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute);
+
+#define pte_access_permitted(pte, write) \
+   (pte_present(pte) && \
+((!(write) || pte_write(pte)) && \
+ arch_pte_access_permitted(pte_val(pte), !!write, 0)))
+
+/*
+ * We store key in pmd for huge tlb pages. So need to check for key protection.
+ */
+#define pmd_access_permitted(pmd, write) \
+   (pmd_present(pmd) && \
+((!(write) || pmd_write(pmd)) && \
+ arch_pte_access_permitted(pmd_val(pmd), !!write, 0)))
 #endif /* CONFIG_PPC_MEM_KEYS */
 
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-- 
1.7.1



[PATCH v9 16/51] powerpc: helper to validate key-access permissions of a pte

2017-11-06 Thread Ram Pai
helper function that checks if the read/write/execute is allowed
on the pte.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |4 +++
 arch/powerpc/include/asm/pkeys.h |9 
 arch/powerpc/mm/pkeys.c  |   28 ++
 3 files changed, 41 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 4c1ee6e..c277a63 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -462,6 +462,10 @@ static inline void huge_ptep_set_wrprotect(struct 
mm_struct *mm,
pte_update(mm, addr, ptep, 0, _PAGE_PRIVILEGED, 1);
 }
 
+#ifdef CONFIG_PPC_MEM_KEYS
+extern bool arch_pte_access_permitted(u64 pte, bool write, bool execute);
+#endif /* CONFIG_PPC_MEM_KEYS */
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
   unsigned long addr, pte_t *ptep)
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 06a58fe..3437a50 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -82,6 +82,15 @@ static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
((pteflags & H_PTE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL));
 }
 
+static inline u16 pte_to_pkey_bits(u64 pteflags)
+{
+   return (((pteflags & H_PTE_PKEY_BIT0) ? 0x10 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT1) ? 0x8 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT2) ? 0x4 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT3) ? 0x2 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT4) ? 0x1 : 0x0UL));
+}
+
 #define pkey_alloc_mask(pkey) (0x1 << pkey)
 
 #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index f1c6195..13902be 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -347,3 +347,31 @@ int __arch_override_mprotect_pkey(struct vm_area_struct 
*vma, int prot,
/* Nothing to override. */
return vma_pkey(vma);
 }
+
+static bool pkey_access_permitted(int pkey, bool write, bool execute)
+{
+   int pkey_shift;
+   u64 amr;
+
+   if (!pkey)
+   return true;
+
+   if (!is_pkey_enabled(pkey))
+   return true;
+
+   pkey_shift = pkeyshift(pkey);
+   if (execute && !(read_iamr() & (IAMR_EX_BIT << pkey_shift)))
+   return true;
+
+   amr = read_amr(); /* Delay reading amr until absolutely needed */
+   return ((!write && !(amr & (AMR_RD_BIT << pkey_shift))) ||
+   (write &&  !(amr & (AMR_WR_BIT << pkey_shift;
+}
+
+bool arch_pte_access_permitted(u64 pte, bool write, bool execute)
+{
+   if (static_branch_likely(_disabled))
+   return true;
+
+   return pkey_access_permitted(pte_to_pkey_bits(pte), write, execute);
+}
-- 
1.7.1



[PATCH v9 15/51] powerpc: Program HPTE key protection bits

2017-11-06 Thread Ram Pai
Map the PTE protection key bits to the HPTE key protection bits,
while creating HPTE  entries.

Acked-by: Balbir Singh 
Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |5 +
 arch/powerpc/include/asm/mmu_context.h|6 ++
 arch/powerpc/include/asm/pkeys.h  |9 +
 arch/powerpc/mm/hash_utils_64.c   |1 +
 4 files changed, 21 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 508275b..2e22357 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -90,6 +90,8 @@
 #define HPTE_R_PP0 ASM_CONST(0x8000)
 #define HPTE_R_TS  ASM_CONST(0x4000)
 #define HPTE_R_KEY_HI  ASM_CONST(0x3000)
+#define HPTE_R_KEY_BIT0ASM_CONST(0x2000)
+#define HPTE_R_KEY_BIT1ASM_CONST(0x1000)
 #define HPTE_R_RPN_SHIFT   12
 #define HPTE_R_RPN ASM_CONST(0x0000)
 #define HPTE_R_RPN_3_0 ASM_CONST(0x01fff000)
@@ -104,6 +106,9 @@
 #define HPTE_R_C   ASM_CONST(0x0080)
 #define HPTE_R_R   ASM_CONST(0x0100)
 #define HPTE_R_KEY_LO  ASM_CONST(0x0e00)
+#define HPTE_R_KEY_BIT2ASM_CONST(0x0800)
+#define HPTE_R_KEY_BIT3ASM_CONST(0x0400)
+#define HPTE_R_KEY_BIT4ASM_CONST(0x0200)
 #define HPTE_R_KEY (HPTE_R_KEY_LO | HPTE_R_KEY_HI)
 
 #define HPTE_V_1TB_SEG ASM_CONST(0x4000)
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index a83d540..a557735 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -154,6 +154,12 @@ static inline int vma_pkey(struct vm_area_struct *vma)
 {
return 0;
 }
+
+static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
+{
+   return 0x0UL;
+}
+
 #endif /* CONFIG_PPC_MEM_KEYS */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index cfe61a9..06a58fe 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -73,6 +73,15 @@ static inline int vma_pkey(struct vm_area_struct *vma)
 
 #define arch_max_pkey() pkeys_total
 
+static inline u64 pte_to_hpte_pkey_bits(u64 pteflags)
+{
+   return (((pteflags & H_PTE_PKEY_BIT0) ? HPTE_R_KEY_BIT0 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT1) ? HPTE_R_KEY_BIT1 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT2) ? HPTE_R_KEY_BIT2 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT3) ? HPTE_R_KEY_BIT3 : 0x0UL) |
+   ((pteflags & H_PTE_PKEY_BIT4) ? HPTE_R_KEY_BIT4 : 0x0UL));
+}
+
 #define pkey_alloc_mask(pkey) (0x1 << pkey)
 
 #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 1e74590..ddfc673 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -232,6 +232,7 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
 */
rflags |= HPTE_R_M;
 
+   rflags |= pte_to_hpte_pkey_bits(pteflags);
return rflags;
 }
 
-- 
1.7.1



[PATCH v9 14/51] powerpc: map vma key-protection bits to pte key bits.

2017-11-06 Thread Ram Pai
Map  the  key  protection  bits of the vma to the pkey bits in
the PTE.

The PTE  bits used  for pkey  are  3,4,5,6  and 57. The  first
four bits are the same four bits that were freed up  initially
in this patch series. remember? :-) Without those four bits
this patch wouldn't be possible.

BUT, on 4k kernel, bit 3, and 4 could not be freed up. remember?
Hence we have to be satisfied with 5, 6 and 7.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |   25 -
 arch/powerpc/include/asm/mman.h  |6 ++
 arch/powerpc/include/asm/pkeys.h |   12 
 3 files changed, 42 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 9a677cd..4c1ee6e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -39,6 +39,7 @@
 #define _RPAGE_RSV20x0800UL
 #define _RPAGE_RSV30x0400UL
 #define _RPAGE_RSV40x0200UL
+#define _RPAGE_RSV50x00040UL
 
 #define _PAGE_PTE  0x4000UL/* distinguishes PTEs 
from pointers */
 #define _PAGE_PRESENT  0x8000UL/* pte contains a 
translation */
@@ -58,6 +59,25 @@
 /* Max physical address bit as per radix table */
 #define _RPAGE_PA_MAX  57
 
+#ifdef CONFIG_PPC_MEM_KEYS
+#ifdef CONFIG_PPC_64K_PAGES
+#define H_PTE_PKEY_BIT0_RPAGE_RSV1
+#define H_PTE_PKEY_BIT1_RPAGE_RSV2
+#else /* CONFIG_PPC_64K_PAGES */
+#define H_PTE_PKEY_BIT00 /* _RPAGE_RSV1 is not available */
+#define H_PTE_PKEY_BIT10 /* _RPAGE_RSV2 is not available */
+#endif /* CONFIG_PPC_64K_PAGES */
+#define H_PTE_PKEY_BIT2_RPAGE_RSV3
+#define H_PTE_PKEY_BIT3_RPAGE_RSV4
+#define H_PTE_PKEY_BIT4_RPAGE_RSV5
+#else /*  CONFIG_PPC_MEM_KEYS */
+#define H_PTE_PKEY_BIT00
+#define H_PTE_PKEY_BIT10
+#define H_PTE_PKEY_BIT20
+#define H_PTE_PKEY_BIT30
+#define H_PTE_PKEY_BIT40
+#endif /*  CONFIG_PPC_MEM_KEYS */
+
 /*
  * Max physical address bit we will use for now.
  *
@@ -121,13 +141,16 @@
 #define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_HPTEFLAGS | _PAGE_DIRTY | \
 _PAGE_ACCESSED | _PAGE_SPECIAL | _PAGE_PTE |   \
 _PAGE_SOFT_DIRTY)
+
+#define H_PTE_PKEY  (H_PTE_PKEY_BIT0 | H_PTE_PKEY_BIT1 | H_PTE_PKEY_BIT2 | \
+H_PTE_PKEY_BIT3 | H_PTE_PKEY_BIT4)
 /*
  * Mask of bits returned by pte_pgprot()
  */
 #define PAGE_PROT_BITS  (_PAGE_SAO | _PAGE_NON_IDEMPOTENT | _PAGE_TOLERANT | \
 H_PAGE_4K_PFN | _PAGE_PRIVILEGED | _PAGE_ACCESSED | \
 _PAGE_READ | _PAGE_WRITE |  _PAGE_DIRTY | _PAGE_EXEC | 
\
-_PAGE_SOFT_DIRTY)
+_PAGE_SOFT_DIRTY | H_PTE_PKEY)
 /*
  * We define 2 sets of base prot bits, one for basic pages (ie,
  * cacheable kernel and user pages) and one for non cacheable
diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 2999478..07e3f54 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -33,7 +33,13 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned 
long prot,
 
 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
 {
+#ifdef CONFIG_PPC_MEM_KEYS
+   return (vm_flags & VM_SAO) ?
+   __pgprot(_PAGE_SAO | vmflag_to_pte_pkey_bits(vm_flags)) :
+   __pgprot(0 | vmflag_to_pte_pkey_bits(vm_flags));
+#else
return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0);
+#endif
 }
 #define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
 
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 441bbf3..cfe61a9 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -52,6 +52,18 @@ static inline u64 pkey_to_vmflag_bits(u16 pkey)
return (((u64)pkey << VM_PKEY_SHIFT) & ARCH_VM_PKEY_FLAGS);
 }
 
+static inline u64 vmflag_to_pte_pkey_bits(u64 vm_flags)
+{
+   if (static_branch_likely(_disabled))
+   return 0x0UL;
+
+   return (((vm_flags & VM_PKEY_BIT0) ? H_PTE_PKEY_BIT4 : 0x0UL) |
+   ((vm_flags & VM_PKEY_BIT1) ? H_PTE_PKEY_BIT3 : 0x0UL) |
+   ((vm_flags & VM_PKEY_BIT2) ? H_PTE_PKEY_BIT2 : 0x0UL) |
+   ((vm_flags & VM_PKEY_BIT3) ? H_PTE_PKEY_BIT1 : 0x0UL) |
+   ((vm_flags & VM_PKEY_BIT4) ? H_PTE_PKEY_BIT0 : 0x0UL));
+}
+
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
if (static_branch_likely(_disabled))
-- 
1.7.1



[PATCH v9 13/51] powerpc: implementation for arch_override_mprotect_pkey()

2017-11-06 Thread Ram Pai
arch independent code calls arch_override_mprotect_pkey()
to return a pkey that best matches the requested protection.

This patch provides the implementation.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/mmu_context.h |5 
 arch/powerpc/include/asm/pkeys.h   |   21 +-
 arch/powerpc/mm/pkeys.c|   36 
 3 files changed, 61 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 4eccc2f..a83d540 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -149,6 +149,11 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 #define thread_pkey_regs_save(thread)
 #define thread_pkey_regs_restore(new_thread, old_thread)
 #define thread_pkey_regs_init(thread)
+
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+   return 0;
+}
 #endif /* CONFIG_PPC_MEM_KEYS */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 1bd41ef..441bbf3 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -52,6 +52,13 @@ static inline u64 pkey_to_vmflag_bits(u16 pkey)
return (((u64)pkey << VM_PKEY_SHIFT) & ARCH_VM_PKEY_FLAGS);
 }
 
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+   if (static_branch_likely(_disabled))
+   return 0;
+   return (vma->vm_flags & ARCH_VM_PKEY_FLAGS) >> VM_PKEY_SHIFT;
+}
+
 #define arch_max_pkey() pkeys_total
 
 #define pkey_alloc_mask(pkey) (0x1 << pkey)
@@ -148,10 +155,22 @@ static inline int execute_only_pkey(struct mm_struct *mm)
return __execute_only_pkey(mm);
 }
 
+extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma,
+int prot, int pkey);
 static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
  int prot, int pkey)
 {
-   return 0;
+   if (static_branch_likely(_disabled))
+   return 0;
+
+   /*
+* Is this an mprotect_pkey() call? If so, never override the value that
+* came from the user.
+*/
+   if (pkey != -1)
+   return pkey;
+
+   return __arch_override_mprotect_pkey(vma, prot, pkey);
 }
 
 extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 4d704ea..f1c6195 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -311,3 +311,39 @@ int __execute_only_pkey(struct mm_struct *mm)
mm->context.execute_only_pkey = execute_only_pkey;
return execute_only_pkey;
 }
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+   /* Do this check first since the vm_flags should be hot */
+   if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+   return false;
+
+   return (vma_pkey(vma) == vma->vm_mm->context.execute_only_pkey);
+}
+
+/*
+ * This should only be called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot,
+ int pkey)
+{
+   /*
+* If the currently associated pkey is execute-only, but the requested
+* protection requires read or write, move it back to the default pkey.
+*/
+   if (vma_is_pkey_exec_only(vma) && (prot & (PROT_READ | PROT_WRITE)))
+   return 0;
+
+   /*
+* The requested protection is execute-only. Hence let's use an
+* execute-only pkey.
+*/
+   if (prot == PROT_EXEC) {
+   pkey = execute_only_pkey(vma->vm_mm);
+   if (pkey > 0)
+   return pkey;
+   }
+
+   /* Nothing to override. */
+   return vma_pkey(vma);
+}
-- 
1.7.1



[PATCH v9 12/51] powerpc: ability to associate pkey to a vma

2017-11-06 Thread Ram Pai
arch-independent code expects the arch to  map
a  pkey  into the vma's protection bit setting.
The patch provides that ability.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/mman.h  |7 ++-
 arch/powerpc/include/asm/pkeys.h |   11 +++
 arch/powerpc/mm/pkeys.c  |8 
 3 files changed, 25 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/mman.h b/arch/powerpc/include/asm/mman.h
index 30922f6..2999478 100644
--- a/arch/powerpc/include/asm/mman.h
+++ b/arch/powerpc/include/asm/mman.h
@@ -13,6 +13,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 /*
@@ -22,7 +23,11 @@
 static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
unsigned long pkey)
 {
-   return (prot & PROT_SAO) ? VM_SAO : 0;
+#ifdef CONFIG_PPC_MEM_KEYS
+   return (((prot & PROT_SAO) ? VM_SAO : 0) | pkey_to_vmflag_bits(pkey));
+#else
+   return ((prot & PROT_SAO) ? VM_SAO : 0);
+#endif
 }
 #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 20d1f0e..1bd41ef 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -41,6 +41,17 @@
 #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | \
VM_PKEY_BIT3 | VM_PKEY_BIT4)
 
+/* Override any generic PKEY permission defines */
+#define PKEY_DISABLE_EXECUTE   0x4
+#define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS | \
+   PKEY_DISABLE_WRITE  | \
+   PKEY_DISABLE_EXECUTE)
+
+static inline u64 pkey_to_vmflag_bits(u16 pkey)
+{
+   return (((u64)pkey << VM_PKEY_SHIFT) & ARCH_VM_PKEY_FLAGS);
+}
+
 #define arch_max_pkey() pkeys_total
 
 #define pkey_alloc_mask(pkey) (0x1 << pkey)
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 5da94fe..4d704ea 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -39,6 +39,14 @@ void __init pkey_initialize(void)
 (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
 
/*
+* pkey_to_vmflag_bits() assumes that the pkey bits are contiguous
+* in the vmaflag. Make sure that is really the case.
+*/
+   BUILD_BUG_ON(__builtin_clzl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT) +
+__builtin_popcountl(ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)
+   != (sizeof(u64) * BITS_PER_BYTE));
+
+   /*
 * Disable the pkey system till everything is in place. A subsequent
 * patch will enable it.
 */
-- 
1.7.1



[PATCH v9 11/51] powerpc: introduce execute-only pkey

2017-11-06 Thread Ram Pai
This patch provides the implementation of execute-only pkey.
The architecture-independent layer expects the arch-dependent
layer, to support the ability to create and enable a special
key which has execute-only permission.

Acked-by: Balbir Singh 
Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/mmu.h |1 +
 arch/powerpc/include/asm/pkeys.h |8 -
 arch/powerpc/mm/pkeys.c  |   56 ++
 3 files changed, 64 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index df17fbc..44dbc91 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -116,6 +116,7 @@ struct patb_entry {
 * bit unset -> key available for allocation
 */
u32 pkey_allocation_map;
+   s16 execute_only_pkey; /* key holding execute-only protection */
 #endif
 } mm_context_t;
 
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 0b2d9f0..20d1f0e 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -128,9 +128,13 @@ static inline int mm_pkey_free(struct mm_struct *mm, int 
pkey)
  * Try to dedicate one of the protection keys to be used as an
  * execute-only protection key.
  */
+extern int __execute_only_pkey(struct mm_struct *mm);
 static inline int execute_only_pkey(struct mm_struct *mm)
 {
-   return 0;
+   if (static_branch_likely(_disabled))
+   return -1;
+
+   return __execute_only_pkey(mm);
 }
 
 static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
@@ -154,6 +158,8 @@ static inline void pkey_mm_init(struct mm_struct *mm)
if (static_branch_likely(_disabled))
return;
mm_pkey_allocation_map(mm) = initial_allocation_mask;
+   /* -1 means unallocated or invalid */
+   mm->context.execute_only_pkey = -1;
 }
 
 extern void thread_pkey_regs_save(struct thread_struct *thread);
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 469f370..5da94fe 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -247,3 +247,59 @@ void thread_pkey_regs_init(struct thread_struct *thread)
write_iamr(read_iamr() & pkey_iamr_mask);
write_uamor(read_uamor() & pkey_amr_uamor_mask);
 }
+
+static inline bool pkey_allows_readwrite(int pkey)
+{
+   int pkey_shift = pkeyshift(pkey);
+
+   if (!is_pkey_enabled(pkey))
+   return true;
+
+   return !(read_amr() & ((AMR_RD_BIT|AMR_WR_BIT) << pkey_shift));
+}
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+   bool need_to_set_mm_pkey = false;
+   int execute_only_pkey = mm->context.execute_only_pkey;
+   int ret;
+
+   /* Do we need to assign a pkey for mm's execute-only maps? */
+   if (execute_only_pkey == -1) {
+   /* Go allocate one to use, which might fail */
+   execute_only_pkey = mm_pkey_alloc(mm);
+   if (execute_only_pkey < 0)
+   return -1;
+   need_to_set_mm_pkey = true;
+   }
+
+   /*
+* We do not want to go through the relatively costly dance to set AMR
+* if we do not need to. Check it first and assume that if the
+* execute-only pkey is readwrite-disabled than we do not have to set it
+* ourselves.
+*/
+   if (!need_to_set_mm_pkey && !pkey_allows_readwrite(execute_only_pkey))
+   return execute_only_pkey;
+
+   /*
+* Set up AMR so that it denies access for everything other than
+* execution.
+*/
+   ret = __arch_set_user_pkey_access(current, execute_only_pkey,
+ PKEY_DISABLE_ACCESS |
+ PKEY_DISABLE_WRITE);
+   /*
+* If the AMR-set operation failed somehow, just return 0 and
+* effectively disable execute-only support.
+*/
+   if (ret) {
+   mm_pkey_free(mm, execute_only_pkey);
+   return -1;
+   }
+
+   /* We got one, store it and use it from here on out */
+   if (need_to_set_mm_pkey)
+   mm->context.execute_only_pkey = execute_only_pkey;
+   return execute_only_pkey;
+}
-- 
1.7.1



[PATCH v9 10/51] powerpc: store and restore the pkey state across context switches

2017-11-06 Thread Ram Pai
Store and restore the AMR, IAMR and UAMOR register state of the task
before scheduling out and after scheduling in, respectively.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/mmu_context.h |3 ++
 arch/powerpc/include/asm/pkeys.h   |4 ++
 arch/powerpc/include/asm/processor.h   |5 +++
 arch/powerpc/kernel/process.c  |7 
 arch/powerpc/mm/pkeys.c|   49 +++-
 5 files changed, 67 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 6d7c4f1..4eccc2f 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -146,6 +146,9 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 #ifndef CONFIG_PPC_MEM_KEYS
 #define pkey_initialize()
 #define pkey_mm_init(mm)
+#define thread_pkey_regs_save(thread)
+#define thread_pkey_regs_restore(new_thread, old_thread)
+#define thread_pkey_regs_init(thread)
 #endif /* CONFIG_PPC_MEM_KEYS */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 652c750..0b2d9f0 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -156,5 +156,9 @@ static inline void pkey_mm_init(struct mm_struct *mm)
mm_pkey_allocation_map(mm) = initial_allocation_mask;
 }
 
+extern void thread_pkey_regs_save(struct thread_struct *thread);
+extern void thread_pkey_regs_restore(struct thread_struct *new_thread,
+struct thread_struct *old_thread);
+extern void thread_pkey_regs_init(struct thread_struct *thread);
 extern void pkey_initialize(void);
 #endif /*_ASM_POWERPC_KEYS_H */
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index fab7ff8..e3c417c 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -309,6 +309,11 @@ struct thread_struct {
struct thread_vr_state ckvr_state; /* Checkpointed VR state */
unsigned long   ckvrsave; /* Checkpointed VRSAVE */
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+#ifdef CONFIG_PPC_MEM_KEYS
+   unsigned long   amr;
+   unsigned long   iamr;
+   unsigned long   uamor;
+#endif
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
void*   kvm_shadow_vcpu; /* KVM internal data */
 #endif /* CONFIG_KVM_BOOK3S_32_HANDLER */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index a0c74bb..148b934 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -42,6 +42,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1085,6 +1086,8 @@ static inline void save_sprs(struct thread_struct *t)
t->tar = mfspr(SPRN_TAR);
}
 #endif
+
+   thread_pkey_regs_save(t);
 }
 
 static inline void restore_sprs(struct thread_struct *old_thread,
@@ -1120,6 +1123,8 @@ static inline void restore_sprs(struct thread_struct 
*old_thread,
mtspr(SPRN_TAR, new_thread->tar);
}
 #endif
+
+   thread_pkey_regs_restore(new_thread, old_thread);
 }
 
 #ifdef CONFIG_PPC_BOOK3S_64
@@ -1705,6 +1710,8 @@ void start_thread(struct pt_regs *regs, unsigned long 
start, unsigned long sp)
current->thread.tm_tfiar = 0;
current->thread.load_tm = 0;
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
+
+   thread_pkey_regs_init(>thread);
 }
 EXPORT_SYMBOL(start_thread);
 
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 3ddc13a..469f370 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -16,6 +16,8 @@
 bool pkey_execute_disable_supported;
 int  pkeys_total;  /* Total pkeys as per device tree */
 u32  initial_allocation_mask;  /* Bits set for reserved keys */
+u64  pkey_amr_uamor_mask;  /* Bits in AMR/UMOR not to be touched */
+u64  pkey_iamr_mask;   /* Bits in AMR not to be touched */
 
 #define AMR_BITS_PER_PKEY 2
 #define AMR_RD_BIT 0x1UL
@@ -74,8 +76,16 @@ void __init pkey_initialize(void)
 *  programming note.
 */
initial_allocation_mask = ~0x0;
-   for (i = 2; i < (pkeys_total - os_reserved); i++)
+
+   /* register mask is in BE format */
+   pkey_amr_uamor_mask = ~0x0ul;
+   pkey_iamr_mask = ~0x0ul;
+
+   for (i = 2; i < (pkeys_total - os_reserved); i++) {
initial_allocation_mask &= ~(0x1 << i);
+   pkey_amr_uamor_mask &= ~(0x3ul << pkeyshift(i));
+   pkey_iamr_mask &= ~(0x1ul << pkeyshift(i));
+   }
 }
 
 static inline u64 read_amr(void)
@@ -200,3 +210,40 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, 
int pkey,
init_amr(pkey, new_amr_bits);
return 0;
 }
+
+void thread_pkey_regs_save(struct thread_struct *thread)
+{
+   if (static_branch_likely(_disabled))
+   return;
+
+   /*
+* TODO: Skip 

[PATCH v9 09/51] powerpc: ability to create execute-disabled pkeys

2017-11-06 Thread Ram Pai
powerpc has hardware support to disable execute on a pkey.
This patch enables the ability to create execute-disabled
keys.

Signed-off-by: Ram Pai 
---
 arch/powerpc/include/uapi/asm/mman.h |6 ++
 arch/powerpc/mm/pkeys.c  |   16 
 2 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/uapi/asm/mman.h 
b/arch/powerpc/include/uapi/asm/mman.h
index e63bc37..65065ce 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -30,4 +30,10 @@
 #define MAP_STACK  0x2 /* give out an address that is best 
suited for process/thread stacks */
 #define MAP_HUGETLB0x4 /* create a huge page mapping */
 
+/* Override any generic PKEY permission defines */
+#define PKEY_DISABLE_EXECUTE   0x4
+#undef PKEY_ACCESS_MASK
+#define PKEY_ACCESS_MASK   (PKEY_DISABLE_ACCESS |\
+   PKEY_DISABLE_WRITE  |\
+   PKEY_DISABLE_EXECUTE)
 #endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 4a01c2f..3ddc13a 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -29,6 +29,14 @@ void __init pkey_initialize(void)
int os_reserved, i;
 
/*
+* We define PKEY_DISABLE_EXECUTE in addition to the arch-neutral
+* generic defines for PKEY_DISABLE_ACCESS and PKEY_DISABLE_WRITE.
+* Ensure that the bits a distinct.
+*/
+   BUILD_BUG_ON(PKEY_DISABLE_EXECUTE &
+(PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE));
+
+   /*
 * Disable the pkey system till everything is in place. A subsequent
 * patch will enable it.
 */
@@ -171,10 +179,18 @@ int __arch_set_user_pkey_access(struct task_struct *tsk, 
int pkey,
unsigned long init_val)
 {
u64 new_amr_bits = 0x0ul;
+   u64 new_iamr_bits = 0x0ul;
 
if (!is_pkey_enabled(pkey))
return -EINVAL;
 
+   if (init_val & PKEY_DISABLE_EXECUTE) {
+   if (!pkey_execute_disable_supported)
+   return -EINVAL;
+   new_iamr_bits |= IAMR_EX_BIT;
+   }
+   init_iamr(pkey, new_iamr_bits);
+
/* Set the bits we need in AMR: */
if (init_val & PKEY_DISABLE_ACCESS)
new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
-- 
1.7.1



[PATCH v9 08/51] powerpc: implementation for arch_set_user_pkey_access()

2017-11-06 Thread Ram Pai
This patch provides the detailed implementation for
a user to allocate a key and enable it in the hardware.

It provides the plumbing, but it cannot be used till
the system call is implemented. The next patch  will
do so.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/pkeys.h |6 -
 arch/powerpc/mm/pkeys.c  |   40 ++
 2 files changed, 45 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 0d00a54..652c750 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -139,10 +139,14 @@ static inline int arch_override_mprotect_pkey(struct 
vm_area_struct *vma,
return 0;
 }
 
+extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+  unsigned long init_val);
 static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
 {
-   return 0;
+   if (static_branch_likely(_disabled))
+   return -EINVAL;
+   return __arch_set_user_pkey_access(tsk, pkey, init_val);
 }
 
 static inline void pkey_mm_init(struct mm_struct *mm)
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index f3bf661..4a01c2f 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -9,6 +9,7 @@
  * (at your option) any later version.
  */
 
+#include 
 #include 
 
 DEFINE_STATIC_KEY_TRUE(pkey_disabled);
@@ -17,6 +18,9 @@
 u32  initial_allocation_mask;  /* Bits set for reserved keys */
 
 #define AMR_BITS_PER_PKEY 2
+#define AMR_RD_BIT 0x1UL
+#define AMR_WR_BIT 0x2UL
+#define IAMR_EX_BIT 0x1UL
 #define PKEY_REG_BITS (sizeof(u64)*8)
 #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
 
@@ -102,6 +106,20 @@ static inline void write_uamor(u64 value)
mtspr(SPRN_UAMOR, value);
 }
 
+static bool is_pkey_enabled(int pkey)
+{
+   u64 uamor = read_uamor();
+   u64 pkey_bits = 0x3ul << pkeyshift(pkey);
+   u64 uamor_pkey_bits = (uamor & pkey_bits);
+
+   /*
+* Both the bits in UAMOR corresponding to the key should be set or
+* reset.
+*/
+   WARN_ON(uamor_pkey_bits && (uamor_pkey_bits != pkey_bits));
+   return !!(uamor_pkey_bits);
+}
+
 static inline void init_amr(int pkey, u8 init_bits)
 {
u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
@@ -144,3 +162,25 @@ void __arch_deactivate_pkey(int pkey)
 {
pkey_status_change(pkey, false);
 }
+
+/*
+ * Set the access rights in AMR IAMR and UAMOR registers for @pkey to that
+ * specified in @init_val.
+ */
+int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+   unsigned long init_val)
+{
+   u64 new_amr_bits = 0x0ul;
+
+   if (!is_pkey_enabled(pkey))
+   return -EINVAL;
+
+   /* Set the bits we need in AMR: */
+   if (init_val & PKEY_DISABLE_ACCESS)
+   new_amr_bits |= AMR_RD_BIT | AMR_WR_BIT;
+   else if (init_val & PKEY_DISABLE_WRITE)
+   new_amr_bits |= AMR_WR_BIT;
+
+   init_amr(pkey, new_amr_bits);
+   return 0;
+}
-- 
1.7.1



[PATCH v9 07/51] powerpc: cleanup AMR, IAMR when a key is allocated or freed

2017-11-06 Thread Ram Pai
Cleanup the bits corresponding to a key in the AMR, and IAMR
register, when the key is newly allocated/activated or is freed.
We dont want some residual bits cause the hardware enforce
unintended behavior when the key is activated or freed.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/pkeys.h |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index e5deac7..0d00a54 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -69,6 +69,8 @@ static inline bool mm_pkey_is_allocated(struct mm_struct *mm, 
int pkey)
__mm_pkey_is_allocated(mm, pkey));
 }
 
+extern void __arch_activate_pkey(int pkey);
+extern void __arch_deactivate_pkey(int pkey);
 /*
  * Returns a positive, 5-bit key on success, or -1 on failure.
  * Relies on the mmap_sem to protect against concurrency in mm_pkey_alloc() and
@@ -96,6 +98,12 @@ static inline int mm_pkey_alloc(struct mm_struct *mm)
 
ret = ffz((u32)mm_pkey_allocation_map(mm));
__mm_pkey_allocated(mm, ret);
+
+   /*
+* Enable the key in the hardware
+*/
+   if (ret > 0)
+   __arch_activate_pkey(ret);
return ret;
 }
 
@@ -107,6 +115,10 @@ static inline int mm_pkey_free(struct mm_struct *mm, int 
pkey)
if (!mm_pkey_is_allocated(mm, pkey))
return -EINVAL;
 
+   /*
+* Disable the key in the hardware
+*/
+   __arch_deactivate_pkey(pkey);
__mm_pkey_free(mm, pkey);
 
return 0;
-- 
1.7.1



[PATCH v9 06/51] powerpc: helper functions to initialize AMR, IAMR and UAMOR registers

2017-11-06 Thread Ram Pai
Introduce  helper functions that can initialize the bits in the AMR,
IAMR and UAMOR register; the bits that correspond to the given pkey.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/pkeys.c |   47 +++
 1 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index b6bdfdf..f3bf661 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -16,6 +16,10 @@
 int  pkeys_total;  /* Total pkeys as per device tree */
 u32  initial_allocation_mask;  /* Bits set for reserved keys */
 
+#define AMR_BITS_PER_PKEY 2
+#define PKEY_REG_BITS (sizeof(u64)*8)
+#define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
+
 void __init pkey_initialize(void)
 {
int os_reserved, i;
@@ -97,3 +101,46 @@ static inline void write_uamor(u64 value)
 {
mtspr(SPRN_UAMOR, value);
 }
+
+static inline void init_amr(int pkey, u8 init_bits)
+{
+   u64 new_amr_bits = (((u64)init_bits & 0x3UL) << pkeyshift(pkey));
+   u64 old_amr = read_amr() & ~((u64)(0x3ul) << pkeyshift(pkey));
+
+   write_amr(old_amr | new_amr_bits);
+}
+
+static inline void init_iamr(int pkey, u8 init_bits)
+{
+   u64 new_iamr_bits = (((u64)init_bits & 0x1UL) << pkeyshift(pkey));
+   u64 old_iamr = read_iamr() & ~((u64)(0x1ul) << pkeyshift(pkey));
+
+   write_iamr(old_iamr | new_iamr_bits);
+}
+
+static void pkey_status_change(int pkey, bool enable)
+{
+   u64 old_uamor;
+
+   /* Reset the AMR and IAMR bits for this key */
+   init_amr(pkey, 0x0);
+   init_iamr(pkey, 0x0);
+
+   /* Enable/disable key */
+   old_uamor = read_uamor();
+   if (enable)
+   old_uamor |= (0x3ul << pkeyshift(pkey));
+   else
+   old_uamor &= ~(0x3ul << pkeyshift(pkey));
+   write_uamor(old_uamor);
+}
+
+void __arch_activate_pkey(int pkey)
+{
+   pkey_status_change(pkey, true);
+}
+
+void __arch_deactivate_pkey(int pkey)
+{
+   pkey_status_change(pkey, false);
+}
-- 
1.7.1



[PATCH v9 05/51] powerpc: helper function to read, write AMR, IAMR, UAMOR registers

2017-11-06 Thread Ram Pai
Implements helper functions to read and write the key related
registers; AMR, IAMR, UAMOR.

AMR register tracks the read,write permission of a key
IAMR register tracks the execute permission of a key
UAMOR register enables and disables a key

Acked-by: Balbir Singh 
Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/pkeys.c |   36 
 1 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 512bdf2..b6bdfdf 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -61,3 +61,39 @@ void __init pkey_initialize(void)
for (i = 2; i < (pkeys_total - os_reserved); i++)
initial_allocation_mask &= ~(0x1 << i);
 }
+
+static inline u64 read_amr(void)
+{
+   return mfspr(SPRN_AMR);
+}
+
+static inline void write_amr(u64 value)
+{
+   mtspr(SPRN_AMR, value);
+}
+
+static inline u64 read_iamr(void)
+{
+   if (!likely(pkey_execute_disable_supported))
+   return 0x0UL;
+
+   return mfspr(SPRN_IAMR);
+}
+
+static inline void write_iamr(u64 value)
+{
+   if (!likely(pkey_execute_disable_supported))
+   return;
+
+   mtspr(SPRN_IAMR, value);
+}
+
+static inline u64 read_uamor(void)
+{
+   return mfspr(SPRN_UAMOR);
+}
+
+static inline void write_uamor(u64 value)
+{
+   mtspr(SPRN_UAMOR, value);
+}
-- 
1.7.1



[PATCH v9 04/51] powerpc: track allocation status of all pkeys

2017-11-06 Thread Ram Pai
Total 32 keys are available on power7 and above. However
pkey 0,1 are reserved. So effectively we  have  30 pkeys.

On 4K kernels, we do not  have  5  bits  in  the  PTE to
represent  all the keys; we only have 3bits.Two of those
keys are reserved; pkey 0 and pkey 1. So effectively  we
have 6 pkeys.

This patch keeps track of reserved keys, allocated  keys
and keys that are currently free.

Also it  adds  skeletal  functions  and macros, that the
architecture-independent code expects to be available.

Reviewed-by: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 arch/powerpc/include/asm/book3s/64/mmu.h |9 +++
 arch/powerpc/include/asm/mmu_context.h   |1 +
 arch/powerpc/include/asm/pkeys.h |   95 -
 arch/powerpc/mm/mmu_context_book3s64.c   |2 +
 arch/powerpc/mm/pkeys.c  |   33 ++
 5 files changed, 136 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index 37fdede..df17fbc 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -108,6 +108,15 @@ struct patb_entry {
 #ifdef CONFIG_SPAPR_TCE_IOMMU
struct list_head iommu_group_mem_list;
 #endif
+
+#ifdef CONFIG_PPC_MEM_KEYS
+   /*
+* Each bit represents one protection key.
+* bit set   -> key allocated
+* bit unset -> key available for allocation
+*/
+   u32 pkey_allocation_map;
+#endif
 } mm_context_t;
 
 /*
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 2c24447..6d7c4f1 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -145,6 +145,7 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
 
 #ifndef CONFIG_PPC_MEM_KEYS
 #define pkey_initialize()
+#define pkey_mm_init(mm)
 #endif /* CONFIG_PPC_MEM_KEYS */
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index a54cb39..e5deac7 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -15,21 +15,101 @@
 #include 
 
 DECLARE_STATIC_KEY_TRUE(pkey_disabled);
-#define ARCH_VM_PKEY_FLAGS 0
+extern int pkeys_total; /* total pkeys as per device tree */
+extern u32 initial_allocation_mask; /* bits set for reserved keys */
+
+/*
+ * powerpc needs VM_PKEY_BIT* bit to enable pkey system.
+ * Without them, at least compilation needs to succeed.
+ */
+#ifndef VM_PKEY_BIT0
+#define VM_PKEY_SHIFT 0
+#define VM_PKEY_BIT0 0
+#define VM_PKEY_BIT1 0
+#define VM_PKEY_BIT2 0
+#define VM_PKEY_BIT3 0
+#endif
+
+/*
+ * powerpc needs an additional vma bit to support 32 keys. Till the additional
+ * vma bit lands in include/linux/mm.h we can only support 16 keys.
+ */
+#ifndef VM_PKEY_BIT4
+#define VM_PKEY_BIT4 0
+#endif
+
+#define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | \
+   VM_PKEY_BIT3 | VM_PKEY_BIT4)
+
+#define arch_max_pkey() pkeys_total
+
+#define pkey_alloc_mask(pkey) (0x1 << pkey)
+
+#define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map)
+
+#define __mm_pkey_allocated(mm, pkey) {\
+   mm_pkey_allocation_map(mm) |= pkey_alloc_mask(pkey); \
+}
+
+#define __mm_pkey_free(mm, pkey) { \
+   mm_pkey_allocation_map(mm) &= ~pkey_alloc_mask(pkey);   \
+}
+
+#define __mm_pkey_is_allocated(mm, pkey)   \
+   (mm_pkey_allocation_map(mm) & pkey_alloc_mask(pkey))
+
+#define __mm_pkey_is_reserved(pkey) (initial_allocation_mask & \
+  pkey_alloc_mask(pkey))
 
 static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
-   return false;
+   /* A reserved key is never considered as 'explicitly allocated' */
+   return ((pkey < arch_max_pkey()) &&
+   !__mm_pkey_is_reserved(pkey) &&
+   __mm_pkey_is_allocated(mm, pkey));
 }
 
+/*
+ * Returns a positive, 5-bit key on success, or -1 on failure.
+ * Relies on the mmap_sem to protect against concurrency in mm_pkey_alloc() and
+ * mm_pkey_free().
+ */
 static inline int mm_pkey_alloc(struct mm_struct *mm)
 {
-   return -1;
+   /*
+* Note: this is the one and only place we make sure that the pkey is
+* valid as far as the hardware is concerned. The rest of the kernel
+* trusts that only good, valid pkeys come out of here.
+*/
+   u32 all_pkeys_mask = (u32)(~(0x0));
+   int ret;
+
+   if (static_branch_likely(_disabled))
+   return -1;
+
+   /*
+* Are we out of pkeys? We must handle this specially because ffz()
+* behavior is undefined if there are no zeros.
+*/
+   if (mm_pkey_allocation_map(mm) == all_pkeys_mask)
+   return -1;
+
+   ret = ffz((u32)mm_pkey_allocation_map(mm));
+   __mm_pkey_allocated(mm, ret);

[PATCH v9 03/51] powerpc: initial pkey plumbing

2017-11-06 Thread Ram Pai
Basic  plumbing  to   initialize  the   pkey  system.
Nothing is enabled yet. A later patch will enable it
ones all the infrastructure is in place.

Signed-off-by: Ram Pai 
---
 arch/powerpc/Kconfig   |   15 
 arch/powerpc/include/asm/mmu_context.h |5 +++
 arch/powerpc/include/asm/pkeys.h   |   57 
 arch/powerpc/mm/Makefile   |1 +
 arch/powerpc/mm/hash_utils_64.c|4 ++
 arch/powerpc/mm/pkeys.c|   30 +
 6 files changed, 112 insertions(+), 0 deletions(-)
 create mode 100644 arch/powerpc/include/asm/pkeys.h
 create mode 100644 arch/powerpc/mm/pkeys.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index cb782ac..9fd389b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -865,6 +865,21 @@ config SECCOMP
 
  If unsure, say Y. Only embedded should say N here.
 
+config PPC_MEM_KEYS
+   prompt "PowerPC Memory Protection Keys"
+   def_bool y
+   depends on PPC_BOOK3S_64
+   select ARCH_USES_HIGH_VMA_FLAGS
+   select ARCH_HAS_PKEYS
+   help
+ Memory Protection Keys provides a mechanism for enforcing
+ page-based protections, but without requiring modification of the
+ page tables when an application changes protection domains.
+
+ For details, see Documentation/vm/protection-keys.txt
+
+ If unsure, say y.
+
 endmenu
 
 config ISA_DMA_API
diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 492d814..2c24447 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -142,5 +142,10 @@ static inline bool arch_vma_access_permitted(struct 
vm_area_struct *vma,
/* by default, allow everything */
return true;
 }
+
+#ifndef CONFIG_PPC_MEM_KEYS
+#define pkey_initialize()
+#endif /* CONFIG_PPC_MEM_KEYS */
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_MMU_CONTEXT_H */
diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
new file mode 100644
index 000..a54cb39
--- /dev/null
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -0,0 +1,57 @@
+/*
+ * PowerPC Memory Protection Keys management
+ * Copyright (c) 2017, IBM Corporation.
+ * Author: Ram Pai 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef _ASM_POWERPC_KEYS_H
+#define _ASM_POWERPC_KEYS_H
+
+#include 
+
+DECLARE_STATIC_KEY_TRUE(pkey_disabled);
+#define ARCH_VM_PKEY_FLAGS 0
+
+static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
+{
+   return false;
+}
+
+static inline int mm_pkey_alloc(struct mm_struct *mm)
+{
+   return -1;
+}
+
+static inline int mm_pkey_free(struct mm_struct *mm, int pkey)
+{
+   return -EINVAL;
+}
+
+/*
+ * Try to dedicate one of the protection keys to be used as an
+ * execute-only protection key.
+ */
+static inline int execute_only_pkey(struct mm_struct *mm)
+{
+   return 0;
+}
+
+static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma,
+ int prot, int pkey)
+{
+   return 0;
+}
+
+static inline int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+   unsigned long init_val)
+{
+   return 0;
+}
+
+extern void pkey_initialize(void);
+#endif /*_ASM_POWERPC_KEYS_H */
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index a0c327d..823b03d 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -44,3 +44,4 @@ obj-$(CONFIG_PPC_COPRO_BASE)  += copro_fault.o
 obj-$(CONFIG_SPAPR_TCE_IOMMU)  += mmu_context_iommu.o
 obj-$(CONFIG_PPC_PTDUMP)   += dump_linuxpagetables.o
 obj-$(CONFIG_PPC_HTDUMP)   += dump_hashpagetable.o
+obj-$(CONFIG_PPC_MEM_KEYS) += pkeys.o
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index 578d5a3..1e74590 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1050,6 +1051,9 @@ void __init hash__early_init_mmu(void)
pr_info("Initializing hash mmu with SLB\n");
/* Initialize SLB management */
slb_initialize();
+
+   /* initialize the key subsystem */
+   pkey_initialize();
 }
 
 #ifdef CONFIG_SMP
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
new file mode 100644
index 000..c97a7a0
--- /dev/null
+++ b/arch/powerpc/mm/pkeys.c
@@ -0,0 +1,30 @@
+/*
+ * PowerPC Memory Protection Keys management
+ * Copyright (c) 2017, IBM Corporation.
+ * Author: Ram Pai 
+ *
+ * This program is free software; you can redistribute it 

[PATCH v9 02/51] mm, powerpc, x86: introduce an additional vma bit for powerpc pkey

2017-11-06 Thread Ram Pai
Currently only 4bits are allocated in the vma flags to hold 16
keys. This is sufficient for x86. PowerPC  supports  32  keys,
which needs 5bits. This patch allocates an  additional bit.

Acked-by: Balbir Singh 
Signed-off-by: Ram Pai 
---
 fs/proc/task_mmu.c |1 +
 include/linux/mm.h |3 ++-
 2 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 677866e..fad19a0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -683,6 +683,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct 
vm_area_struct *vma)
[ilog2(VM_PKEY_BIT1)]   = "",
[ilog2(VM_PKEY_BIT2)]   = "",
[ilog2(VM_PKEY_BIT3)]   = "",
+   [ilog2(VM_PKEY_BIT4)]   = "",
 #endif /* CONFIG_ARCH_HAS_PKEYS */
};
size_t i;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2c5ea48..f5330a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -221,9 +221,10 @@ extern int overcommit_kbytes_handler(struct ctl_table *, 
int, void __user *,
 #ifdef CONFIG_ARCH_HAS_PKEYS
 # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0
 # define VM_PKEY_BIT0  VM_HIGH_ARCH_0  /* A protection key is a 4-bit value */
-# define VM_PKEY_BIT1  VM_HIGH_ARCH_1
+# define VM_PKEY_BIT1  VM_HIGH_ARCH_1  /* on x86 and 5-bit value on ppc64   */
 # define VM_PKEY_BIT2  VM_HIGH_ARCH_2
 # define VM_PKEY_BIT3  VM_HIGH_ARCH_3
+# define VM_PKEY_BIT4  VM_HIGH_ARCH_4
 #endif /* CONFIG_ARCH_HAS_PKEYS */
 
 #if defined(CONFIG_X86)
-- 
1.7.1



  1   2   >