date:20180613

Re: [PATCH v2] powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP

2018-06-13 Thread kbuild test robot

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on next-20180613]
[cannot apply to v4.17]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/powerpc-64s-radix-Fix-MADV_-FREE-DONTNEED-TLB-flush-miss-problem-with-THP/20180614-114728
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: microblaze-nommu_defconfig (attached as .config)
compiler: microblaze-linux-gcc (GCC) 8.1.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=8.1.0 make.cross ARCH=microblaze 

All errors (new ones prefixed by >>):

   In file included from include/linux/mm.h:478,
from arch/microblaze/include/asm/io.h:17,
from include/linux/clocksource.h:21,
from include/linux/clockchips.h:14,
from include/linux/tick.h:8,
from include/linux/sched/isolation.h:6,
from kernel/sched/sched.h:17,
from kernel/sched/loadavg.c:9:
   include/linux/migrate.h: In function 'new_page_nodemask':
>> include/linux/huge_mm.h:82:25: error: 'PMD_SHIFT' undeclared (first use in 
>> this function); did you mean 'NMI_SHIFT'?
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
   include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
   include/linux/migrate.h:47:11: note: in expansion of macro 'HPAGE_PMD_ORDER'
  order = HPAGE_PMD_ORDER;
  ^~~
   include/linux/huge_mm.h:82:25: note: each undeclared identifier is reported 
only once for each function it appears in
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
   include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
   include/linux/migrate.h:47:11: note: in expansion of macro 'HPAGE_PMD_ORDER'
  order = HPAGE_PMD_ORDER;
  ^~~
--
   In file included from include/linux/mm.h:478,
from arch/microblaze/include/asm/io.h:17,
from include/linux/clocksource.h:21,
from include/linux/clockchips.h:14,
from include/linux/tick.h:8,
from include/linux/sched/isolation.h:6,
from kernel/sched/sched.h:17,
from kernel/sched/core.c:8:
   include/linux/migrate.h: In function 'new_page_nodemask':
>> include/linux/huge_mm.h:82:25: error: 'PMD_SHIFT' undeclared (first use in 
>> this function); did you mean 'NMI_SHIFT'?
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
   include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
   include/linux/migrate.h:47:11: note: in expansion of macro 'HPAGE_PMD_ORDER'
  order = HPAGE_PMD_ORDER;
  ^~~
   include/linux/huge_mm.h:82:25: note: each undeclared identifier is reported 
only once for each function it appears in
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
   include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
   include/linux/migrate.h:47:11: note: in expansion of macro 'HPAGE_PMD_ORDER'
  order = HPAGE_PMD_ORDER;
  ^~~
   In file included from kernel/sched/sched.h:63,
from kernel/sched/core.c:8:
   kernel/sched/core.c: At top level:
   include/linux/syscalls.h:233:18: warning: 'sys_sched_rr_get_interval' alias 
between functions of incompatible types 'long int(pid_t,  struct timespec *)' 
{aka 'long int(int,  struct timespec *)'} and 'long int(long int,  long int)' 
[-Wattribute-alias]
 asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
 ^~~
   include/linux/syscalls.h:222:2: note: in expansion of macro 
'__SYSCALL_DEFINEx'
 __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 ^
   include/linux/syscalls.h:212:36: note: in expansion of macro 
'SYSCALL_DEFINEx'
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
   ^~~
   kernel/sched/core.c:5274:1: note: in expansion of macro 'SYSCALL_DEFINE2'
SYSCALL_DEFINE2(sch

Re: [PATCH v2] powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP

2018-06-13 Thread kbuild test robot

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on next-20180613]
[cannot apply to v4.17]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/powerpc-64s-radix-Fix-MADV_-FREE-DONTNEED-TLB-flush-miss-problem-with-THP/20180614-114728
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: arm-allnoconfig (attached as .config)
compiler: arm-linux-gnueabi-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=arm 

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/mm.h:478:0,
from include/linux/memcontrol.h:29,
from include/linux/swap.h:9,
from mm/compaction.c:12:
   include/linux/migrate.h: In function 'new_page_nodemask':
>> include/linux/huge_mm.h:82:25: error: 'PMD_SHIFT' undeclared (first use in 
>> this function); did you mean 'PUD_SHIFT'?
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
>> include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
>> include/linux/migrate.h:47:11: note: in expansion of macro 'HPAGE_PMD_ORDER'
  order = HPAGE_PMD_ORDER;
  ^~~
   include/linux/huge_mm.h:82:25: note: each undeclared identifier is reported 
only once for each function it appears in
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
>> include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
>> include/linux/migrate.h:47:11: note: in expansion of macro 'HPAGE_PMD_ORDER'
  order = HPAGE_PMD_ORDER;
  ^~~
--
   In file included from include/linux/mm.h:478:0,
from include/linux/dax.h:6,
from mm/filemap.c:14:
   mm/filemap.c: In function 'page_cache_free_page':
>> include/linux/huge_mm.h:82:25: error: 'PMD_SHIFT' undeclared (first use in 
>> this function); did you mean 'PUD_SHIFT'?
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
>> include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
>> include/linux/huge_mm.h:80:26: note: in expansion of macro 'HPAGE_PMD_ORDER'
#define HPAGE_PMD_NR (1<> mm/filemap.c:279:22: note: in expansion of macro 'HPAGE_PMD_NR'
  page_ref_sub(page, HPAGE_PMD_NR);
 ^~~~
   include/linux/huge_mm.h:82:25: note: each undeclared identifier is reported 
only once for each function it appears in
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
>> include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
>> include/linux/huge_mm.h:80:26: note: in expansion of macro 'HPAGE_PMD_ORDER'
#define HPAGE_PMD_NR (1<> mm/filemap.c:279:22: note: in expansion of macro 'HPAGE_PMD_NR'
  page_ref_sub(page, HPAGE_PMD_NR);
 ^~~~
   mm/filemap.c: In function 'page_cache_tree_delete_batch':
>> include/linux/huge_mm.h:82:25: error: 'PMD_SHIFT' undeclared (first use in 
>> this function); did you mean 'PUD_SHIFT'?
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
>> include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
>> include/linux/huge_mm.h:80:26: note: in expansion of macro 'HPAGE_PMD_ORDER'
#define HPAGE_PMD_NR (1<> include/linux/huge_mm.h:82:25: error: 'PMD_SHIFT' undeclared (first use in 
>> this function); did you mean 'PUD_SHIFT'?
#define HPAGE_PMD_SHIFT PMD_SHIFT
^
>> include/linux/huge_mm.h:79:26: note: in expansion of macro 'HPAGE_PMD_SHIFT'
#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT)
 ^~~
>> include/linux/huge_mm.h:80:26: note: in expansion of macro 'HPAGE_PMD_ORDER'
#define HPAGE_PMD_NR (1<> mm/truncate.c:182:38: note: in expansion of macro 'HPAGE_PMD_NR'
  pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
 ^~~~
   in

Re: [RFC PATCH 22/23] watchdog/hardlockup/hpet: Only enable the HPET watchdog via a boot parameter

2018-06-13 Thread Randy Dunlap

On 06/13/2018 05:58 PM, Ricardo Neri wrote:
> On Tue, Jun 12, 2018 at 10:26:57PM -0700, Randy Dunlap wrote:
>> On 06/12/2018 05:57 PM, Ricardo Neri wrote:
>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
>>> b/Documentation/admin-guide/kernel-parameters.txt
>>> index f2040d4..a8833c7 100644
>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>> @@ -2577,7 +2577,7 @@
>>> Format: [state][,regs][,debounce][,die]
>>>  
>>> nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
>>> -   Format: [panic,][nopanic,][num]
>>> +   Format: [panic,][nopanic,][num,][hpet]
>>> Valid num: 0 or 1
>>> 0 - turn hardlockup detector in nmi_watchdog off
>>> 1 - turn hardlockup detector in nmi_watchdog on
>>
>> This says that I can use "nmi_watchdog=hpet" without using 0 or 1.
>> Is that correct?
> 
> Yes, this what I meant. In my view, if you set nmi_watchdog=hpet it
> implies that you want to activate the NMI watchdog. In this case, perf.
> 
> I can see how this will be ambiguous for the case of perf and arch NMI
> watchdogs.
> 
> Alternative, a new parameter could be added; such as nmi_watchdog_type. I
> didn't want to add it in this patchset as I think that a single parameter
> can handle the enablement and type of the NMI watchdog.
> 
> What do you think?

I think it's OK like it is.

thanks,
-- 
~Randy

[PATCH v2] powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP

2018-06-13 Thread Nicholas Piggin

The patch 99baac21e4 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss
problem") added a force flush mode to the mmu_gather flush, which
unconditionally flushes the entire address range being invalidated
(even if actual ptes only covered a smaller range), to solve a problem
with concurrent threads invalidating the same PTEs causing them to
miss TLBs that need flushing.

This does not work with powerpc that invalidates mmu_gather batches
according to page size. Have powerpc flush all possible page sizes in
the range if it encounters this concurrency condition.

Patch 4647706ebe ("mm: always flush VMA ranges affected by
zap_page_range") does add a TLB flush for all page sizes on powerpc for
the zap_page_range case, but that is to be removed and replaced with
the mmu_gather flush to avoid redundant flushing. It is also thought to
not cover other obscure race conditions:

https://lkml.kernel.org/r/bd3a0ebe-ecf4-41d4-87fa-c755ea9ab...@gmail.com

Hash does not have a problem because it invalidates TLBs inside the
page table locks.

Reported-by: Aneesh Kumar K.V 
Signed-off-by: Nicholas Piggin 
---
Since v1:
- Compile fix or !THP
- Fixed missing PWC flush case
- Fixed concurrent TLB flush test
- Expanded changelog

I think this is a required fix for existing kernels, at least to be
safe and bring the flushig in to line with other architctures I
think we should add this as a fix. For the next kernel release I will
remove the duplicate flush in zap_page_range so this would definitely
be needed.

 arch/powerpc/mm/tlb-radix.c | 92 +
 include/linux/huge_mm.h | 10 +---
 2 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 67a6e86d3e7e..919232a59ea1 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -689,22 +689,17 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = 
POWER9_TLB_SETS_RADIX * 2;
 
-void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-unsigned long end)
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+   unsigned long start, unsigned long end,
+   bool flush_all_sizes)
 
 {
-   struct mm_struct *mm = vma->vm_mm;
unsigned long pid;
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
bool local, full;
 
-#ifdef CONFIG_HUGETLB_PAGE
-   if (is_vm_hugetlb_page(vma))
-   return radix__flush_hugetlb_tlb_range(vma, start, end);
-#endif
-
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
@@ -738,16 +733,27 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, 
unsigned long start,
_tlbie_pid(pid, RIC_FLUSH_TLB);
}
} else {
-   bool hflush = false;
+   bool hflush = flush_all_sizes;
+   bool gflush = flush_all_sizes;
unsigned long hstart, hend;
+   unsigned long gstart, gend;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-   hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
-   hend = end >> HPAGE_PMD_SHIFT;
-   if (hstart < hend) {
-   hstart <<= HPAGE_PMD_SHIFT;
-   hend <<= HPAGE_PMD_SHIFT;
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+   if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
hflush = true;
+
+   if (hflush) {
+   hstart = (start + HPAGE_PMD_SIZE - 1) & HPAGE_PMD_MASK;
+   hend = end & HPAGE_PMD_MASK;
+   if (hstart == hend)
+   hflush = false;
+   }
+
+   if (gflush) {
+   gstart = (start + HPAGE_PUD_SIZE - 1) & HPAGE_PUD_MASK;
+   gend = end & HPAGE_PUD_MASK;
+   if (gstart == gend)
+   gflush = false;
}
 #endif
 
@@ -757,18 +763,36 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, 
unsigned long start,
if (hflush)
__tlbiel_va_range(hstart, hend, pid,
HPAGE_PMD_SIZE, MMU_PAGE_2M);
+   if (gflush)
+   __tlbiel_va_range(gstart, gend, pid,
+   HPAGE_PUD_SIZE, MMU_PAGE_1G);
asm volatile("ptesync": : :"memory");
} else {

Re: [RFC PATCH 3/3] powerpc/64s/radix: optimise TLB flush with precise TLB ranges in mmu_gather

2018-06-13 Thread Nicholas Piggin

On Tue, 12 Jun 2018 18:10:26 -0700
Linus Torvalds  wrote:

> On Tue, Jun 12, 2018 at 5:12 PM Nicholas Piggin  wrote:
> > >
> > > And in _theory_, maybe you could have just used "invalpg" with a
> > > targeted address instead. In fact, I think a single invlpg invalidates
> > > _all_ caches for the associated MM, but don't quote me on that.  
> 
> Confirmed. The SDK says
> 
>  "INVLPG also invalidates all entries in all paging-structure caches
>   associated with the current PCID, regardless of the linear addresses
>   to which they correspond"

Interesting, so that's very much like powerpc.

> so if x86 wants to do this "separate invalidation for page directory
> entryes", then it would want to
> 
>  (a) remove the __tlb_adjust_range() operation entirely from
> pud_free_tlb() and friends

Revised patch below (only the generic part this time, but powerpc
implementation gives the same result as the last patch).

> 
>  (b) instead just have a single field for "invalidate_tlb_caches",
> which could be a boolean, or could just be one of the addresses

Yeah well powerpc hijacks one of the existing bools in the mmu_gather
for exactly that, and sets it when a page table page is to be freed.

> and then the logic would be that IFF no other tlb invalidate is done
> due to an actual page range, then we look at that
> invalidate_tlb_caches field, and do a single INVLPG instead.
> 
> I still am not sure if this would actually make a difference in
> practice, but I guess it does mean that x86 could at least participate
> in some kind of scheme where we have architecture-specific actions for
> those page directory entries.

I think it could. But yes I don't know how much it would help, I think
x86 tlb invalidation is very fast, and I noticed this mostly at exec
time when you probably lose all your TLBs anyway.

> 
> And we could make the default behavior - if no architecture-specific
> tlb page directory invalidation function exists - be the current
> "__tlb_adjust_range()" case. So the default would be to not change
> behavior, and architectures could opt in to something like this.
> 
> Linus

Yep, is this a bit more to your liking?

---
 include/asm-generic/tlb.h | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index faddde44de8c..fa44321bc8dd 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -262,36 +262,49 @@ static inline void 
tlb_remove_check_page_size_change(struct mmu_gather *tlb,
  * architecture to do its own odd thing, not cause pain for others
  * 
http://lkml.kernel.org/r/ca+55afzbggoxtnxqeng5d_mrodnambe5y+urs+phr67nupm...@mail.gmail.com
  *
+ * Powerpc (Book3S 64-bit) with the radix MMU has an architected "page
+ * walk cache" that is invalidated with a specific instruction. It uses
+ * need_flush_all to issue this instruction, which is set by its own
+ * __p??_free_tlb functions.
+ *
  * For now w.r.t page table cache, mark the range_size as PAGE_SIZE
  */
 
+#ifndef pte_free_tlb
 #define pte_free_tlb(tlb, ptep, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
__pte_free_tlb(tlb, ptep, address); \
} while (0)
+#endif
 
+#ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
__pmd_free_tlb(tlb, pmdp, address); \
} while (0)
+#endif
 
 #ifndef __ARCH_HAS_4LEVEL_HACK
+#ifndef pud_free_tlb
 #define pud_free_tlb(tlb, pudp, address)   \
do {\
__tlb_adjust_range(tlb, address, PAGE_SIZE);\
__pud_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
+#endif
 
 #ifndef __ARCH_HAS_5LEVEL_HACK
+#ifndef p4d_free_tlb
 #define p4d_free_tlb(tlb, pudp, address)   \
do {\
-   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
+   __tlb_adjust_range(tlb, address, PAGE_SIZE);\
__p4d_free_tlb(tlb, pudp, address); \
} while (0)
 #endif
+#endif
 
 #define tlb_migrate_finish(mm) do {} while (0)
 
-- 
2.17.0

Re: [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations

2018-06-13 Thread Nicholas Piggin

On Wed, 13 Jun 2018 18:31:17 -0700
Ricardo Neri  wrote:

> On Wed, Jun 13, 2018 at 09:52:25PM +1000, Nicholas Piggin wrote:
> > On Wed, 13 Jun 2018 11:26:49 +0200 (CEST)
> > Thomas Gleixner  wrote:
> >   
> > > On Wed, 13 Jun 2018, Peter Zijlstra wrote:  
> > > > On Wed, Jun 13, 2018 at 05:41:41PM +1000, Nicholas Piggin wrote:
> > > > > On Tue, 12 Jun 2018 17:57:32 -0700
> > > > > Ricardo Neri  wrote:
> > > > > 
> > > > > > Instead of exposing individual functions for the operations of the 
> > > > > > NMI
> > > > > > watchdog, define a common interface that can be used across multiple
> > > > > > implementations.
> > > > > > 
> > > > > > The struct nmi_watchdog_ops is defined for such operations. These 
> > > > > > initial
> > > > > > definitions include the enable, disable, start, stop, and cleanup
> > > > > > operations.
> > > > > > 
> > > > > > Only a single NMI watchdog can be used in the system. The 
> > > > > > operations of
> > > > > > this NMI watchdog are accessed via the new variable nmi_wd_ops. This
> > > > > > variable is set to point the operations of the first NMI watchdog 
> > > > > > that
> > > > > > initializes successfully. Even though at this moment, the only 
> > > > > > available
> > > > > > NMI watchdog is the perf-based hardlockup detector. More 
> > > > > > implementations
> > > > > > can be added in the future.
> > > > > 
> > > > > Cool, this looks pretty nice at a quick glance. sparc and powerpc at
> > > > > least have their own NMI watchdogs, it would be good to have those
> > > > > converted as well.
> > > > 
> > > > Yeah, agreed, this looks like half a patch.
> > > 
> > > Though I'm not seeing the advantage of it. That kind of NMI watchdogs are
> > > low level architecture details so having yet another 'ops' data structure
> > > with a gazillion of callbacks, checks and indirections does not provide
> > > value over the currently available weak stubs.  
> > 
> > The other way to go of course is librify the perf watchdog and make an
> > x86 watchdog that selects between perf and hpet... I also probably
> > prefer that for code such as this, but I wouldn't strongly object to
> > ops struct if I'm not writing the code. It's not that bad is it?  
> 
> My motivation to add the ops was that the hpet and perf watchdog share
> significant portions of code.

Right, a good motivation.

> I could look into creating the library for
> common code and relocate the hpet watchdog into arch/x86 for the hpet-
> specific parts.

If you can investigate that approach, that would be appreciated. I hope
I did not misunderstand you there, Thomas.

Basically you would have perf infrastructure and hpet infrastructure,
and then the x86 watchdog driver will use one or the other of those. The
generic watchdog driver will be just a simple shim that uses the perf
infrastructure. Then hopefully the powerpc driver would require almost
no change.

Thanks,
Nick

Re: [RFC PATCH 14/23] watchdog/hardlockup: Decouple the hardlockup detector from perf

2018-06-13 Thread Nicholas Piggin

On Wed, 13 Jun 2018 18:19:01 -0700
Ricardo Neri  wrote:

> On Wed, Jun 13, 2018 at 10:43:24AM +0200, Peter Zijlstra wrote:
> > On Tue, Jun 12, 2018 at 05:57:34PM -0700, Ricardo Neri wrote:  
> > > The current default implementation of the hardlockup detector assumes that
> > > it is implemented using perf events.  
> > 
> > The sparc and powerpc things are very much not using perf.  
> 
> Isn't it true that the current hardlockup detector
> (under kernel/watchdog_hld.c) is based on perf?

arch/powerpc/kernel/watchdog.c is a powerpc implementation that uses
the kernel/watchdog_hld.c framework.

> As far as I understand,
> this hardlockup detector is constructed using perf events for architectures
> that don't provide an NMI watchdog. Perhaps I can be more specific and say
> that this synthetized detector is based on perf.

The perf detector is like that, but we want NMI watchdogs to share
the watchdog_hld code as much as possible even for arch specific NMI
watchdogs, so that kernel and user interfaces and behaviour are
consistent.

Other arch watchdogs like sparc are a little older so they are not
using HLD. You don't have to change those for your series, but it
would be good to bring them into the fold if possible at some time.
IIRC sparc was slightly non-trivial because it has some differences
in sysctl or cmdline APIs that we don't want to break.

But powerpc at least needs to be updated if you change hld apis.

Thanks,
Nick

Re: [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations

2018-06-13 Thread Ricardo Neri

On Wed, Jun 13, 2018 at 09:52:25PM +1000, Nicholas Piggin wrote:
> On Wed, 13 Jun 2018 11:26:49 +0200 (CEST)
> Thomas Gleixner  wrote:
> 
> > On Wed, 13 Jun 2018, Peter Zijlstra wrote:
> > > On Wed, Jun 13, 2018 at 05:41:41PM +1000, Nicholas Piggin wrote:  
> > > > On Tue, 12 Jun 2018 17:57:32 -0700
> > > > Ricardo Neri  wrote:
> > > >   
> > > > > Instead of exposing individual functions for the operations of the NMI
> > > > > watchdog, define a common interface that can be used across multiple
> > > > > implementations.
> > > > > 
> > > > > The struct nmi_watchdog_ops is defined for such operations. These 
> > > > > initial
> > > > > definitions include the enable, disable, start, stop, and cleanup
> > > > > operations.
> > > > > 
> > > > > Only a single NMI watchdog can be used in the system. The operations 
> > > > > of
> > > > > this NMI watchdog are accessed via the new variable nmi_wd_ops. This
> > > > > variable is set to point the operations of the first NMI watchdog that
> > > > > initializes successfully. Even though at this moment, the only 
> > > > > available
> > > > > NMI watchdog is the perf-based hardlockup detector. More 
> > > > > implementations
> > > > > can be added in the future.  
> > > > 
> > > > Cool, this looks pretty nice at a quick glance. sparc and powerpc at
> > > > least have their own NMI watchdogs, it would be good to have those
> > > > converted as well.  
> > > 
> > > Yeah, agreed, this looks like half a patch.  
> > 
> > Though I'm not seeing the advantage of it. That kind of NMI watchdogs are
> > low level architecture details so having yet another 'ops' data structure
> > with a gazillion of callbacks, checks and indirections does not provide
> > value over the currently available weak stubs.
> 
> The other way to go of course is librify the perf watchdog and make an
> x86 watchdog that selects between perf and hpet... I also probably
> prefer that for code such as this, but I wouldn't strongly object to
> ops struct if I'm not writing the code. It's not that bad is it?

My motivation to add the ops was that the hpet and perf watchdog share
significant portions of code. I could look into creating the library for
common code and relocate the hpet watchdog into arch/x86 for the hpet-
specific parts.

Thanks and BR,
Ricardo

Re: [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations

2018-06-13 Thread Ricardo Neri

On Wed, Jun 13, 2018 at 10:42:19AM +0200, Peter Zijlstra wrote:
> On Wed, Jun 13, 2018 at 05:41:41PM +1000, Nicholas Piggin wrote:
> > On Tue, 12 Jun 2018 17:57:32 -0700
> > Ricardo Neri  wrote:
> > 
> > > Instead of exposing individual functions for the operations of the NMI
> > > watchdog, define a common interface that can be used across multiple
> > > implementations.
> > > 
> > > The struct nmi_watchdog_ops is defined for such operations. These initial
> > > definitions include the enable, disable, start, stop, and cleanup
> > > operations.
> > > 
> > > Only a single NMI watchdog can be used in the system. The operations of
> > > this NMI watchdog are accessed via the new variable nmi_wd_ops. This
> > > variable is set to point the operations of the first NMI watchdog that
> > > initializes successfully. Even though at this moment, the only available
> > > NMI watchdog is the perf-based hardlockup detector. More implementations
> > > can be added in the future.
> > 
> > Cool, this looks pretty nice at a quick glance. sparc and powerpc at
> > least have their own NMI watchdogs, it would be good to have those
> > converted as well.
> 
> Yeah, agreed, this looks like half a patch.

I planned to look into the conversion of sparc and powerpc. I just wanted
to see the reception to these patches before jumping and do potentially
useless work. Comments in this thread lean towards keep using the weak
stubs.

Thanks and BR,
Ricardo

Re: [RFC PATCH 14/23] watchdog/hardlockup: Decouple the hardlockup detector from perf

2018-06-13 Thread Ricardo Neri

On Wed, Jun 13, 2018 at 10:43:24AM +0200, Peter Zijlstra wrote:
> On Tue, Jun 12, 2018 at 05:57:34PM -0700, Ricardo Neri wrote:
> > The current default implementation of the hardlockup detector assumes that
> > it is implemented using perf events.
> 
> The sparc and powerpc things are very much not using perf.

Isn't it true that the current hardlockup detector
(under kernel/watchdog_hld.c) is based on perf? As far as I understand,
this hardlockup detector is constructed using perf events for architectures
that don't provide an NMI watchdog. Perhaps I can be more specific and say
that this synthetized detector is based on perf.

On a side note, I saw that powerpc might use a perf-based hardlockup
detector if it has perf events [1].

Please let me know if my understanding is not correct.

Thanks and BR,
Ricardo

[1]. https://elixir.bootlin.com/linux/v4.17/source/arch/powerpc/Kconfig#L218

Re: [RFC PATCH 16/23] watchdog/hardlockup: Add an HPET-based hardlockup detector

2018-06-13 Thread Ricardo Neri

On Tue, Jun 12, 2018 at 10:23:47PM -0700, Randy Dunlap wrote:
> Hi,

Hi Randy,

> 
> On 06/12/2018 05:57 PM, Ricardo Neri wrote:
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index c40c7b7..6e79833 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -828,6 +828,16 @@ config HARDLOCKUP_DETECTOR_PERF
> > bool
> > select SOFTLOCKUP_DETECTOR
> >  
> > +config HARDLOCKUP_DETECTOR_HPET
> > +   bool "Use HPET Timer for Hard Lockup Detection"
> > +   select SOFTLOCKUP_DETECTOR
> > +   select HARDLOCKUP_DETECTOR
> > +   depends on HPET_TIMER && HPET
> > +   help
> > + Say y to enable a hardlockup detector that is driven by an 
> > High-Precision
> > + Event Timer. In addition to selecting this option, the command-line
> > + parameter nmi_watchdog option. See 
> > Documentation/admin-guide/kernel-parameters.rst
> 
> The "In addition ..." thing is a broken (incomplete) sentence.

Oops. I apologize. I missed this I will fix it in my next version.

Thanks and BR,
Ricardo

Re: [RFC PATCH 22/23] watchdog/hardlockup/hpet: Only enable the HPET watchdog via a boot parameter

2018-06-13 Thread Ricardo Neri

On Tue, Jun 12, 2018 at 10:26:57PM -0700, Randy Dunlap wrote:
> On 06/12/2018 05:57 PM, Ricardo Neri wrote:
> > diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> > b/Documentation/admin-guide/kernel-parameters.txt
> > index f2040d4..a8833c7 100644
> > --- a/Documentation/admin-guide/kernel-parameters.txt
> > +++ b/Documentation/admin-guide/kernel-parameters.txt
> > @@ -2577,7 +2577,7 @@
> > Format: [state][,regs][,debounce][,die]
> >  
> > nmi_watchdog=   [KNL,BUGS=X86] Debugging features for SMP kernels
> > -   Format: [panic,][nopanic,][num]
> > +   Format: [panic,][nopanic,][num,][hpet]
> > Valid num: 0 or 1
> > 0 - turn hardlockup detector in nmi_watchdog off
> > 1 - turn hardlockup detector in nmi_watchdog on
> 
> This says that I can use "nmi_watchdog=hpet" without using 0 or 1.
> Is that correct?

Yes, this what I meant. In my view, if you set nmi_watchdog=hpet it
implies that you want to activate the NMI watchdog. In this case, perf.

I can see how this will be ambiguous for the case of perf and arch NMI
watchdogs.

Alternative, a new parameter could be added; such as nmi_watchdog_type. I
didn't want to add it in this patchset as I think that a single parameter
can handle the enablement and type of the NMI watchdog.

What do you think?

Thanks and BR,
Ricardo

[PATCH v13 24/24] selftests/vm: test correct behavior of pkey-0

2018-06-13 Thread Ram Pai

Ensure pkey-0 is allocated on start.  Ensure pkey-0 can be attached
dynamically in various modes, without failures.  Ensure pkey-0 can be
freed and allocated.

Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   66 +-
 1 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index cbd87f8..f37b031 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1003,6 +1003,67 @@ void close_test_fds(void)
return *ptr;
 }
 
+void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey)
+{
+   int i, err;
+   int max_nr_pkey_allocs;
+   int alloced_pkeys[NR_PKEYS];
+   int nr_alloced = 0;
+   int newpkey;
+   long size;
+
+   assert(pkey_last_malloc_record);
+   size = pkey_last_malloc_record->size;
+   /*
+* This is a bit of a hack.  But mprotect() requires
+* huge-page-aligned sizes when operating on hugetlbfs.
+* So, make sure that we use something that's a multiple
+* of a huge page when we can.
+*/
+   if (size >= HPAGE_SIZE)
+   size = HPAGE_SIZE;
+
+
+   /* allocate every possible key and make sure key-0 never got allocated 
*/
+   max_nr_pkey_allocs = NR_PKEYS;
+   for (i = 0; i < max_nr_pkey_allocs; i++) {
+   int new_pkey = alloc_pkey();
+   assert(new_pkey != 0);
+
+   if (new_pkey < 0)
+   break;
+   alloced_pkeys[nr_alloced++] = new_pkey;
+   }
+   /* free all the allocated keys */
+   for (i = 0; i < nr_alloced; i++) {
+   int free_ret;
+
+   if (!alloced_pkeys[i])
+   continue;
+   free_ret = sys_pkey_free(alloced_pkeys[i]);
+   pkey_assert(!free_ret);
+   }
+
+   /* attach key-0 in various modes */
+   err = sys_mprotect_pkey(ptr, size, PROT_READ, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0);
+   pkey_assert(!err);
+   err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0);
+   pkey_assert(!err);
+
+   /* free key-0 */
+   err = sys_pkey_free(0);
+   pkey_assert(!err);
+
+   newpkey = sys_pkey_alloc(0, 0x0);
+   assert(newpkey == 0);
+}
+
 void test_read_of_write_disabled_region(int *ptr, u16 pkey)
 {
int ptr_contents;
@@ -1153,10 +1214,10 @@ void test_kernel_gup_write_to_write_disabled_region(int 
*ptr, u16 pkey)
 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
 {
int err;
-   int i = get_start_key();
+   int i;
 
/* Note: 0 is the default pkey, so don't mess with it */
-   for (; i < NR_PKEYS; i++) {
+   for (i=1; i < NR_PKEYS; i++) {
if (pkey == i)
continue;
 
@@ -1465,6 +1526,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
test_pkey_syscalls_on_non_allocated_pkey,
test_pkey_syscalls_bad_args,
test_pkey_alloc_exhaust,
+   test_pkey_alloc_free_attach_pkey0,
 };
 
 void run_tests_once(void)
-- 
1.7.1

[PATCH v13 23/24] selftests/vm: sub-page allocator

2018-06-13 Thread Ram Pai

introduce a new allocator that allocates 4k hardware-pages to back
64k linux-page. This allocator is only applicable on powerpc.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/vm/pkey-helpers.h|6 ++
 tools/testing/selftests/vm/pkey-powerpc.h|   25 +
 tools/testing/selftests/vm/pkey-x86.h|5 +
 tools/testing/selftests/vm/protection_keys.c |1 +
 4 files changed, 37 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 288ccff..a00eee6 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -28,6 +28,9 @@
 extern int dprint_in_signal;
 extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
+extern int test_nr;
+extern int iteration_nr;
+
 #ifdef __GNUC__
 __attribute__((format(printf, 1, 2)))
 #endif
@@ -78,6 +81,9 @@ static inline void sigsafe_printf(const char *format, ...)
 void expected_pkey_fault(int pkey);
 int sys_pkey_alloc(unsigned long flags, u64 init_val);
 int sys_pkey_free(unsigned long pkey);
+int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot,
+   unsigned long pkey);
+void record_pkey_malloc(void *ptr, long size, int prot);
 
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 #include "pkey-x86.h"
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
index 957f6f6..af44eed 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -98,4 +98,29 @@ void expect_fault_on_read_execonly_key(void *p1, u16 pkey)
 /* 8-bytes of instruction * 16384bytes = 1 page */
 #define __page_o_noops() asm(".rept 16384 ; nop; .endr")
 
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+   void *ptr;
+   int ret;
+
+   dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__,
+   size, prot, pkey);
+   pkey_assert(pkey < NR_PKEYS);
+   ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+   pkey_assert(ptr != (void *)-1);
+
+   ret = syscall(__NR_subpage_prot, ptr, size, NULL);
+   if (ret) {
+   perror("subpage_perm");
+   return PTR_ERR_ENOTSUP;
+   }
+
+   ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey);
+   pkey_assert(!ret);
+   record_pkey_malloc(ptr, size, prot);
+
+   dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr);
+   return ptr;
+}
+
 #endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index 6820c10..322da49 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -176,4 +176,9 @@ void expect_fault_on_read_execonly_key(void *p1, u16 pkey)
expected_pkey_fault(pkey);
 }
 
+void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey)
+{
+   return PTR_ERR_ENOTSUP;
+}
+
 #endif /* _PKEYS_X86_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index b5a9e6c..cbd87f8 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -887,6 +887,7 @@ void setup_hugetlbfs(void)
 void *(*pkey_malloc[])(long size, int prot, u16 pkey) = {
 
malloc_pkey_with_mprotect,
+   malloc_pkey_with_mprotect_subpage,
malloc_pkey_anon_huge,
malloc_pkey_hugetlb
 /* can not do direct with the pkey_mprotect() API:
-- 
1.7.1

[PATCH v13 22/24] selftests/vm: testcases must restore pkey-permissions

2018-06-13 Thread Ram Pai

Generally the signal handler restores the state of the pkey register
before returning. However there are times when the read/write operation
can legitamely fail without invoking the signal handler.  Eg: A
sys_read() operaton to a write-protected page should be disallowed.  In
such a case the state of the pkey register is not restored to its
original state.  The test case is responsible for restoring the key
register state to its original value.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |5 +
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index caf634e..b5a9e6c 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1011,6 +1011,7 @@ void test_read_of_write_disabled_region(int *ptr, u16 
pkey)
ptr_contents = read_ptr(ptr);
dprintf1("*ptr: %d\n", ptr_contents);
dprintf1("\n");
+   pkey_write_allow(pkey);
 }
 void test_read_of_access_disabled_region(int *ptr, u16 pkey)
 {
@@ -1090,6 +1091,7 @@ void test_kernel_write_of_access_disabled_region(int 
*ptr, u16 pkey)
ret = read(test_fd, ptr, 1);
dprintf1("read ret: %d\n", ret);
pkey_assert(ret);
+   pkey_access_allow(pkey);
 }
 void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
@@ -1102,6 +1104,7 @@ void test_kernel_write_of_write_disabled_region(int *ptr, 
u16 pkey)
if (ret < 0 && (DEBUG_LEVEL > 0))
perror("verbose read result (OK for this to be bad)");
pkey_assert(ret);
+   pkey_write_allow(pkey);
 }
 
 void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey)
@@ -1121,6 +1124,7 @@ void test_kernel_gup_of_access_disabled_region(int *ptr, 
u16 pkey)
vmsplice_ret = vmsplice(pipe_fds[1], , 1, SPLICE_F_GIFT);
dprintf1("vmsplice() ret: %d\n", vmsplice_ret);
pkey_assert(vmsplice_ret == -1);
+   pkey_access_allow(pkey);
 
close(pipe_fds[0]);
close(pipe_fds[1]);
@@ -1141,6 +1145,7 @@ void test_kernel_gup_write_to_write_disabled_region(int 
*ptr, u16 pkey)
if (DEBUG_LEVEL > 0)
perror("futex");
dprintf1("futex() ret: %d\n", futex_ret);
+   pkey_write_allow(pkey);
 }
 
 /* Assumes that all pkeys other than 'pkey' are unallocated */
-- 
1.7.1

[PATCH v13 21/24] selftests/vm: detect write violation on a mapped access-denied-key page

2018-06-13 Thread Ram Pai

detect write-violation on a page to which access-disabled
key is associated much after the page is mapped.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
---
 tools/testing/selftests/vm/protection_keys.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index f4acd72..caf634e 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1067,6 +1067,18 @@ void test_write_of_access_disabled_region(int *ptr, u16 
pkey)
*ptr = __LINE__;
expected_pkey_fault(pkey);
 }
+
+void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   *ptr = __LINE__;
+   dprintf1("disabling access; after accessing the page, "
+   " to PKEY[%02d], doing write\n", pkey);
+   pkey_access_deny(pkey);
+   *ptr = __LINE__;
+   expected_pkey_fault(pkey);
+}
+
 void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey)
 {
int ret;
@@ -1435,6 +1447,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
test_write_of_write_disabled_region,
test_write_of_write_disabled_region_with_page_already_mapped,
test_write_of_access_disabled_region,
+   test_write_of_access_disabled_region_with_page_already_mapped,
test_kernel_write_of_access_disabled_region,
test_kernel_write_of_write_disabled_region,
test_kernel_gup_of_access_disabled_region,
-- 
1.7.1

[PATCH v13 20/24] selftests/vm: associate key on a mapped page and detect write violation

2018-06-13 Thread Ram Pai

detect write-violation on a page to which write-disabled
key is associated much after the page is mapped.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
---
 tools/testing/selftests/vm/protection_keys.c |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 04d0249..f4acd72 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1042,6 +1042,17 @@ void 
test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
expected_pkey_fault(pkey);
 }
 
+void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   *ptr = __LINE__;
+   dprintf1("disabling write access; after accessing the page, "
+   "to PKEY[%02d], doing write\n", pkey);
+   pkey_write_deny(pkey);
+   *ptr = __LINE__;
+   expected_pkey_fault(pkey);
+}
+
 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
@@ -1422,6 +1433,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
test_read_of_access_disabled_region,
test_read_of_access_disabled_region_with_page_already_mapped,
test_write_of_write_disabled_region,
+   test_write_of_write_disabled_region_with_page_already_mapped,
test_write_of_access_disabled_region,
test_kernel_write_of_access_disabled_region,
test_kernel_write_of_write_disabled_region,
-- 
1.7.1

[PATCH v13 19/24] selftests/vm: associate key on a mapped page and detect access violation

2018-06-13 Thread Ram Pai

detect access-violation on a page to which access-disabled
key is associated much after the page is mapped.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Acked-by: Dave Hansen 
---
 tools/testing/selftests/vm/protection_keys.c |   19 +++
 1 files changed, 19 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index e8ad970..04d0249 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1024,6 +1024,24 @@ void test_read_of_access_disabled_region(int *ptr, u16 
pkey)
dprintf1("*ptr: %d\n", ptr_contents);
expected_pkey_fault(pkey);
 }
+
+void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr,
+   u16 pkey)
+{
+   int ptr_contents;
+
+   dprintf1("disabling access to PKEY[%02d], doing read @ %p\n",
+   pkey, ptr);
+   ptr_contents = read_ptr(ptr);
+   dprintf1("reading ptr before disabling the read : %d\n",
+   ptr_contents);
+   read_pkey_reg();
+   pkey_access_deny(pkey);
+   ptr_contents = read_ptr(ptr);
+   dprintf1("*ptr: %d\n", ptr_contents);
+   expected_pkey_fault(pkey);
+}
+
 void test_write_of_write_disabled_region(int *ptr, u16 pkey)
 {
dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey);
@@ -1402,6 +1420,7 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
 void (*pkey_tests[])(int *ptr, u16 pkey) = {
test_read_of_write_disabled_region,
test_read_of_access_disabled_region,
+   test_read_of_access_disabled_region_with_page_already_mapped,
test_write_of_write_disabled_region,
test_write_of_access_disabled_region,
test_kernel_write_of_access_disabled_region,
-- 
1.7.1

[PATCH v13 18/24] selftests/vm: fix an assertion in test_pkey_alloc_exhaust()

2018-06-13 Thread Ram Pai

The maximum number of keys that can be allocated has to
take into consideration, that some keys are reserved by
the architecture for   specific   purpose. Hence cannot
be allocated.

Fix the assertion in test_pkey_alloc_exhaust()

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   13 +
 1 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index cb81a47..e8ad970 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1175,15 +1175,12 @@ void test_pkey_alloc_exhaust(int *ptr, u16 pkey)
pkey_assert(i < NR_PKEYS*2);
 
/*
-* There are 16 pkeys supported in hardware.  Three are
-* allocated by the time we get here:
-*   1. The default key (0)
-*   2. One possibly consumed by an execute-only mapping.
-*   3. One allocated by the test code and passed in via
-*  'pkey' to this function.
-* Ensure that we can allocate at least another 13 (16-3).
+* There are NR_PKEYS pkeys supported in hardware. arch_reserved_keys()
+* are reserved. One of which is the default key(0). One can be taken
+* up by an execute-only mapping.
+* Ensure that we can allocate at least the remaining.
 */
-   pkey_assert(i >= NR_PKEYS-3);
+   pkey_assert(i >= (NR_PKEYS-arch_reserved_keys()-1));
 
for (i = 0; i < nr_allocated_pkeys; i++) {
err = sys_pkey_free(allocated_pkeys[i]);
-- 
1.7.1

[PATCH v13 17/24] selftests/vm: powerpc implementation to check support for pkey

2018-06-13 Thread Ram Pai

pkey subsystem is supported if the hardware and kernel has support.
We determine that by checking if allocation of a key succeeds or not.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/pkey-helpers.h|2 ++
 tools/testing/selftests/vm/pkey-powerpc.h|   14 --
 tools/testing/selftests/vm/pkey-x86.h|8 
 tools/testing/selftests/vm/protection_keys.c |9 +
 4 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 321bbbd..288ccff 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -76,6 +76,8 @@ static inline void sigsafe_printf(const char *format, ...)
 
 __attribute__((noinline)) int read_ptr(int *ptr);
 void expected_pkey_fault(int pkey);
+int sys_pkey_alloc(unsigned long flags, u64 init_val);
+int sys_pkey_free(unsigned long pkey);
 
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 #include "pkey-x86.h"
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
index ec6f5d7..957f6f6 100644
--- a/tools/testing/selftests/vm/pkey-powerpc.h
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -62,9 +62,19 @@ static inline void __write_pkey_reg(pkey_reg_t pkey_reg)
pkey_reg);
 }
 
-static inline int cpu_has_pku(void)
+static inline bool is_pkey_supported(void)
 {
-   return 1;
+   /*
+* No simple way to determine this.
+* Lets try allocating a key and see if it succeeds.
+*/
+   int ret = sys_pkey_alloc(0, 0);
+
+   if (ret > 0) {
+   sys_pkey_free(ret);
+   return true;
+   }
+   return false;
 }
 
 static inline int arch_reserved_keys(void)
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index 95ee952..6820c10 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -105,7 +105,7 @@ static inline void __cpuid(unsigned int *eax, unsigned int 
*ebx,
 #define X86_FEATURE_PKU(1<<3) /* Protection Keys for Userspace */
 #define X86_FEATURE_OSPKE  (1<<4) /* OS Protection Keys Enable */
 
-static inline int cpu_has_pku(void)
+static inline bool is_pkey_supported(void)
 {
unsigned int eax;
unsigned int ebx;
@@ -118,13 +118,13 @@ static inline int cpu_has_pku(void)
 
if (!(ecx & X86_FEATURE_PKU)) {
dprintf2("cpu does not have PKU\n");
-   return 0;
+   return false;
}
if (!(ecx & X86_FEATURE_OSPKE)) {
dprintf2("cpu does not have OSPKE\n");
-   return 0;
+   return false;
}
-   return 1;
+   return true;
 }
 
 #define XSTATE_PKEY_BIT(9)
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index ba184ca..cb81a47 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -1393,8 +1393,8 @@ void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 
pkey)
int size = PAGE_SIZE;
int sret;
 
-   if (cpu_has_pku()) {
-   dprintf1("SKIP: %s: no CPU support\n", __func__);
+   if (is_pkey_supported()) {
+   dprintf1("SKIP: %s: no CPU/kernel support\n", __func__);
return;
}
 
@@ -1458,12 +1458,13 @@ void run_tests_once(void)
 int main(void)
 {
int nr_iterations = 22;
+   int pkey_supported = is_pkey_supported();
 
setup_handlers();
 
-   printf("has pkey: %d\n", cpu_has_pku());
+   printf("has pkey: %s\n", pkey_supported ? "Yes" : "No");
 
-   if (!cpu_has_pku()) {
+   if (!pkey_supported) {
int size = PAGE_SIZE;
int *ptr;
 
-- 
1.7.1

[PATCH v13 15/24] selftests/vm: powerpc implementation for generic abstraction

2018-06-13 Thread Ram Pai

Introduce powerpc implementation for the various
abstractions.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/vm/pkey-helpers.h|   16 -
 tools/testing/selftests/vm/pkey-powerpc.h|   91 ++
 tools/testing/selftests/vm/pkey-x86.h|   15 
 tools/testing/selftests/vm/protection_keys.c |   62 ++
 4 files changed, 156 insertions(+), 28 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-powerpc.h

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 52a1152..321bbbd 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -74,8 +74,13 @@ static inline void sigsafe_printf(const char *format, ...)
}   \
 } while (0)
 
+__attribute__((noinline)) int read_ptr(int *ptr);
+void expected_pkey_fault(int pkey);
+
 #if defined(__i386__) || defined(__x86_64__) /* arch */
 #include "pkey-x86.h"
+#elif defined(__powerpc64__) /* arch */
+#include "pkey-powerpc.h"
 #else /* arch */
 #error Architecture not supported
 #endif /* arch */
@@ -186,7 +191,16 @@ static inline int open_hugepage_file(int flag)
 
 static inline int get_start_key(void)
 {
-   return 1;
+   return 0;
+}
+
+static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
+{
+#ifdef si_pkey
+   return >si_pkey;
+#else
+   return (u32 *)(((u8 *)si) + si_pkey_offset);
+#endif
 }
 
 #endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-powerpc.h 
b/tools/testing/selftests/vm/pkey-powerpc.h
new file mode 100644
index 000..ec6f5d7
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-powerpc.h
@@ -0,0 +1,91 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _PKEYS_POWERPC_H
+#define _PKEYS_POWERPC_H
+
+#ifndef SYS_mprotect_key
+# define SYS_mprotect_key  386
+#endif
+#ifndef SYS_pkey_alloc
+# define SYS_pkey_alloc384
+# define SYS_pkey_free 385
+#endif
+#define REG_IP_IDX PT_NIP
+#define REG_TRAPNO PT_TRAP
+#define gregs  gp_regs
+#define fpregs fp_regs
+#define si_pkey_offset 0x20
+
+#ifndef PKEY_DISABLE_ACCESS
+# define PKEY_DISABLE_ACCESS   0x3  /* disable read and write */
+#endif
+
+#ifndef PKEY_DISABLE_WRITE
+# define PKEY_DISABLE_WRITE0x2
+#endif
+
+#define NR_PKEYS   32
+#define NR_RESERVED_PKEYS_4K   26
+#define NR_RESERVED_PKEYS_64K  3
+#define PKEY_BITS_PER_PKEY 2
+#define HPAGE_SIZE (1UL << 24)
+#define PAGE_SIZE  (1UL << 16)
+#define pkey_reg_t u64
+#define PKEY_REG_FMT   "%016lx"
+#define HUGEPAGE_FILE  
"/sys/kernel/mm/hugepages/hugepages-16384kB/nr_hugepages"
+
+static inline u32 pkey_bit_position(int pkey)
+{
+   return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+}
+
+static inline pkey_reg_t __read_pkey_reg(void)
+{
+   pkey_reg_t pkey_reg;
+
+   asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg));
+
+   return pkey_reg;
+}
+
+static inline void __write_pkey_reg(pkey_reg_t pkey_reg)
+{
+   pkey_reg_t eax = pkey_reg;
+
+   dprintf4("%s() changing "PKEY_REG_FMT" to "PKEY_REG_FMT"\n",
+__func__, __read_pkey_reg(), pkey_reg);
+
+   asm volatile("mtspr 0xd, %0" : : "r" ((unsigned long)(eax)) : "memory");
+
+   dprintf4("%s() pkey register after changing "PKEY_REG_FMT" to "
+   PKEY_REG_FMT"\n", __func__, __read_pkey_reg(),
+   pkey_reg);
+}
+
+static inline int cpu_has_pku(void)
+{
+   return 1;
+}
+
+static inline int arch_reserved_keys(void)
+{
+   if (sysconf(_SC_PAGESIZE) == 4096)
+   return NR_RESERVED_PKEYS_4K;
+   else
+   return NR_RESERVED_PKEYS_64K;
+}
+
+void expect_fault_on_read_execonly_key(void *p1, u16 pkey)
+{
+   /* powerpc does not allow userspace to change permissions of exec-only
+* keys since those keys are not allocated by userspace. The signal
+* handler wont be able to reset the permissions, which means the code
+* will infinitely continue to segfault here.
+*/
+   return;
+}
+
+/* 8-bytes of instruction * 16384bytes = 1 page */
+#define __page_o_noops() asm(".rept 16384 ; nop; .endr")
+
+#endif /* _PKEYS_POWERPC_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index d5fa299..95ee952 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -42,6 +42,7 @@
 #endif
 
 #define NR_PKEYS   16
+#define NR_RESERVED_PKEYS  1
 #define PKEY_BITS_PER_PKEY 2
 #define HPAGE_SIZE (1UL<<21)
 #define PAGE_SIZE  4096
@@ -161,4 +162,18 @@ int pkey_reg_xstate_offset(void)
return xstate_offset;
 }
 
+static inline int arch_reserved_keys(void)
+{
+

[PATCH v13 16/24] selftests/vm: clear the bits in shadow reg when a pkey is freed.

2018-06-13 Thread Ram Pai

When a key is freed, the  key  is  no  more  effective.
Clear the bits corresponding to the pkey in the shadow
register. Otherwise  it  will carry some spurious bits
which can trigger false-positive asserts.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 88dfa40..ba184ca 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -577,7 +577,8 @@ int sys_pkey_free(unsigned long pkey)
int ret = syscall(SYS_pkey_free, pkey);
 
if (!ret)
-   shadow_pkey_reg &= clear_pkey_flags(pkey, PKEY_DISABLE_ACCESS);
+   shadow_pkey_reg &= clear_pkey_flags(pkey,
+   PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE);
dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
return ret;
 }
-- 
1.7.1

[PATCH v13 14/24] selftests/vm: generic cleanup

2018-06-13 Thread Ram Pai

cleanup the code to satisfy coding styles.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |   64 +
 1 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index adcae4a..f43a319 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -4,7 +4,7 @@
  *
  * There are examples in here of:
  *  * how to set protection keys on memory
- *  * how to set/clear bits in pkey registers (the rights register)
+ *  * how to set/clear bits in Protection Key registers (the rights register)
  *  * how to handle SEGV_PKUERR signals and extract pkey-relevant
  *information from the siginfo
  *
@@ -13,13 +13,18 @@
  * prefault pages in at malloc, or not
  * protect MPX bounds tables with protection keys?
  * make sure VMA splitting/merging is working correctly
- * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune 
to pkeys
- * look for pkey "leaks" where it is still set on a VMA but "freed" back 
to the kernel
- * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey 
sticks
+ * OOMs can destroy mm->mmap (see exit_mmap()),
+ * so make sure it is immune to pkeys
+ * look for pkey "leaks" where it is still set on a VMA
+ *  but "freed" back to the kernel
+ * do a plain mprotect() to a mprotect_pkey() area and make
+ *  sure the pkey sticks
  *
  * Compile like this:
- * gcc  -o protection_keys-O2 -g -std=gnu99 -pthread -Wall 
protection_keys.c -lrt -ldl -lm
- * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall 
protection_keys.c -lrt -ldl -lm
+ * gcc  -o protection_keys-O2 -g -std=gnu99
+ *  -pthread -Wall protection_keys.c -lrt -ldl -lm
+ * gcc -m32 -o protection_keys_32 -O2 -g -std=gnu99
+ *  -pthread -Wall protection_keys.c -lrt -ldl -lm
  */
 #define _GNU_SOURCE
 #include 
@@ -263,10 +268,12 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
__read_pkey_reg());
dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
*(u64 *)pkey_reg_ptr = 0x;
-   dprintf1("WARNING: set PRKU=0 to allow faulting instruction to 
continue\n");
+   dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction "
+   "to continue\n");
pkey_faults++;
dprintf1("==\n");
dprint_in_signal = 0;
+   return;
 }
 
 int wait_all_children(void)
@@ -384,7 +391,7 @@ void pkey_disable_set(int pkey, int flags)
 {
unsigned long syscall_flags = 0;
int ret;
-   int pkey_rights;
+   u32 pkey_rights;
pkey_reg_t orig_pkey_reg = read_pkey_reg();
 
dprintf1("START->%s(%d, 0x%x)\n", __func__,
@@ -487,9 +494,10 @@ int sys_mprotect_pkey(void *ptr, size_t size, unsigned 
long orig_prot,
return sret;
 }
 
-int sys_pkey_alloc(unsigned long flags, unsigned long init_val)
+int sys_pkey_alloc(unsigned long flags, u64 init_val)
 {
int ret = syscall(SYS_pkey_alloc, flags, init_val);
+
dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n",
__func__, flags, init_val, ret, errno);
return ret;
@@ -513,7 +521,7 @@ void pkey_set_shadow(u32 key, u64 init_val)
 int alloc_pkey(void)
 {
int ret;
-   unsigned long init_val = 0x0;
+   u64 init_val = 0x0;
 
dprintf1("%s()::%d, pkey_reg: "PKEY_REG_FMT" shadow: "PKEY_REG_FMT"\n",
__func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg);
@@ -672,7 +680,9 @@ void record_pkey_malloc(void *ptr, long size, int prot)
/* every record is full */
size_t old_nr_records = nr_pkey_malloc_records;
size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1);
-   size_t new_size = new_nr_records * sizeof(struct 
pkey_malloc_record);
+   size_t new_size = new_nr_records *
+   sizeof(struct pkey_malloc_record);
+
dprintf2("new_nr_records: %zd\n", new_nr_records);
dprintf2("new_size: %zd\n", new_size);
pkey_malloc_records = realloc(pkey_malloc_records, new_size);
@@ -698,9 +708,11 @@ void free_pkey_malloc(void *ptr)
 {
long i;
int ret;
+
dprintf3("%s(%p)\n", __func__, ptr);
for (i = 0; i < nr_pkey_malloc_records; i++) {
struct pkey_malloc_record *rec = _malloc_records[i];
+
dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n",
ptr, i, rec, rec->ptr, rec->size);
if ((ptr <  rec->ptr) ||
@@ -781,11 +793,13 @@ void

[PATCH v13 13/24] selftests/vm: pkey register should match shadow pkey

2018-06-13 Thread Ram Pai

expected_pkey_fault() is comparing the contents of pkey
register with 0. This may not be true all the time. There
could be bits set by default by the architecture
which can never be changed. Hence compare the value against
shadow pkey register, which is supposed to track the bits
accurately all throughout

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 9afe894..adcae4a 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -916,10 +916,10 @@ void expected_pkey_fault(int pkey)
pkey_assert(last_si_pkey == pkey);
 
/*
-* The signal handler shold have cleared out PKEY register to let the
+* The signal handler should have cleared out pkey-register to let the
 * test program continue.  We now have to restore it.
 */
-   if (__read_pkey_reg() != 0)
+   if (__read_pkey_reg() != shadow_pkey_reg)
pkey_assert(0);
 
__write_pkey_reg(shadow_pkey_reg);
-- 
1.7.1

[PATCH v13 12/24] selftests/vm: introduce two arch independent abstraction

2018-06-13 Thread Ram Pai

open_hugepage_file() <- opens the huge page file
get_start_key() <--  provides the first non-reserved key.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Reviewed-by: Dave Hansen 
---
 tools/testing/selftests/vm/pkey-helpers.h|   10 ++
 tools/testing/selftests/vm/pkey-x86.h|1 +
 tools/testing/selftests/vm/protection_keys.c |6 +++---
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index ada0146..52a1152 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -179,4 +179,14 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
 #define __stringify_1(x...) #x
 #define __stringify(x...)   __stringify_1(x)
 
+static inline int open_hugepage_file(int flag)
+{
+   return open(HUGEPAGE_FILE, flag);
+}
+
+static inline int get_start_key(void)
+{
+   return 1;
+}
+
 #endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index 2b3780d..d5fa299 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -48,6 +48,7 @@
 #define MB (1<<20)
 #define pkey_reg_t u32
 #define PKEY_REG_FMT   "%016x"
+#define HUGEPAGE_FILE  
"/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages"
 
 static inline u32 pkey_bit_position(int pkey)
 {
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index c5f9776..9afe894 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -791,7 +791,7 @@ void setup_hugetlbfs(void)
 * Now go make sure that we got the pages and that they
 * are 2M pages.  Someone might have made 1G the default.
 */
-   fd = open("/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages", 
O_RDONLY);
+   fd = open_hugepage_file(O_RDONLY);
if (fd < 0) {
perror("opening sysfs 2M hugetlb config");
return;
@@ -1078,10 +1078,10 @@ void test_kernel_gup_write_to_write_disabled_region(int 
*ptr, u16 pkey)
 void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey)
 {
int err;
-   int i;
+   int i = get_start_key();
 
/* Note: 0 is the default pkey, so don't mess with it */
-   for (i = 1; i < NR_PKEYS; i++) {
+   for (; i < NR_PKEYS; i++) {
if (pkey == i)
continue;
 
-- 
1.7.1

[PATCH v13 11/24] selftests/vm: fix alloc_random_pkey() to make it really random

2018-06-13 Thread Ram Pai

alloc_random_pkey() was allocating the same pkey every time.
Not all pkeys were geting tested. fixed it.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |6 +-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 42a91c7..c5f9776 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -24,6 +24,7 @@
 #define _GNU_SOURCE
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -576,13 +577,15 @@ int alloc_random_pkey(void)
int alloced_pkeys[NR_PKEYS];
int nr_alloced = 0;
int random_index;
+
memset(alloced_pkeys, 0, sizeof(alloced_pkeys));
+   srand((unsigned int)time(NULL));
 
/* allocate every possible key and make a note of which ones we got */
max_nr_pkey_allocs = NR_PKEYS;
-   max_nr_pkey_allocs = 1;
for (i = 0; i < max_nr_pkey_allocs; i++) {
int new_pkey = alloc_pkey();
+
if (new_pkey < 0)
break;
alloced_pkeys[nr_alloced++] = new_pkey;
@@ -598,6 +601,7 @@ int alloc_random_pkey(void)
/* go through the allocated ones that we did not want and free them */
for (i = 0; i < nr_alloced; i++) {
int free_ret;
+
if (!alloced_pkeys[i])
continue;
free_ret = sys_pkey_free(alloced_pkeys[i]);
-- 
1.7.1

[PATCH v13 10/24] selftests/vm: clear the bits in shadow reg when a pkey is freed.

2018-06-13 Thread Ram Pai

When a key is freed, the  key  is  no  more  effective.
Clear the bits corresponding to the pkey in the shadow
register. Otherwise  it  will carry some spurious bits
which can trigger false-positive asserts.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index da4f5d5..42a91c7 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -556,6 +556,9 @@ int alloc_pkey(void)
 int sys_pkey_free(unsigned long pkey)
 {
int ret = syscall(SYS_pkey_free, pkey);
+
+   if (!ret)
+   shadow_pkey_reg &= clear_pkey_flags(pkey, PKEY_DISABLE_ACCESS);
dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret);
return ret;
 }
-- 
1.7.1

[PATCH v13 09/24] selftests/vm: fixed bugs in pkey_disable_clear()

2018-06-13 Thread Ram Pai

instead of clearing the bits, pkey_disable_clear() was setting
the bits. Fixed it.

Also fixed a wrong assertion in that function. When bits are
cleared, the resulting bit value will be less than the original.

This hasn't been a problem so far because this code isn't currently
used.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 5fcccdb..da4f5d5 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -433,7 +433,7 @@ void pkey_disable_clear(int pkey, int flags)
pkey, pkey, pkey_rights);
pkey_assert(pkey_rights >= 0);
 
-   pkey_rights |= flags;
+   pkey_rights &= ~flags;
 
ret = hw_pkey_set(pkey, pkey_rights, 0);
shadow_pkey_reg &= clear_pkey_flags(pkey, flags);
@@ -446,7 +446,7 @@ void pkey_disable_clear(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n", __func__,
pkey, read_pkey_reg());
if (flags)
-   assert(read_pkey_reg() > orig_pkey_reg);
+   assert(read_pkey_reg() < orig_pkey_reg);
 }
 
 void pkey_write_allow(int pkey)
-- 
1.7.1

[PATCH v13 08/24] selftests/vm: fix the wrong assert in pkey_disable_set()

2018-06-13 Thread Ram Pai

If the flag is 0, no bits will be set. Hence we cant expect
the resulting bitmap to have a higher value than what it
was earlier.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
---
 tools/testing/selftests/vm/protection_keys.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 57340b3..5fcccdb 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -415,7 +415,7 @@ void pkey_disable_set(int pkey, int flags)
dprintf1("%s(%d) pkey_reg: 0x"PKEY_REG_FMT"\n",
__func__, pkey, read_pkey_reg());
if (flags)
-   pkey_assert(read_pkey_reg() > orig_pkey_reg);
+   pkey_assert(read_pkey_reg() >= orig_pkey_reg);
dprintf1("END<---%s(%d, 0x%x)\n", __func__,
pkey, flags);
 }
-- 
1.7.1

[PATCH v13 07/24] selftests/vm: generic function to handle shadow key register

2018-06-13 Thread Ram Pai

helper functions to handler shadow pkey register

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/vm/pkey-helpers.h|   29 ++
 tools/testing/selftests/vm/pkey-x86.h|5 
 tools/testing/selftests/vm/protection_keys.c |   34 -
 3 files changed, 56 insertions(+), 12 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 2a1a024..ada0146 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -80,6 +80,35 @@ static inline void sigsafe_printf(const char *format, ...)
 #error Architecture not supported
 #endif /* arch */
 
+static inline pkey_reg_t clear_pkey_flags(int pkey, pkey_reg_t flags)
+{
+   u32 shift = pkey_bit_position(pkey);
+
+   return ~(flags << shift);
+}
+
+/*
+ * Takes pkey flags and puts them at the right bit position for the given key 
so
+ * that the result can be ORed into the register.
+ */
+static inline pkey_reg_t left_shift_bits(int pkey, pkey_reg_t bits)
+{
+   u32 shift = pkey_bit_position(pkey);
+
+   return (bits << shift);
+}
+
+/*
+ * Takes pkey register values and puts the flags for the given pkey at the 
least
+ * significant bits of the returned value.
+ */
+static inline pkey_reg_t right_shift_bits(int pkey, pkey_reg_t bits)
+{
+   u32 shift = pkey_bit_position(pkey);
+
+   return (bits >> shift);
+}
+
 extern pkey_reg_t shadow_pkey_reg;
 
 static inline pkey_reg_t _read_pkey_reg(int line)
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
index 5f40901..2b3780d 100644
--- a/tools/testing/selftests/vm/pkey-x86.h
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -49,6 +49,11 @@
 #define pkey_reg_t u32
 #define PKEY_REG_FMT   "%016x"
 
+static inline u32 pkey_bit_position(int pkey)
+{
+   return pkey * PKEY_BITS_PER_PKEY;
+}
+
 static inline void __page_o_noops(void)
 {
/* 8-bytes of instruction * 512 bytes = 1 page */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 90b3a41..57340b3 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -342,7 +342,7 @@ static u32 hw_pkey_get(int pkey, unsigned long flags)
__func__, pkey, flags, 0, 0);
dprintf2("%s() raw pkey_reg: "PKEY_REG_FMT"\n", __func__, pkey_reg);
 
-   shifted_pkey_reg = (pkey_reg >> (pkey * PKEY_BITS_PER_PKEY));
+   shifted_pkey_reg = right_shift_bits(pkey, pkey_reg);
dprintf2("%s() shifted_pkey_reg: "PKEY_REG_FMT"\n", __func__,
shifted_pkey_reg);
masked_pkey_reg = shifted_pkey_reg & mask;
@@ -366,9 +366,9 @@ static int hw_pkey_set(int pkey, unsigned long rights, 
unsigned long flags)
/* copy old pkey_reg */
new_pkey_reg = old_pkey_reg;
/* mask out bits from pkey in old value: */
-   new_pkey_reg &= ~(mask << (pkey * PKEY_BITS_PER_PKEY));
+   new_pkey_reg &= clear_pkey_flags(pkey, mask);
/* OR in new bits for pkey: */
-   new_pkey_reg |= (rights << (pkey * PKEY_BITS_PER_PKEY));
+   new_pkey_reg |= left_shift_bits(pkey, rights);
 
__write_pkey_reg(new_pkey_reg);
 
@@ -402,7 +402,7 @@ void pkey_disable_set(int pkey, int flags)
ret = hw_pkey_set(pkey, pkey_rights, syscall_flags);
assert(!ret);
/*pkey_reg and flags have the same format */
-   shadow_pkey_reg |= flags << (pkey * 2);
+   shadow_pkey_reg |= left_shift_bits(pkey, flags);
dprintf1("%s(%d) shadow: 0x"PKEY_REG_FMT"\n",
__func__, pkey, shadow_pkey_reg);
 
@@ -436,7 +436,7 @@ void pkey_disable_clear(int pkey, int flags)
pkey_rights |= flags;
 
ret = hw_pkey_set(pkey, pkey_rights, 0);
-   shadow_pkey_reg &= ~(flags << (pkey * 2));
+   shadow_pkey_reg &= clear_pkey_flags(pkey, flags);
pkey_assert(ret >= 0);
 
pkey_rights = hw_pkey_get(pkey, syscall_flags);
@@ -494,6 +494,21 @@ int sys_pkey_alloc(unsigned long flags, unsigned long 
init_val)
return ret;
 }
 
+void pkey_setup_shadow(void)
+{
+   shadow_pkey_reg = __read_pkey_reg();
+}
+
+void pkey_reset_shadow(u32 key)
+{
+   shadow_pkey_reg &= clear_pkey_flags(key, 0x3);
+}
+
+void pkey_set_shadow(u32 key, u64 init_val)
+{
+   shadow_pkey_reg |=  left_shift_bits(key, init_val);
+}
+
 int alloc_pkey(void)
 {
int ret;
@@ -512,7 +527,7 @@ int alloc_pkey(void)
shadow_pkey_reg);
if (ret) {
/* clear both the bits: */
-   shadow_pkey_reg &= ~(0x3  << (ret * 2));
+   pkey_reset_shadow(ret);
dprintf4("%s()::%d, ret: %d pkey_reg: 0x"PKEY_REG_FMT
" shadow: 0x"PKEY_REG_FMT"\n",

[PATCH v13 06/24] selftests/vm: typecast the pkey register

2018-06-13 Thread Ram Pai

This is in preparation to accomadate a differing size register
across architectures.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/vm/pkey-helpers.h|   23 ---
 tools/testing/selftests/vm/pkey-x86.h|   16 +++--
 tools/testing/selftests/vm/protection_keys.c |   87 +++--
 3 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 7f18a82..2a1a024 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -80,13 +80,14 @@ static inline void sigsafe_printf(const char *format, ...)
 #error Architecture not supported
 #endif /* arch */
 
-extern unsigned int shadow_pkey_reg;
+extern pkey_reg_t shadow_pkey_reg;
 
-static inline unsigned int _read_pkey_reg(int line)
+static inline pkey_reg_t _read_pkey_reg(int line)
 {
-   unsigned int pkey_reg = __read_pkey_reg();
+   pkey_reg_t pkey_reg = __read_pkey_reg();
 
-   dprintf4("read_pkey_reg(line=%d) pkey_reg: %x shadow: %x\n",
+   dprintf4("read_pkey_reg(line=%d) pkey_reg: "PKEY_REG_FMT
+   " shadow: "PKEY_REG_FMT"\n",
line, pkey_reg, shadow_pkey_reg);
assert(pkey_reg == shadow_pkey_reg);
 
@@ -95,15 +96,15 @@ static inline unsigned int _read_pkey_reg(int line)
 
 #define read_pkey_reg() _read_pkey_reg(__LINE__)
 
-static inline void write_pkey_reg(unsigned int pkey_reg)
+static inline void write_pkey_reg(pkey_reg_t pkey_reg)
 {
-   dprintf4("%s() changing %08x to %08x\n", __func__,
+   dprintf4("%s() changing "PKEY_REG_FMT" to "PKEY_REG_FMT"\n", __func__,
__read_pkey_reg(), pkey_reg);
/* will do the shadow check for us: */
read_pkey_reg();
__write_pkey_reg(pkey_reg);
shadow_pkey_reg = pkey_reg;
-   dprintf4("%s(%08x) pkey_reg: %08x\n", __func__,
+   dprintf4("%s("PKEY_REG_FMT") pkey_reg: "PKEY_REG_FMT"\n", __func__,
pkey_reg, __read_pkey_reg());
 }
 
@@ -113,7 +114,7 @@ static inline void write_pkey_reg(unsigned int pkey_reg)
  */
 static inline void __pkey_access_allow(int pkey, int do_allow)
 {
-   unsigned int pkey_reg = read_pkey_reg();
+   pkey_reg_t pkey_reg = read_pkey_reg();
int bit = pkey * 2;
 
if (do_allow)
@@ -121,13 +122,13 @@ static inline void __pkey_access_allow(int pkey, int 
do_allow)
else
pkey_reg |= (1<>>>===SIGSEGV\n");
-   dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__,
+   dprintf1("%s()::%d, pkey_reg: "PKEY_REG_FMT" shadow: "PKEY_REG_FMT"\n",
+   __func__, __LINE__,
__read_pkey_reg(), shadow_pkey_reg);
 
trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
@@ -213,8 +214,9 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
fpregset = uctxt->uc_mcontext.fpregs;
fpregs = (void *)fpregset;
 
-   dprintf2("%s() trapno: %d ip: 0x%lx info->si_code: %s/%d\n", __func__,
-   trapno, ip, si_code_str(si->si_code), si->si_code);
+   dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n",
+   __func__, trapno, ip, si_code_str(si->si_code),
+   si->si_code);
 #ifdef __i386__
/*
 * 32-bit has some extra padding so that userspace can tell whether
@@ -251,12 +253,13 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
pkey_assert(siginfo_pkey < NR_PKEYS);
last_si_pkey = siginfo_pkey;
 
-   dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr);
+   dprintf1("signal pkey_reg from xsave: "PKEY_REG_FMT"\n", *pkey_reg_ptr);
/*
 * need __read_pkey_reg() version so we do not do shadow_pkey_reg
 * checking
 */
-   dprintf1("signal pkey_reg from  pkey_reg: %08x\n", __read_pkey_reg());
+   dprintf1("signal pkey_reg from  pkey_reg: "PKEY_REG_FMT"\n",
+   __read_pkey_reg());
dprintf1("pkey from siginfo: %jx\n", siginfo_pkey);
*(u64 *)pkey_reg_ptr = 0x;
dprintf1("WARNING: set PRKU=0 to allow faulting instruction to 
continue\n");
@@ -331,16 +334,17 @@ pid_t fork_lazy_child(void)
 static u32 hw_pkey_get(int pkey, unsigned long flags)
 {
u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE);
-   u32 pkey_reg = __read_pkey_reg();
-   u32 shifted_pkey_reg;
+   pkey_reg_t pkey_reg = __read_pkey_reg();
+   pkey_reg_t shifted_pkey_reg;
u32 masked_pkey_reg;
 
dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n",
__func__, pkey, flags, 0, 0);
-   dprintf2("%s() raw pkey_reg: %x\n", __func__, pkey_reg);
+   dprintf2("%s() raw pkey_reg: "PKEY_REG_FMT"\n", __func__, pkey_reg);

[PATCH v13 05/24] selftests/vm: Make gcc check arguments of sigsafe_printf()

2018-06-13 Thread Ram Pai

From: Thiago Jung Bauermann 

This will help us ensure we print pkey_reg_t values correctly in different
architectures.

Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/vm/pkey-helpers.h |4 
 1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 3ed2f02..7f18a82 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -27,6 +27,10 @@
 #define DPRINT_IN_SIGNAL_BUF_SIZE 4096
 extern int dprint_in_signal;
 extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+
+#ifdef __GNUC__
+__attribute__((format(printf, 1, 2)))
+#endif
 static inline void sigsafe_printf(const char *format, ...)
 {
va_list ap;
-- 
1.7.1

[PATCH v13 04/24] selftests/vm: move arch-specific definitions to arch-specific header

2018-06-13 Thread Ram Pai

From: Thiago Jung Bauermann 

In preparation for multi-arch support, move definitions which have
arch-specific values to x86-specific header.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/vm/pkey-helpers.h|  111 +-
 tools/testing/selftests/vm/pkey-x86.h|  156 ++
 tools/testing/selftests/vm/protection_keys.c |   47 
 3 files changed, 162 insertions(+), 152 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-x86.h

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 6ad1bd5..3ed2f02 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -21,9 +21,6 @@
 
 #define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
 
-#define NR_PKEYS 16
-#define PKEY_BITS_PER_PKEY 2
-
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
 #endif
@@ -73,19 +70,13 @@ static inline void sigsafe_printf(const char *format, ...)
}   \
 } while (0)
 
+#if defined(__i386__) || defined(__x86_64__) /* arch */
+#include "pkey-x86.h"
+#else /* arch */
+#error Architecture not supported
+#endif /* arch */
+
 extern unsigned int shadow_pkey_reg;
-static inline unsigned int __read_pkey_reg(void)
-{
-   unsigned int eax, edx;
-   unsigned int ecx = 0;
-   unsigned int pkey_reg;
-
-   asm volatile(".byte 0x0f,0x01,0xee\n\t"
-: "=a" (eax), "=d" (edx)
-: "c" (ecx));
-   pkey_reg = eax;
-   return pkey_reg;
-}
 
 static inline unsigned int _read_pkey_reg(int line)
 {
@@ -100,19 +91,6 @@ static inline unsigned int _read_pkey_reg(int line)
 
 #define read_pkey_reg() _read_pkey_reg(__LINE__)
 
-static inline void __write_pkey_reg(unsigned int pkey_reg)
-{
-   unsigned int eax = pkey_reg;
-   unsigned int ecx = 0;
-   unsigned int edx = 0;
-
-   dprintf4("%s() changing %08x to %08x\n", __func__,
-   __read_pkey_reg(), pkey_reg);
-   asm volatile(".byte 0x0f,0x01,0xef\n\t"
-: : "a" (eax), "c" (ecx), "d" (edx));
-   assert(pkey_reg == __read_pkey_reg());
-}
-
 static inline void write_pkey_reg(unsigned int pkey_reg)
 {
dprintf4("%s() changing %08x to %08x\n", __func__,
@@ -157,83 +135,6 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
dprintf4("pkey_reg now: %08x\n", read_pkey_reg());
 }
 
-#define PAGE_SIZE 4096
-#define MB (1<<20)
-
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
-   unsigned int *ecx, unsigned int *edx)
-{
-   /* ecx is often an input as well as an output. */
-   asm volatile(
-   "cpuid;"
-   : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
-   : "0" (*eax), "2" (*ecx));
-}
-
-/* Intel-defined CPU features, CPUID level 0x0007:0 (ecx) */
-#define X86_FEATURE_PKU(1<<3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE  (1<<4) /* OS Protection Keys Enable */
-
-static inline int cpu_has_pku(void)
-{
-   unsigned int eax;
-   unsigned int ebx;
-   unsigned int ecx;
-   unsigned int edx;
-
-   eax = 0x7;
-   ecx = 0x0;
-   __cpuid(, , , );
-
-   if (!(ecx & X86_FEATURE_PKU)) {
-   dprintf2("cpu does not have PKU\n");
-   return 0;
-   }
-   if (!(ecx & X86_FEATURE_OSPKE)) {
-   dprintf2("cpu does not have OSPKE\n");
-   return 0;
-   }
-   return 1;
-}
-
-#define XSTATE_PKEY_BIT(9)
-#define XSTATE_PKEY0x200
-
-int pkey_reg_xstate_offset(void)
-{
-   unsigned int eax;
-   unsigned int ebx;
-   unsigned int ecx;
-   unsigned int edx;
-   int xstate_offset;
-   int xstate_size;
-   unsigned long XSTATE_CPUID = 0xd;
-   int leaf;
-
-   /* assume that XSTATE_PKEY is set in XCR0 */
-   leaf = XSTATE_PKEY_BIT;
-   {
-   eax = XSTATE_CPUID;
-   ecx = leaf;
-   __cpuid(, , , );
-
-   if (leaf == XSTATE_PKEY_BIT) {
-   xstate_offset = ebx;
-   xstate_size = eax;
-   }
-   }
-
-   if (xstate_size == 0) {
-   printf("could not find size/offset of PKEY in xsave state\n");
-   return 0;
-   }
-
-   return xstate_offset;
-}
-
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
 #define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
 #define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
diff --git a/tools/testing/selftests/vm/pkey-x86.h 
b/tools/testing/selftests/vm/pkey-x86.h
new file mode 100644
index 000..2f04ade
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-x86.h
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef

[PATCH v13 03/24] selftests/vm: move generic definitions to header file

2018-06-13 Thread Ram Pai

Moved all the generic definition and helper functions to the header file.

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
---
 tools/testing/selftests/vm/pkey-helpers.h|   35 ++---
 tools/testing/selftests/vm/protection_keys.c |   27 
 2 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index d5779be..6ad1bd5 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -13,6 +13,14 @@
 #include 
 #include 
 
+/* Define some kernel-like types */
+#define  u8 uint8_t
+#define u16 uint16_t
+#define u32 uint32_t
+#define u64 uint64_t
+
+#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
+
 #define NR_PKEYS 16
 #define PKEY_BITS_PER_PKEY 2
 
@@ -53,6 +61,18 @@ static inline void sigsafe_printf(const char *format, ...)
 #define dprintf3(args...) dprintf_level(3, args)
 #define dprintf4(args...) dprintf_level(4, args)
 
+extern void abort_hooks(void);
+#define pkey_assert(condition) do {\
+   if (!(condition)) { \
+   dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
+   __FILE__, __LINE__, \
+   test_nr, iteration_nr); \
+   dprintf0("errno at assert: %d", errno); \
+   abort_hooks();  \
+   exit(__LINE__); \
+   }   \
+} while (0)
+
 extern unsigned int shadow_pkey_reg;
 static inline unsigned int __read_pkey_reg(void)
 {
@@ -137,11 +157,6 @@ static inline void __pkey_write_allow(int pkey, int 
do_allow_write)
dprintf4("pkey_reg now: %08x\n", read_pkey_reg());
 }
 
-#define PROT_PKEY0 0x10/* protection key value (bit 0) */
-#define PROT_PKEY1 0x20/* protection key value (bit 1) */
-#define PROT_PKEY2 0x40/* protection key value (bit 2) */
-#define PROT_PKEY3 0x80/* protection key value (bit 3) */
-
 #define PAGE_SIZE 4096
 #define MB (1<<20)
 
@@ -219,4 +234,14 @@ int pkey_reg_xstate_offset(void)
return xstate_offset;
 }
 
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
+#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
+#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
+#define ALIGN_PTR_UP(p, ptr_align_to)  \
+   ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to))
+#define ALIGN_PTR_DOWN(p, ptr_align_to)\
+   ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to))
+#define __stringify_1(x...) #x
+#define __stringify(x...)   __stringify_1(x)
+
 #endif /* _PKEYS_HELPER_H */
diff --git a/tools/testing/selftests/vm/protection_keys.c 
b/tools/testing/selftests/vm/protection_keys.c
index 9f373cc..cad52dc 100644
--- a/tools/testing/selftests/vm/protection_keys.c
+++ b/tools/testing/selftests/vm/protection_keys.c
@@ -51,31 +51,10 @@
 unsigned int shadow_pkey_reg;
 
 #define HPAGE_SIZE (1UL<<21)
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
-#define ALIGN_UP(x, align_to)  (((x) + ((align_to)-1)) & ~((align_to)-1))
-#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1))
-#define ALIGN_PTR_UP(p, ptr_align_to)  ((typeof(p))ALIGN_UP((unsigned 
long)(p),ptr_align_to))
-#define ALIGN_PTR_DOWN(p, ptr_align_to)
((typeof(p))ALIGN_DOWN((unsigned long)(p),  ptr_align_to))
-#define __stringify_1(x...) #x
-#define __stringify(x...)   __stringify_1(x)
-
-#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP)
 
 int dprint_in_signal;
 char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
 
-extern void abort_hooks(void);
-#define pkey_assert(condition) do {\
-   if (!(condition)) { \
-   dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \
-   __FILE__, __LINE__, \
-   test_nr, iteration_nr); \
-   dprintf0("errno at assert: %d", errno); \
-   abort_hooks();  \
-   exit(__LINE__); \
-   }   \
-} while (0)
-
 void cat_into_file(char *str, char *file)
 {
int fd = open(file, O_RDWR);
@@ -186,12 +165,6 @@ void lots_o_noops_around_write(int *write_to_me)
dprintf3("%s() done\n", __func__);
 }
 
-/* Define some kernel-like types */
-#define  u8 uint8_t
-#define u16 uint16_t
-#define u32 uint32_t
-#define u64 uint64_t
-
 #ifdef __i386__
 
 #ifndef SYS_mprotect_key
-- 
1.7.1

[PATCH v13 02/24] selftests/vm: rename all references to pkru to a generic name

2018-06-13 Thread Ram Pai

some pkru references are named to pkey_reg
and some prku references are renamed to pkey

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Reviewed-by: Dave Hansen 
---
 tools/testing/selftests/vm/pkey-helpers.h|   85 +-
 tools/testing/selftests/vm/protection_keys.c |  238 ++
 2 files changed, 169 insertions(+), 154 deletions(-)

diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
index 254e543..d5779be 100644
--- a/tools/testing/selftests/vm/pkey-helpers.h
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -14,7 +14,7 @@
 #include 
 
 #define NR_PKEYS 16
-#define PKRU_BITS_PER_PKEY 2
+#define PKEY_BITS_PER_PKEY 2
 
 #ifndef DEBUG_LEVEL
 #define DEBUG_LEVEL 0
@@ -53,85 +53,88 @@ static inline void sigsafe_printf(const char *format, ...)
 #define dprintf3(args...) dprintf_level(3, args)
 #define dprintf4(args...) dprintf_level(4, args)
 
-extern unsigned int shadow_pkru;
-static inline unsigned int __rdpkru(void)
+extern unsigned int shadow_pkey_reg;
+static inline unsigned int __read_pkey_reg(void)
 {
unsigned int eax, edx;
unsigned int ecx = 0;
-   unsigned int pkru;
+   unsigned int pkey_reg;
 
asm volatile(".byte 0x0f,0x01,0xee\n\t"
 : "=a" (eax), "=d" (edx)
 : "c" (ecx));
-   pkru = eax;
-   return pkru;
+   pkey_reg = eax;
+   return pkey_reg;
 }
 
-static inline unsigned int _rdpkru(int line)
+static inline unsigned int _read_pkey_reg(int line)
 {
-   unsigned int pkru = __rdpkru();
+   unsigned int pkey_reg = __read_pkey_reg();
 
-   dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
-   line, pkru, shadow_pkru);
-   assert(pkru == shadow_pkru);
+   dprintf4("read_pkey_reg(line=%d) pkey_reg: %x shadow: %x\n",
+   line, pkey_reg, shadow_pkey_reg);
+   assert(pkey_reg == shadow_pkey_reg);
 
-   return pkru;
+   return pkey_reg;
 }
 
-#define rdpkru() _rdpkru(__LINE__)
+#define read_pkey_reg() _read_pkey_reg(__LINE__)
 
-static inline void __wrpkru(unsigned int pkru)
+static inline void __write_pkey_reg(unsigned int pkey_reg)
 {
-   unsigned int eax = pkru;
+   unsigned int eax = pkey_reg;
unsigned int ecx = 0;
unsigned int edx = 0;
 
-   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   dprintf4("%s() changing %08x to %08x\n", __func__,
+   __read_pkey_reg(), pkey_reg);
asm volatile(".byte 0x0f,0x01,0xef\n\t"
 : : "a" (eax), "c" (ecx), "d" (edx));
-   assert(pkru == __rdpkru());
+   assert(pkey_reg == __read_pkey_reg());
 }
 
-static inline void wrpkru(unsigned int pkru)
+static inline void write_pkey_reg(unsigned int pkey_reg)
 {
-   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   dprintf4("%s() changing %08x to %08x\n", __func__,
+   __read_pkey_reg(), pkey_reg);
/* will do the shadow check for us: */
-   rdpkru();
-   __wrpkru(pkru);
-   shadow_pkru = pkru;
-   dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+   read_pkey_reg();
+   __write_pkey_reg(pkey_reg);
+   shadow_pkey_reg = pkey_reg;
+   dprintf4("%s(%08x) pkey_reg: %08x\n", __func__,
+   pkey_reg, __read_pkey_reg());
 }
 
 /*
  * These are technically racy. since something could
- * change PKRU between the read and the write.
+ * change PKEY register between the read and the write.
  */
 static inline void __pkey_access_allow(int pkey, int do_allow)
 {
-   unsigned int pkru = rdpkru();
+   unsigned int pkey_reg = read_pkey_reg();
int bit = pkey * 2;
 
if (do_allow)
-   pkru &= (1<>>>===SIGSEGV\n");
-   dprintf1("%s()::%d, pkru: 0x%x shadow: %x\n", __func__, __LINE__,
-   __rdpkru(), shadow_pkru);
+   dprintf1("%s()::%d, pkey_reg: 0x%x shadow: %x\n", __func__, __LINE__,
+   __read_pkey_reg(), shadow_pkey_reg);
 
trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO];
ip = uctxt->uc_mcontext.gregs[REG_IP_IDX];
@@ -289,19 +289,19 @@ void signal_handler(int signum, siginfo_t *si, void 
*vucontext)
 */
fpregs += 0x70;
 #endif
-   pkru_offset = pkru_xstate_offset();
-   pkru_ptr = (void *)([pkru_offset]);
+   pkey_reg_offset = pkey_reg_xstate_offset();
+   pkey_reg_ptr = (void *)([pkey_reg_offset]);
 
dprintf1("siginfo: %p\n", si);
dprintf1(" fpregs: %p\n", fpregs);
/*
-* If we got a PKRU fault, we *HAVE* to have at least one bit set in
+* If we got a PKEY fault, we *HAVE* to have at least one bit set in
 * here.
 */
-   dprintf1("pkru_xstate_offset: %d\n", pkru_xstate_offset());
+

[PATCH v13 01/24] selftests/x86: Move protecton key selftest to arch neutral directory

2018-06-13 Thread Ram Pai

cc: Dave Hansen 
cc: Florian Weimer 
Signed-off-by: Ram Pai 
Signed-off-by: Thiago Jung Bauermann 
Acked-by: Ingo Molnar 
---
 tools/testing/selftests/vm/.gitignore |1 +
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  219 
 tools/testing/selftests/vm/protection_keys.c  | 1485 +
 tools/testing/selftests/x86/.gitignore|1 -
 tools/testing/selftests/x86/pkey-helpers.h|  219 
 tools/testing/selftests/x86/protection_keys.c | 1485 -
 7 files changed, 1706 insertions(+), 1705 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/vm/protection_keys.c
 delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h
 delete mode 100644 tools/testing/selftests/x86/protection_keys.c

diff --git a/tools/testing/selftests/vm/.gitignore 
b/tools/testing/selftests/vm/.gitignore
index 342c7bc..0214fbf 100644
--- a/tools/testing/selftests/vm/.gitignore
+++ b/tools/testing/selftests/vm/.gitignore
@@ -12,3 +12,4 @@ mlock-random-test
 virtual_address_range
 gup_benchmark
 va_128TBswitch
+protection_keys
diff --git a/tools/testing/selftests/vm/Makefile 
b/tools/testing/selftests/vm/Makefile
index fdefa22..9788a58 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -20,6 +20,7 @@ TEST_GEN_FILES += transhuge-stress
 TEST_GEN_FILES += userfaultfd
 TEST_GEN_FILES += va_128TBswitch
 TEST_GEN_FILES += virtual_address_range
+TEST_GEN_FILES += protection_keys
 
 TEST_PROGS := run_vmtests
 
diff --git a/tools/testing/selftests/vm/pkey-helpers.h 
b/tools/testing/selftests/vm/pkey-helpers.h
new file mode 100644
index 000..254e543
--- /dev/null
+++ b/tools/testing/selftests/vm/pkey-helpers.h
@@ -0,0 +1,219 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _PKEYS_HELPER_H
+#define _PKEYS_HELPER_H
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define NR_PKEYS 16
+#define PKRU_BITS_PER_PKEY 2
+
+#ifndef DEBUG_LEVEL
+#define DEBUG_LEVEL 0
+#endif
+#define DPRINT_IN_SIGNAL_BUF_SIZE 4096
+extern int dprint_in_signal;
+extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE];
+static inline void sigsafe_printf(const char *format, ...)
+{
+   va_list ap;
+
+   if (!dprint_in_signal) {
+   va_start(ap, format);
+   vprintf(format, ap);
+   va_end(ap);
+   } else {
+   int ret;
+   /*
+* No printf() functions are signal-safe.
+* They deadlock easily. Write the format
+* string to get some output, even if
+* incomplete.
+*/
+   ret = write(1, format, strlen(format));
+   if (ret < 0)
+   exit(1);
+   }
+}
+#define dprintf_level(level, args...) do { \
+   if (level <= DEBUG_LEVEL)   \
+   sigsafe_printf(args);   \
+} while (0)
+#define dprintf0(args...) dprintf_level(0, args)
+#define dprintf1(args...) dprintf_level(1, args)
+#define dprintf2(args...) dprintf_level(2, args)
+#define dprintf3(args...) dprintf_level(3, args)
+#define dprintf4(args...) dprintf_level(4, args)
+
+extern unsigned int shadow_pkru;
+static inline unsigned int __rdpkru(void)
+{
+   unsigned int eax, edx;
+   unsigned int ecx = 0;
+   unsigned int pkru;
+
+   asm volatile(".byte 0x0f,0x01,0xee\n\t"
+: "=a" (eax), "=d" (edx)
+: "c" (ecx));
+   pkru = eax;
+   return pkru;
+}
+
+static inline unsigned int _rdpkru(int line)
+{
+   unsigned int pkru = __rdpkru();
+
+   dprintf4("rdpkru(line=%d) pkru: %x shadow: %x\n",
+   line, pkru, shadow_pkru);
+   assert(pkru == shadow_pkru);
+
+   return pkru;
+}
+
+#define rdpkru() _rdpkru(__LINE__)
+
+static inline void __wrpkru(unsigned int pkru)
+{
+   unsigned int eax = pkru;
+   unsigned int ecx = 0;
+   unsigned int edx = 0;
+
+   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   asm volatile(".byte 0x0f,0x01,0xef\n\t"
+: : "a" (eax), "c" (ecx), "d" (edx));
+   assert(pkru == __rdpkru());
+}
+
+static inline void wrpkru(unsigned int pkru)
+{
+   dprintf4("%s() changing %08x to %08x\n", __func__, __rdpkru(), pkru);
+   /* will do the shadow check for us: */
+   rdpkru();
+   __wrpkru(pkru);
+   shadow_pkru = pkru;
+   dprintf4("%s(%08x) pkru: %08x\n", __func__, pkru, __rdpkru());
+}
+
+/*
+ * These are technically racy. since something could
+ * change PKRU between the read and the write.
+ */
+static inline void __pkey_access_allow(int pkey, int do_allow)
+{
+   unsigned int pkru = rdpkru();
+   int bit = pkey * 2;
+
+   if (do_allow)
+   pkru &=

[PATCH v13 00/24] selftests, powerpc, x86 : Memory Protection Keys

2018-06-13 Thread Ram Pai

Memory protection keys enables an application to protect its address space from
inadvertent access by its own code.

This feature is now enabled on powerpc architecture and integrated in
4.16-rc1.  The patches move the selftests to arch neutral directory
and enhance their test coverage.

Test

Verified for correctness on powerpc. Need help verifying on x86.
Compiles on x86.

History:
---

version 13:
(1) Incorporated comments for Dave Hansen.
(2)   Added one more test for correct pkey-0 behavior.

version 12:
(1) fixed the offset of pkey field in the siginfo structure for
x86_64 and powerpc. And tries to use the actual field
if the headers have it defined.

version 11:
(1) fixed a deadlock in the ptrace testcase.

version 10 and prior:
(1) moved the testcase to arch neutral directory
(2) split the changes into incremental patches.


Ram Pai (22):
  selftests/x86: Move protecton key selftest to arch neutral directory
  selftests/vm: rename all references to pkru to a generic name
  selftests/vm: move generic definitions to header file
  selftests/vm: typecast the pkey register
  selftests/vm: generic function to handle shadow key register
  selftests/vm: fix the wrong assert in pkey_disable_set()
  selftests/vm: fixed bugs in pkey_disable_clear()
  selftests/vm: clear the bits in shadow reg when a pkey is freed.
  selftests/vm: fix alloc_random_pkey() to make it really random
  selftests/vm: introduce two arch independent abstraction
  selftests/vm: pkey register should match shadow pkey
  selftests/vm: generic cleanup
  selftests/vm: powerpc implementation for generic abstraction
  selftests/vm: clear the bits in shadow reg when a pkey is freed.
  selftests/vm: powerpc implementation to check support for pkey
  selftests/vm: fix an assertion in test_pkey_alloc_exhaust()
  selftests/vm: associate key on a mapped page and detect access
violation
  selftests/vm: associate key on a mapped page and detect write
violation
  selftests/vm: detect write violation on a mapped access-denied-key
page
  selftests/vm: testcases must restore pkey-permissions
  selftests/vm: sub-page allocator
  selftests/vm: test correct behavior of pkey-0

Thiago Jung Bauermann (2):
  selftests/vm: move arch-specific definitions to arch-specific header
  selftests/vm: Make gcc check arguments of sigsafe_printf()

 tools/testing/selftests/vm/.gitignore |1 +
 tools/testing/selftests/vm/Makefile   |1 +
 tools/testing/selftests/vm/pkey-helpers.h |  214 
 tools/testing/selftests/vm/pkey-powerpc.h |  126 ++
 tools/testing/selftests/vm/pkey-x86.h |  184 +++
 tools/testing/selftests/vm/protection_keys.c  | 1598 +
 tools/testing/selftests/x86/.gitignore|1 -
 tools/testing/selftests/x86/pkey-helpers.h|  219 
 tools/testing/selftests/x86/protection_keys.c | 1485 ---
 9 files changed, 2124 insertions(+), 1705 deletions(-)
 create mode 100644 tools/testing/selftests/vm/pkey-helpers.h
 create mode 100644 tools/testing/selftests/vm/pkey-powerpc.h
 create mode 100644 tools/testing/selftests/vm/pkey-x86.h
 create mode 100644 tools/testing/selftests/vm/protection_keys.c
 delete mode 100644 tools/testing/selftests/x86/pkey-helpers.h
 delete mode 100644 tools/testing/selftests/x86/protection_keys.c

Re: [PATCH] powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP

2018-06-13 Thread kbuild test robot

Hi Nicholas,

I love your patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on next-20180613]
[cannot apply to v4.17]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Nicholas-Piggin/powerpc-64s-radix-Fix-MADV_-FREE-DONTNEED-TLB-flush-miss-problem-with-THP/20180613-180928
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-cell_defconfig (attached as .config)
compiler: powerpc64-linux-gnu-gcc (Debian 7.2.0-11) 7.2.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
GCC_VERSION=7.2.0 make.cross ARCH=powerpc 

All error/warnings (new ones prefixed by >>):

   In file included from include/asm-generic/bug.h:5:0,
from arch/powerpc/include/asm/bug.h:128,
from include/linux/bug.h:5,
from include/linux/mmdebug.h:5,
from include/linux/mm.h:9,
from arch/powerpc/mm/tlb-radix.c:12:
   In function '__radix__flush_tlb_range',
   inlined from 'radix__tlb_flush' at arch/powerpc/mm/tlb-radix.c:898:3:
>> include/linux/compiler.h:339:38: error: call to '__compiletime_assert_745' 
>> declared with attribute error: BUILD_BUG failed
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:319:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^~
   include/linux/compiler.h:339:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/build_bug.h:45:37: note: in expansion of macro 
'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   include/linux/build_bug.h:79:21: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
^~~~
>> include/linux/huge_mm.h:251:27: note: in expansion of macro 'BUILD_BUG'
#define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; })
  ^
>> arch/powerpc/mm/tlb-radix.c:745:22: note: in expansion of macro 
>> 'HPAGE_PMD_SIZE'
   hstart = (start + HPAGE_PMD_SIZE - 1) & HPAGE_PMD_MASK;
 ^~
>> include/linux/compiler.h:339:38: error: call to '__compiletime_assert_745' 
>> declared with attribute error: BUILD_BUG failed
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:319:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^~
   include/linux/compiler.h:339:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/build_bug.h:45:37: note: in expansion of macro 
'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   include/linux/build_bug.h:79:21: note: in expansion of macro 
'BUILD_BUG_ON_MSG'
#define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
^~~~
   include/linux/huge_mm.h:250:27: note: in expansion of macro 'BUILD_BUG'
#define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
  ^
>> arch/powerpc/mm/tlb-radix.c:745:44: note: in expansion of macro 
>> 'HPAGE_PMD_MASK'
   hstart = (start + HPAGE_PMD_SIZE - 1) & HPAGE_PMD_MASK;
   ^~
   include/linux/compiler.h:339:38: error: call to '__compiletime_assert_746' 
declared with attribute error: BUILD_BUG failed
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^
   include/linux/compiler.h:319:4: note: in definition of macro 
'__compiletime_assert'
   prefix ## suffix();\
   ^~
   include/linux/compiler.h:339:2: note: in expansion of macro 
'_compiletime_assert'
 _compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 ^~~
   include/linux/build_bug.h:45:37: note: in expansion of macro 
'compiletime_assert'
#define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg)
^~
   include/linux/build_bug.h:79:21: note: in expansion of

[PATCH v2 6/6] powerpc/pkeys: Deny read/write/execute by default

2018-06-13 Thread Ram Pai

Deny all permissions on all keys, with some exceptions.  pkey-0 must
allow all permissions, or else everything comes to a screaching halt.
Execute-only key must allow execute permission.

Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/pkeys.c |   10 --
 1 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 9098605..cec990c 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -128,13 +128,11 @@ int pkey_initialize(void)
 
/* register mask is in BE format */
pkey_amr_mask = ~0x0ul;
-   pkey_iamr_mask = ~0x0ul;
+   pkey_amr_mask &= ~(0x3ul << pkeyshift(PKEY_0));
 
-   for (i = 0; i < (pkeys_total - os_reserved); i++) {
-   pkey_amr_mask &= ~(0x3ul << pkeyshift(i));
-   pkey_iamr_mask &= ~(0x1ul << pkeyshift(i));
-   }
-   pkey_amr_mask |= (AMR_RD_BIT|AMR_WR_BIT) << pkeyshift(EXECUTE_ONLY_KEY);
+   pkey_iamr_mask = ~0x0ul;
+   pkey_iamr_mask &= ~(0x3ul << pkeyshift(PKEY_0));
+   pkey_iamr_mask &= ~(0x3ul << pkeyshift(EXECUTE_ONLY_KEY));
 
pkey_uamor_mask = ~0x0ul;
pkey_uamor_mask &= ~(0x3ul << pkeyshift(PKEY_0));
-- 
1.7.1

[PATCH v2 5/6] powerpc/pkeys: make protection key 0 less special

2018-06-13 Thread Ram Pai

Applications need the ability to associate an address-range with some
key and latter revert to its initial default key. Pkey-0 comes close to
providing this function but falls short, because the current
implementation disallows applications to explicitly associate pkey-0 to
the address range.

Lets make pkey-0 less special and treat it almost like any other key.
Thus it can be explicitly associated with any address range, and can be
freed. This gives the application more flexibility and power.  The
ability to free pkey-0 must be used responsibily, since pkey-0 is
associated with almost all address-range by default.

Even with this change pkey-0 continues to be slightly more special
from the following point of view.
(a) it is implicitly allocated.
(b) it is the default key assigned to any address-range.
(c) its permissions cannot be modified by userspace.

NOTE: (c) is specific to powerpc only. pkey-0 is associated by default
with all pages including kernel pages, and pkeys are also active in
kernel mode. If any permission is denied on pkey-0, the kernel running
in the context of the application will be unable to operate.

Tested on powerpc.

cc: Thomas Gleixner 
cc: Dave Hansen 
cc: Michael Ellermen 
cc: Ingo Molnar 
cc: Andrew Morton 
cc: Thiago Jung Bauermann 
cc: Michal Such谩nek 
---
History:
v5: . no changes since version.

v4: . introduced PKEY_0 macro.  No bug fixes. Code
re-arrangement to save a few cycles.

v3: . Corrected a comment in arch_set_user_pkey_access().  .
Clarified the header, to capture the notion that pkey-0
permissions cannot be modified by userspace on powerpc.
-- comment from Thiago

v2: . mm_pkey_is_allocated() continued to treat pkey-0 special.
fixed it.
---
 arch/powerpc/include/asm/pkeys.h |   29 +++--
 arch/powerpc/mm/pkeys.c  |   13 ++---
 2 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/pkeys.h b/arch/powerpc/include/asm/pkeys.h
index 5ba80cf..c824528 100644
--- a/arch/powerpc/include/asm/pkeys.h
+++ b/arch/powerpc/include/asm/pkeys.h
@@ -13,7 +13,10 @@
 
 DECLARE_STATIC_KEY_TRUE(pkey_disabled);
 extern int pkeys_total; /* total pkeys as per device tree */
-extern u32 initial_allocation_mask; /* bits set for reserved keys */
+extern u32 initial_allocation_mask; /*  bits set for the initially allocated 
keys */
+extern u32 reserved_allocation_mask; /* bits set for reserved keys */
+
+#define PKEY_0 0
 
 #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | \
VM_PKEY_BIT3 | VM_PKEY_BIT4)
@@ -83,15 +86,19 @@ static inline u16 pte_to_pkey_bits(u64 pteflags)
 #define __mm_pkey_is_allocated(mm, pkey)   \
(mm_pkey_allocation_map(mm) & pkey_alloc_mask(pkey))
 
-#define __mm_pkey_is_reserved(pkey) (initial_allocation_mask & \
+#define __mm_pkey_is_reserved(pkey) (reserved_allocation_mask & \
   pkey_alloc_mask(pkey))
 
 static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
-   /* A reserved key is never considered as 'explicitly allocated' */
-   return ((pkey < arch_max_pkey()) &&
-   !__mm_pkey_is_reserved(pkey) &&
-   __mm_pkey_is_allocated(mm, pkey));
+   if (pkey < 0 || pkey >= arch_max_pkey())
+   return false;
+
+   /* Reserved keys are never allocated. */
+   if (__mm_pkey_is_reserved(pkey))
+   return false;
+
+   return __mm_pkey_is_allocated(mm, pkey);
 }
 
 extern void __arch_activate_pkey(int pkey);
@@ -187,6 +194,16 @@ static inline int arch_set_user_pkey_access(struct 
task_struct *tsk, int pkey,
 {
if (static_branch_likely(_disabled))
return -EINVAL;
+
+   /*
+* userspace should not change pkey-0 permissions.
+* pkey-0 is associated with every page in the kernel.
+* If userspace denies any permission on pkey-0, the
+* kernel cannot operate.
+*/
+   if (pkey == PKEY_0)
+   return init_val ? -EINVAL : 0;
+
return __arch_set_user_pkey_access(tsk, pkey, init_val);
 }
 
diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 1f2389f..9098605 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -14,7 +14,8 @@
 bool pkey_execute_disable_supported;
 int  pkeys_total;  /* Total pkeys as per device tree */
 bool pkeys_devtree_defined;/* pkey property exported by device tree */
-u32  initial_allocation_mask;  /* Bits set for reserved keys */
+u32  initial_allocation_mask;   /* Bits set for the initially allocated keys */
+u32  reserved_allocation_mask;  /* Bits set for reserved keys */
 u64  pkey_amr_mask;/* Bits in AMR not to be touched */
 u64  pkey_iamr_mask;   /* Bits in AMR not to be touched

[PATCH v2 4/6] powerpc/pkeys: Preallocate execute-only key

2018-06-13 Thread Ram Pai

execute-only key is allocated dynamically. This is a problem.  When a
thread implicitly creates a execute-only key, and resets UAMOR for that
key, the UAMOR value does not percolate to all the other threads.  Any
other thread may ignorantly change the permissions on the key.  This can
cause the key to be not execute-only for that thread.

Preallocate the execute-only key and ensure that no thread can change
the permission of the key, by resetting the corresponding bit in UAMOR.

CC: Andy Lutomirski 
CC: Florian Weimer 
CC: Thiago Jung Bauermann 
CC: Michael Ellerman 
Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/pkeys.c |   53 +++---
 1 files changed, 8 insertions(+), 45 deletions(-)

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index b681bec..1f2389f 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -25,6 +25,7 @@
 #define IAMR_EX_BIT 0x1UL
 #define PKEY_REG_BITS (sizeof(u64)*8)
 #define pkeyshift(pkey) (PKEY_REG_BITS - ((pkey+1) * AMR_BITS_PER_PKEY))
+#define EXECUTE_ONLY_KEY 2
 
 static void scan_pkey_feature(void)
 {
@@ -120,7 +121,8 @@ int pkey_initialize(void)
 #else
os_reserved = 0;
 #endif
-   initial_allocation_mask  = (0x1 << 0) | (0x1 << 1);
+   initial_allocation_mask  = (0x1 << 0) | (0x1 << 1) |
+   (0x1 << EXECUTE_ONLY_KEY);
 
/* register mask is in BE format */
pkey_amr_mask = ~0x0ul;
@@ -130,9 +132,12 @@ int pkey_initialize(void)
pkey_amr_mask &= ~(0x3ul << pkeyshift(i));
pkey_iamr_mask &= ~(0x1ul << pkeyshift(i));
}
+   pkey_amr_mask |= (AMR_RD_BIT|AMR_WR_BIT) << pkeyshift(EXECUTE_ONLY_KEY);
 
pkey_uamor_mask = ~0x0ul;
pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
+   pkey_uamor_mask &= ~(0x3ul << pkeyshift(EXECUTE_ONLY_KEY));
+
for (i = (pkeys_total - os_reserved); i < pkeys_total; i++)
pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
 
@@ -146,8 +151,7 @@ void pkey_mm_init(struct mm_struct *mm)
if (static_branch_likely(_disabled))
return;
mm_pkey_allocation_map(mm) = initial_allocation_mask;
-   /* -1 means unallocated or invalid */
-   mm->context.execute_only_pkey = -1;
+   mm->context.execute_only_pkey = EXECUTE_ONLY_KEY;
 }
 
 static inline u64 read_amr(void)
@@ -326,48 +330,7 @@ static inline bool pkey_allows_readwrite(int pkey)
 
 int __execute_only_pkey(struct mm_struct *mm)
 {
-   bool need_to_set_mm_pkey = false;
-   int execute_only_pkey = mm->context.execute_only_pkey;
-   int ret;
-
-   /* Do we need to assign a pkey for mm's execute-only maps? */
-   if (execute_only_pkey == -1) {
-   /* Go allocate one to use, which might fail */
-   execute_only_pkey = mm_pkey_alloc(mm);
-   if (execute_only_pkey < 0)
-   return -1;
-   need_to_set_mm_pkey = true;
-   }
-
-   /*
-* We do not want to go through the relatively costly dance to set AMR
-* if we do not need to. Check it first and assume that if the
-* execute-only pkey is readwrite-disabled than we do not have to set it
-* ourselves.
-*/
-   if (!need_to_set_mm_pkey && !pkey_allows_readwrite(execute_only_pkey))
-   return execute_only_pkey;
-
-   /*
-* Set up AMR so that it denies access for everything other than
-* execution.
-*/
-   ret = __arch_set_user_pkey_access(current, execute_only_pkey,
- PKEY_DISABLE_ACCESS |
- PKEY_DISABLE_WRITE);
-   /*
-* If the AMR-set operation failed somehow, just return 0 and
-* effectively disable execute-only support.
-*/
-   if (ret) {
-   mm_pkey_free(mm, execute_only_pkey);
-   return -1;
-   }
-
-   /* We got one, store it and use it from here on out */
-   if (need_to_set_mm_pkey)
-   mm->context.execute_only_pkey = execute_only_pkey;
-   return execute_only_pkey;
+   return mm->context.execute_only_pkey;
 }
 
 static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
-- 
1.7.1

[PATCH v2 3/6] powerpc/pkeys: fix calculation of total pkeys.

2018-06-13 Thread Ram Pai

Total number of pkeys calculation is off by 1. Fix it.

CC: Florian Weimer 
CC: Michael Ellerman 
CC: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/pkeys.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index 6529f4e..b681bec 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -92,7 +92,7 @@ int pkey_initialize(void)
 * arch-neutral code.
 */
pkeys_total = min_t(int, pkeys_total,
-   (ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT));
+   ((ARCH_VM_PKEY_FLAGS >> VM_PKEY_SHIFT)+1));
 
if (!pkey_mmu_enabled() || radix_enabled() || !pkeys_total)
static_branch_enable(_disabled);
-- 
1.7.1

[PATCH v2 2/6] powerpc/pkeys: Save the pkey registers before fork

2018-06-13 Thread Ram Pai

When a thread forks the contents of AMR, IAMR, UAMOR registers in the
newly forked thread are not inherited.

Save the registers before forking, for content of those
registers to be automatically copied into the new thread.

CC: Michael Ellerman 
CC: Florian Weimer 
CC: Andy Lutomirski 
CC: Thiago Jung Bauermann 
Signed-off-by: Ram Pai 
---
 arch/powerpc/kernel/process.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9ef4aea..991d097 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -583,6 +583,7 @@ static void save_all(struct task_struct *tsk)
__giveup_spe(tsk);
 
msr_check_and_clear(msr_all_available);
+   thread_pkey_regs_save(>thread);
 }
 
 void flush_all_to_thread(struct task_struct *tsk)
-- 
1.7.1

[PATCH v2 1/6] powerpc/pkeys: Enable all user-allocatable pkeys at init.

2018-06-13 Thread Ram Pai

In a multithreaded application, a key allocated by one thread must
be activate and usable on all threads.

Currently this is not the case, because the UAMOR bits for all keys are
disabled by default. When a new key is allocated in one thread, though
the corresponding UAMOR bits for that thread get enabled, the UAMOR bits
for all other existing threads continue to have their bits disabled.
Other threads have no way to set permissions on the key, effectively
making the key useless.

Enable the UAMOR bits for all keys, at process creation. Since the
contents of UAMOR are inherited at fork, all threads are capable of
modifying the permissions on any key.

BTW: changing the permission on unallocated keys has no effect, till
those keys are not associated with any PTEs. The kernel will anyway
disallow to association of unallocated keys with PTEs.

CC: Andy Lutomirski 
CC: Florian Weimer 
CC: Thiago Jung Bauermann 
CC: Michael Ellerman 
Signed-off-by: Ram Pai 
---
 arch/powerpc/mm/pkeys.c |   40 ++--
 1 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/mm/pkeys.c b/arch/powerpc/mm/pkeys.c
index e6f500f..6529f4e 100644
--- a/arch/powerpc/mm/pkeys.c
+++ b/arch/powerpc/mm/pkeys.c
@@ -15,8 +15,9 @@
 int  pkeys_total;  /* Total pkeys as per device tree */
 bool pkeys_devtree_defined;/* pkey property exported by device tree */
 u32  initial_allocation_mask;  /* Bits set for reserved keys */
-u64  pkey_amr_uamor_mask;  /* Bits in AMR/UMOR not to be touched */
+u64  pkey_amr_mask;/* Bits in AMR not to be touched */
 u64  pkey_iamr_mask;   /* Bits in AMR not to be touched */
+u64  pkey_uamor_mask;  /* Bits in UMOR not to be touched */
 
 #define AMR_BITS_PER_PKEY 2
 #define AMR_RD_BIT 0x1UL
@@ -119,20 +120,22 @@ int pkey_initialize(void)
 #else
os_reserved = 0;
 #endif
-   initial_allocation_mask = ~0x0;
-   pkey_amr_uamor_mask = ~0x0ul;
+   initial_allocation_mask  = (0x1 << 0) | (0x1 << 1);
+
+   /* register mask is in BE format */
+   pkey_amr_mask = ~0x0ul;
pkey_iamr_mask = ~0x0ul;
-   /*
-* key 0, 1 are reserved.
-* key 0 is the default key, which allows read/write/execute.
-* key 1 is recommended not to be used. PowerISA(3.0) page 1015,
-* programming note.
-*/
-   for (i = 2; i < (pkeys_total - os_reserved); i++) {
-   initial_allocation_mask &= ~(0x1 << i);
-   pkey_amr_uamor_mask &= ~(0x3ul << pkeyshift(i));
+
+   for (i = 0; i < (pkeys_total - os_reserved); i++) {
+   pkey_amr_mask &= ~(0x3ul << pkeyshift(i));
pkey_iamr_mask &= ~(0x1ul << pkeyshift(i));
}
+
+   pkey_uamor_mask = ~0x0ul;
+   pkey_uamor_mask &= ~(0x3ul << pkeyshift(0));
+   for (i = (pkeys_total - os_reserved); i < pkeys_total; i++)
+   pkey_uamor_mask &= ~(0x3ul << pkeyshift(i));
+
return 0;
 }
 
@@ -289,9 +292,6 @@ void thread_pkey_regs_restore(struct thread_struct 
*new_thread,
if (static_branch_likely(_disabled))
return;
 
-   /*
-* TODO: Just set UAMOR to zero if @new_thread hasn't used any keys yet.
-*/
if (old_thread->amr != new_thread->amr)
write_amr(new_thread->amr);
if (old_thread->iamr != new_thread->iamr)
@@ -305,9 +305,13 @@ void thread_pkey_regs_init(struct thread_struct *thread)
if (static_branch_likely(_disabled))
return;
 
-   thread->amr = read_amr() & pkey_amr_uamor_mask;
-   thread->iamr = read_iamr() & pkey_iamr_mask;
-   thread->uamor = read_uamor() & pkey_amr_uamor_mask;
+   thread->amr = pkey_amr_mask;
+   thread->iamr = pkey_iamr_mask;
+   thread->uamor = pkey_uamor_mask;
+
+   write_uamor(pkey_uamor_mask);
+   write_amr(pkey_amr_mask);
+   write_iamr(pkey_iamr_mask);
 }
 
 static inline bool pkey_allows_readwrite(int pkey)
-- 
1.7.1

[PATCH v2 0/6] powerpc/pkeys: fixes to pkeys

2018-06-13 Thread Ram Pai

Assortment of fixes to pkey.

Patch 1  makes pkey consumable in multithreaded applications.

Patch 2  fixes fork behavior to inherit the key attributes.

Patch 3  A off-by-one bug made one key unusable. Fixes it.

Patch 4  Execute-only key is preallocated.

Patch 5  Makes pkey-0 less special.

Patch 6  Deny by default permissions on all unallocated keys.

Passes all selftests on powerpc. Also behavior verified to be correct
by Florian.

Changelog:

v2: . fixed merge conflict with upstream code.
. Add patch 6. Makes the behavior consistent
  with that on x86.

Ram Pai (6):
  powerpc/pkeys: Enable all user-allocatable pkeys at init.
  powerpc/pkeys: Save the pkey registers before fork
  powerpc/pkeys: fix calculation of total pkeys.
  powerpc/pkeys: Preallocate execute-only key
  powerpc/pkeys: make protection key 0 less special
  powerpc/pkeys: Deny read/write/execute by default

 arch/powerpc/include/asm/pkeys.h |   29 +--
 arch/powerpc/kernel/process.c|1 +
 arch/powerpc/mm/pkeys.c  |  102 -
 3 files changed, 57 insertions(+), 75 deletions(-)

Re: [RFC,00/12] Deal with TM on kernel entry and exit

2018-06-13 Thread Breno Leitao

Hi Cyril,

On 02/19/2018 09:22 PM, Cyril Bur wrote:
> This is very much a proof of concept and if it isn't clear from the
> commit names, still a work in progress.

> I believe I have something that works - all the powerpc selftests
> pass. I would like to get some eyes on it to a) see if I've missed
> anything big and b) some opinions on if it is looking like a net
> improvement.

I started to look at this patchset. The patchset apply cleanly on top of the
current kernel and I started to run some tests.

It seems there is a different behavior when there is a trap (as a illegal
instruction or a 'trap' instruction) inside the transaction.

In this case, the signal handler does not seem to be called, and and the task
segfaults. On current upstream, the signal handler is called and the program
can continue.

4.17 pristine
-
$ ./illegal
Failure

4.17 plus your patches
--
$ ./illegal
[1]2504 segmentation fault  ./illegal

Here is a minimal example that is able to recreate this behaviour:

#include 
#include 

int htm(){
asm goto ("tbegin.  \n\t"
  "beq %l[failure]  \n\t"
  "li 3, 3  \n\t"
  "trap \n\t"
  "tend.\n\t"
  : : : : failure);

return 0;
failure:
printf("Failure\n");
return 1;
}

void signal_handler(int signo, siginfo_t *si, void *data) {
// Do nothing
}

int main(){
struct sigaction sa;
sa.sa_flags = SA_SIGINFO;
sa.sa_sigaction = signal_handler;

sigaction(SIGTRAP, , NULL);
sigaction(SIGILL,  , NULL);

return htm();
}

powerpc: use time64_t in read_persistent_clock

2018-06-13 Thread Mathieu Malaterre

Arnd,

In 5bfd643583b2e I can see that you changed:

$ git show 5bfd643583b2e -- arch/powerpc/platforms/powermac/time.c
[...]
 #ifdef CONFIG_ADB_PMU
-static unsigned long pmu_get_time(void)
+static time64_t pmu_get_time(void)
 {
struct adb_request req;
-   unsigned int now;
+   time64_t now;

if (pmu_request(, NULL, 1, PMU_READ_RTC) < 0)
return 0;
@@ -160,10 +151,10 @@ static unsigned long pmu_get_time(void)
   req.reply_len);
now = (req.reply[0] << 24) + (req.reply[1] << 16)
+ (req.reply[2] << 8) + req.reply[3];
-   return ((unsigned long)now) - RTC_OFFSET;
+   return now - RTC_OFFSET;
 }
[...]

As far as I can tell the old function would never return a negative
value and rely on unsigned long roll over. Now I am seeing negative
value being returned and it seems to break later on in the rtc
library.

Could you comment why you removed the cast to (unsigned long) in your commit ?

Thanks

Re: [PATCH] powerpc/64s: Report SLB multi-hit rather than parity error

2018-06-13 Thread Nicholas Piggin

On Wed, 13 Jun 2018 23:24:14 +1000
Michael Ellerman  wrote:

> When we take an SLB multi-hit on bare metal, we see both the multi-hit
> and parity error bits set in DSISR. The user manuals indicates this is
> expected to always happen on Power8, whereas on Power9 it says a
> multi-hit will "usually" also cause a parity error.
> 
> We decide what to do based on the various error tables in mce_power.c,
> and because we process them in order and only report the first, we
> currently always report a parity error but not the multi-hit, eg:
> 
>   Severe Machine check interrupt [Recovered]
> Initiator: CPU
> Error type: SLB [Parity]
>   Effective address: c00d4300
> 
> Although this is correct, it leaves the user wondering why they got a
> parity error. It would be clearer instead if we reported the
> multi-hit because that is more likely to be simply a software bug,
> whereas a true parity error is possibly an indication of a bad core.
> 
> We can do that simply by reordering the error tables so that multi-hit
> appears before parity. That doesn't affect the error recovery at all,
> because we flush the SLB either way.

Yeah this is a good idea. I wonder if there are any other conditions
like this that should be reordered.

I think the i-side should not have to be changed here because it
matches the value not bits, so that shouldn't matter.

A bit of a shame we don't report i/d side, and ideally we'd be able
to report multiple conditions. The reporting APIs really want to be
massaged a bit, but for now this is a good step.

Reviewed-by: Nicholas Piggin 


> 
> Signed-off-by: Michael Ellerman 
> ---
>  arch/powerpc/kernel/mce_power.c | 36 ++--
>  1 file changed, 18 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
> index 38c5b4764bfe..1e450d0c4f72 100644
> --- a/arch/powerpc/kernel/mce_power.c
> +++ b/arch/powerpc/kernel/mce_power.c
> @@ -140,12 +140,12 @@ static const struct mce_ierror_table 
> mce_p7_ierror_table[] = {
>  { 0x001c, 0x0004, true,
>MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> +{ 0x001c, 0x000c, true,
> +  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,/* Before PARITY */
> +  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
>  { 0x001c, 0x0008, true,
>MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> -{ 0x001c, 0x000c, true,
> -  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
> -  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
>  { 0x001c, 0x0010, true,
>MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> @@ -164,12 +164,12 @@ static const struct mce_ierror_table 
> mce_p8_ierror_table[] = {
>  { 0x081c, 0x0004, true,
>MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> +{ 0x081c, 0x000c, true,
> +  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,/* Before PARITY */
> +  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
>  { 0x081c, 0x0008, true,
>MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> -{ 0x081c, 0x000c, true,
> -  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
> -  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
>  { 0x081c, 0x0010, true,
>MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> @@ -194,12 +194,12 @@ static const struct mce_ierror_table 
> mce_p9_ierror_table[] = {
>  { 0x081c, 0x0004, true,
>MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> +{ 0x081c, 0x000c, true,
> +  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,/* Before PARITY */
> +  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
>  { 0x081c, 0x0008, true,
>MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> -{ 0x081c, 0x000c, true,
> -  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
> -  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
>  { 0x081c, 0x0010, true,
>MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
>MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
> @@ -257,12 +257,12 @@ static const struct mce_derror_table 
> mce_p7_derror_table[] = {
>  { 0x0400, true,
>MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
>MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
> +{ 0x0080, true,
> +  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,   /* Before PARITY */
> +  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
>  { 0x0100, true,
>MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
>

Re: [PATCH] powerpc/64s: Fix DT CPU features Power9 DD2.1 logic

2018-06-13 Thread Nicholas Piggin

On Wed, 13 Jun 2018 23:23:56 +1000
Michael Ellerman  wrote:

> In the device tree CPU features quirk code we want to set
> CPU_FTR_POWER9_DD2_1 on all Power9s that aren't DD2.0 or earlier. But
> we got the logic wrong and instead set it on all CPUs that aren't
> Power9 DD2.0 or earlier, ie. including Power8.
> 
> Fix it by making sure we're on a Power9. This isn't a bug in practice
> because the only code that checks the feature is Power9 only to begin
> with. But we'll backport it anyway to avoid confusion.
> 
> Fixes: 9e9626ed3a4a ("powerpc/64s: Fix POWER9 DD2.2 and above in DT CPU 
> features")
> Cc: sta...@vger.kernel.org # v4.17+
> Reported-by: Paul Mackerras 
> Signed-off-by: Michael Ellerman 

Acked-by: Nicholas Piggin 

> ---
>  arch/powerpc/kernel/dt_cpu_ftrs.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
> b/arch/powerpc/kernel/dt_cpu_ftrs.c
> index 4be1c0de9406..96dd3d871986 100644
> --- a/arch/powerpc/kernel/dt_cpu_ftrs.c
> +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
> @@ -711,7 +711,8 @@ static __init void cpufeatures_cpu_quirks(void)
>   cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST;
>   cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG;
>   cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
> - } else /* DD2.1 and up have DD2_1 */
> + } else if ((version & 0x) == 0x004e)
> + /* DD2.1 and up have DD2_1 */
>   cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
>  
>   if ((version & 0x) == 0x004e) {

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-06-13 Thread Michael S. Tsirkin

On Mon, Jun 11, 2018 at 01:34:50PM +1000, Benjamin Herrenschmidt wrote:
> On Mon, 2018-06-11 at 06:28 +0300, Michael S. Tsirkin wrote:
> > 
> > > However if the administrator
> > > ignores/forgets/deliberatey-decides/is-constrained to NOT enable the
> > > flag, virtio will not be able to pass control to the DMA ops associated
> > > with the virtio devices. Which means, we have no opportunity to share
> > > the I/O buffers with the hypervisor/qemu.
> > > 
> > > How do you suggest, we handle this case?
> > 
> > As step 1, ignore it as a user error.
> 
> Ugh ... not again. Ram, don't bring that subject back we ALREADY
> addressed it, and requiring the *user* to do special things is just
> utterly and completely wrong.
> 
> The *user* has no bloody idea what that stuff is, will never know to
> set whatver magic qemu flag etc... The user will just start a a VM
> normally and expect things to work. Requiring the *user* to know things
> like that iommu virtio flag is complete nonsense.

You should consider teaching QEMU that the platform has support for the
ultravisor thing so it can set flags for you.

That's true even if something else is done for virtio - if you don't
have the capability right now you will come to regret it, host userspace
is just fundamentally in charge of control-path enumeration in the KVM
stack, I understand you want to take some of it away for security but
trying to hide things from QEMU completely is a mistake IMHO.

Just my $.02.

> If by "user" you mean libvirt, then you are now requesting about 4 or 5
> different projects to be patched to add speical cases for something
> they know nothing about and is completely irrelevant, while it can be
> entirely addressed with a 1-liner in virtio kernel side to allow the
> arch to plumb alternate DMA ops.
> 
> So for some reason you seem to be dead set on a path that leads to
> mountain of user pain, changes to many different projects and overall
> havok while there is a much much simpler and elegant solution at hand
> which I described (again) in the response to Ram I sent about 5mn ago.

What you sent to Ram sounds OK to me - I only hope we do all mean
the same thing because the 3 paragraphs above are all
about the libvirt/QEMU split mess which linux or virtio
should not really care about.

And I'd like to stress that direct path isn't merely legacy.

It's a common sense approach that when you have a hypervisor doing
things like taking care of cache coherency, you do not want to do them
in the guest as well. And the same guest can use a pass-through device
where it does need to do all the coherency mess, so while it's quite
possible to build a platform-specific way to tell guests which devices
need which kind of treatment, people understandably chose to do it in a
portable way through the virtio device.

I understand you guys are working on a new project that is going to use
bounce buffers anyway so what do you care about optimizations but we
can't just slow everyone else down for your benefit.

> > Further you can for example add per-device quirks in virtio so it can be
> > switched to dma api. make extra decisions in platform code then.

Isn't above exactly what you sent to Ram?

-- 
MST

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-06-13 Thread Michael S. Tsirkin

On Mon, Jun 11, 2018 at 01:29:18PM +1000, Benjamin Herrenschmidt wrote:
> On Sun, 2018-06-10 at 19:39 -0700, Ram Pai wrote:
> > 
> > However if the administrator
> > ignores/forgets/deliberatey-decides/is-constrained to NOT enable the
> > flag, virtio will not be able to pass control to the DMA ops associated
> > with the virtio devices. Which means, we have no opportunity to share
> > the I/O buffers with the hypervisor/qemu.
> > 
> > How do you suggest, we handle this case?
> 
> At the risk of repeating myself, let's just do the first pass which is
> to switch virtio over to always using the DMA API in the actual data
> flow code, with a hook at initialization time that replaces the DMA ops
> with some home cooked "direct" ops in the case where the IOMMU flag
> isn't set.

I'm not sure I understand all of the details, will have to
see the patch, but superficially it sounds good to me.

> This will be equivalent to what we have today but avoids having 2
> separate code path all over the driver.
> 
> Then a second stage, I think, is to replace this "hook" so that the
> architecture gets a say in the matter.
> 
> Basically, something like:
> 
>   arch_virtio_update_dma_ops(pci_dev, qemu_direct_mode).
> 
> IE, virtio would tell the arch whether the "other side" is in fact QEMU
> in a mode that bypasses the IOMMU and is cache coherent with the guest.
> This is our legacy "qemu special" mode. If the performance is
> sufficient we may want to deprecate it over time and have qemu enable
> the iommu by default but we still need it.
> 
> A weak implementation of the above will be provied that just puts in
> the direct ops when qemu_direct_mode is set, and thus provides today's
> behaviour on any arch that doesn't override it. If the flag is not set,
> the ops are left to whatever the arch PCI layer already set.
> 
> This will provide the opportunity for architectures that want to do
> something special, such as in our case, when we want to force even the
> "qemu_direct_mode" to go via bounce buffers, to put our own ops in
> there, while retaining the legacy behaviour otherwise.
> 
> It also means that the "gunk" is entirely localized in that one
> function, the rest of virtio just uses the DMA API normally.
> 
> Christoph, are you actually hacking "stage 1" above already or should
> we produce patches ?
> 
> Cheers,
> Ben.

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-06-13 Thread Michael S. Tsirkin

On Wed, Jun 13, 2018 at 12:41:41AM -0700, Christoph Hellwig wrote:
> On Mon, Jun 11, 2018 at 01:29:18PM +1000, Benjamin Herrenschmidt wrote:
> > At the risk of repeating myself, let's just do the first pass which is
> > to switch virtio over to always using the DMA API in the actual data
> > flow code, with a hook at initialization time that replaces the DMA ops
> > with some home cooked "direct" ops in the case where the IOMMU flag
> > isn't set.
> > 
> > This will be equivalent to what we have today but avoids having 2
> > separate code path all over the driver.
> > 
> > Then a second stage, I think, is to replace this "hook" so that the
> > architecture gets a say in the matter.
> 
> I don't think we can actually use dma_direct_ops.  It still allows
> architectures to override parts of the dma setup, which virtio seems
> to blindly assume phys == dma and not cache flushing.
> 
> I think the right way forward is to either add a new
> VIRTIO_F_IS_PCI_DEVICE (or redefine the existing iommu flag if deemed
> possible).

Given this is exactly what happens now, this seems possible, but maybe
we want a non-PCI specific name.

>  And then make sure recent qemu always sets it.

I don't think that part is going to happen, sorry.

Hypervisors can set it when they *actually have* a real PCI device.
People emulate systems which have a bunch of overhead in the DMA API
which is required for real DMA. Your proposal would double that overhead
by first doing it in guest then re-doing it in host.

I don't think it's justified when 99% of the world doesn't need it.
-- 
MST

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-06-13 Thread Michael S. Tsirkin

On Thu, Jun 07, 2018 at 11:36:55PM -0700, Christoph Hellwig wrote:
> > This seems to be what was being asked for in this thread,
> > with comments claiming IOMMU flag adds too much overhead.
> 
> Right now it means implementing a virtual iommu, which I agree is
> way too much overhead.

Well not really. The flag in question will have a desired effect without
a virtual iommu.

> > SEV guys report that they just set the iommu flag and then it all works.
> > I guess if there's translation we can think of this as a kind of iommu.
> > Maybe we should rename PLATFORM_IOMMU to PLARTFORM_TRANSLATION?
> 
> VIRTIO_F_BEHAVES_LIKE_A_REAL_PCI_DEVICE_DONT_TRY_TO_OUTSMART_ME
> 
> as said it's not just translations, it is cache coherence as well.

Well it's only for DMA. So maybe PLATFORM_DMA.

I suspect people will then come and complain that they
do *not* want cache coherence tricks because virtio is
running on a CPU, but we'll see.

> > And apparently some people complain that just setting that flag makes
> > qemu check translation on each access with an unacceptable performance
> > overhead.  Forcing same behaviour for everyone on general principles
> > even without the flag is unlikely to make them happy.
> 
> That sounds like a qemu implementation bug.  If qemu knowns that
> guest physiscall == guest dma space there is no need to check.

Possibly. Or it's possible it's all just theoretical, no one
posted any numbers.

-- 
MST

[PATCH] powerpc/64s: Report SLB multi-hit rather than parity error

2018-06-13 Thread Michael Ellerman

When we take an SLB multi-hit on bare metal, we see both the multi-hit
and parity error bits set in DSISR. The user manuals indicates this is
expected to always happen on Power8, whereas on Power9 it says a
multi-hit will "usually" also cause a parity error.

We decide what to do based on the various error tables in mce_power.c,
and because we process them in order and only report the first, we
currently always report a parity error but not the multi-hit, eg:

  Severe Machine check interrupt [Recovered]
Initiator: CPU
Error type: SLB [Parity]
  Effective address: c00d4300

Although this is correct, it leaves the user wondering why they got a
parity error. It would be clearer instead if we reported the
multi-hit because that is more likely to be simply a software bug,
whereas a true parity error is possibly an indication of a bad core.

We can do that simply by reordering the error tables so that multi-hit
appears before parity. That doesn't affect the error recovery at all,
because we flush the SLB either way.

Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/mce_power.c | 36 ++--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 38c5b4764bfe..1e450d0c4f72 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -140,12 +140,12 @@ static const struct mce_ierror_table 
mce_p7_ierror_table[] = {
 { 0x001c, 0x0004, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x001c, 0x000c, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,  /* Before PARITY */
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
 { 0x001c, 0x0008, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x001c, 0x000c, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
 { 0x001c, 0x0010, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
@@ -164,12 +164,12 @@ static const struct mce_ierror_table 
mce_p8_ierror_table[] = {
 { 0x081c, 0x0004, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x081c, 0x000c, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,  /* Before PARITY */
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
 { 0x081c, 0x0008, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x081c, 0x000c, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
 { 0x081c, 0x0010, true,
   MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
@@ -194,12 +194,12 @@ static const struct mce_ierror_table 
mce_p9_ierror_table[] = {
 { 0x081c, 0x0004, true,
   MCE_ERROR_TYPE_UE,  MCE_UE_ERROR_IFETCH,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
+{ 0x081c, 0x000c, true,
+  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,  /* Before PARITY */
+  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
 { 0x081c, 0x0008, true,
   MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
-{ 0x081c, 0x000c, true,
-  MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
 { 0x081c, 0x0010, true,
   MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
   MCE_INITIATOR_CPU,  MCE_SEV_ERROR_SYNC, },
@@ -257,12 +257,12 @@ static const struct mce_derror_table 
mce_p7_derror_table[] = {
 { 0x0400, true,
   MCE_ERROR_TYPE_TLB,  MCE_TLB_ERROR_MULTIHIT,
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x0080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
 { 0x0100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
-{ 0x0080, true,
-  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT,
-  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
 { 0x0040, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
@@ -290,12 +290,12 @@ static const struct mce_derror_table 
mce_p8_derror_table[] = {
 { 0x0200, true,
   MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
   MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
+{ 0x0080, true,
+  MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_MULTIHIT, /* Before PARITY */
+  MCE_INITIATOR_CPU,   MCE_SEV_ERROR_SYNC, },
 { 0x0100, true,
   MCE_ERROR_TYPE_SLB,  MCE_SLB_ERROR_PARITY,

[PATCH] powerpc/64s: Fix DT CPU features Power9 DD2.1 logic

2018-06-13 Thread Michael Ellerman

In the device tree CPU features quirk code we want to set
CPU_FTR_POWER9_DD2_1 on all Power9s that aren't DD2.0 or earlier. But
we got the logic wrong and instead set it on all CPUs that aren't
Power9 DD2.0 or earlier, ie. including Power8.

Fix it by making sure we're on a Power9. This isn't a bug in practice
because the only code that checks the feature is Power9 only to begin
with. But we'll backport it anyway to avoid confusion.

Fixes: 9e9626ed3a4a ("powerpc/64s: Fix POWER9 DD2.2 and above in DT CPU 
features")
Cc: sta...@vger.kernel.org # v4.17+
Reported-by: Paul Mackerras 
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/dt_cpu_ftrs.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 4be1c0de9406..96dd3d871986 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -711,7 +711,8 @@ static __init void cpufeatures_cpu_quirks(void)
cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_HV_ASSIST;
cur_cpu_spec->cpu_features |= CPU_FTR_P9_TM_XER_SO_BUG;
cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
-   } else /* DD2.1 and up have DD2_1 */
+   } else if ((version & 0x) == 0x004e)
+   /* DD2.1 and up have DD2_1 */
cur_cpu_spec->cpu_features |= CPU_FTR_POWER9_DD2_1;
 
if ((version & 0x) == 0x004e) {
-- 
2.14.1

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-06-13 Thread Benjamin Herrenschmidt

On Wed, 2018-06-13 at 22:25 +1000, Benjamin Herrenschmidt wrote:
> On Wed, 2018-06-13 at 00:41 -0700, Christoph Hellwig wrote:
> > On Mon, Jun 11, 2018 at 01:29:18PM +1000, Benjamin Herrenschmidt wrote:
> > > At the risk of repeating myself, let's just do the first pass which is
> > > to switch virtio over to always using the DMA API in the actual data
> > > flow code, with a hook at initialization time that replaces the DMA ops
> > > with some home cooked "direct" ops in the case where the IOMMU flag
> > > isn't set.
> > > 
> > > This will be equivalent to what we have today but avoids having 2
> > > separate code path all over the driver.
> > > 
> > > Then a second stage, I think, is to replace this "hook" so that the
> > > architecture gets a say in the matter.
> > 
> > I don't think we can actually use dma_direct_ops.  It still allows
> > architectures to override parts of the dma setup, which virtio seems
> > to blindly assume phys == dma and not cache flushing.
> 
> By direct ops I didn't mean *the* dma_direct_ops but a virtio-local
> variants that effectively reproduces the existing expectations (ie,
> virtio-dma-legacy-ops or something).

Actually ... the stuff in lib/dma-direct.c seems to be just it, no ?

There's no cache flushing and there's no architecture hooks that I can
see other than the AMD security stuff which is probably fine.

Or am I missing something ?

Cheers,
Ben.
> 
> > I think the right way forward is to either add a new
> > VIRTIO_F_IS_PCI_DEVICE (or redefine the existing iommu flag if deemed
> > possible).  And then make sure recent qemu always sets it.
> 
> Ben.

Re: [patchV2 2/2] pci/shpchp: no claim on pcie port device

2018-06-13 Thread Bjorn Helgaas

On Wed, Jun 13, 2018 at 02:29:57PM +0800, Pingfan Liu wrote:
> The Linux Device Driver Model allows a physical device to be handled
> by only a single driver. But at present, both shpchp and portdrv_pci
> claim PCI_CLASS_BRIDGE_PCI, and touch devices_kset when the drivers are
> loaded. This causes a few problems, one is the wrong shutdown seq of
> devices, owing to the broken devices_kset. This patch keeps shpchp away
> from pcie port device, using the extra matching method.

As Christoph pointed out, something seems wrong if we add to
devices_kset even when the probe fails.  We will call shpc_probe() for
the device below, but there's no SHPC capability, so the probe should
fail when shpc_init() fails.

I do think the current structure where portdrv and shpchp are both
"drivers" that claim bridges is broken.  This has caused problems
before, e.g., some PCIe switches have performance counters, and a
driver for those switches ought to be able to claim the device and use
the counters, but portdrv is in the way.

I think it would be better if portdrv (and maybe shpchp; not sure
about that) were integrated into the PCIe core and did not register as
a driver.  But this is a big project, far beyond the scope of this
current issue.

> Note:
> I hit this bug on a Power9 machine, when "kexec -e", and see a ata-disk
> behind a bridge can not write back buffer in flight due to the former
> shutdown of the bridge which clears the BusMaster bit.
> 
> the device involved:
> 0004:00:00.0 PCI bridge: IBM Device 04c1 (prog-if 00 [Normal decode])
> Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- 
> Stepping- SERR+ FastB2B- DisINTx-
> Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- 
> SERR-  Latency: 0
> NUMA node: 0
> Bus: primary=00, secondary=01, subordinate=12, sec-latency=0
> I/O behind bridge: -0fff
> Memory behind bridge: 8000-ffef
> Prefetchable memory behind bridge: 00060240-0006027f7fff
> Secondary status: 66MHz- FastB2B- ParErr- DEVSEL=fast >TAbort- 
>  BridgeCtl: Parity- SERR+ NoISA- VGA- MAbort- >Reset- FastB2B-
> PriDiscTmr- SecDiscTmr- DiscTmrStat- DiscTmrSERREn-
> Capabilities: [40] Power Management version 3
> Flags: PMEClk- DSI- D1- D2- AuxCurrent=0mA 
> PME(D0+,D1-,D2-,D3hot+,D3cold+)
> Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=0 PME-
> Capabilities: [48] Express (v2) Root Port (Slot-), MSI 00
> DevCap: MaxPayload 512 bytes, PhantFunc 0
> ExtTag- RBE+
> DevCtl: Report errors: Correctable- Non-Fatal- Fatal- 
> Unsupported-
> RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
> MaxPayload 256 bytes, MaxReadReq 128 bytes
> DevSta: CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr- 
> TransPend-
> LnkCap: Port #0, Speed 8GT/s, Width x8, ASPM not supported, 
> Exit Latency L0s <64ns, L1 <1us
> ClockPM- Surprise- LLActRep+ BwNot+ ASPMOptComp-
> LnkCtl: ASPM Disabled; RCB 128 bytes Disabled- CommClk-
> ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
> LnkSta: Speed 8GT/s, Width x4, TrErr- Train- SlotClk- 
> DLActive+ BWMgmt- ABWMgmt+
> RootCtl: ErrCorrectable- ErrNon-Fatal- ErrFatal- PMEIntEna- 
> CRSVisible-
> RootCap: CRSVisible-
> RootSta: PME ReqID , PMEStatus- PMEPending-
> DevCap2: Completion Timeout: Range ABCD, TimeoutDis+, LTR-, 
> OBFF Not Supported ARIFwd+
> DevCtl2: Completion Timeout: 16ms to 55ms, TimeoutDis+, LTR-, 
> OBFF Disabled ARIFwd+
> LnkCtl2: Target Link Speed: 8GT/s, EnterCompliance- SpeedDis-
>  Transmit Margin: Normal Operating Range, 
> EnterModifiedCompliance- ComplianceSOS-
>  Compliance De-emphasis: -6dB
> LnkSta2: Current De-emphasis Level: -3.5dB, 
> EqualizationComplete+, EqualizationPhase1+
>  EqualizationPhase2+, EqualizationPhase3+, 
> LinkEqualizationRequest-
> Capabilities: [100 v1] Advanced Error Reporting
> UESta:  DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- 
> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
> UEMsk:  DLP- SDES- TLP+ FCP- CmpltTO+ CmpltAbrt+ UnxCmplt- 
> RxOF- MalfTLP- ECRC+ UnsupReq- ACSViol-
> UESvrt: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- 
> RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
> CESta:  RxErr- BadTLP- BadDLLP- Rollover- Timeout- 
> NonFatalErr-
> CEMsk:  RxErr- BadTLP- BadDLLP- Rollover- Timeout- 
> NonFatalErr-
> AERCap: First Error Pointer: 00, GenCap+ CGenEn+ ChkCap+ 
> ChkEn+
> Capabilities: [148 v1] #19
> 
> Signed-off-by:

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-06-13 Thread Benjamin Herrenschmidt

On Wed, 2018-06-13 at 00:41 -0700, Christoph Hellwig wrote:
> On Mon, Jun 11, 2018 at 01:29:18PM +1000, Benjamin Herrenschmidt wrote:
> > At the risk of repeating myself, let's just do the first pass which is
> > to switch virtio over to always using the DMA API in the actual data
> > flow code, with a hook at initialization time that replaces the DMA ops
> > with some home cooked "direct" ops in the case where the IOMMU flag
> > isn't set.
> > 
> > This will be equivalent to what we have today but avoids having 2
> > separate code path all over the driver.
> > 
> > Then a second stage, I think, is to replace this "hook" so that the
> > architecture gets a say in the matter.
> 
> I don't think we can actually use dma_direct_ops.  It still allows
> architectures to override parts of the dma setup, which virtio seems
> to blindly assume phys == dma and not cache flushing.

By direct ops I didn't mean *the* dma_direct_ops but a virtio-local
variants that effectively reproduces the existing expectations (ie,
virtio-dma-legacy-ops or something).

> I think the right way forward is to either add a new
> VIRTIO_F_IS_PCI_DEVICE (or redefine the existing iommu flag if deemed
> possible).  And then make sure recent qemu always sets it.

Ben.

Re: [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations

2018-06-13 Thread Nicholas Piggin

On Wed, 13 Jun 2018 11:26:49 +0200 (CEST)
Thomas Gleixner  wrote:

> On Wed, 13 Jun 2018, Peter Zijlstra wrote:
> > On Wed, Jun 13, 2018 at 05:41:41PM +1000, Nicholas Piggin wrote:  
> > > On Tue, 12 Jun 2018 17:57:32 -0700
> > > Ricardo Neri  wrote:
> > >   
> > > > Instead of exposing individual functions for the operations of the NMI
> > > > watchdog, define a common interface that can be used across multiple
> > > > implementations.
> > > > 
> > > > The struct nmi_watchdog_ops is defined for such operations. These 
> > > > initial
> > > > definitions include the enable, disable, start, stop, and cleanup
> > > > operations.
> > > > 
> > > > Only a single NMI watchdog can be used in the system. The operations of
> > > > this NMI watchdog are accessed via the new variable nmi_wd_ops. This
> > > > variable is set to point the operations of the first NMI watchdog that
> > > > initializes successfully. Even though at this moment, the only available
> > > > NMI watchdog is the perf-based hardlockup detector. More implementations
> > > > can be added in the future.  
> > > 
> > > Cool, this looks pretty nice at a quick glance. sparc and powerpc at
> > > least have their own NMI watchdogs, it would be good to have those
> > > converted as well.  
> > 
> > Yeah, agreed, this looks like half a patch.  
> 
> Though I'm not seeing the advantage of it. That kind of NMI watchdogs are
> low level architecture details so having yet another 'ops' data structure
> with a gazillion of callbacks, checks and indirections does not provide
> value over the currently available weak stubs.

The other way to go of course is librify the perf watchdog and make an
x86 watchdog that selects between perf and hpet... I also probably
prefer that for code such as this, but I wouldn't strongly object to
ops struct if I'm not writing the code. It's not that bad is it?

Thanks,
Nick

Re: UBSAN: Undefined behaviour in ../include/linux/percpu_counter.h:137:13

2018-06-13 Thread Mathieu Malaterre

On Wed, Jun 13, 2018 at 10:43 AM Mathieu Malaterre  wrote:
>
> On Wed, Jun 13, 2018 at 3:43 AM Michael Ellerman  wrote:
> >
> > Mathieu Malaterre  writes:
> >
> > > Hi there,
> > >
> > > I have a reproducible UBSAN appearing in dmesg after a while on my G4
> > > (*). Could anyone suggest a way to diagnose the actual root issue here
> > > (or is it just a false positive) ?
> >
> > It looks like a real overflow, I guess the question is why are we seeing it.
> >
> > The first thing to work out would be what exactly is overflowing.
> >
> > Is it in here?
> >
> > cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
> >  rq->io_start_time_ns, rq->cmd_flags);
> >
> >
> > If so that would suggest something is taking multiple hours to complete,
> > which seems unlikely. Is time going backward?
>
> There is also something suspicious in the kern.log file:
>
> Jun 12 20:09:04 debian kernel: [5.504182]
> 
> Jun 12 20:09:04 debian kernel: [5.508945] UBSAN: Undefined
> behaviour in ../drivers/rtc/rtc-lib.c:87:22
> Jun 12 20:09:04 debian kernel: [5.513658] signed integer overflow:
> Jun 12 20:09:04 debian kernel: [5.518211] 1193024 * 3600 cannot be
> represented in type 'int'
> Jun 12 20:09:04 debian kernel: [5.522866] CPU: 0 PID: 1 Comm:
> swapper Not tainted 4.17.0+ #1
> Jun 12 20:09:04 debian kernel: [5.527567] Call Trace:
> Jun 12 20:09:04 debian kernel: [5.532200] [df4e7b00] [c0481074]
> ubsan_epilogue+0x18/0x4c (unreliable)
> Jun 12 20:09:04 debian kernel: [5.537019] [df4e7b10] [c0481a14]
> handle_overflow+0xbc/0xdc
> Jun 12 20:09:04 debian kernel: [5.541832] [df4e7b90] [c060d698]
> rtc_time64_to_tm+0x344/0x388
> Jun 12 20:09:04 debian kernel: [5.546655] [df4e7bd0] [c001076c]
> rtc_generic_get_time+0x2c/0x40
> Jun 12 20:09:04 debian kernel: [5.551477] [df4e7be0] [c06113d4]
> __rtc_read_time+0x70/0x13c
> Jun 12 20:09:04 debian kernel: [5.556288] [df4e7c00] [c061150c]
> rtc_read_time+0x6c/0x130
> Jun 12 20:09:04 debian kernel: [5.561088] [df4e7c30] [c061271c]
> __rtc_read_alarm+0x34/0x684
> Jun 12 20:09:04 debian kernel: [5.565884] [df4e7ce0] [c060f234]
> rtc_device_register+0x88/0x218
> Jun 12 20:09:04 debian kernel: [5.570695] [df4e7d40] [c060f428]
> devm_rtc_device_register+0x64/0xc4
> Jun 12 20:09:04 debian kernel: [5.575528] [df4e7d60] [c09d15d4]
> generic_rtc_probe+0x50/0x78
> Jun 12 20:09:04 debian kernel: [5.580359] [df4e7d70] [c055e4a4]
> platform_drv_probe+0xa8/0x128
> Jun 12 20:09:04 debian kernel: [5.585210] [df4e7d90] [c0559d28]
> driver_probe_device+0x354/0x6fc
> Jun 12 20:09:04 debian kernel: [5.590064] [df4e7dd0] [c055a270]
> __driver_attach+0x1a0/0x22c
> Jun 12 20:09:04 debian kernel: [5.594917] [df4e7df0] [c0555b70]
> bus_for_each_dev+0x84/0xdc
> Jun 12 20:09:04 debian kernel: [5.599750] [df4e7e20] [c0558420]
> bus_add_driver+0x188/0x348
> Jun 12 20:09:04 debian kernel: [5.604584] [df4e7e40] [c055b7b4]
> driver_register+0xa0/0x18c
> Jun 12 20:09:04 debian kernel: [5.609433] [df4e7e50] [c055e950]
> __platform_driver_probe+0x8c/0x198
> Jun 12 20:09:04 debian kernel: [5.614330] [df4e7e70] [c0005800]
> do_one_initcall+0x64/0x280
> Jun 12 20:09:04 debian kernel: [5.619237] [df4e7ee0] [c0997c04]
> kernel_init_freeable+0x3a4/0x444
> Jun 12 20:09:04 debian kernel: [5.624145] [df4e7f30] [c00049f8]
> kernel_init+0x24/0x118
> Jun 12 20:09:04 debian kernel: [5.629029] [df4e7f40] [c001b1c4]
> ret_from_kernel_thread+0x14/0x1c
> Jun 12 20:09:04 debian kernel: [5.633878]
> 
>
>
> Grep-ing all leads to:
>
> $ grep  "cannot be represented" kern.log | colrm 1 45|sort -u
>  1193022 * 3600 cannot be represented in type 'int'
>  1193024 * 3600 cannot be represented in type 'int'
>  1193032 * 3600 cannot be represented in type 'int'
>  1193033 * 3600 cannot be represented in type 'int'
>  1193034 * 3600 cannot be represented in type 'int'
>  1193035 * 3600 cannot be represented in type 'int'
>
> How come tm_hour can store a value of 1193035 ?

It appears that I am getting a negative value for  time64_t :

[5.495470] devices_kset: Moving rtc-generic to end of list
[5.502584]  mm DBG: TIME64_T -2068738348
[5.507205] WARNING: CPU: 0 PID: 1 at ../drivers/rtc/rtc-lib.c:60
rtc_time64_to_tm+0x48/0x38c
[5.511843] Modules linked in:
[5.516306] CPU: 0 PID: 1 Comm: swapper Not tainted 4.17.0+ #6
[5.520879] NIP:  c060d498 LR: c060d494 CTR: c00d49dc
[5.525467] REGS: df4e7ad0 TRAP: 0700   Not tainted  (4.17.0+)
[5.530115] MSR:  00029032   CR: 88008282  XER: 
[5.534865]
   GPR00: c060d494 df4e7b80 df4e8000 0019 0001
032e 0400 
   GPR08: 0003 0001 c0aa4b38  24008882
 c09dfc4c c09dfc28
   GPR16:

[PATCH v3 09/12] macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver

2018-06-13 Thread Finn Thain

Now that the PowerMac via-pmu driver supports m68k PowerBooks,
switch over to that driver and remove the via-pmu68k driver.

Cc: Geert Uytterhoeven 
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/m68k/configs/mac_defconfig   |   2 +-
 arch/m68k/configs/multi_defconfig |   2 +-
 arch/m68k/mac/config.c|   2 +-
 arch/m68k/mac/misc.c  |  48 +--
 drivers/macintosh/Kconfig |  13 +-
 drivers/macintosh/Makefile|   1 -
 drivers/macintosh/adb.c   |   2 +-
 drivers/macintosh/via-pmu68k.c| 846 --
 include/uapi/linux/pmu.h  |   2 +-
 9 files changed, 14 insertions(+), 904 deletions(-)
 delete mode 100644 drivers/macintosh/via-pmu68k.c

diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 390d4a87441c..ee63f1242e9a 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -370,7 +370,7 @@ CONFIG_TCM_PSCSI=m
 CONFIG_ADB=y
 CONFIG_ADB_MACII=y
 CONFIG_ADB_IOP=y
-CONFIG_ADB_PMU68K=y
+CONFIG_ADB_PMU=y
 CONFIG_ADB_CUDA=y
 CONFIG_INPUT_ADBHID=y
 CONFIG_MAC_EMUMOUSEBTN=y
diff --git a/arch/m68k/configs/multi_defconfig 
b/arch/m68k/configs/multi_defconfig
index 77be97d82dc3..6421a3da616c 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -403,7 +403,7 @@ CONFIG_TCM_PSCSI=m
 CONFIG_ADB=y
 CONFIG_ADB_MACII=y
 CONFIG_ADB_IOP=y
-CONFIG_ADB_PMU68K=y
+CONFIG_ADB_PMU=y
 CONFIG_ADB_CUDA=y
 CONFIG_INPUT_ADBHID=y
 CONFIG_MAC_EMUMOUSEBTN=y
diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c
index e522307db47c..92e80cf0d8aa 100644
--- a/arch/m68k/mac/config.c
+++ b/arch/m68k/mac/config.c
@@ -891,7 +891,7 @@ static void __init mac_identify(void)
 #ifdef CONFIG_ADB_CUDA
find_via_cuda();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
find_via_pmu();
 #endif
 }
diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index 7ccb799eeb57..28090a44fa09 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -85,7 +85,7 @@ static void cuda_write_pram(int offset, __u8 data)
 }
 #endif /* CONFIG_ADB_CUDA */
 
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
 static long pmu_read_time(void)
 {
struct adb_request req;
@@ -136,7 +136,7 @@ static void pmu_write_pram(int offset, __u8 data)
while (!req.complete)
pmu_poll();
 }
-#endif /* CONFIG_ADB_PMU68K */
+#endif /* CONFIG_ADB_PMU */
 
 /*
  * VIA PRAM/RTC access routines
@@ -367,38 +367,6 @@ static void cuda_shutdown(void)
 }
 #endif /* CONFIG_ADB_CUDA */
 
-#ifdef CONFIG_ADB_PMU68K
-
-void pmu_restart(void)
-{
-   struct adb_request req;
-   if (pmu_request(, NULL,
-   2, PMU_SET_INTR_MASK, PMU_INT_ADB|PMU_INT_TICK) < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-   if (pmu_request(, NULL, 1, PMU_RESET) < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-}
-
-void pmu_shutdown(void)
-{
-   struct adb_request req;
-   if (pmu_request(, NULL,
-   2, PMU_SET_INTR_MASK, PMU_INT_ADB|PMU_INT_TICK) < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-   if (pmu_request(, NULL, 5, PMU_SHUTDOWN, 'M', 'A', 'T', 'T') < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-}
-
-#endif
-
 /*
  *---
  * Below this point are the generic routines; they'll dispatch to the
@@ -423,7 +391,7 @@ void mac_pram_read(int offset, __u8 *buffer, int len)
func = cuda_read_pram;
break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
func = pmu_read_pram;
break;
@@ -453,7 +421,7 @@ void mac_pram_write(int offset, __u8 *buffer, int len)
func = cuda_write_pram;
break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
func = pmu_write_pram;
break;
@@ -477,7 +445,7 @@ void mac_poweroff(void)
   macintosh_config->adb_type == MAC_ADB_CUDA) {
cuda_shutdown();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_shutdown();
 #endif
@@ -518,7 +486,7 @@ void mac_reset(void)
   macintosh_config->adb_type == MAC_ADB_CUDA) {
cuda_restart();
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
} else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_restart();
 #endif
@@ -670,7 +638,7 @@ int mac_hwclk(int op, struct rtc_time *t)
now = cuda_read_time();
break;
 #endif
-#ifdef CONFIG_ADB_PMU68K
+#ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
now =

[PATCH v3 05/12] macintosh/via-pmu: Replace via pointer with via1 and via2 pointers

2018-06-13 Thread Finn Thain

On most PowerPC Macs, the PMU driver uses the shift register and
IO port B from a single VIA chip.

On 68k and early PowerPC PowerBooks, the driver uses the shift register
from one VIA chip together with IO port B from another.

Replace via with via1 and via2 to accommodate this. For the
CONFIG_PPC_PMAC case, set via1 = via2 so there is no change.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/via-pmu.c | 142 +---
 1 file changed, 69 insertions(+), 73 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index c4c324fb5fa6..2e09137410f6 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -76,7 +76,6 @@
 #define BATTERY_POLLING_COUNT  2
 
 static DEFINE_MUTEX(pmu_info_proc_mutex);
-static volatile unsigned char __iomem *via;
 
 /* VIA registers - spaced 0x200 bytes apart */
 #define RS 0x200   /* skip between registers */
@@ -145,6 +144,8 @@ static struct device_node *vias;
 static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited;
 static int pmu_has_adb;
+static volatile unsigned char __iomem *via1;
+static volatile unsigned char __iomem *via2;
 static struct device_node *gpio_node;
 static unsigned char __iomem *gpio_reg;
 static int gpio_irq = 0;
@@ -340,14 +341,14 @@ int __init find_via_pmu(void)
} else
pmu_kind = PMU_UNKNOWN;
 
-   via = ioremap(taddr, 0x2000);
-   if (via == NULL) {
+   via1 = via2 = ioremap(taddr, 0x2000);
+   if (via1 == NULL) {
printk(KERN_ERR "via-pmu: Can't map address !\n");
goto fail_via_remap;
}

-   out_8([IER], IER_CLR | 0x7f);   /* disable all intrs */
-   out_8([IFR], 0x7f); /* clear IFR */
+   out_8([IER], IER_CLR | 0x7f);  /* disable all intrs */
+   out_8([IFR], 0x7f);/* clear IFR */
 
pmu_state = idle;
 
@@ -362,8 +363,8 @@ int __init find_via_pmu(void)
return 1;
 
  fail_init:
-   iounmap(via);
-   via = NULL;
+   iounmap(via1);
+   via1 = via2 = NULL;
  fail_via_remap:
iounmap(gpio_reg);
gpio_reg = NULL;
@@ -437,7 +438,7 @@ static int __init via_pmu_start(void)
}
 
/* Enable interrupts */
-   out_8([IER], IER_SET | SR_INT | CB1_INT);
+   out_8([IER], IER_SET | SR_INT | CB1_INT);
 
pmu_fully_inited = 1;
 
@@ -533,8 +534,8 @@ init_pmu(void)
struct adb_request req;
 
/* Negate TREQ. Set TACK to input and TREQ to output. */
-   out_8([B], in_8([B]) | TREQ);
-   out_8([DIRB], (in_8([DIRB]) | TREQ) & ~TACK);
+   out_8([B], in_8([B]) | TREQ);
+   out_8([DIRB], (in_8([DIRB]) | TREQ) & ~TACK);
 
pmu_request(, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
timeout =  10;
@@ -1174,7 +1175,7 @@ wait_for_ack(void)
 * reported
 */
int timeout = 4000;
-   while ((in_8([B]) & TACK) == 0) {
+   while ((in_8([B]) & TACK) == 0) {
if (--timeout < 0) {
printk(KERN_ERR "PMU not responding (!ack)\n");
return;
@@ -1188,23 +1189,19 @@ wait_for_ack(void)
 static inline void
 send_byte(int x)
 {
-   volatile unsigned char __iomem *v = via;
-
-   out_8([ACR], in_8([ACR]) | SR_OUT | SR_EXT);
-   out_8([SR], x);
-   out_8([B], in_8([B]) & ~TREQ);  /* assert TREQ */
-   (void)in_8([B]);
+   out_8([ACR], in_8([ACR]) | SR_OUT | SR_EXT);
+   out_8([SR], x);
+   out_8([B], in_8([B]) & ~TREQ);/* assert TREQ */
+   (void)in_8([B]);
 }
 
 static inline void
 recv_byte(void)
 {
-   volatile unsigned char __iomem *v = via;
-
-   out_8([ACR], (in_8([ACR]) & ~SR_OUT) | SR_EXT);
-   in_8([SR]);   /* resets SR */
-   out_8([B], in_8([B]) & ~TREQ);
-   (void)in_8([B]);
+   out_8([ACR], (in_8([ACR]) & ~SR_OUT) | SR_EXT);
+   in_8([SR]);/* resets SR */
+   out_8([B], in_8([B]) & ~TREQ);
+   (void)in_8([B]);
 }
 
 static inline void
@@ -1307,7 +1304,7 @@ pmu_suspend(void)
if (!adb_int_pending && pmu_state == idle && 
!req_awaiting_reply) {
if (gpio_irq >= 0)
disable_irq_nosync(gpio_irq);
-   out_8([IER], CB1_INT | IER_CLR);
+   out_8([IER], CB1_INT | IER_CLR);
spin_unlock_irqrestore(_lock, flags);
break;
}
@@ -1331,7 +1328,7 @@ pmu_resume(void)
adb_int_pending = 1;
if (gpio_irq >= 0)
enable_irq(gpio_irq);
-   out_8([IER], CB1_INT | IER_SET);
+   out_8([IER], CB1_INT | IER_SET);
spin_unlock_irqrestore(_lock, flags);
pmu_poll();
 }
@@ -1456,20 +1453,20 @@ pmu_sr_intr(void)
struct adb_request *req;
int bite = 0;
 
-   if (in_8([B]) &

[PATCH v3 07/12] macintosh/via-pmu: Make CONFIG_PPC_PMAC Kconfig deps explicit

2018-06-13 Thread Finn Thain

At present, CONFIG_ADB_PMU depends on CONFIG_PPC_PMAC. When this gets
relaxed to CONFIG_PPC_PMAC || CONFIG_MAC, those Kconfig symbols with
implicit deps on PPC_PMAC will need explicit deps. Add them now.
No functional change.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 9c6452b38c36..26abae4c899d 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -79,7 +79,7 @@ config ADB_PMU
 
 config ADB_PMU_LED
bool "Support for the Power/iBook front LED"
-   depends on ADB_PMU
+   depends on PPC_PMAC && ADB_PMU
select NEW_LEDS
select LEDS_CLASS
help
@@ -122,7 +122,7 @@ config PMAC_MEDIABAY
 
 config PMAC_BACKLIGHT
bool "Backlight control for LCD screens"
-   depends on ADB_PMU && FB = y && (BROKEN || !PPC64)
+   depends on PPC_PMAC && ADB_PMU && FB = y && (BROKEN || !PPC64)
select FB_BACKLIGHT
help
  Say Y here to enable Macintosh specific extensions of the generic
-- 
2.16.4

[PATCH v3 06/12] macintosh/via-pmu: Add support for m68k PowerBooks

2018-06-13 Thread Finn Thain

Put #ifdefs around the Open Firmware, xmon, interrupt dispatch,
battery and suspend code. Add the necessary interrupt handling to
support m68k PowerBooks.

The pmu_kind value is available to userspace using the
PMU_IOC_GET_MODEL ioctl. It is not clear yet what hardware classes
are be needed to describe m68k PowerBook models, so pmu_kind is given
the provisional value PMU_UNKNOWN.

To find out about the hardware, user programs can use /proc/bootinfo
or /proc/hardware, or send the PMU_GET_VERSION command using /dev/adb.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/Kconfig   |   2 +-
 drivers/macintosh/via-pmu.c | 101 +++-
 2 files changed, 91 insertions(+), 12 deletions(-)

diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index 97a420c11eed..9c6452b38c36 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -65,7 +65,7 @@ config ADB_CUDA
  If unsure say Y.
 
 config ADB_PMU
-   bool "Support for PMU  based PowerMacs"
+   bool "Support for PMU based PowerMacs and PowerBooks"
depends on PPC_PMAC
help
  On PowerBooks, iBooks, and recent iMacs and Power Macintoshes, the
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 2e09137410f6..22cb7d94e3ce 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
- * Device driver for the via-pmu on Apple Powermacs.
+ * Device driver for the PMU in Apple PowerBooks and PowerMacs.
  *
  * The VIA (versatile interface adapter) interfaces to the PMU,
  * a 6805 microprocessor core whose primary function is to control
@@ -49,20 +49,26 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
 #include 
 #include 
+#ifdef CONFIG_PPC_PMAC
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#else
+#include 
+#include 
+#include 
+#endif
 
 #include "via-pmu-event.h"
 
@@ -97,8 +103,13 @@ static DEFINE_MUTEX(pmu_info_proc_mutex);
 #define ANH(15*RS) /* A-side data, no handshake */
 
 /* Bits in B data register: both active low */
+#ifdef CONFIG_PPC_PMAC
 #define TACK   0x08/* Transfer acknowledge (input) */
 #define TREQ   0x10/* Transfer request (output) */
+#else
+#define TACK   0x02
+#define TREQ   0x04
+#endif
 
 /* Bits in ACR */
 #define SR_CTRL0x1c/* Shift register control bits 
*/
@@ -140,13 +151,15 @@ static int data_index;
 static int data_len;
 static volatile int adb_int_pending;
 static volatile int disable_poll;
-static struct device_node *vias;
 static int pmu_kind = PMU_UNKNOWN;
 static int pmu_fully_inited;
 static int pmu_has_adb;
+#ifdef CONFIG_PPC_PMAC
 static volatile unsigned char __iomem *via1;
 static volatile unsigned char __iomem *via2;
+static struct device_node *vias;
 static struct device_node *gpio_node;
+#endif
 static unsigned char __iomem *gpio_reg;
 static int gpio_irq = 0;
 static int gpio_irq_enabled = -1;
@@ -273,6 +286,7 @@ static char *pbook_type[] = {
 
 int __init find_via_pmu(void)
 {
+#ifdef CONFIG_PPC_PMAC
u64 taddr;
const u32 *reg;
 
@@ -355,9 +369,6 @@ int __init find_via_pmu(void)
if (!init_pmu())
goto fail_init;
 
-   printk(KERN_INFO "PMU driver v%d initialized for %s, firmware: %02x\n",
-  PMU_DRIVER_VERSION, pbook_type[pmu_kind], pmu_version);
-  
sys_ctrler = SYS_CTRLER_PMU;

return 1;
@@ -373,6 +384,30 @@ int __init find_via_pmu(void)
vias = NULL;
pmu_state = uninitialized;
return 0;
+#else
+   if (macintosh_config->adb_type != MAC_ADB_PB2)
+   return 0;
+
+   pmu_kind = PMU_UNKNOWN;
+
+   spin_lock_init(_lock);
+
+   pmu_has_adb = 1;
+
+   pmu_intr_mask = PMU_INT_PCEJECT |
+   PMU_INT_SNDBRT |
+   PMU_INT_ADB |
+   PMU_INT_TICK;
+
+   pmu_state = idle;
+
+   if (!init_pmu()) {
+   pmu_state = uninitialized;
+   return 0;
+   }
+
+   return 1;
+#endif /* !CONFIG_PPC_PMAC */
 }
 
 #ifdef CONFIG_ADB
@@ -396,13 +431,14 @@ static int pmu_init(void)
  */
 static int __init via_pmu_start(void)
 {
-   unsigned int irq;
+   unsigned int __maybe_unused irq;
 
if (pmu_state == uninitialized)
return -ENODEV;
 
batt_req.complete = 1;
 
+#ifdef CONFIG_PPC_PMAC
irq = irq_of_parse_and_map(vias, 0);
if (!irq) {
printk(KERN_ERR "via-pmu: can't map interrupt\n");
@@ -439,6 +475,19 @@ static int __init via_pmu_start(void)
 
/* Enable interrupts */
out_8([IER], IER_SET | SR_INT | CB1_INT);
+#else
+   if (request_irq(IRQ_MAC_ADB_SR, via_pmu_interrupt, IRQF_NO_SUSPEND,
+

[PATCH v3 10/12] macintosh: Use common code to access RTC

2018-06-13 Thread Finn Thain

Now that the 68k Mac port has adopted the via-pmu driver, it must access
the PMU RTC using the appropriate command format. The same code can now
be used for both m68k and powerpc.

Replace the RTC code that's duplicated in arch/powerpc and arch/m68k
with common RTC accessors for Cuda and PMU devices.

Cc: Geert Uytterhoeven 
Cc: Paul Mackerras ,
Cc: Michael Ellerman 
Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 arch/m68k/mac/misc.c   | 64 ++---
 arch/powerpc/platforms/powermac/time.c | 74 +-
 drivers/macintosh/via-cuda.c   | 34 
 drivers/macintosh/via-pmu.c| 32 +++
 include/linux/cuda.h   |  3 ++
 include/linux/pmu.h|  3 ++
 6 files changed, 78 insertions(+), 132 deletions(-)

diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index 28090a44fa09..397f9f942a9f 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -33,34 +33,6 @@
 static void (*rom_reset)(void);
 
 #ifdef CONFIG_ADB_CUDA
-static long cuda_read_time(void)
-{
-   struct adb_request req;
-   long time;
-
-   if (cuda_request(, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0)
-   return 0;
-   while (!req.complete)
-   cuda_poll();
-
-   time = (req.reply[3] << 24) | (req.reply[4] << 16) |
-  (req.reply[5] << 8) | req.reply[6];
-   return time - RTC_OFFSET;
-}
-
-static void cuda_write_time(long data)
-{
-   struct adb_request req;
-
-   data += RTC_OFFSET;
-   if (cuda_request(, NULL, 6, CUDA_PACKET, CUDA_SET_TIME,
-(data >> 24) & 0xFF, (data >> 16) & 0xFF,
-(data >> 8) & 0xFF, data & 0xFF) < 0)
-   return;
-   while (!req.complete)
-   cuda_poll();
-}
-
 static __u8 cuda_read_pram(int offset)
 {
struct adb_request req;
@@ -86,34 +58,6 @@ static void cuda_write_pram(int offset, __u8 data)
 #endif /* CONFIG_ADB_CUDA */
 
 #ifdef CONFIG_ADB_PMU
-static long pmu_read_time(void)
-{
-   struct adb_request req;
-   long time;
-
-   if (pmu_request(, NULL, 1, PMU_READ_RTC) < 0)
-   return 0;
-   while (!req.complete)
-   pmu_poll();
-
-   time = (req.reply[1] << 24) | (req.reply[2] << 16) |
-  (req.reply[3] << 8) | req.reply[4];
-   return time - RTC_OFFSET;
-}
-
-static void pmu_write_time(long data)
-{
-   struct adb_request req;
-
-   data += RTC_OFFSET;
-   if (pmu_request(, NULL, 5, PMU_SET_RTC,
-   (data >> 24) & 0xFF, (data >> 16) & 0xFF,
-   (data >> 8) & 0xFF, data & 0xFF) < 0)
-   return;
-   while (!req.complete)
-   pmu_poll();
-}
-
 static __u8 pmu_read_pram(int offset)
 {
struct adb_request req;
@@ -635,12 +579,12 @@ int mac_hwclk(int op, struct rtc_time *t)
 #ifdef CONFIG_ADB_CUDA
case MAC_ADB_EGRET:
case MAC_ADB_CUDA:
-   now = cuda_read_time();
+   now = cuda_get_time();
break;
 #endif
 #ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
-   now = pmu_read_time();
+   now = pmu_get_time();
break;
 #endif
default:
@@ -671,12 +615,12 @@ int mac_hwclk(int op, struct rtc_time *t)
 #ifdef CONFIG_ADB_CUDA
case MAC_ADB_EGRET:
case MAC_ADB_CUDA:
-   cuda_write_time(now);
+   cuda_set_time(now);
break;
 #endif
 #ifdef CONFIG_ADB_PMU
case MAC_ADB_PB2:
-   pmu_write_time(now);
+   pmu_set_time(now);
break;
 #endif
default:
diff --git a/arch/powerpc/platforms/powermac/time.c 
b/arch/powerpc/platforms/powermac/time.c
index 274af6fa388e..e9c1f3dafe2f 100644
--- a/arch/powerpc/platforms/powermac/time.c
+++ b/arch/powerpc/platforms/powermac/time.c
@@ -42,9 +42,6 @@
 #define DBG(x...)
 #endif
 
-/* Apparently the RTC stores seconds since 1 Jan 1904 */
-#define RTC_OFFSET 2082844800
-
 /*
  * Calibrate the decrementer frequency with the VIA timer 1.
  */
@@ -103,43 +100,8 @@ static unsigned long from_rtc_time(struct rtc_time *tm)
 #endif
 
 #ifdef CONFIG_ADB_CUDA
-static unsigned long cuda_get_time(void)
-{
-   struct adb_request req;
-   unsigned int now;
-
-   if (cuda_request(, NULL, 2, CUDA_PACKET, CUDA_GET_TIME) < 0)
-   return 0;
-   while (!req.complete)
-   cuda_poll();
-   if (req.reply_len != 7)
-   printk(KERN_ERR "cuda_get_time: got %d byte reply\n",
-  req.reply_len);
-   now = (req.reply[3] << 24) + (req.reply[4] << 16)
-   + (req.reply[5] << 8) + req.reply[6];
-   return ((unsigned long)now) -

[PATCH v3 12/12] macintosh/via-pmu: Disambiguate interrupt statistics

2018-06-13 Thread Finn Thain

Some of the event counters are overloaded which makes it very
difficult to interpret their values.

Counter 0 is supposed to report CB1 interrupts but it can also count
PMU_INT_WAITING_CHARGER events.

Counter 1 is supposed to report GPIO interrupts but it can also count
other events (depending upon the value of the PMU_INT_ADB bit).

Disambiguate these statistics with dedicated counters for GPIO and
CB1 interrupts.

Comments in the MkLinux source code say that the type 0 and type 1
interrupts are model-specific. Label them as "unknown".

This change to the contents of /proc/pmu/interrupts is by necessity
visible in userland. However, packages which interact with the PMU
(that is, pbbuttonsd, pmac-utils and pmud) don't open this file.
AFAIK, user software has no need to poll these counters.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
The file now looks like this,

  0:  0 (Unknown interrupt (type 0))
  1:  0 (Unknown interrupt (type 1))
  2:  0 (PC-Card eject button)
  3: 23 (Sound/Brightness button)
  4: 74 (ADB message)
  5:  0 (Battery state change)
  6:  0 (Environment interrupt)
  7:121 (Tick timer)
  8:  0 (Ghost interrupt (zero len))
  9:  1 (Empty interrupt (empty mask))
 10:  2 (Max irqs in a row)
 11:194 (Total CB1 triggered events)
 12:  0 (Total GPIO1 triggered events)

rather than this,

  0:194 (Total CB1 triggered events)
  1:  0 (Total GPIO1 triggered events)
  2:  0 (PC-Card eject button)
  3: 23 (Sound/Brightness button)
  4: 74 (ADB message)
  5:  0 (Battery state change)
  6:  0 (Environment interrupt)
  7:121 (Tick timer)
  8:  0 (Ghost interrupt (zero len))
  9:  1 (Empty interrupt (empty mask))
 10:  2 (Max irqs in a row)

If some parser exists for this file, and if this change is problematic,
we could increment the driver version number in /proc/pmu/info, to
correspond with the format change.
---
 drivers/macintosh/via-pmu.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 730c10f7fbb7..44919b3b56e0 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -172,7 +172,9 @@ static int drop_interrupts;
 static int option_lid_wakeup = 1;
 #endif /* CONFIG_SUSPEND && CONFIG_PPC32 */
 static unsigned long async_req_locks;
-static unsigned int pmu_irq_stats[11];
+
+#define NUM_IRQ_STATS 13
+static unsigned int pmu_irq_stats[NUM_IRQ_STATS];
 
 static struct proc_dir_entry *proc_pmu_root;
 static struct proc_dir_entry *proc_pmu_info;
@@ -884,9 +886,9 @@ static const struct file_operations pmu_info_proc_fops = {
 static int pmu_irqstats_proc_show(struct seq_file *m, void *v)
 {
int i;
-   static const char *irq_names[] = {
-   "Total CB1 triggered events",
-   "Total GPIO1 triggered events",
+   static const char *irq_names[NUM_IRQ_STATS] = {
+   "Unknown interrupt (type 0)",
+   "Unknown interrupt (type 1)",
"PC-Card eject button",
"Sound/Brightness button",
"ADB message",
@@ -895,10 +897,12 @@ static int pmu_irqstats_proc_show(struct seq_file *m, 
void *v)
"Tick timer",
"Ghost interrupt (zero len)",
"Empty interrupt (empty mask)",
-   "Max irqs in a row"
+   "Max irqs in a row",
+   "Total CB1 triggered events",
+   "Total GPIO1 triggered events",
 };
 
-   for (i=0; i<11; i++) {
+   for (i = 0; i < NUM_IRQ_STATS; i++) {
seq_printf(m, " %2u: %10u (%s)\n",
 i, pmu_irq_stats[i], irq_names[i]);
}
@@ -1659,7 +1663,7 @@ via_pmu_interrupt(int irq, void *arg)
}
if (intr & CB1_INT) {
adb_int_pending = 1;
-   pmu_irq_stats[0]++;
+   pmu_irq_stats[11]++;
}
if (intr & SR_INT) {
req = pmu_sr_intr();
@@ -1746,7 +1750,7 @@ gpio1_interrupt(int irq, void *arg)
disable_irq_nosync(gpio_irq);
gpio_irq_enabled = 0;
}
-   pmu_irq_stats[1]++;
+   pmu_irq_stats[12]++;
adb_int_pending = 1;
spin_unlock_irqrestore(_lock, flags);
via_pmu_interrupt(0, NULL);
-- 
2.16.4

[PATCH v3 08/12] macintosh/via-pmu68k: Don't load driver on unsupported hardware

2018-06-13 Thread Finn Thain

Don't load the via-pmu68k driver on early PowerBooks. The M50753 PMU
device found in those models was never supported by this driver.
Attempting to load the driver usually causes a boot hang.

Cc: Geert Uytterhoeven 
Signed-off-by: Finn Thain 
Reviewed-by: Michael Schmitz 
---
 arch/m68k/mac/misc.c   | 6 ++
 drivers/macintosh/via-pmu68k.c | 4 
 include/uapi/linux/pmu.h   | 2 +-
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c
index c68054361615..7ccb799eeb57 100644
--- a/arch/m68k/mac/misc.c
+++ b/arch/m68k/mac/misc.c
@@ -478,8 +478,7 @@ void mac_poweroff(void)
cuda_shutdown();
 #endif
 #ifdef CONFIG_ADB_PMU68K
-   } else if (macintosh_config->adb_type == MAC_ADB_PB1
-   || macintosh_config->adb_type == MAC_ADB_PB2) {
+   } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_shutdown();
 #endif
}
@@ -520,8 +519,7 @@ void mac_reset(void)
cuda_restart();
 #endif
 #ifdef CONFIG_ADB_PMU68K
-   } else if (macintosh_config->adb_type == MAC_ADB_PB1
-   || macintosh_config->adb_type == MAC_ADB_PB2) {
+   } else if (macintosh_config->adb_type == MAC_ADB_PB2) {
pmu_restart();
 #endif
} else if (CPU_IS_030) {
diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c
index d545ed45e482..bec8e1837d7d 100644
--- a/drivers/macintosh/via-pmu68k.c
+++ b/drivers/macintosh/via-pmu68k.c
@@ -175,9 +175,6 @@ static s8 pmu_data_len[256][2] = {
 int __init find_via_pmu(void)
 {
switch (macintosh_config->adb_type) {
-   case MAC_ADB_PB1:
-   pmu_kind = PMU_68K_V1;
-   break;
case MAC_ADB_PB2:
pmu_kind = PMU_68K_V2;
break;
@@ -785,7 +782,6 @@ pmu_enable_backlight(int on)
/* first call: get current backlight value */
if (backlight_level < 0) {
switch(pmu_kind) {
-   case PMU_68K_V1:
case PMU_68K_V2:
pmu_request(, NULL, 3, PMU_READ_NVRAM, 0x14, 0xe);
while (!req.complete)
diff --git a/include/uapi/linux/pmu.h b/include/uapi/linux/pmu.h
index 89cb1acea93a..e128f609281a 100644
--- a/include/uapi/linux/pmu.h
+++ b/include/uapi/linux/pmu.h
@@ -93,7 +93,7 @@ enum {
PMU_HEATHROW_BASED, /* PowerBook G3 series */
PMU_PADDINGTON_BASED,   /* 1999 PowerBook G3 */
PMU_KEYLARGO_BASED, /* Core99 motherboard (PMU99) */
-   PMU_68K_V1, /* 68K PMU, version 1 */
+   PMU_68K_V1, /* Unused/deprecated */
PMU_68K_V2, /* 68K PMU, version 2 */
 };
 
-- 
2.16.4

[PATCH v3 11/12] macintosh/via-pmu: Clean up interrupt statistics

2018-06-13 Thread Finn Thain

Replace an open-coded ffs() with the function call.
Simplify an if-else cascade using a switch statement.
Correct a typo and an indentation issue.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 39 ++-
 1 file changed, 22 insertions(+), 17 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 38d7dd0bdb28..730c10f7fbb7 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1392,7 +1392,8 @@ pmu_resume(void)
 static void
 pmu_handle_data(unsigned char *data, int len)
 {
-   unsigned char ints, pirq;
+   unsigned char ints;
+   int idx;
int i = 0;
 
asleep = 0;
@@ -1414,25 +1415,24 @@ pmu_handle_data(unsigned char *data, int len)
ints &= ~(PMU_INT_ADB_AUTO | PMU_INT_AUTO_SRQ_POLL);
 
 next:
-
if (ints == 0) {
if (i > pmu_irq_stats[10])
pmu_irq_stats[10] = i;
return;
}
-
-   for (pirq = 0; pirq < 8; pirq++)
-   if (ints & (1 << pirq))
-   break;
-   pmu_irq_stats[pirq]++;
i++;
-   ints &= ~(1 << pirq);
+
+   idx = ffs(ints) - 1;
+   ints &= ~BIT(idx);
+
+   pmu_irq_stats[idx]++;
 
/* Note: for some reason, we get an interrupt with len=1,
 * data[0]==0 after each normal ADB interrupt, at least
 * on the Pismo. Still investigating...  --BenH
 */
-   if ((1 << pirq) & PMU_INT_ADB) {
+   switch (BIT(idx)) {
+   case PMU_INT_ADB:
if ((data[0] & PMU_INT_ADB_AUTO) == 0) {
struct adb_request *req = req_awaiting_reply;
if (req == 0) {
@@ -1470,25 +1470,28 @@ pmu_handle_data(unsigned char *data, int len)
adb_input(data+1, len-1, 1);
 #endif /* CONFIG_ADB */
}
-   }
+   break;
+
/* Sound/brightness button pressed */
-   else if ((1 << pirq) & PMU_INT_SNDBRT) {
+   case PMU_INT_SNDBRT:
 #ifdef CONFIG_PMAC_BACKLIGHT
if (len == 3)
pmac_backlight_set_legacy_brightness_pmu(data[1] >> 4);
 #endif
-   }
+   break;
+
/* Tick interrupt */
-   else if ((1 << pirq) & PMU_INT_TICK) {
-   /* Environement or tick interrupt, query batteries */
+   case PMU_INT_TICK:
+   /* Environment or tick interrupt, query batteries */
if (pmu_battery_count) {
if ((--query_batt_timer) == 0) {
query_battery_state();
query_batt_timer = BATTERY_POLLING_COUNT;
}
}
-}
-   else if ((1 << pirq) & PMU_INT_ENVIRONMENT) {
+   break;
+
+   case PMU_INT_ENVIRONMENT:
if (pmu_battery_count)
query_battery_state();
pmu_pass_intr(data, len);
@@ -1498,7 +1501,9 @@ pmu_handle_data(unsigned char *data, int len)
via_pmu_event(PMU_EVT_POWER, !!(data[1]&8));
via_pmu_event(PMU_EVT_LID, data[1]&1);
}
-   } else {
+   break;
+
+   default:
   pmu_pass_intr(data, len);
}
goto next;
-- 
2.16.4

[PATCH v3 04/12] macintosh/via-pmu: Enhance state machine with new 'uninitialized' state

2018-06-13 Thread Finn Thain

On 68k Macs, the via/vias pointer can't be used to determine whether
the PMU driver has been initialized. For portability, add a new state
to indicate that via_find_pmu() succeeded.

After via_find_pmu() executes, testing vias == NULL is equivalent to
testing via == NULL. Replace these tests with pmu_state == uninitialized
which is simpler and more consistent. No functional change.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
---
 drivers/macintosh/via-pmu.c | 44 ++--
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 4c1bae5380c2..c4c324fb5fa6 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -114,6 +114,7 @@ static volatile unsigned char __iomem *via;
 #define CB1_INT0x10/* transition on CB1 input */
 
 static volatile enum pmu_state {
+   uninitialized = 0,
idle,
sending,
intack,
@@ -274,7 +275,7 @@ int __init find_via_pmu(void)
u64 taddr;
const u32 *reg;
 
-   if (via != 0)
+   if (pmu_state != uninitialized)
return 1;
vias = of_find_node_by_name(NULL, "via-pmu");
if (vias == NULL)
@@ -369,20 +370,19 @@ int __init find_via_pmu(void)
  fail:
of_node_put(vias);
vias = NULL;
+   pmu_state = uninitialized;
return 0;
 }
 
 #ifdef CONFIG_ADB
 static int pmu_probe(void)
 {
-   return vias == NULL? -ENODEV: 0;
+   return pmu_state == uninitialized ? -ENODEV : 0;
 }
 
 static int pmu_init(void)
 {
-   if (vias == NULL)
-   return -ENODEV;
-   return 0;
+   return pmu_state == uninitialized ? -ENODEV : 0;
 }
 #endif /* CONFIG_ADB */
 
@@ -397,7 +397,7 @@ static int __init via_pmu_start(void)
 {
unsigned int irq;
 
-   if (vias == NULL)
+   if (pmu_state == uninitialized)
return -ENODEV;
 
batt_req.complete = 1;
@@ -463,7 +463,7 @@ arch_initcall(via_pmu_start);
  */
 static int __init via_pmu_dev_init(void)
 {
-   if (vias == NULL)
+   if (pmu_state == uninitialized)
return -ENODEV;
 
 #ifdef CONFIG_PMAC_BACKLIGHT
@@ -966,7 +966,7 @@ static int pmu_send_request(struct adb_request *req, int 
sync)
 {
int i, ret;
 
-   if ((vias == NULL) || (!pmu_fully_inited)) {
+   if (pmu_state == uninitialized || !pmu_fully_inited) {
req->complete = 1;
return -ENXIO;
}
@@ -1060,7 +1060,7 @@ static int __pmu_adb_autopoll(int devs)
 
 static int pmu_adb_autopoll(int devs)
 {
-   if ((vias == NULL) || (!pmu_fully_inited) || !pmu_has_adb)
+   if (pmu_state == uninitialized || !pmu_fully_inited || !pmu_has_adb)
return -ENXIO;
 
adb_dev_map = devs;
@@ -1073,7 +1073,7 @@ static int pmu_adb_reset_bus(void)
struct adb_request req;
int save_autopoll = adb_dev_map;
 
-   if ((vias == NULL) || (!pmu_fully_inited) || !pmu_has_adb)
+   if (pmu_state == uninitialized || !pmu_fully_inited || !pmu_has_adb)
return -ENXIO;
 
/* anyone got a better idea?? */
@@ -1109,7 +1109,7 @@ pmu_request(struct adb_request *req, void (*done)(struct 
adb_request *),
va_list list;
int i;
 
-   if (vias == NULL)
+   if (pmu_state == uninitialized)
return -ENXIO;
 
if (nbytes < 0 || nbytes > 32) {
@@ -1134,7 +1134,7 @@ pmu_queue_request(struct adb_request *req)
unsigned long flags;
int nsend;
 
-   if (via == NULL) {
+   if (pmu_state == uninitialized) {
req->complete = 1;
return -ENXIO;
}
@@ -1247,7 +1247,7 @@ pmu_start(void)
 void
 pmu_poll(void)
 {
-   if (!via)
+   if (pmu_state == uninitialized)
return;
if (disable_poll)
return;
@@ -1257,7 +1257,7 @@ pmu_poll(void)
 void
 pmu_poll_adb(void)
 {
-   if (!via)
+   if (pmu_state == uninitialized)
return;
if (disable_poll)
return;
@@ -1272,7 +1272,7 @@ pmu_poll_adb(void)
 void
 pmu_wait_complete(struct adb_request *req)
 {
-   if (!via)
+   if (pmu_state == uninitialized)
return;
while((pmu_state != idle && pmu_state != locked) || !req->complete)
via_pmu_interrupt(0, NULL);
@@ -1288,7 +1288,7 @@ pmu_suspend(void)
 {
unsigned long flags;
 
-   if (!via)
+   if (pmu_state == uninitialized)
return;

spin_lock_irqsave(_lock, flags);
@@ -1319,7 +1319,7 @@ pmu_resume(void)
 {
unsigned long flags;
 
-   if (!via || (pmu_suspended < 1))
+   if (pmu_state == uninitialized || pmu_suspended < 1)
return;
 
spin_lock_irqsave(_lock, flags);
@@ -1681,7 +1681,7 @@ pmu_enable_irled(int on)
 {
struct adb_request req;
 
-   if (vias == NULL)
+   if (pmu_state ==

[PATCH v3 01/12] macintosh/via-pmu: Fix section mismatch warning

2018-06-13 Thread Finn Thain

The pmu_init() function has the __init qualifier, but the ops struct
that holds a pointer to it does not. This causes a build warning.
The driver works fine because the pointer is only dereferenced early.

The function is so small that there's negligible benefit from using
the __init qualifier. Remove it to fix the warning, consistent with
the other ADB drivers.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 433dbeddfcf9..fd3c5640d586 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -378,7 +378,7 @@ static int pmu_probe(void)
return vias == NULL? -ENODEV: 0;
 }
 
-static int __init pmu_init(void)
+static int pmu_init(void)
 {
if (vias == NULL)
return -ENODEV;
-- 
2.16.4

[PATCH v3 03/12] macintosh/via-pmu: Don't clear shift register interrupt flag twice

2018-06-13 Thread Finn Thain

The shift register interrupt flag gets cleared in via_pmu_interrupt()
and once again in pmu_sr_intr(). Fix this theoretical race condition.

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 74065ea410bd..4c1bae5380c2 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -1458,7 +1458,6 @@ pmu_sr_intr(void)
 
if (in_8([B]) & TREQ) {
printk(KERN_ERR "PMU: spurious SR intr (%x)\n", in_8([B]));
-   out_8([IFR], SR_INT);
return NULL;
}
/* The ack may not yet be low when we get the interrupt */
-- 
2.16.4

[PATCH v3 00/12] macintosh: Resolve various PMU driver problems

2018-06-13 Thread Finn Thain

This series of patches has the following aims.

1) Eliminate duplicated code. Linux presently has two drivers for
   the 68HC05-based PMU devices found in Macs: via-pmu and via-pmu68k.
   There's no value in having separate PMU drivers for each architecture.

2) Avoid further work on via-pmu68k that's not needed for via-pmu.

3) Fix some bugs in the via-pmu driver.

4) Enable the /dev/pmu and /proc/pmu/* userspace APIs on m68k Macs
   by adopting via-pmu.

5) Improve stability on early 100-series PowerBooks by loading no PMU
   driver at all. Neither via-pmu nor via-pmu68k supports the early
   M50753-based PMU device found in these models.

6) Eliminate duplicated RTC accessors for PMU and Cuda. Presently these
   can be found under both arch/m68k and arch/powerpc.

7) Assist the out-of-tree NuBus PowerMac port to support PMU designs
   shared with the m68k Mac port (e.g. PowerBooks 190 and 5300).

This patch series has been regression tested on various PowerBooks
(190, 520, 3400, Pismo G3) and PowerMacs (Beige G3, G5). These patches
did not affect userland utilities. (Note that there is a userland-
visible change to the contents of /proc/pmu/interrupts.)

Changed since v1:
1) Added blank lines after 'break' statements in patch 10.
2) Improved patch description for patch 3.
3) Added reviewed-by tags.
4) Split patch 8 to make code review easier.

Changed since v2:
1) Added reviewed-by tag.
2) Retained PMU_68K_V1 and PMU_68K_V2 symbols.


Finn Thain (12):
  macintosh/via-pmu: Fix section mismatch warning
  macintosh/via-pmu: Add missing mmio accessors
  macintosh/via-pmu: Don't clear shift register interrupt flag twice
  macintosh/via-pmu: Enhance state machine with new 'uninitialized'
state
  macintosh/via-pmu: Replace via pointer with via1 and via2 pointers
  macintosh/via-pmu: Add support for m68k PowerBooks
  macintosh/via-pmu: Make CONFIG_PPC_PMAC Kconfig deps explicit
  macintosh/via-pmu68k: Don't load driver on unsupported hardware
  macintosh/via-pmu: Replace via-pmu68k driver with via-pmu driver
  macintosh: Use common code to access RTC
  macintosh/via-pmu: Clean up interrupt statistics
  macintosh/via-pmu: Disambiguate interrupt statistics

 arch/m68k/configs/mac_defconfig|   2 +-
 arch/m68k/configs/multi_defconfig  |   2 +-
 arch/m68k/mac/config.c |   2 +-
 arch/m68k/mac/misc.c   | 118 +
 arch/powerpc/platforms/powermac/time.c |  74 +--
 drivers/macintosh/Kconfig  |  19 +-
 drivers/macintosh/Makefile |   1 -
 drivers/macintosh/adb.c|   2 +-
 drivers/macintosh/via-cuda.c   |  34 ++
 drivers/macintosh/via-pmu.c| 378 ++-
 drivers/macintosh/via-pmu68k.c | 850 -
 include/linux/cuda.h   |   3 +
 include/linux/pmu.h|   3 +
 include/uapi/linux/pmu.h   |   4 +-
 14 files changed, 313 insertions(+), 1179 deletions(-)
 delete mode 100644 drivers/macintosh/via-pmu68k.c

-- 
2.16.4

[PATCH v3 02/12] macintosh/via-pmu: Add missing mmio accessors

2018-06-13 Thread Finn Thain

Add missing in_8() accessors to init_pmu() and pmu_sr_intr().

This fixes several sparse warnings:
drivers/macintosh/via-pmu.c:536:29: warning: dereference of noderef expression
drivers/macintosh/via-pmu.c:537:33: warning: dereference of noderef expression
drivers/macintosh/via-pmu.c:1455:17: warning: dereference of noderef expression
drivers/macintosh/via-pmu.c:1456:69: warning: dereference of noderef expression

Tested-by: Stan Johnson 
Signed-off-by: Finn Thain 
Reviewed-by: Geert Uytterhoeven 
---
 drivers/macintosh/via-pmu.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index fd3c5640d586..74065ea410bd 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -532,8 +532,9 @@ init_pmu(void)
int timeout;
struct adb_request req;
 
-   out_8([B], via[B] | TREQ);  /* negate TREQ */
-   out_8([DIRB], (via[DIRB] | TREQ) & ~TACK);  /* TACK in, TREQ out */
+   /* Negate TREQ. Set TACK to input and TREQ to output. */
+   out_8([B], in_8([B]) | TREQ);
+   out_8([DIRB], (in_8([DIRB]) | TREQ) & ~TACK);
 
pmu_request(, NULL, 2, PMU_SET_INTR_MASK, pmu_intr_mask);
timeout =  10;
@@ -1455,8 +1456,8 @@ pmu_sr_intr(void)
struct adb_request *req;
int bite = 0;
 
-   if (via[B] & TREQ) {
-   printk(KERN_ERR "PMU: spurious SR intr (%x)\n", via[B]);
+   if (in_8([B]) & TREQ) {
+   printk(KERN_ERR "PMU: spurious SR intr (%x)\n", in_8([B]));
out_8([IFR], SR_INT);
return NULL;
}
-- 
2.16.4

Re: [RFC PATCH 03/23] genirq: Introduce IRQF_DELIVER_AS_NMI

2018-06-13 Thread Julien Thierry





On 13/06/18 10:57, Thomas Gleixner wrote:

On Wed, 13 Jun 2018, Julien Thierry wrote:

On 13/06/18 10:20, Thomas Gleixner wrote:

Adding NMI delivery support at low level architecture irq chip level is
perfectly fine, but the exposure of that needs to be restricted very
much. Adding it to the generic interrupt control interfaces is not going to
happen. That's doomed to begin with and a complete abuse of the interface
as the handler can not ever be used for that.



Understood, however the need would be to provide a way for a driver to request
an interrupt to be delivered as an NMI (if irqchip supports it).


s/driver/specialized code written by people who know what they are doing/


But from your response this would be out of the question (in the
interrupt/irq/irqchip definitions).


Adding some magic to the irq chip is fine, because that's where the low
level integration needs to be done, but exposing it through the generic
interrupt subsystem is a NONO for obvious reasons.


Or somehow the concerned irqchip informs the arch it supports NMI delivery and
it is up to the interested drivers to query the arch whether NMI delivery is
supported by the system?


Yes, we need some infrastructure for that, but that needs to be separate
and with very limited exposure.



Right, makes sense. I'll check with Marc how such an infrastructure 
should be introduced.


Thanks,

--
Julien Thierry

Re: [RFC PATCH 03/23] genirq: Introduce IRQF_DELIVER_AS_NMI

2018-06-13 Thread Marc Zyngier

On 13/06/18 10:20, Thomas Gleixner wrote:
> On Wed, 13 Jun 2018, Julien Thierry wrote:
>> On 13/06/18 09:34, Peter Zijlstra wrote:
>>> On Tue, Jun 12, 2018 at 05:57:23PM -0700, Ricardo Neri wrote:
 diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
 index 5426627..dbc5e02 100644
 --- a/include/linux/interrupt.h
 +++ b/include/linux/interrupt.h
 @@ -61,6 +61,8 @@
*interrupt handler after suspending interrupts. For
 system
*wakeup devices users need to implement wakeup
 detection in
*their interrupt handlers.
 + * IRQF_DELIVER_AS_NMI - Configure interrupt to be delivered as
 non-maskable, if
 + *supported by the chip.
*/
>>>
>>> NAK on the first 6 patches. You really _REALLY_ don't want to expose
>>> NMIs to this level.
>>>
>>
>> I've been working on something similar on arm64 side, and effectively the one
>> thing that might be common to arm64 and intel is the interface to set an
>> interrupt as NMI. So I guess it would be nice to agree on the right approach
>> for this.
>>
>> The way I did it was by introducing a new irq_state and let the irqchip 
>> driver
>> handle most of the work (if it supports that state):
>>
>> https://lkml.org/lkml/2018/5/25/181
>>
>> This has not been ACKed nor NAKed. So I am just asking whether this is a more
>> suitable approach, and if not, is there any suggestions on how to do this?
> 
> I really didn't pay attention to that as it's burried in the GIC/ARM series
> which is usually Marc's playground.

I'm working my way through it ATM now that I have some brain cycles back.

> Adding NMI delivery support at low level architecture irq chip level is
> perfectly fine, but the exposure of that needs to be restricted very
> much. Adding it to the generic interrupt control interfaces is not going to
> happen. That's doomed to begin with and a complete abuse of the interface
> as the handler can not ever be used for that.

I can only agree with that. Allowing random driver to use request_irq()
to make anything an NMI ultimately turns it into a complete mess ("hey,
NMI is *faster*, let's use that"), and a potential source of horrible
deadlocks.

What I'd find more palatable is a way for an irqchip to be able to
prioritize some interrupts based on a set of architecturally-defined
requirements, and a separate NMI requesting/handling framework that is
separate from the IRQ API, as the overall requirements are likely to
completely different.

It shouldn't have to be nearly as complex as the IRQ API, and require
much stricter requirements in terms of what you can do there (flow
handling should definitely be different).

Thanks,

M.
-- 
Jazz is not dead. It just smells funny...

[PATCH] powerpc/64s/radix: Fix MADV_[FREE|DONTNEED] TLB flush miss problem with THP

2018-06-13 Thread Nicholas Piggin

The patch 99baac21e4 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss
problem") added a force flush mode to the mmu_gather flush, which
unconditionally flushes the entire address range being invalidated
(even if actual ptes only covered a smaller range), to solve a problem
with concurrent threads invalidating the same PTEs causing them to
miss TLBs that need flushing.

This does not work with powerpc that invalidates mmu_gather batches
according to page size. Have powerpc flush all possible page sizes in
the range if it encounters the concurrency condition.

Hash does not have a problem because it invalidates TLBs inside the
page table locks.

Reported-by: Aneesh Kumar K.V 
Signed-off-by: Nicholas Piggin 
---
Since RFC:
- Account for hugetlb pages that can be mixed with the tlb_flush
  range.

 arch/powerpc/mm/tlb-radix.c | 85 -
 1 file changed, 66 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 67a6e86d3e7e..9dbccda651d7 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -689,22 +689,17 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
 static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = 
POWER9_TLB_SETS_RADIX * 2;
 
-void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
-unsigned long end)
+static inline void __radix__flush_tlb_range(struct mm_struct *mm,
+   unsigned long start, unsigned long end,
+   bool flush_all_sizes)
 
 {
-   struct mm_struct *mm = vma->vm_mm;
unsigned long pid;
unsigned int page_shift = mmu_psize_defs[mmu_virtual_psize].shift;
unsigned long page_size = 1UL << page_shift;
unsigned long nr_pages = (end - start) >> page_shift;
bool local, full;
 
-#ifdef CONFIG_HUGETLB_PAGE
-   if (is_vm_hugetlb_page(vma))
-   return radix__flush_hugetlb_tlb_range(vma, start, end);
-#endif
-
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
return;
@@ -738,18 +733,27 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, 
unsigned long start,
_tlbie_pid(pid, RIC_FLUSH_TLB);
}
} else {
-   bool hflush = false;
+   bool hflush = flush_all_sizes;
+   bool gflush = flush_all_sizes;
unsigned long hstart, hend;
+   unsigned long gstart, gend;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-   hstart = (start + HPAGE_PMD_SIZE - 1) >> HPAGE_PMD_SHIFT;
-   hend = end >> HPAGE_PMD_SHIFT;
-   if (hstart < hend) {
-   hstart <<= HPAGE_PMD_SHIFT;
-   hend <<= HPAGE_PMD_SHIFT;
+   if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
hflush = true;
+
+   if (hflush) {
+   hstart = (start + HPAGE_PMD_SIZE - 1) & HPAGE_PMD_MASK;
+   hend = end & HPAGE_PMD_MASK;
+   if (hstart == hend)
+   hflush = false;
+   }
+
+   if (gflush) {
+   gstart = (start + HPAGE_PUD_SIZE - 1) & HPAGE_PUD_MASK;
+   gend = end & HPAGE_PUD_MASK;
+   if (gstart == gend)
+   gflush = false;
}
-#endif
 
asm volatile("ptesync": : :"memory");
if (local) {
@@ -757,18 +761,36 @@ void radix__flush_tlb_range(struct vm_area_struct *vma, 
unsigned long start,
if (hflush)
__tlbiel_va_range(hstart, hend, pid,
HPAGE_PMD_SIZE, MMU_PAGE_2M);
+   if (gflush)
+   __tlbiel_va_range(gstart, gend, pid,
+   HPAGE_PUD_SIZE, MMU_PAGE_1G);
asm volatile("ptesync": : :"memory");
} else {
__tlbie_va_range(start, end, pid, page_size, 
mmu_virtual_psize);
if (hflush)
__tlbie_va_range(hstart, hend, pid,
HPAGE_PMD_SIZE, MMU_PAGE_2M);
+   if (gflush)
+   __tlbie_va_range(gstart, gend, pid,
+   HPAGE_PUD_SIZE, MMU_PAGE_1G);
fixup_tlbie();
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
}
preempt_enable();
 }
+
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
+unsigned long end)
+
+{
+#ifdef

Re: [RFC PATCH 03/23] genirq: Introduce IRQF_DELIVER_AS_NMI

2018-06-13 Thread Thomas Gleixner

On Wed, 13 Jun 2018, Julien Thierry wrote:
> On 13/06/18 10:20, Thomas Gleixner wrote:
> > Adding NMI delivery support at low level architecture irq chip level is
> > perfectly fine, but the exposure of that needs to be restricted very
> > much. Adding it to the generic interrupt control interfaces is not going to
> > happen. That's doomed to begin with and a complete abuse of the interface
> > as the handler can not ever be used for that.
> > 
> 
> Understood, however the need would be to provide a way for a driver to request
> an interrupt to be delivered as an NMI (if irqchip supports it).

s/driver/specialized code written by people who know what they are doing/

> But from your response this would be out of the question (in the
> interrupt/irq/irqchip definitions).

Adding some magic to the irq chip is fine, because that's where the low
level integration needs to be done, but exposing it through the generic
interrupt subsystem is a NONO for obvious reasons.

> Or somehow the concerned irqchip informs the arch it supports NMI delivery and
> it is up to the interested drivers to query the arch whether NMI delivery is
> supported by the system?

Yes, we need some infrastructure for that, but that needs to be separate
and with very limited exposure.

Thanks,

tglx

Re: [RFC PATCH 03/23] genirq: Introduce IRQF_DELIVER_AS_NMI

2018-06-13 Thread Julien Thierry





On 13/06/18 10:36, Julien Thierry wrote:



On 13/06/18 10:20, Thomas Gleixner wrote:

On Wed, 13 Jun 2018, Julien Thierry wrote:

On 13/06/18 09:34, Peter Zijlstra wrote:

On Tue, Jun 12, 2018 at 05:57:23PM -0700, Ricardo Neri wrote:

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5426627..dbc5e02 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -61,6 +61,8 @@
    *    interrupt handler after suspending interrupts. 
For

system
    *    wakeup devices users need to implement wakeup
detection in
    *    their interrupt handlers.
+ * IRQF_DELIVER_AS_NMI - Configure interrupt to be delivered as
non-maskable, if
+ *    supported by the chip.
    */


NAK on the first 6 patches. You really _REALLY_ don't want to expose
NMIs to this level.



I've been working on something similar on arm64 side, and effectively 
the one

thing that might be common to arm64 and intel is the interface to set an
interrupt as NMI. So I guess it would be nice to agree on the right 
approach

for this.

The way I did it was by introducing a new irq_state and let the 
irqchip driver

handle most of the work (if it supports that state):

https://lkml.org/lkml/2018/5/25/181

This has not been ACKed nor NAKed. So I am just asking whether this 
is a more
suitable approach, and if not, is there any suggestions on how to do 
this?


I really didn't pay attention to that as it's burried in the GIC/ARM 
series

which is usually Marc's playground.

Adding NMI delivery support at low level architecture irq chip level is
perfectly fine, but the exposure of that needs to be restricted very
much. Adding it to the generic interrupt control interfaces is not 
going to

happen. That's doomed to begin with and a complete abuse of the interface
as the handler can not ever be used for that.



Understood, however the need would be to provide a way for a driver to 
request an interrupt to be delivered as an NMI (if irqchip supports it).


But from your response this would be out of the question (in the 
interrupt/irq/irqchip definitions).


Or somehow the concerned irqchip informs the arch it supports NMI 
delivery and it is up to the interested drivers to query the arch 
whether NMI delivery is supported by the system?


Actually scratch that last part, it is also missing a way for the driver 
to actually communicate to the irqchip that its interrupt should be 
treated as an NMI, so it wouldn't work...


--
Julien Thierry

Re: [RFC PATCH 20/23] watchdog/hardlockup/hpet: Rotate interrupt among all monitored CPUs

2018-06-13 Thread Thomas Gleixner

On Tue, 12 Jun 2018, Ricardo Neri wrote:
> + /* There are no CPUs to monitor. */
> + if (!cpumask_weight(>monitored_mask))
> + return NMI_HANDLED;
> +
>   inspect_for_hardlockups(regs);
>  
> + /*
> +  * Target a new CPU. Keep trying until we find a monitored CPU. CPUs
> +  * are addded and removed to this mask at cpu_up() and cpu_down(),
> +  * respectively. Thus, the interrupt should be able to be moved to
> +  * the next monitored CPU.
> +  */
> + spin_lock(_data->lock);

Yuck. Taking a spinlock from NMI ... 

> + for_each_cpu_wrap(cpu, >monitored_mask, smp_processor_id() + 1) {
> + if (!irq_set_affinity(hld_data->irq, cpumask_of(cpu)))
> + break;

... and then calling into generic interrupt code which will take even more
locks is completely broken.

Guess what happens when the NMI hits a section where one of those locks is
held? Then you need another watchdog to decode the lockup you just ran into.

Thanks,

tglx

Re: [RFC PATCH 17/23] watchdog/hardlockup/hpet: Convert the timer's interrupt to NMI

2018-06-13 Thread Thomas Gleixner

On Tue, 12 Jun 2018, Ricardo Neri wrote:
> @@ -183,6 +184,8 @@ static irqreturn_t hardlockup_detector_irq_handler(int 
> irq, void *data)
>   if (!(hdata->flags & HPET_DEV_PERI_CAP))
>   kick_timer(hdata);
>  
> + pr_err("This interrupt should not have happened. Ensure delivery mode 
> is NMI.\n");

Eeew.

>  /**
> + * hardlockup_detector_nmi_handler() - NMI Interrupt handler
> + * @val: Attribute associated with the NMI. Not used.
> + * @regs:Register values as seen when the NMI was asserted
> + *
> + * When an NMI is issued, look for hardlockups. If the timer is not periodic,
> + * kick it. The interrupt is always handled when if delivered via the
> + * Front-Side Bus.
> + *
> + * Returns:
> + *
> + * NMI_DONE if the HPET timer did not cause the interrupt. NMI_HANDLED
> + * otherwise.
> + */
> +static int hardlockup_detector_nmi_handler(unsigned int val,
> +struct pt_regs *regs)
> +{
> + struct hpet_hld_data *hdata = hld_data;
> + unsigned int use_fsb;
> +
> + /*
> +  * If FSB delivery mode is used, the timer interrupt is programmed as
> +  * edge-triggered and there is no need to check the ISR register.
> +  */
> + use_fsb = hdata->flags & HPET_DEV_FSB_CAP;
> +
> + if (!use_fsb && !is_hpet_wdt_interrupt(hdata))
> + return NMI_DONE;

So for 'use_fsb == True' every single NMI will fall through into the
watchdog code below.

> + inspect_for_hardlockups(regs);
> +
> + if (!(hdata->flags & HPET_DEV_PERI_CAP))
> + kick_timer(hdata);

And in case that the HPET does not support periodic mode this reprogramms
the timer on every NMI which means that while perf is running the watchdog
will never ever detect anything.

Aside of that, reading TWO HPET registers for every NMI is insane. HPET
access is horribly slow, so any high frequency perf monitoring will take a
massive performance hit.

Thanks,

tglx

Re: [RFC PATCH 03/23] genirq: Introduce IRQF_DELIVER_AS_NMI

2018-06-13 Thread Thomas Gleixner

On Wed, 13 Jun 2018, Julien Thierry wrote:
> On 13/06/18 09:34, Peter Zijlstra wrote:
> > On Tue, Jun 12, 2018 at 05:57:23PM -0700, Ricardo Neri wrote:
> > > diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> > > index 5426627..dbc5e02 100644
> > > --- a/include/linux/interrupt.h
> > > +++ b/include/linux/interrupt.h
> > > @@ -61,6 +61,8 @@
> > >*interrupt handler after suspending interrupts. For
> > > system
> > >*wakeup devices users need to implement wakeup
> > > detection in
> > >*their interrupt handlers.
> > > + * IRQF_DELIVER_AS_NMI - Configure interrupt to be delivered as
> > > non-maskable, if
> > > + *supported by the chip.
> > >*/
> > 
> > NAK on the first 6 patches. You really _REALLY_ don't want to expose
> > NMIs to this level.
> > 
> 
> I've been working on something similar on arm64 side, and effectively the one
> thing that might be common to arm64 and intel is the interface to set an
> interrupt as NMI. So I guess it would be nice to agree on the right approach
> for this.
> 
> The way I did it was by introducing a new irq_state and let the irqchip driver
> handle most of the work (if it supports that state):
> 
> https://lkml.org/lkml/2018/5/25/181
>
> This has not been ACKed nor NAKed. So I am just asking whether this is a more
> suitable approach, and if not, is there any suggestions on how to do this?

I really didn't pay attention to that as it's burried in the GIC/ARM series
which is usually Marc's playground.

Adding NMI delivery support at low level architecture irq chip level is
perfectly fine, but the exposure of that needs to be restricted very
much. Adding it to the generic interrupt control interfaces is not going to
happen. That's doomed to begin with and a complete abuse of the interface
as the handler can not ever be used for that.

Thanks,

tglx

Re: [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations

2018-06-13 Thread Thomas Gleixner

On Wed, 13 Jun 2018, Peter Zijlstra wrote:
> On Wed, Jun 13, 2018 at 05:41:41PM +1000, Nicholas Piggin wrote:
> > On Tue, 12 Jun 2018 17:57:32 -0700
> > Ricardo Neri  wrote:
> > 
> > > Instead of exposing individual functions for the operations of the NMI
> > > watchdog, define a common interface that can be used across multiple
> > > implementations.
> > > 
> > > The struct nmi_watchdog_ops is defined for such operations. These initial
> > > definitions include the enable, disable, start, stop, and cleanup
> > > operations.
> > > 
> > > Only a single NMI watchdog can be used in the system. The operations of
> > > this NMI watchdog are accessed via the new variable nmi_wd_ops. This
> > > variable is set to point the operations of the first NMI watchdog that
> > > initializes successfully. Even though at this moment, the only available
> > > NMI watchdog is the perf-based hardlockup detector. More implementations
> > > can be added in the future.
> > 
> > Cool, this looks pretty nice at a quick glance. sparc and powerpc at
> > least have their own NMI watchdogs, it would be good to have those
> > converted as well.
> 
> Yeah, agreed, this looks like half a patch.

Though I'm not seeing the advantage of it. That kind of NMI watchdogs are
low level architecture details so having yet another 'ops' data structure
with a gazillion of callbacks, checks and indirections does not provide
value over the currently available weak stubs.

> > Is hpet a cross platform thing, or just x86? We should avoid
> > proliferation of files under kernel/ I think, so with these watchdog
> > driver structs then maybe implementations could go in drivers/ or
> > arch/
> 
> HPET is mostly an x86 thing (altough it can be found elsewhere), but the

On ia64 and I doubt that anyone wants to take on the task of underwater
welding it to Itanic.

> whole thing relies on the x86 NMI mechanism and is thus firmly arch/
> material (like the sparc and ppc thing).

Right. Trying to make this 'generic' is not really solving anything.

Thanks,

tglx

Re: [RFC PATCH 17/23] watchdog/hardlockup/hpet: Convert the timer's interrupt to NMI

2018-06-13 Thread Peter Zijlstra

On Tue, Jun 12, 2018 at 05:57:37PM -0700, Ricardo Neri wrote:

+static bool is_hpet_wdt_interrupt(struct hpet_hld_data *hdata)
+{
+   unsigned long this_isr;
+   unsigned int lvl_trig;
+
+   this_isr = hpet_readl(HPET_STATUS) & BIT(hdata->num);
+
+   lvl_trig = hpet_readl(HPET_Tn_CFG(hdata->num)) & HPET_TN_LEVEL;
+
+   if (lvl_trig && this_isr)
+   return true;
+
+   return false;
+}

> +static int hardlockup_detector_nmi_handler(unsigned int val,
> +struct pt_regs *regs)
> +{
> + struct hpet_hld_data *hdata = hld_data;
> + unsigned int use_fsb;
> +
> + /*
> +  * If FSB delivery mode is used, the timer interrupt is programmed as
> +  * edge-triggered and there is no need to check the ISR register.
> +  */
> + use_fsb = hdata->flags & HPET_DEV_FSB_CAP;

Please do explain.. That FSB thing basically means MSI. But there's only
a single NMI vector. How do we know this NMI came from the HPET?

> +
> + if (!use_fsb && !is_hpet_wdt_interrupt(hdata))

So you add _2_ HPET reads for every single NMI that gets triggered...
and IIRC HPET reads are _slloww_.

> + return NMI_DONE;
> +
> + inspect_for_hardlockups(regs);
> +
> + if (!(hdata->flags & HPET_DEV_PERI_CAP))
> + kick_timer(hdata);
> +
> + /* Acknowledge interrupt if in level-triggered mode */
> + if (!use_fsb)
> + hpet_writel(BIT(hdata->num), HPET_STATUS);
> +
> + return NMI_HANDLED;

So if I read this right, when in FSB/MSI mode, we'll basically _always_
claim every single NMI as handled?

That's broken.

> +}

Re: [RFC PATCH 03/23] genirq: Introduce IRQF_DELIVER_AS_NMI

2018-06-13 Thread Julien Thierry


Hi Peter, Ricardo,

On 13/06/18 09:34, Peter Zijlstra wrote:

On Tue, Jun 12, 2018 at 05:57:23PM -0700, Ricardo Neri wrote:

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5426627..dbc5e02 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -61,6 +61,8 @@
   *interrupt handler after suspending interrupts. For system
   *wakeup devices users need to implement wakeup detection in
   *their interrupt handlers.
+ * IRQF_DELIVER_AS_NMI - Configure interrupt to be delivered as non-maskable, 
if
+ *supported by the chip.
   */


NAK on the first 6 patches. You really _REALLY_ don't want to expose
NMIs to this level.



I've been working on something similar on arm64 side, and effectively 
the one thing that might be common to arm64 and intel is the interface 
to set an interrupt as NMI. So I guess it would be nice to agree on the 
right approach for this.


The way I did it was by introducing a new irq_state and let the irqchip 
driver handle most of the work (if it supports that state):


https://lkml.org/lkml/2018/5/25/181

This has not been ACKed nor NAKed. So I am just asking whether this is a 
more suitable approach, and if not, is there any suggestions on how to 
do this?


Thanks,

--
Julien Thierry

Re: [RFC PATCH 14/23] watchdog/hardlockup: Decouple the hardlockup detector from perf

2018-06-13 Thread Peter Zijlstra

On Tue, Jun 12, 2018 at 05:57:34PM -0700, Ricardo Neri wrote:
> The current default implementation of the hardlockup detector assumes that
> it is implemented using perf events.

The sparc and powerpc things are very much not using perf.

Re: UBSAN: Undefined behaviour in ../include/linux/percpu_counter.h:137:13

2018-06-13 Thread Mathieu Malaterre

On Wed, Jun 13, 2018 at 3:43 AM Michael Ellerman  wrote:
>
> Mathieu Malaterre  writes:
>
> > Hi there,
> >
> > I have a reproducible UBSAN appearing in dmesg after a while on my G4
> > (*). Could anyone suggest a way to diagnose the actual root issue here
> > (or is it just a false positive) ?
>
> It looks like a real overflow, I guess the question is why are we seeing it.
>
> The first thing to work out would be what exactly is overflowing.
>
> Is it in here?
>
> cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns,
>  rq->io_start_time_ns, rq->cmd_flags);
>
>
> If so that would suggest something is taking multiple hours to complete,
> which seems unlikely. Is time going backward?

There is also something suspicious in the kern.log file:

Jun 12 20:09:04 debian kernel: [5.504182]

Jun 12 20:09:04 debian kernel: [5.508945] UBSAN: Undefined
behaviour in ../drivers/rtc/rtc-lib.c:87:22
Jun 12 20:09:04 debian kernel: [5.513658] signed integer overflow:
Jun 12 20:09:04 debian kernel: [5.518211] 1193024 * 3600 cannot be
represented in type 'int'
Jun 12 20:09:04 debian kernel: [5.522866] CPU: 0 PID: 1 Comm:
swapper Not tainted 4.17.0+ #1
Jun 12 20:09:04 debian kernel: [5.527567] Call Trace:
Jun 12 20:09:04 debian kernel: [5.532200] [df4e7b00] [c0481074]
ubsan_epilogue+0x18/0x4c (unreliable)
Jun 12 20:09:04 debian kernel: [5.537019] [df4e7b10] [c0481a14]
handle_overflow+0xbc/0xdc
Jun 12 20:09:04 debian kernel: [5.541832] [df4e7b90] [c060d698]
rtc_time64_to_tm+0x344/0x388
Jun 12 20:09:04 debian kernel: [5.546655] [df4e7bd0] [c001076c]
rtc_generic_get_time+0x2c/0x40
Jun 12 20:09:04 debian kernel: [5.551477] [df4e7be0] [c06113d4]
__rtc_read_time+0x70/0x13c
Jun 12 20:09:04 debian kernel: [5.556288] [df4e7c00] [c061150c]
rtc_read_time+0x6c/0x130
Jun 12 20:09:04 debian kernel: [5.561088] [df4e7c30] [c061271c]
__rtc_read_alarm+0x34/0x684
Jun 12 20:09:04 debian kernel: [5.565884] [df4e7ce0] [c060f234]
rtc_device_register+0x88/0x218
Jun 12 20:09:04 debian kernel: [5.570695] [df4e7d40] [c060f428]
devm_rtc_device_register+0x64/0xc4
Jun 12 20:09:04 debian kernel: [5.575528] [df4e7d60] [c09d15d4]
generic_rtc_probe+0x50/0x78
Jun 12 20:09:04 debian kernel: [5.580359] [df4e7d70] [c055e4a4]
platform_drv_probe+0xa8/0x128
Jun 12 20:09:04 debian kernel: [5.585210] [df4e7d90] [c0559d28]
driver_probe_device+0x354/0x6fc
Jun 12 20:09:04 debian kernel: [5.590064] [df4e7dd0] [c055a270]
__driver_attach+0x1a0/0x22c
Jun 12 20:09:04 debian kernel: [5.594917] [df4e7df0] [c0555b70]
bus_for_each_dev+0x84/0xdc
Jun 12 20:09:04 debian kernel: [5.599750] [df4e7e20] [c0558420]
bus_add_driver+0x188/0x348
Jun 12 20:09:04 debian kernel: [5.604584] [df4e7e40] [c055b7b4]
driver_register+0xa0/0x18c
Jun 12 20:09:04 debian kernel: [5.609433] [df4e7e50] [c055e950]
__platform_driver_probe+0x8c/0x198
Jun 12 20:09:04 debian kernel: [5.614330] [df4e7e70] [c0005800]
do_one_initcall+0x64/0x280
Jun 12 20:09:04 debian kernel: [5.619237] [df4e7ee0] [c0997c04]
kernel_init_freeable+0x3a4/0x444
Jun 12 20:09:04 debian kernel: [5.624145] [df4e7f30] [c00049f8]
kernel_init+0x24/0x118
Jun 12 20:09:04 debian kernel: [5.629029] [df4e7f40] [c001b1c4]
ret_from_kernel_thread+0x14/0x1c
Jun 12 20:09:04 debian kernel: [5.633878]



Grep-ing all leads to:

$ grep  "cannot be represented" kern.log | colrm 1 45|sort -u
 1193022 * 3600 cannot be represented in type 'int'
 1193024 * 3600 cannot be represented in type 'int'
 1193032 * 3600 cannot be represented in type 'int'
 1193033 * 3600 cannot be represented in type 'int'
 1193034 * 3600 cannot be represented in type 'int'
 1193035 * 3600 cannot be represented in type 'int'

How come tm_hour can store a value of 1193035 ?

> cheers
>
> > (*)
> > [41877.514338] 
> > 
> > [41877.514364] UBSAN: Undefined behaviour in
> > ../include/linux/percpu_counter.h:137:13
> > [41877.514373] signed integer overflow:
> > [41877.514378] 9223352809007201260 + 41997676517838 cannot be
> > represented in type 'long long int'
> > [41877.514389] CPU: 0 PID: 0 Comm: swapper Not tainted 4.17.0+ #54
> > [41877.514394] Call Trace:
> > [41877.514411] [dffedd30] [c047a5f8] ubsan_epilogue+0x18/0x4c (unreliable)
> > [41877.514422] [dffedd40] [c047af98] handle_overflow+0xbc/0xdc
> > [41877.514437] [dffeddc0] [c043aaa8] cfq_completed_request+0x560/0x1234
> > [41877.514446] [dffede40] [c03f595c] __blk_put_request+0xb0/0x2dc
> > [41877.514460] [dffede80] [c05aa41c] scsi_end_request+0x19c/0x344
> > [41877.514469] [dffedeb0] [c05abba0] scsi_io_completion+0x4b4/0x854
> > [41877.514482] [dffedf10] [c040604c] blk_done_softirq+0xe4/0x1e0
> > [41877.514496] [dffedf60] [c07eef84]

Re: [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations

2018-06-13 Thread Peter Zijlstra

On Wed, Jun 13, 2018 at 05:41:41PM +1000, Nicholas Piggin wrote:
> On Tue, 12 Jun 2018 17:57:32 -0700
> Ricardo Neri  wrote:
> 
> > Instead of exposing individual functions for the operations of the NMI
> > watchdog, define a common interface that can be used across multiple
> > implementations.
> > 
> > The struct nmi_watchdog_ops is defined for such operations. These initial
> > definitions include the enable, disable, start, stop, and cleanup
> > operations.
> > 
> > Only a single NMI watchdog can be used in the system. The operations of
> > this NMI watchdog are accessed via the new variable nmi_wd_ops. This
> > variable is set to point the operations of the first NMI watchdog that
> > initializes successfully. Even though at this moment, the only available
> > NMI watchdog is the perf-based hardlockup detector. More implementations
> > can be added in the future.
> 
> Cool, this looks pretty nice at a quick glance. sparc and powerpc at
> least have their own NMI watchdogs, it would be good to have those
> converted as well.

Yeah, agreed, this looks like half a patch.

> Is hpet a cross platform thing, or just x86? We should avoid
> proliferation of files under kernel/ I think, so with these watchdog
> driver structs then maybe implementations could go in drivers/ or
> arch/

HPET is mostly an x86 thing (altough it can be found elsewhere), but the
whole thing relies on the x86 NMI mechanism and is thus firmly arch/
material (like the sparc and ppc thing).

Re: [RFC PATCH 03/23] genirq: Introduce IRQF_DELIVER_AS_NMI

2018-06-13 Thread Peter Zijlstra

On Tue, Jun 12, 2018 at 05:57:23PM -0700, Ricardo Neri wrote:
> diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
> index 5426627..dbc5e02 100644
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -61,6 +61,8 @@
>   *interrupt handler after suspending interrupts. For system
>   *wakeup devices users need to implement wakeup detection in
>   *their interrupt handlers.
> + * IRQF_DELIVER_AS_NMI - Configure interrupt to be delivered as 
> non-maskable, if
> + *supported by the chip.
>   */

NAK on the first 6 patches. You really _REALLY_ don't want to expose
NMIs to this level.

Re: [RFC PATCH 3/4] powerpc/64s: Wire up arch_trigger_cpumask_backtrace()

2018-06-13 Thread Christophe LEROY


Hi Michael,

It looks like this commit generates the following error:

stacktrace.c:(.text+0x1b0): undefined reference to `.smp_send_safe_nmi_ipi'
make[1]: *** [vmlinux] Error 1
make: *** [sub-make] Error 2

See http://kisskb.ellerman.id.au/kisskb/buildresult/13395345/ for details

Seems like that function only exists when CONFIG_NMI_IPI is defined.

Christophe

Le 02/05/2018 à 15:07, Michael Ellerman a écrit :

This allows eg. the RCU stall detector, or the soft/hardlockup
detectors to trigger a backtrace on all CPUs.

We implement this by sending a "safe" NMI, which will actually only
send an IPI. Unfortunately the generic code prints "NMI", so that's a
little confusing but we can probably live with it.

If one of the CPUs doesn't respond to the IPI, we then print some info
from it's paca and do a backtrace based on its saved_r1.

Example output:

   INFO: rcu_sched detected stalls on CPUs/tasks:
2-...0: (0 ticks this GP) idle=1be/1/4611686018427387904 
softirq=1055/1055 fqs=25735
(detected by 4, t=58847 jiffies, g=58, c=57, q=1258)
   Sending NMI from CPU 4 to CPUs 2:
   CPU 2 didn't respond to backtrace IPI, inspecting paca.
   irq_soft_mask: 0x01 in_mce: 0 in_nmi: 0 current: 3623 (bash)
   Back trace of paca->saved_r1 (0xc000e1c83ba0) (possibly stale):
   Call Trace:
   [c000e1c83ba0] [0014] 0x14 (unreliable)
   [c000e1c83bc0] [c0765798] lkdtm_do_action+0x48/0x80
   [c000e1c83bf0] [c0765a40] direct_entry+0x110/0x1b0
   [c000e1c83c90] [c058e650] full_proxy_write+0x90/0xe0
   [c000e1c83ce0] [c03aae3c] __vfs_write+0x6c/0x1f0
   [c000e1c83d80] [c03ab214] vfs_write+0xd4/0x240
   [c000e1c83dd0] [c03ab5cc] ksys_write+0x6c/0x110
   [c000e1c83e30] [c000b860] system_call+0x58/0x6c

Signed-off-by: Michael Ellerman 
---
  arch/powerpc/include/asm/nmi.h   |  4 
  arch/powerpc/kernel/stacktrace.c | 51 
  2 files changed, 55 insertions(+)

diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
index 9c80939b4d14..e97f58689ca7 100644
--- a/arch/powerpc/include/asm/nmi.h
+++ b/arch/powerpc/include/asm/nmi.h
@@ -4,6 +4,10 @@
  
  #ifdef CONFIG_PPC_WATCHDOG

  extern void arch_touch_nmi_watchdog(void);
+extern void arch_trigger_cpumask_backtrace(const cpumask_t *mask,
+  bool exclude_self);
+#define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
+
  #else
  static inline void arch_touch_nmi_watchdog(void) {}
  #endif
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index d534ed901538..cf4652d5df80 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -11,12 +11,15 @@
   */
  
  #include 

+#include 
  #include 
  #include 
  #include 
  #include 
  #include 
  
+#include 

+
  /*
   * Save stack-backtrace addresses into a stack_trace buffer.
   */
@@ -76,3 +79,51 @@ save_stack_trace_regs(struct pt_regs *regs, struct 
stack_trace *trace)
save_context_stack(trace, regs->gpr[1], current, 0);
  }
  EXPORT_SYMBOL_GPL(save_stack_trace_regs);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static void handle_backtrace_ipi(struct pt_regs *regs)
+{
+   nmi_cpu_backtrace(regs);
+}
+
+static void raise_backtrace_ipi(cpumask_t *mask)
+{
+   unsigned int cpu;
+
+   for_each_cpu(cpu, mask) {
+   if (cpu == smp_processor_id())
+   handle_backtrace_ipi(NULL);
+   else
+   smp_send_safe_nmi_ipi(cpu, handle_backtrace_ipi, 5 * 
USEC_PER_SEC);
+   }
+
+   for_each_cpu(cpu, mask) {
+   struct paca_struct *p = paca_ptrs[cpu];
+
+   cpumask_clear_cpu(cpu, mask);
+
+   pr_warn("CPU %d didn't respond to backtrace IPI, inspecting 
paca.\n", cpu);
+   if (!virt_addr_valid(p)) {
+   pr_warn("paca pointer appears corrupt? (%px)\n", p);
+   continue;
+   }
+
+   pr_warn("irq_soft_mask: 0x%02x in_mce: %d in_nmi: %d",
+   p->irq_soft_mask, p->in_mce, p->in_nmi);
+
+   if (virt_addr_valid(p->__current))
+   pr_cont(" current: %d (%s)\n", p->__current->pid,
+   p->__current->comm);
+   else
+   pr_cont(" current pointer corrupt? (%px)\n", 
p->__current);
+
+   pr_warn("Back trace of paca->saved_r1 (0x%016llx) (possibly 
stale):\n", p->saved_r1);
+   show_stack(p->__current, (unsigned long *)p->saved_r1);
+   }
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+{
+   nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi);
+}
+#endif /* CONFIG_PPC64 */

Re: [RFC V2] virtio: Add platform specific DMA API translation for virito devices

2018-06-13 Thread Christoph Hellwig

On Mon, Jun 11, 2018 at 01:29:18PM +1000, Benjamin Herrenschmidt wrote:
> At the risk of repeating myself, let's just do the first pass which is
> to switch virtio over to always using the DMA API in the actual data
> flow code, with a hook at initialization time that replaces the DMA ops
> with some home cooked "direct" ops in the case where the IOMMU flag
> isn't set.
> 
> This will be equivalent to what we have today but avoids having 2
> separate code path all over the driver.
> 
> Then a second stage, I think, is to replace this "hook" so that the
> architecture gets a say in the matter.

I don't think we can actually use dma_direct_ops.  It still allows
architectures to override parts of the dma setup, which virtio seems
to blindly assume phys == dma and not cache flushing.

I think the right way forward is to either add a new
VIRTIO_F_IS_PCI_DEVICE (or redefine the existing iommu flag if deemed
possible).  And then make sure recent qemu always sets it.

Re: [RFC PATCH 12/23] kernel/watchdog: Introduce a struct for NMI watchdog operations

2018-06-13 Thread Nicholas Piggin

On Tue, 12 Jun 2018 17:57:32 -0700
Ricardo Neri  wrote:

> Instead of exposing individual functions for the operations of the NMI
> watchdog, define a common interface that can be used across multiple
> implementations.
> 
> The struct nmi_watchdog_ops is defined for such operations. These initial
> definitions include the enable, disable, start, stop, and cleanup
> operations.
> 
> Only a single NMI watchdog can be used in the system. The operations of
> this NMI watchdog are accessed via the new variable nmi_wd_ops. This
> variable is set to point the operations of the first NMI watchdog that
> initializes successfully. Even though at this moment, the only available
> NMI watchdog is the perf-based hardlockup detector. More implementations
> can be added in the future.

Cool, this looks pretty nice at a quick glance. sparc and powerpc at
least have their own NMI watchdogs, it would be good to have those
converted as well.

Is hpet a cross platform thing, or just x86? We should avoid
proliferation of files under kernel/ I think, so with these watchdog
driver structs then maybe implementations could go in drivers/ or
arch/

Thanks,
Nick

Re: 4.17.0-10146-gf0dc7f9c6dd9: hw csum failure on powerpc+sungem

2018-06-13 Thread Mathieu Malaterre

Hi all,

On Tue, Jun 12, 2018 at 10:15 AM Mathieu Malaterre  wrote:
>
> Hi Balbir,
>
> On Tue, Jun 12, 2018 at 9:39 AM Balbir Singh  wrote:
> >
> >
> > On 12/06/18 06:20, Mathieu Malaterre wrote:
> >
> > > Hi Meelis,
> > >
> > > On Mon, Jun 11, 2018 at 1:21 PM Meelis Roos  wrote:
> > >> I am seeing this on PowerMac G4 with sungem ethernet driver. 4.17 was
> > >> OK, 4.17.0-10146-gf0dc7f9c6dd9 is problematic.
> > > Same here.
> > >
> > >> [  140.518664] eth0: hw csum failure
> > >> [  140.518699] CPU: 0 PID: 1237 Comm: postconf Not tainted 
> > >> 4.17.0-10146-gf0dc7f9c6dd9 #83
> > >> [  140.518707] Call Trace:
> > >> [  140.518734] [effefd90] [c03d6db8] __skb_checksum_complete+0xd8/0xdc 
> > >> (unreliable)
> > >> [  140.518759] [effefdb0] [c04c1284] icmpv6_rcv+0x248/0x4ec
> > >> [  140.518775] [effefdd0] [c049a448] 
> > >> ip6_input_finish.constprop.0+0x11c/0x5f4
> > >> [  140.518786] [effefe10] [c049b1c0] ip6_mc_input+0xcc/0x100
> > >> [  140.518807] [effefe20] [c03e110c] __netif_receive_skb_core+0x310/0x944
> > >> [  140.518820] [effefe70] [c03e76ec] napi_gro_receive+0xd0/0xe8
> > >> [  140.518845] [effefe80] [f3e1f66c] gem_poll+0x618/0x1274 [sungem]
> > >> [  140.518856] [effeff30] [c03e6f0c] net_rx_action+0x198/0x374
> > >> [  140.518872] [effeff90] [c0501a88] __do_softirq+0x120/0x278
> > >> [  140.518890] [effeffe0] [c0036188] irq_exit+0xd8/0xdc
> > >> [  140.518908] [effefff0] [c000f478] call_do_irq+0x24/0x3c
> > >> [  140.518925] [d05a5d30] [c0007120] do_IRQ+0x74/0xf0
> > >> [  140.518941] [d05a5d50] [c0012474] ret_from_except+0x0/0x14
> > >> [  140.518960] --- interrupt: 501 at copy_page+0x40/0x90
> > >>LR = copy_user_page+0x18/0x30
> > >> [  140.518973] [d05a5e10] [d058cd80] 0xd058cd80 (unreliable)
> > >> [  140.518989] [d05a5e20] [c00fa2bc] wp_page_copy+0xec/0x654
> > >> [  140.519002] [d05a5e60] [c00fd3a4] do_wp_page+0xa8/0x5b4
> > >> [  140.519013] [d05a5e90] [c00fe934] handle_mm_fault+0x564/0xa84
> > >> [  140.519025] [d05a5f00] [c0016230] do_page_fault+0x1bc/0x7e8
> > >> [  140.519037] [d05a5f40] [c0012300] handle_page_fault+0x14/0x40
> > >> [  140.519048] --- interrupt: 301 at 0xb78b6864
> > >>LR = 0xb78b6c54
> > >>
> > > For some reason if I do a git bisect it returns that:
> > >
> > > $ git bisect good
> > > 3036bc45364f98515a2c446d7fac2c34dcfbeff4 is the first bad commit
> > >
> > > Could you also check on your side please.
> >
> > Don't have a G4, but the related commits look like
> >
> > 373e098e1e788d7b89ec0f31765a6c08e2ea0f7c and 
> > e9c4943a107b56696e4872cdffdba6b0c7277c77 Balbir
> >
>
> Indeed that makes more sense. I must have messed up during my
> git-bisect operation. Will try a simple git-revert on those and report
> back.

Here is what I get.

Current git HEAD is: 0c14e43a42e4e44f70963f8ccf89461290c4e4da if I do:

$ git revert e9c4943a107b56696e4872cdffdba6b0c7277c77.
$ git revert 373e098e1e788d7b89ec0f31765a6c08e2ea0f7c
$ rm -rf g4
$ make O=g4 ARCH=powerpc g4_defconfig
$ make -j8 O=g4 ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu- bindeb-pkg

-> I can still see the error in dmesg.

> Thanks

[PATCH V3 2/2] crypto/nx: Initialize 842 high and normal RxFIFO control registers

2018-06-13 Thread Haren Myneni



NX increments readOffset by FIFO size in receive FIFO control register
when CRB is read. But the index in RxFIFO has to match with the
corresponding entry in FIFO maintained by VAS in kernel. Otherwise NX
may be processing incorrect CRBs and can cause CRB timeout.

VAS FIFO offset is 0 when the receive window is opened during
initialization. When the module is reloaded or in kexec boot, readOffset
in FIFO control register may not match with VAS entry. This patch adds
nx_coproc_init OPAL call to reset readOffset and queued entries in FIFO
control register for both high and normal FIFOs.

Signed-off-by: Haren Myneni 
---
Changlog:
V2: Execute nx_coproc_init OPAL call per NX without depending on
FIFO priority [Stewart Smith]

V3: Verify OPAL call availability before using NX coproc init
[Michael Ellerman]

diff --git a/arch/powerpc/include/asm/opal-api.h 
b/arch/powerpc/include/asm/opal-api.h
index d886a5b..ff61e4b 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -206,7 +206,8 @@
 #define OPAL_NPU_TL_SET161
 #define OPAL_PCI_GET_PBCQ_TUNNEL_BAR   164
 #define OPAL_PCI_SET_PBCQ_TUNNEL_BAR   165
-#define OPAL_LAST  165
+#defineOPAL_NX_COPROC_INIT 167
+#define OPAL_LAST  167
 
 /* Device tree flags */
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 7159e1a..d79eb82 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -288,6 +288,7 @@ int64_t opal_imc_counters_init(uint32_t type, uint64_t 
address,
 int opal_get_power_shift_ratio(u32 handle, int token, u32 *psr);
 int opal_set_power_shift_ratio(u32 handle, int token, u32 psr);
 int opal_sensor_group_clear(u32 group_hndl, int token);
+int opal_nx_coproc_init(uint32_t chip_id, uint32_t ct);
 
 s64 opal_signal_system_reset(s32 cpu);
 
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S 
b/arch/powerpc/platforms/powernv/opal-wrappers.S
index 3da30c2..c7541a9 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -325,3 +325,4 @@ OPAL_CALL(opal_npu_spa_clear_cache, 
OPAL_NPU_SPA_CLEAR_CACHE);
 OPAL_CALL(opal_npu_tl_set, OPAL_NPU_TL_SET);
 OPAL_CALL(opal_pci_get_pbcq_tunnel_bar,
OPAL_PCI_GET_PBCQ_TUNNEL_BAR);
 OPAL_CALL(opal_pci_set_pbcq_tunnel_bar,
OPAL_PCI_SET_PBCQ_TUNNEL_BAR);
+OPAL_CALL(opal_nx_coproc_init, OPAL_NX_COPROC_INIT);
diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 463eab8..0e479a6 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -1037,3 +1037,5 @@ void powernv_set_nmmu_ptcr(unsigned long ptcr)
 EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
 EXPORT_SYMBOL_GPL(opal_int_eoi);
 EXPORT_SYMBOL_GPL(opal_error_code);
+/* Export the below symbol for NX compression */
+EXPORT_SYMBOL(opal_nx_coproc_init);
diff --git a/drivers/crypto/nx/nx-842-powernv.c 
b/drivers/crypto/nx/nx-842-powernv.c
index 1e87637..a401055 100644
--- a/drivers/crypto/nx/nx-842-powernv.c
+++ b/drivers/crypto/nx/nx-842-powernv.c
@@ -24,6 +24,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Dan Streetman ");
@@ -753,7 +755,7 @@ static int nx842_open_percpu_txwins(void)
 }
 
 static int __init vas_cfg_coproc_info(struct device_node *dn, int chip_id,
-   int vasid)
+   int vasid, int *ct)
 {
struct vas_window *rxwin = NULL;
struct vas_rx_win_attr rxattr;
@@ -837,6 +839,15 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
coproc->vas.id = vasid;
nx842_add_coprocs_list(coproc, chip_id);
 
+   /*
+* (lpid, pid, tid) combination has to be unique for each
+* coprocessor instance in the system. So to make it
+* unique, skiboot uses coprocessor type such as 842 or
+* GZIP for pid and provides this value to kernel in pid
+* device-tree property.
+*/
+   *ct = pid;
+
return 0;
 
 err_out:
@@ -848,7 +859,7 @@ static int __init vas_cfg_coproc_info(struct device_node 
*dn, int chip_id,
 static int __init nx842_powernv_probe_vas(struct device_node *pn)
 {
struct device_node *dn;
-   int chip_id, vasid, ret = 0;
+   int chip_id, vasid, ct, ret = 0;
int nx_fifo_found = 0;
 
chip_id = of_get_ibm_chip_id(pn);
@@ -865,7 +876,7 @@ static int __init nx842_powernv_probe_vas(struct 
device_node *pn)
 
for_each_child_of_node(pn, dn) {
if (of_device_is_compatible(dn, "ibm,p9-nx-842")) {
-   ret = vas_cfg_coproc_info(dn, chip_id, vasid);
+   ret = vas_cfg_coproc_info(dn, chip_id, vasid, );

[PATCH V3 1/2] powerpc/powernv: Export opal_check_token symbol

2018-06-13 Thread Haren Myneni



Export opal_check_token symbol for modules to check the availability
of OPAL calls before using them.

Signed-off-by: Haren Myneni 

diff --git a/arch/powerpc/platforms/powernv/opal.c 
b/arch/powerpc/platforms/powernv/opal.c
index 48fbb41..838b79b 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -923,6 +923,7 @@ void opal_shutdown(void)
 EXPORT_SYMBOL_GPL(opal_flash_write);
 EXPORT_SYMBOL_GPL(opal_flash_erase);
 EXPORT_SYMBOL_GPL(opal_prd_msg);
+EXPORT_SYMBOL_GPL(opal_check_token);
 
 /* Convert a region of vmalloc memory to an opal sg list */
 struct opal_sg_list *opal_vmalloc_to_sg_list(void *vmalloc_addr,

Re: [patchV2 1/2] pci: introduce an extra method for matching in pci_driver

2018-06-13 Thread Pingfan Liu

On Wed, Jun 13, 2018 at 2:37 PM Christoph Hellwig  wrote:
>
> If we successfull call ->probe twice on a device we have a much
> deeper problem than we can fix in the PCI layer or shpchp.
>
OK, see your point.

> Please explain the actual race or other condition that can lead to
> this and report it to the driver core maintainer.
Oh, then it should be. I will add more debug info to narrow down the
problem, and report it to driver core maintainer.

Thanks for your kindly review.

Regards,
Pingfan

Re: [PATCH v5 2/4] resource: Use list_head to link sibling resource

2018-06-13 Thread Baoquan He

On 06/12/18 at 05:10pm, Julia Lawall wrote:
> This looks wrong.  After a list iterator, the index variable points to a
> dummy structure.
> 
> julia
> 
> url:
> https://github.com/0day-ci/linux/commits/Baoquan-He/resource-Use-list_head-to-link-sibling-resource/20180612-113600
> :: branch date: 7 hours ago
> :: commit date: 7 hours ago
> 
> >> kernel/resource.c:265:17-20: ERROR: invalid reference to the index 
> >> variable of the iterator on line 253
> 
> # 
> https://github.com/0day-ci/linux/commit/e906f15906750a86913ba2b1f08bad99129d3dfc
> git remote add linux-review https://github.com/0day-ci/linux
> git remote update linux-review
> git checkout e906f15906750a86913ba2b1f08bad99129d3dfc
> vim +265 kernel/resource.c
> 
> ^1da177e4 Linus Torvalds 2005-04-16  247
> 5eeec0ec9 Yinghai Lu 2009-12-22  248  static void 
> __release_child_resources(struct resource *r)
> 5eeec0ec9 Yinghai Lu 2009-12-22  249  {
> e906f1590 Baoquan He 2018-06-12  250  struct resource *tmp, *next;
> 5eeec0ec9 Yinghai Lu 2009-12-22  251  resource_size_t size;
> 5eeec0ec9 Yinghai Lu 2009-12-22  252
> e906f1590 Baoquan He 2018-06-12 @253  list_for_each_entry_safe(tmp, 
> next, >child, sibling) {
> 5eeec0ec9 Yinghai Lu 2009-12-22  254  tmp->parent = NULL;
> e906f1590 Baoquan He 2018-06-12  255  
> INIT_LIST_HEAD(>sibling);


list_del_init(>sibling);

Thanks, Julia. Here I should use list_del_init(>list) to
replace INIT_LIST_HEAD(>sibling). 

> 5eeec0ec9 Yinghai Lu 2009-12-22  256  
> __release_child_resources(tmp);
> 5eeec0ec9 Yinghai Lu 2009-12-22  257
> 5eeec0ec9 Yinghai Lu 2009-12-22  258  printk(KERN_DEBUG 
> "release child resource %pR\n", tmp);
> 5eeec0ec9 Yinghai Lu 2009-12-22  259  /* need to restore 
> size, and keep flags */
> 5eeec0ec9 Yinghai Lu 2009-12-22  260  size = 
> resource_size(tmp);
> 5eeec0ec9 Yinghai Lu 2009-12-22  261  tmp->start = 0;
> 5eeec0ec9 Yinghai Lu 2009-12-22  262  tmp->end = size - 1;
> 5eeec0ec9 Yinghai Lu 2009-12-22  263  }
> e906f1590 Baoquan He 2018-06-12  264
> e906f1590 Baoquan He 2018-06-12 @265  INIT_LIST_HEAD(>child);
> 5eeec0ec9 Yinghai Lu 2009-12-22  266  }
> 5eeec0ec9 Yinghai Lu 2009-12-22  267
> 
> ---
> 0-DAY kernel test infrastructureOpen Source Technology Center
> https://lists.01.org/pipermail/kbuild-all   Intel Corporation

Re: [patchV2 1/2] pci: introduce an extra method for matching in pci_driver

2018-06-13 Thread Christoph Hellwig

If we successfull call ->probe twice on a device we have a much
deeper problem than we can fix in the PCI layer or shpchp.

Please explain the actual race or other condition that can lead to
this and report it to the driver core maintainer.

[patchV2 2/2] pci/shpchp: no claim on pcie port device

2018-06-13 Thread Pingfan Liu

The Linux Device Driver Model allows a physical device to be handled
by only a single driver. But at present, both shpchp and portdrv_pci
claim PCI_CLASS_BRIDGE_PCI, and touch devices_kset when the drivers are
loaded. This causes a few problems, one is the wrong shutdown seq of
devices, owing to the broken devices_kset. This patch keeps shpchp away
from pcie port device, using the extra matching method.

Note:
I hit this bug on a Power9 machine, when "kexec -e", and see a ata-disk
behind a bridge can not write back buffer in flight due to the former
shutdown of the bridge which clears the BusMaster bit.

the device involved:
0004:00:00.0 PCI bridge: IBM Device 04c1 (prog-if 00 [Normal decode])
Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- 
Stepping- SERR+ FastB2B- DisINTx-
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- SERR- TAbort- Reset- FastB2B-
PriDiscTmr- SecDiscTmr- DiscTmrStat- DiscTmrSERREn-
Capabilities: [40] Power Management version 3
Flags: PMEClk- DSI- D1- D2- AuxCurrent=0mA 
PME(D0+,D1-,D2-,D3hot+,D3cold+)
Status: D0 NoSoftRst- PME-Enable- DSel=0 DScale=0 PME-
Capabilities: [48] Express (v2) Root Port (Slot-), MSI 00
DevCap: MaxPayload 512 bytes, PhantFunc 0
ExtTag- RBE+
DevCtl: Report errors: Correctable- Non-Fatal- Fatal- 
Unsupported-
RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
MaxPayload 256 bytes, MaxReadReq 128 bytes
DevSta: CorrErr- UncorrErr- FatalErr- UnsuppReq- AuxPwr- 
TransPend-
LnkCap: Port #0, Speed 8GT/s, Width x8, ASPM not supported, 
Exit Latency L0s <64ns, L1 <1us
ClockPM- Surprise- LLActRep+ BwNot+ ASPMOptComp-
LnkCtl: ASPM Disabled; RCB 128 bytes Disabled- CommClk-
ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
LnkSta: Speed 8GT/s, Width x4, TrErr- Train- SlotClk- DLActive+ 
BWMgmt- ABWMgmt+
RootCtl: ErrCorrectable- ErrNon-Fatal- ErrFatal- PMEIntEna- 
CRSVisible-
RootCap: CRSVisible-
RootSta: PME ReqID , PMEStatus- PMEPending-
DevCap2: Completion Timeout: Range ABCD, TimeoutDis+, LTR-, 
OBFF Not Supported ARIFwd+
DevCtl2: Completion Timeout: 16ms to 55ms, TimeoutDis+, LTR-, 
OBFF Disabled ARIFwd+
LnkCtl2: Target Link Speed: 8GT/s, EnterCompliance- SpeedDis-
 Transmit Margin: Normal Operating Range, 
EnterModifiedCompliance- ComplianceSOS-
 Compliance De-emphasis: -6dB
LnkSta2: Current De-emphasis Level: -3.5dB, 
EqualizationComplete+, EqualizationPhase1+
 EqualizationPhase2+, EqualizationPhase3+, 
LinkEqualizationRequest-
Capabilities: [100 v1] Advanced Error Reporting
UESta:  DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- 
RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
UEMsk:  DLP- SDES- TLP+ FCP- CmpltTO+ CmpltAbrt+ UnxCmplt- 
RxOF- MalfTLP- ECRC+ UnsupReq- ACSViol-
UESvrt: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- 
RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
CESta:  RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr-
CEMsk:  RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr-
AERCap: First Error Pointer: 00, GenCap+ CGenEn+ ChkCap+ ChkEn+
Capabilities: [148 v1] #19

Signed-off-by: Pingfan Liu 
---
 drivers/pci/hotplug/shpchp_core.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/pci/hotplug/shpchp_core.c 
b/drivers/pci/hotplug/shpchp_core.c
index 1f0f969..85b4665 100644
--- a/drivers/pci/hotplug/shpchp_core.c
+++ b/drivers/pci/hotplug/shpchp_core.c
@@ -336,6 +336,18 @@ static void shpc_remove(struct pci_dev *dev)
kfree(ctrl);
 }
 
+static int shpc_extra_match(struct pci_dev *pdev)
+{
+   /* do not claim pcie port device */
+   if (pci_is_pcie(pdev) &&
+   (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT ||
+pci_pcie_type(pdev) == PCI_EXP_TYPE_UPSTREAM ||
+pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM))
+   return -ENODEV;
+
+   return 0;
+}
+
 static const struct pci_device_id shpcd_pci_tbl[] = {
{PCI_DEVICE_CLASS(((PCI_CLASS_BRIDGE_PCI << 8) | 0x00), ~0)},
{ /* end: all zeroes */ }
@@ -345,6 +357,7 @@ MODULE_DEVICE_TABLE(pci, shpcd_pci_tbl);
 static struct pci_driver shpc_driver = {
.name = SHPC_MODULE_NAME,
.id_table = shpcd_pci_tbl,
+   .extra_match =  shpc_extra_match,
.probe =shpc_probe,
.remove =   shpc_remove,
 };
-- 
2.7.4

[patchV2 1/2] pci: introduce an extra method for matching in pci_driver

2018-06-13 Thread Pingfan Liu

In __driver_attach(), if a driver matches a device, the device will
be appended to the tail of devices_kset, no matter what the probing
result of the device. Hence in order to prevent a driver to append
a probed device to devices_kset, it requires a correct matching.

As for pci, pci driver uses pci_device_id to match a device. But it may
be not enough, since there is need for extra info such as pcie, which
can not be provided in pci_device_id. Such info is driver specific, and
this patch introduces a new method "extra_match" in pci_driver.
This is used by next patch.

Signed-off-by: Pingfan Liu 
---
 drivers/pci/pci-driver.c | 4 
 include/linux/pci.h  | 2 ++
 2 files changed, 6 insertions(+)

diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index b9a1311..151865f 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -273,6 +273,10 @@ static const struct pci_device_id *pci_match_device(struct 
pci_driver *drv,
if (!found_id && dev->driver_override)
found_id = _device_id_any;
 
+   /* more strictly matching besides id_table */
+   if (drv->extra_match && drv->extra_match(dev) < 0)
+   found_id = NULL;
+
return found_id;
 }
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2..6ed960d 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -738,10 +738,12 @@ struct pci_error_handlers {
 
 
 struct module;
+typedef int (*ematch)(struct pci_dev *pdev);
 struct pci_driver {
struct list_headnode;
const char  *name;
const struct pci_device_id *id_table;   /* Must be non-NULL for probe 
to be called */
+   ematch extra_match; /* more strictly matching besides id_table */
int  (*probe)(struct pci_dev *dev, const struct pci_device_id *id); 
/* New device inserted */
void (*remove)(struct pci_dev *dev);/* Device removed (NULL if not 
a hot-plug capable driver) */
int  (*suspend)(struct pci_dev *dev, pm_message_t state);   /* 
Device suspended */
-- 
2.7.4

[patchV2 0/2] shpchp: no claim on pcie port device

2018-06-13 Thread Pingfan Liu

The Linux Device Driver Model allows a physical device to be handled
by only a single driver. But at present, both shpchp and portdrv_pci
claim PCI_CLASS_BRIDGE_PCI. This cause problems, such as the wrong
shutdown seq. This series keeps shpchp driver away from pcie port device.

V1 -> V2:
implement the checking in the process of matching, instead of probing,
which is too late.

Pingfan Liu (2):
  pci: introduce an extra method for matching in pci_driver
  pci/shpchp: no claim on pcie port device

 drivers/pci/hotplug/shpchp_core.c | 13 +
 drivers/pci/pci-driver.c  |  4 
 include/linux/pci.h   |  2 ++
 3 files changed, 19 insertions(+)

-- 
2.7.4

1 2 >

1 - 100 of 102 matches

Mail list logo