date:20200504

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:16 PM AEST Jordan Niethe wrote:
> Expand the code-patching self-tests to includes tests for patching
> prefixed instructions.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v6: New to series
> ---
>  arch/powerpc/lib/Makefile |  2 +-
>  arch/powerpc/lib/code-patching.c  | 21 +
>  arch/powerpc/lib/test_code-patching.S | 19 +++
>  3 files changed, 41 insertions(+), 1 deletion(-)
>  create mode 100644 arch/powerpc/lib/test_code-patching.S
> 
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index 546591848219..5e994cda8e40 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
>  CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
>  endif
> 
> -obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o
> +obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o
> test_code-patching.o
> 
>  ifndef CONFIG_KASAN
>  obj-y+=  string.o memcmp_$(BITS).o
> diff --git a/arch/powerpc/lib/code-patching.c
> b/arch/powerpc/lib/code-patching.c index b32fa707725e..7107c6d01261 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -699,6 +699,24 @@ static void __init test_translate_branch(void)
>   vfree(buf);
>  }
> 
> +#ifdef __powerpc64__
> +static void __init test_prefixed_patching(void)
> +{
> + extern unsigned int code_patching_test1[];
> + extern unsigned int code_patching_test1_expected[];
> + extern unsigned int end_code_patching_test1[];
> +
> + __patch_instruction((struct ppc_inst *)code_patching_test1,
> + ppc_inst_prefix(1 << 26, 0x),
> + (struct ppc_inst *)code_patching_test1);
> +
> + check(!memcmp(code_patching_test1,
> +   code_patching_test1_expected,
> +   sizeof(unsigned int) *
> +   (end_code_patching_test1 - code_patching_test1)));
> +}
> +#endif
> +
>  static int __init test_code_patching(void)
>  {
>   printk(KERN_DEBUG "Running code patching self-tests ...\n");
> @@ -707,6 +725,9 @@ static int __init test_code_patching(void)
>   test_branch_bform();
>   test_create_function_call();
>   test_translate_branch();
> +#ifdef __powerpc64__
> + test_prefixed_patching();
> +#endif
> 
>   return 0;
>  }
> diff --git a/arch/powerpc/lib/test_code-patching.S
> b/arch/powerpc/lib/test_code-patching.S new file mode 100644
> index ..91aab208a804
> --- /dev/null
> +++ b/arch/powerpc/lib/test_code-patching.S
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2020 IBM Corporation
> + */
> +
> + .text
> +
> +#define globl(x) \
> + .globl x;   \
> +x:
> +
> +globl(code_patching_test1)
> + nop
> + nop
> +globl(end_code_patching_test1)
> +
> +globl(code_patching_test1_expected)
> + .long 1 << 26
> + .long 0x000

Re: [PATCH v7 23/28] powerpc: Add prefixed instructions to instruction data type

When reviewing earlier patches in this series I assumed the data type would 
eventually change size (on PPC64 at least) so I was looking for any possible 
side effects this may cause, but I didn't notice any so I think this should be 
ok:

Reviewed-by: Alistair Popple 

However I haven't dug deeply enough into the optprobes code to fully 
understand/comment on the changes there (although they look correct afaict).

On Friday, 1 May 2020 1:42:15 PM AEST Jordan Niethe wrote:
> For powerpc64, redefine the ppc_inst type so both word and prefixed
> instructions can be represented. On powerpc32 the type will remain the
> same.  Update places which had assumed instructions to be 4 bytes long.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v4: New to series
> v5:  - Distinguish normal instructions from prefixed instructions with a
>0xff marker for the suffix.
>  - __patch_instruction() using std for prefixed instructions
> v6:  - Return false instead of 0 in ppc_inst_prefixed()
>  - Fix up types for ppc32 so it compiles
>  - remove ppc_inst_write()
>  - __patching_instruction(): move flush out of condition
> ---
>  arch/powerpc/include/asm/inst.h  | 68 +---
>  arch/powerpc/include/asm/kprobes.h   |  2 +-
>  arch/powerpc/include/asm/uaccess.h   | 32 -
>  arch/powerpc/include/asm/uprobes.h   |  2 +-
>  arch/powerpc/kernel/optprobes.c  | 42 +
>  arch/powerpc/kernel/optprobes_head.S |  3 ++
>  arch/powerpc/lib/code-patching.c | 13 --
>  arch/powerpc/lib/feature-fixups.c|  5 +-
>  arch/powerpc/lib/inst.c  | 40 
>  arch/powerpc/lib/sstep.c |  4 +-
>  arch/powerpc/xmon/xmon.c |  4 +-
>  arch/powerpc/xmon/xmon_bpts.S|  2 +
>  12 files changed, 180 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/inst.h
> b/arch/powerpc/include/asm/inst.h index 2f3c9d5bcf7c..1e743635c214 100644
> --- a/arch/powerpc/include/asm/inst.h
> +++ b/arch/powerpc/include/asm/inst.h
> @@ -8,23 +8,72 @@
> 
>  struct ppc_inst {
>   u32 val;
> +#ifdef __powerpc64__
> + u32 suffix;
> +#endif /* __powerpc64__ */
>  } __packed;
> 
> -#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> -
>  static inline u32 ppc_inst_val(struct ppc_inst x)
>  {
>   return x.val;
>  }
> 
> -static inline int ppc_inst_len(struct ppc_inst x)
> +static inline int ppc_inst_primary_opcode(struct ppc_inst x)
>  {
> - return sizeof(struct ppc_inst);
> + return ppc_inst_val(x) >> 26;
>  }
> 
> -static inline int ppc_inst_primary_opcode(struct ppc_inst x)
> +#ifdef __powerpc64__
> +#define ppc_inst(x) ((struct ppc_inst){ .val = (x), .suffix = 0xff })
> +
> +#define ppc_inst_prefix(x, y) ((struct ppc_inst){ .val = (x), .suffix = (y)
> }) +
> +static inline u32 ppc_inst_suffix(struct ppc_inst x)
>  {
> - return ppc_inst_val(x) >> 26;
> + return x.suffix;
> +}
> +
> +static inline bool ppc_inst_prefixed(struct ppc_inst x)
> +{
> + return (ppc_inst_primary_opcode(x) == 1) && ppc_inst_suffix(x) != 0xff;
> +}
> +
> +static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> +{
> + return ppc_inst_prefix(swab32(ppc_inst_val(x)),
> +swab32(ppc_inst_suffix(x)));
> +}
> +
> +static inline struct ppc_inst ppc_inst_read(const struct ppc_inst *ptr)
> +{
> + u32 val, suffix;
> +
> + val = *(u32 *)ptr;
> + if ((val >> 26) == 1) {
> + suffix = *((u32 *)ptr + 1);
> + return ppc_inst_prefix(val, suffix);
> + } else {
> + return ppc_inst(val);
> + }
> +}
> +
> +static inline bool ppc_inst_equal(struct ppc_inst x, struct ppc_inst y)
> +{
> + return *(u64 *)&x == *(u64 *)&y;
> +}
> +
> +#else
> +
> +#define ppc_inst(x) ((struct ppc_inst){ .val = x })
> +
> +static inline bool ppc_inst_prefixed(struct ppc_inst x)
> +{
> + return false;
> +}
> +
> +static inline u32 ppc_inst_suffix(struct ppc_inst x)
> +{
> + return 0;
>  }
> 
>  static inline struct ppc_inst ppc_inst_swab(struct ppc_inst x)
> @@ -42,6 +91,13 @@ static inline bool ppc_inst_equal(struct ppc_inst x,
> struct ppc_inst y) return ppc_inst_val(x) == ppc_inst_val(y);
>  }
> 
> +#endif /* __powerpc64__ */
> +
> +static inline int ppc_inst_len(struct ppc_inst x)
> +{
> + return (ppc_inst_prefixed(x)) ? 8  : 4;
> +}
> +
>  int probe_user_read_inst(struct ppc_inst *inst,
>struct ppc_inst *nip);
>  int probe_kernel_read_inst(struct ppc_inst *inst,
> diff --git a/arch/powerpc/include/asm/kprobes.h
> b/arch/powerpc/include/asm/kprobes.h index 66b3f2983b22..4fc0e15e23a5
> 100644
> --- a/arch/powerpc/include/asm/kprobes.h
> +++ b/arch/powerpc/include/asm/kprobes.h
> @@ -43,7 +43,7 @@ extern kprobe_opcode_t optprobe_template_ret[];
>  extern kprobe_opcode_t optprobe_template_end[];
> 
>  /* Fixed instruction size for powerpc */
> -#define MAX_INSN_SIZE1
> +#define MAX_INSN_SIZE2
>  #de

Re: [PATCH 1/4] dma-mapping: move the remaining DMA API calls out of line

2020-05-04 Thread Alexey Kardashevskiy




On 17/04/2020 17:58, Christoph Hellwig wrote:
> On Wed, Apr 15, 2020 at 09:21:37PM +1000, Alexey Kardashevskiy wrote:
>> And the fact they were exported leaves possibility that there is a
>> driver somewhere relying on these symbols or distro kernel won't build
>> because the symbol disappeared from exports (I do not know what KABI
>> guarantees or if mainline kernel cares).
> 
> We absolutely do not care.  In fact for abuses of APIs that drivers
> should not use we almost care to make them private and break people
> abusing them.

ok :)

>> I do not care in particular but
>> some might, a line separated with empty lines in the commit log would do.
> 
> I'll add a blurb for the next version.


Has it gone anywhere? Thanks,


-- 
Alexey

Re: [PATCH v4 0/7] clean up redundant 'kvm_run' parameters

2020-05-04 Thread Tianjia Zhang


Paolo Bonzini, any opinion on this?

Thanks and best,
Tianjia

On 2020/4/27 12:35, Tianjia Zhang wrote:

In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
structure. For historical reasons, many kvm-related function parameters
retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
patch does a unified cleanup of these remaining redundant parameters.

This series of patches has completely cleaned the architecture of
arm64, mips, ppc, and s390 (no such redundant code on x86). Due to
the large number of modified codes, a separate patch is made for each
platform. On the ppc platform, there is also a redundant structure
pointer of 'kvm_run' in 'vcpu_arch', which has also been cleaned
separately.

---
v4 change:
   mips: fixes two errors in entry.c.

v3 change:
   Keep the existing `vcpu->run` in the function body unchanged.

v2 change:
   s390 retains the original variable name and minimizes modification.

Tianjia Zhang (7):
   KVM: s390: clean up redundant 'kvm_run' parameters
   KVM: arm64: clean up redundant 'kvm_run' parameters
   KVM: PPC: Remove redundant kvm_run from vcpu_arch
   KVM: PPC: clean up redundant 'kvm_run' parameters
   KVM: PPC: clean up redundant kvm_run parameters in assembly
   KVM: MIPS: clean up redundant 'kvm_run' parameters
   KVM: MIPS: clean up redundant kvm_run parameters in assembly

  arch/arm64/include/asm/kvm_coproc.h  |  12 +--
  arch/arm64/include/asm/kvm_host.h|  11 +--
  arch/arm64/include/asm/kvm_mmu.h |   2 +-
  arch/arm64/kvm/handle_exit.c |  36 +++
  arch/arm64/kvm/sys_regs.c|  13 ++-
  arch/mips/include/asm/kvm_host.h |  32 +--
  arch/mips/kvm/emulate.c  |  59 
  arch/mips/kvm/entry.c|  21 ++---
  arch/mips/kvm/mips.c |  14 +--
  arch/mips/kvm/trap_emul.c| 114 ++-
  arch/mips/kvm/vz.c   |  26 ++
  arch/powerpc/include/asm/kvm_book3s.h|  16 ++--
  arch/powerpc/include/asm/kvm_host.h  |   1 -
  arch/powerpc/include/asm/kvm_ppc.h   |  27 +++---
  arch/powerpc/kvm/book3s.c|   4 +-
  arch/powerpc/kvm/book3s.h|   2 +-
  arch/powerpc/kvm/book3s_64_mmu_hv.c  |  12 +--
  arch/powerpc/kvm/book3s_64_mmu_radix.c   |   4 +-
  arch/powerpc/kvm/book3s_emulate.c|  10 +-
  arch/powerpc/kvm/book3s_hv.c |  64 ++---
  arch/powerpc/kvm/book3s_hv_nested.c  |  12 +--
  arch/powerpc/kvm/book3s_interrupts.S |  17 ++--
  arch/powerpc/kvm/book3s_paired_singles.c |  72 +++---
  arch/powerpc/kvm/book3s_pr.c |  33 ---
  arch/powerpc/kvm/booke.c |  39 
  arch/powerpc/kvm/booke.h |   8 +-
  arch/powerpc/kvm/booke_emulate.c |   2 +-
  arch/powerpc/kvm/booke_interrupts.S  |   9 +-
  arch/powerpc/kvm/bookehv_interrupts.S|  10 +-
  arch/powerpc/kvm/e500_emulate.c  |  15 ++-
  arch/powerpc/kvm/emulate.c   |  10 +-
  arch/powerpc/kvm/emulate_loadstore.c |  32 +++
  arch/powerpc/kvm/powerpc.c   |  72 +++---
  arch/powerpc/kvm/trace_hv.h  |   6 +-
  arch/s390/kvm/kvm-s390.c |  23 +++--
  virt/kvm/arm/arm.c   |   6 +-
  virt/kvm/arm/mmio.c  |  11 ++-
  virt/kvm/arm/mmu.c   |   5 +-
  38 files changed, 392 insertions(+), 470 deletions(-)

Re: [PATCH 3/3] mm/hugetlb: Introduce HAVE_ARCH_CLEAR_HUGEPAGE_FLAGS

2020-05-04 Thread Anshuman Khandual




On 04/26/2020 08:31 AM, Andrew Morton wrote:
> On Sun, 26 Apr 2020 08:13:17 +0530 Anshuman Khandual 
>  wrote:
> 
>>
>>
>> On 04/26/2020 06:25 AM, Andrew Morton wrote:
>>> On Tue, 14 Apr 2020 17:14:30 +0530 Anshuman Khandual 
>>>  wrote:
>>>
>>>> There are multiple similar definitions for arch_clear_hugepage_flags() on
>>>> various platforms. This introduces HAVE_ARCH_CLEAR_HUGEPAGE_FLAGS for those
>>>> platforms that need to define their own arch_clear_hugepage_flags() while
>>>> also providing a generic fallback definition for others to use. This help
>>>> reduce code duplication.
>>>>
>>>> ...
>>>>
>>>> --- a/include/linux/hugetlb.h
>>>> +++ b/include/linux/hugetlb.h
>>>> @@ -544,6 +544,10 @@ static inline int is_hugepage_only_range(struct 
>>>> mm_struct *mm,
>>>>  }
>>>>  #endif
>>>>  
>>>> +#ifndef HAVE_ARCH_CLEAR_HUGEPAGE_FLAGS
>>>> +static inline void arch_clear_hugepage_flags(struct page *page) { }
>>>> +#endif
>>>> +
>>>>  #ifndef arch_make_huge_pte
>>>>  static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct 
>>>> *vma,
>>>>   struct page *page, int writable)
>>>
>>> This is the rather old-school way of doing it.  The Linus-suggested way is
>>>
>>> #ifndef arch_clear_hugepage_flags
>>> static inline void arch_clear_hugepage_flags(struct page *page)
>>> {
>>> }
>>> #define arch_clear_hugepage_flags arch_clear_hugepage_flags
>>
>> Do we need that above line here ? Is not that implicit.
> 
> It depends if other header files want to test whether
> arch_clear_hugepage_flags is already defined.  If the header heorarchy
> is well-defined and working properly, they shouldn't need to, because
> we're reliably indluding the relevant arch header before (or early
> within) include/linux/hugetlb.h.
> 
> It would be nice if
> 
> #define arch_clear_hugepage_flags arch_clear_hugepage_flags
> #define arch_clear_hugepage_flags arch_clear_hugepage_flags
> 
> were to generate an compiler error but it doesn't.  If it did we could
> detect these incorrect inclusion orders.
> 
>>> #endif
>>>
>>> And the various arch headers do
>>>
>>> static inline void arch_clear_hugepage_flags(struct page *page)
>>> {
>>> 
>>> }
>>> #define arch_clear_hugepage_flags arch_clear_hugepage_flags
>>>
>>> It's a small difference - mainly to avoid adding two variables to the
>>> overall namespace where one would do.
>>
>> Understood, will change and resend.
> 
> That's OK - I've queued up that fix.
>

Hello Andrew,

I might not have searched all the relevant trees or might have just searched
earlier than required. But I dont see these patches (or your proposed fixes)
either in mmotm (2020-04-29-23-04) or in next-20200504. Wondering if you are
waiting on a V2 for this series accommodating the changes you had proposed.

- Anshuman

Re: [PATCH v7 22/28] powerpc: Define new SRR1 bits for a future ISA version

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:14 PM AEST Jordan Niethe wrote:
> Add the BOUNDARY SRR1 bit definition for when the cause of an alignment
> exception is a prefixed instruction that crosses a 64-byte boundary.
> Add the PREFIXED SRR1 bit definition for exceptions caused by prefixed
> instructions.
> 
> Bit 35 of SRR1 is called SRR1_ISI_N_OR_G. This name comes from it being
> used to indicate that an ISI was due to the access being no-exec or
> guarded. A future ISA version adds another purpose. It is also set if
> there is an access in a cache-inhibited location for prefixed
> instruction.  Rename from SRR1_ISI_N_OR_G to SRR1_ISI_N_G_OR_CIP.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v2: Combined all the commits concerning SRR1 bits.
> ---
>  arch/powerpc/include/asm/reg.h  | 4 +++-
>  arch/powerpc/kvm/book3s_hv_nested.c | 2 +-
>  arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +-
>  3 files changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index 773f76402392..f95eb8f97756 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -762,7 +762,7 @@
>  #endif
> 
>  #define   SRR1_ISI_NOPT  0x4000 /* ISI: Not found in hash */
> -#define   SRR1_ISI_N_OR_G0x1000 /* ISI: Access is no-exec or G */
> +#define   SRR1_ISI_N_G_OR_CIP0x1000 /* ISI: Access is no-exec or 
> G or
> CI for a prefixed instruction */ #define   SRR1_ISI_PROT  
> 0x0800 /*
> ISI: Other protection fault */ #define   SRR1_WAKEMASK
> 0x0038 /*
> reason for wakeup */
>  #define   SRR1_WAKEMASK_P8   0x003c /* reason for wakeup on POWER8 and 9
> */ @@ -789,6 +789,8 @@
>  #define   SRR1_PROGADDR  0x0001 /* SRR0 contains subsequent 
> addr */
> 
>  #define   SRR1_MCE_MCP   0x0008 /* Machine check signal 
> caused 
interrupt
> */ +#define   SRR1_BOUNDARY   0x1000 /* Prefixed instruction 
> crosses
> 64-byte boundary */ +#define   SRR1_PREFIXED  0x2000 /* Exception
> caused by prefixed instruction */
> 
>  #define SPRN_HSRR0   0x13A   /* Save/Restore Register 0 */
>  #define SPRN_HSRR1   0x13B   /* Save/Restore Register 1 */
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c
> b/arch/powerpc/kvm/book3s_hv_nested.c index dc97e5be76f6..6ab685227574
> 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -1169,7 +1169,7 @@ static int kvmhv_translate_addr_nested(struct kvm_vcpu
> *vcpu, } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { /*
> Can we execute? */
>   if (!gpte_p->may_execute) {
> - flags |= SRR1_ISI_N_OR_G;
> + flags |= SRR1_ISI_N_G_OR_CIP;
>   goto forward_to_l1;
>   }
>   } else {
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 220305454c23..b53a9f1c1a46
> 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
> @@ -1260,7 +1260,7 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu,
> unsigned long addr, status &= ~DSISR_NOHPTE;  /* DSISR_NOHPTE ==
> SRR1_ISI_NOPT */
>   if (!data) {
>   if (gr & (HPTE_R_N | HPTE_R_G))
> - return status | SRR1_ISI_N_OR_G;
> + return status | SRR1_ISI_N_G_OR_CIP;
>   if (!hpte_read_permission(pp, slb_v & key))
>   return status | SRR1_ISI_PROT;
>   } else if (status & DSISR_ISSTORE) {

Re: [PATCH v7 20/28] powerpc: Make test_translate_branch() independent of instruction length

I guess this could change if there were prefixed branch instructions, but there 
aren't so:

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:12 PM AEST Jordan Niethe wrote:
> test_translate_branch() uses two pointers to instructions within a
> buffer, p and q, to test patch_branch(). The pointer arithmetic done on
> them assumes a size of 4. This will not work if the instruction length
> changes. Instead do the arithmetic relative to the void * to the buffer.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v4: New to series
> ---
>  arch/powerpc/lib/code-patching.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/lib/code-patching.c
> b/arch/powerpc/lib/code-patching.c index 110f710500c8..5b2f66d06b1e 100644
> --- a/arch/powerpc/lib/code-patching.c
> +++ b/arch/powerpc/lib/code-patching.c
> @@ -569,7 +569,7 @@ static void __init test_branch_bform(void)
>  static void __init test_translate_branch(void)
>  {
>   unsigned long addr;
> - struct ppc_inst *p, *q;
> + void *p, *q;
>   struct ppc_inst instr;
>   void *buf;
> 
> @@ -583,7 +583,7 @@ static void __init test_translate_branch(void)
>   addr = (unsigned long)p;
>   patch_branch(p, addr, 0);
>   check(instr_is_branch_to_addr(p, addr));
> - q = p + 1;
> + q = p + 4;
>   translate_branch(&instr, q, p);
>   patch_instruction(q, instr);
>   check(instr_is_branch_to_addr(q, addr));
> @@ -639,7 +639,7 @@ static void __init test_translate_branch(void)
>   create_cond_branch(&instr, p, addr, 0);
>   patch_instruction(p, instr);
>   check(instr_is_branch_to_addr(p, addr));
> - q = p + 1;
> + q = buf + 4;
>   translate_branch(&instr, q, p);
>   patch_instruction(q, instr);
>   check(instr_is_branch_to_addr(q, addr));

Re: [PATCH v7 19/28] powerpc/xmon: Move insertion of breakpoint for xol'ing

I can't see any side-effects from patching both instructions at the same time.

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:11 PM AEST Jordan Niethe wrote:
> When a new breakpoint is created, the second instruction of that
> breakpoint is patched with a trap instruction. This assumes the length
> of the instruction is always the same. In preparation for prefixed
> instructions, remove this assumption. Insert the trap instruction at the
> same time the first instruction is inserted.
> 
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/xmon/xmon.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 1947821e425d..fb2563079046 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -878,7 +878,6 @@ static struct bpt *new_breakpoint(unsigned long a)
>   if (!bp->enabled && atomic_read(&bp->ref_count) == 0) {
>   bp->address = a;
>   bp->instr = (void *)(bpt_table + ((bp - bpts) * 
> BPT_WORDS));
> - patch_instruction(bp->instr + 1, ppc_inst(bpinstr));
>   return bp;
>   }
>   }
> @@ -910,6 +909,7 @@ static void insert_bpts(void)
>   continue;
>   }
>   patch_instruction(bp->instr, instr);
> + patch_instruction((void *)bp->instr + ppc_inst_len(instr),
> ppc_inst(bpinstr)); if (bp->enabled & BP_CIABR)
>   continue;
>   if (patch_instruction((struct ppc_inst *)bp->address,

Re: [PATCH v7 18/28] powerpc/xmon: Use a function for reading instructions

Shouldn't change anything and will be correct once prefix instructions are 
defined.

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:10 PM AEST Jordan Niethe wrote:
> Currently in xmon, mread() is used for reading instructions. In
> preparation for prefixed instructions, create and use a new function,
> mread_instr(), especially for reading instructions.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v5: New to series, seperated from "Add prefixed instructions to
> instruction data type"
> v6: mread_instr(): correctly return error status
> ---
>  arch/powerpc/xmon/xmon.c | 28 
>  1 file changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index cde733a82366..1947821e425d 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -122,6 +122,7 @@ static unsigned bpinstr = 0x7fe8; /* trap */
>  static int cmds(struct pt_regs *);
>  static int mread(unsigned long, void *, int);
>  static int mwrite(unsigned long, void *, int);
> +static int mread_instr(unsigned long, struct ppc_inst *);
>  static int handle_fault(struct pt_regs *);
>  static void byterev(unsigned char *, int);
>  static void memex(void);
> @@ -896,7 +897,7 @@ static void insert_bpts(void)
>   for (i = 0; i < NBPTS; ++i, ++bp) {
>   if ((bp->enabled & (BP_TRAP|BP_CIABR)) == 0)
>   continue;
> - if (mread(bp->address, &instr, 4) != 4) {
> + if (!mread_instr(bp->address, &instr)) {
>   printf("Couldn't read instruction at %lx, "
>  "disabling breakpoint there\n", bp->address);
>   bp->enabled = 0;
> @@ -946,7 +947,7 @@ static void remove_bpts(void)
>   for (i = 0; i < NBPTS; ++i, ++bp) {
>   if ((bp->enabled & (BP_TRAP|BP_CIABR)) != BP_TRAP)
>   continue;
> - if (mread(bp->address, &instr, 4) == 4
> + if (mread_instr(bp->address, &instr)
>   && ppc_inst_equal(instr, ppc_inst(bpinstr))
>   && patch_instruction(
>   (struct ppc_inst *)bp->address, 
> ppc_inst_read(bp->instr)) != 0)
> @@ -1162,7 +1163,7 @@ static int do_step(struct pt_regs *regs)
>   force_enable_xmon();
>   /* check we are in 64-bit kernel mode, translation enabled */
>   if ((regs->msr & (MSR_64BIT|MSR_PR|MSR_IR)) == (MSR_64BIT|MSR_IR)) {
> - if (mread(regs->nip, &instr, 4) == 4) {
> + if (mread_instr(regs->nip, &instr)) {
>   stepped = emulate_step(regs, instr);
>   if (stepped < 0) {
>   printf("Couldn't single-step %s instruction\n",
> @@ -1329,7 +1330,7 @@ static long check_bp_loc(unsigned long addr)
>   printf("Breakpoints may only be placed at kernel addresses\n");
>   return 0;
>   }
> - if (!mread(addr, &instr, sizeof(instr))) {
> + if (!mread_instr(addr, &instr)) {
>   printf("Can't read instruction at address %lx\n", addr);
>   return 0;
>   }
> @@ -2122,6 +2123,25 @@ mwrite(unsigned long adrs, void *buf, int size)
>   return n;
>  }
> 
> +static int
> +mread_instr(unsigned long adrs, struct ppc_inst *instr)
> +{
> + volatile int n;
> +
> + n = 0;
> + if (setjmp(bus_error_jmp) == 0) {
> + catch_memory_errors = 1;
> + sync();
> + *instr = ppc_inst_read((struct ppc_inst *)adrs);
> + sync();
> + /* wait a little while to see if we get a machine check */
> + __delay(200);
> + n = ppc_inst_len(*instr);
> + }
> + catch_memory_errors = 0;
> + return n;
> +}
> +
>  static int fault_type;
>  static int fault_except;
>  static char *fault_chars[] = { "--", "**", "##" };

Re: [PATCH v7 17/28] powerpc: Introduce a function for reporting instruction length

Looks good,

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:09 PM AEST Jordan Niethe wrote:
> Currently all instructions have the same length, but in preparation for
> prefixed instructions introduce a function for returning instruction
> length.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v6: - feature-fixups.c: do_final_fixups(): use here
> - ppc_inst_len(): change return type from bool to int
> - uprobes: Use ppc_inst_read() before calling ppc_inst_len()
> ---
>  arch/powerpc/include/asm/inst.h   |  5 +
>  arch/powerpc/kernel/kprobes.c |  6 --
>  arch/powerpc/kernel/uprobes.c |  2 +-
>  arch/powerpc/lib/feature-fixups.c | 14 +++---
>  4 files changed, 17 insertions(+), 10 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/inst.h
> b/arch/powerpc/include/asm/inst.h index 0d581b332c20..2f3c9d5bcf7c 100644
> --- a/arch/powerpc/include/asm/inst.h
> +++ b/arch/powerpc/include/asm/inst.h
> @@ -17,6 +17,11 @@ static inline u32 ppc_inst_val(struct ppc_inst x)
>   return x.val;
>  }
> 
> +static inline int ppc_inst_len(struct ppc_inst x)
> +{
> + return sizeof(struct ppc_inst);
> +}
> +
>  static inline int ppc_inst_primary_opcode(struct ppc_inst x)
>  {
>   return ppc_inst_val(x) >> 26;
> diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
> index a72c8e1a42ad..33d54b091c70 100644
> --- a/arch/powerpc/kernel/kprobes.c
> +++ b/arch/powerpc/kernel/kprobes.c
> @@ -462,14 +462,16 @@ NOKPROBE_SYMBOL(trampoline_probe_handler);
>   */
>  int kprobe_post_handler(struct pt_regs *regs)
>  {
> + int len;
>   struct kprobe *cur = kprobe_running();
>   struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
> 
>   if (!cur || user_mode(regs))
>   return 0;
> 
> + len = ppc_inst_len(ppc_inst_read((struct ppc_inst *)cur->ainsn.insn));
>   /* make sure we got here for instruction we have a kprobe on */
> - if (((unsigned long)cur->ainsn.insn + 4) != regs->nip)
> + if (((unsigned long)cur->ainsn.insn + len) != regs->nip)
>   return 0;
> 
>   if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
> @@ -478,7 +480,7 @@ int kprobe_post_handler(struct pt_regs *regs)
>   }
> 
>   /* Adjust nip to after the single-stepped instruction */
> - regs->nip = (unsigned long)cur->addr + 4;
> + regs->nip = (unsigned long)cur->addr + len;
>   regs->msr |= kcb->kprobe_saved_msr;
> 
>   /*Restore back the original saved kprobes variables and continue. */
> diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
> index 6893d40a48c5..83e883e1a42d 100644
> --- a/arch/powerpc/kernel/uprobes.c
> +++ b/arch/powerpc/kernel/uprobes.c
> @@ -112,7 +112,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe,
> struct pt_regs *regs) * support doesn't exist and have to fix-up the next
> instruction * to be executed.
>*/
> - regs->nip = utask->vaddr + MAX_UINSN_BYTES;
> + regs->nip = utask->vaddr + ppc_inst_len(ppc_inst_read(&auprobe->insn));
> 
>   user_disable_single_step(current);
>   return 0;
> diff --git a/arch/powerpc/lib/feature-fixups.c
> b/arch/powerpc/lib/feature-fixups.c index 13ec3264a565..f4845e740338 100644
> --- a/arch/powerpc/lib/feature-fixups.c
> +++ b/arch/powerpc/lib/feature-fixups.c
> @@ -390,20 +390,20 @@ void do_lwsync_fixups(unsigned long value, void
> *fixup_start, void *fixup_end) static void do_final_fixups(void)
>  {
>  #if defined(CONFIG_PPC64) && defined(CONFIG_RELOCATABLE)
> - struct ppc_inst *src, *dest;
> - unsigned long length;
> + struct ppc_inst inst, *src, *dest, *end;
> 
>   if (PHYSICAL_START == 0)
>   return;
> 
>   src = (struct ppc_inst *)(KERNELBASE + PHYSICAL_START);
>   dest = (struct ppc_inst *)KERNELBASE;
> - length = (__end_interrupts - _stext) / sizeof(struct ppc_inst);
> + end = (void *)src + (__end_interrupts - _stext);
> 
> - while (length--) {
> - raw_patch_instruction(dest, ppc_inst_read(src));
> - src++;
> - dest++;
> + while (src < end) {
> + inst = ppc_inst_read(src);
> + raw_patch_instruction(dest, inst);
> + src = (void *)src + ppc_inst_len(inst);
> + dest = (void *)dest + ppc_inst_len(inst);
>   }
>  #endif
>  }

Re: [PATCH v7 16/28] powerpc: Define and use __get_user_instr{, inatomic}()

Doesn't change any behaviour from what I can see.

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:08 PM AEST Jordan Niethe wrote:
> Define specific __get_user_instr() and __get_user_instr_inatomic()
> macros for reading instructions from user space.
> 
> Signed-off-by: Jordan Niethe 
> ---
>  arch/powerpc/include/asm/uaccess.h  | 5 +
>  arch/powerpc/kernel/align.c | 2 +-
>  arch/powerpc/kernel/hw_breakpoint.c | 2 +-
>  arch/powerpc/kernel/vecemu.c| 2 +-
>  4 files changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/uaccess.h
> b/arch/powerpc/include/asm/uaccess.h index 2f500debae21..c0a35e4586a5
> 100644
> --- a/arch/powerpc/include/asm/uaccess.h
> +++ b/arch/powerpc/include/asm/uaccess.h
> @@ -105,6 +105,11 @@ static inline int __access_ok(unsigned long addr,
> unsigned long size, #define __put_user_inatomic(x, ptr) \
>   __put_user_nosleep((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
> 
> +#define __get_user_instr(x, ptr) \
> + __get_user_nocheck((x).val, (u32 *)(ptr), sizeof(u32), true)
> +
> +#define __get_user_instr_inatomic(x, ptr) \
> + __get_user_nosleep((x).val, (u32 *)(ptr), sizeof(u32))
>  extern long __put_user_bad(void);
> 
>  /*
> diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
> index 9e66e6c62354..b8f56052c6fe 100644
> --- a/arch/powerpc/kernel/align.c
> +++ b/arch/powerpc/kernel/align.c
> @@ -304,7 +304,7 @@ int fix_alignment(struct pt_regs *regs)
>*/
>   CHECK_FULL_REGS(regs);
> 
> - if (unlikely(__get_user(instr.val, (unsigned int __user *)regs->nip)))
> + if (unlikely(__get_user_instr(instr, (void __user *)regs->nip)))
>   return -EFAULT;
>   if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) {
>   /* We don't handle PPC little-endian any more... */
> diff --git a/arch/powerpc/kernel/hw_breakpoint.c
> b/arch/powerpc/kernel/hw_breakpoint.c index 2db9a7ac7bcb..423603c92c0f
> 100644
> --- a/arch/powerpc/kernel/hw_breakpoint.c
> +++ b/arch/powerpc/kernel/hw_breakpoint.c
> @@ -249,7 +249,7 @@ static bool stepping_handler(struct pt_regs *regs,
> struct perf_event *bp, struct instruction_op op;
>   unsigned long addr = info->address;
> 
> - if (__get_user_inatomic(instr.val, (unsigned int *)regs->nip))
> + if (__get_user_instr_inatomic(instr, (void __user *)regs->nip))
>   goto fail;
> 
>   ret = analyse_instr(&op, regs, instr);
> diff --git a/arch/powerpc/kernel/vecemu.c b/arch/powerpc/kernel/vecemu.c
> index bb262707fb5c..adcdba6d534e 100644
> --- a/arch/powerpc/kernel/vecemu.c
> +++ b/arch/powerpc/kernel/vecemu.c
> @@ -266,7 +266,7 @@ int emulate_altivec(struct pt_regs *regs)
>   unsigned int va, vb, vc, vd;
>   vector128 *vrs;
> 
> - if (get_user(instr.val, (unsigned int __user *) regs->nip))
> + if (__get_user_instr(instr, (void __user *) regs->nip))
>   return -EFAULT;
> 
>   word = ppc_inst_val(instr);

Re: [PATCH v7 15/28] powerpc/kprobes: Use patch_instruction()

Without CONFIG_STRICT_KERNEL_RWX this boils down to doing the same thing 
(although with a few more safety checks along the way), and with 
CONFIG_STRICT_KERNEL_RWX this should make it actually work (although perhaps 
there was some other mechanism that made it work anyway).

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:07 PM AEST Jordan Niethe wrote:
> Instead of using memcpy() and flush_icache_range() use
> patch_instruction() which not only accomplishes both of these steps but
> will also make it easier to add support for prefixed instructions.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v6: New to series.
> ---
>  arch/powerpc/kernel/kprobes.c | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
> index f64312dca84f..a72c8e1a42ad 100644
> --- a/arch/powerpc/kernel/kprobes.c
> +++ b/arch/powerpc/kernel/kprobes.c
> @@ -125,11 +125,8 @@ int arch_prepare_kprobe(struct kprobe *p)
>   }
> 
>   if (!ret) {
> - memcpy(p->ainsn.insn, p->addr,
> - MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
> + patch_instruction((struct ppc_inst *)p->ainsn.insn, insn);
>   p->opcode = ppc_inst_val(insn);
> - flush_icache_range((unsigned long)p->ainsn.insn,
> - (unsigned long)p->ainsn.insn + sizeof(kprobe_opcode_t));
>   }
> 
>   p->ainsn.boostable = 0;

Re: [PATCH V2 00/11] Subject: Remove duplicated kmap code

2020-05-04 Thread Ira Weiny

On Mon, May 04, 2020 at 10:02:25PM +0100, Al Viro wrote:
> On Mon, May 04, 2020 at 01:17:41PM -0700, Ira Weiny wrote:
> 
> > > || * arm: much, much worse.  We have several files that pull 
> > > linux/highmem.h:
> > > || arch/arm/mm/cache-feroceon-l2.c, arch/arm/mm/cache-xsc3l2.c,
> > > || arch/arm/mm/copypage-*.c, arch/arm/mm/dma-mapping.c, 
> > > arch/arm/mm/flush.c,
> > > || arch/arm/mm/highmem.c, arch/arm/probes/uprobes/core.c,
> > > || arch/arm/include/asm/kvm_mmu.h (kmap_atomic_pfn()).
> > > || Those are fine, but we also have this:
> > > || arch/arm/include/asm/pgtable.h:200:#define __pte_map(pmd)  
> > >  (pte_t *)kmap_atomic(pmd_page(*(pmd)))
> > > || arch/arm/include/asm/pgtable.h:208:#define pte_offset_map(pmd,addr)
> > >  (__pte_map(pmd) + pte_index(addr))
> > > || and sure as hell, asm/pgtable.h does *NOT* pull linux/highmem.h.
> > 
> > It does not pull asm/highmem.h either...
> 
> No, but the users of those macros need to be considered.

Agreed, I was trying to point out that highmem.h was being pulled from
somewhere else prior to my series, sorry.

> 
> > > || #define pte_offset_map(dir, addr)   \
> > > || ((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr))
> > > || One pte_offset_map user in arch/microblaze:
> > > || arch/microblaze/kernel/signal.c:207:ptep = pte_offset_map(pmdp, 
> > > address);
> > > || Messy, but doesn't require any changes (we have asm/pgalloc.h included
> > > || there, and that pull linux/highmem.h).
> > 
> > AFAICS asm/pgtable.h does not include asm/highmem.h here...
> > 
> > So looks like arch/microblaze/kernel/signal.c will need linux/highmem.h
> 
> See above - line 39 in there is
> #include 
> and line 14 in arch/microblaze/include/asm/pgalloc.h is
> #include 
> It's conditional upon CONFIG_MMU in there, but so's the use of
> pte_offset_map() in arch/microblaze/kernel/signal.c 
> 
> So it shouldn't be a problem.

Ah ok, I did not see that one.  Ok I'll drop that change and this series should
be good.

I was setting up to submit another version with 3 more patches you have
suggested:

kmap: Remove kmap_atomic_to_page()
parisc/kmap: Remove duplicate kmap code
sparc: Remove unnecessary includes

Would you like to see those folded in?  I submitted 2 of the above as a
separate series already.

> 
> > > || * xtensa: users in arch/xtensa/kernel/pci-dma.c, 
> > > arch/xtensa/mm/highmem.c,
> > > || arch/xtensa/mm/cache.c and arch/xtensa/platforms/iss/simdisk.c (all 
> > > pull
> > > || linux/highmem.h).
> > 
> > Actually
> > 
> > arch/xtensa/mm/cache.c gets linux/highmem.h from linux/pagemap.h
> > 
> > arch/xtensa/platforms/iss/simdisk.c may have an issue?
> > linux/blkdev.h -> CONFIG_BLOCK -> linux/pagemap.h -> linux/highmem.h
> > But simdisk.c requires BLK_DEV_SIMDISK -> CONFIG_BLOCK...
> > 
> 
> Yep - see above re major chain of indirect includes conditional upon 
> CONFIG_BLOCK
> and its uses in places that only build with such configs.  There's a plenty of
> similar considerations outside of arch/*, unfortunately...

Indeed.

FWIW the last 2 versions of this series have had no build failures with 0-day.

This series in particular just finished 164 configs without issue.

Would you like me to submit a new series?  With your additional patches?

Ira

Re: [PATCH v2 2/5] stats_fs API: create, add and remove stats_fs sources and values

2020-05-04 Thread Randy Dunlap

On 5/4/20 4:03 AM, Emanuele Giuseppe Esposito wrote:
> diff --git a/fs/Kconfig b/fs/Kconfig
> index f08fbbfafd9a..1b0de0f19e96 100644
> --- a/fs/Kconfig
> +++ b/fs/Kconfig
> @@ -328,4 +328,10 @@ source "fs/unicode/Kconfig"
>  config IO_WQ
>   bool
>  
> +config STATS_FS
> + bool "Statistics Filesystem"
> + help
> +   stats_fs is a virtual file system that provides counters and
> +   other statistics about the running kernel.
> +
>  endmenu

Hi,

This kconfig entry should be under (inside) "Pseudo filesystems",
i.e., between 'menu "Pseudo filesystems"' and its corresponding
"endmenu".

Thanks.
-- 
~Randy

[powerpc:next] BUILD SUCCESS 140777a3d8dfdb3d3f20ea7707c0f1c0ce1b0aa5

2020-05-04 Thread kbuild test robot

-a001-20200502
mips randconfig-a001-20200502
nds32randconfig-a001-20200502
alpharandconfig-a001-20200502
parisc   randconfig-a001-20200502
riscvrandconfig-a001-20200502
h8300randconfig-a001-20200503
nios2randconfig-a001-20200503
microblaze   randconfig-a001-20200503
c6x  randconfig-a001-20200503
sparc64  randconfig-a001-20200503
s390 randconfig-a001-20200504
xtensa   randconfig-a001-20200504
sh   randconfig-a001-20200504
openrisc randconfig-a001-20200504
csky randconfig-a001-20200504
s390 randconfig-a001-20200505
xtensa   randconfig-a001-20200505
sh   randconfig-a001-20200505
openrisc randconfig-a001-20200505
csky randconfig-a001-20200505
xtensa   randconfig-a001-20200503
sh   randconfig-a001-20200503
openrisc randconfig-a001-20200503
csky randconfig-a001-20200503
s390 randconfig-a001-20200430
xtensa   randconfig-a001-20200430
csky randconfig-a001-20200430
openrisc randconfig-a001-20200430
sh   randconfig-a001-20200430
i386 randconfig-b003-20200503
x86_64   randconfig-b002-20200503
i386 randconfig-b001-20200503
x86_64   randconfig-b003-20200503
x86_64   randconfig-b001-20200503
i386 randconfig-b002-20200503
x86_64   randconfig-c002-20200502
i386 randconfig-c002-20200502
i386 randconfig-c001-20200502
i386 randconfig-c003-20200502
i386 randconfig-d003-20200502
i386 randconfig-d001-20200502
x86_64   randconfig-d002-20200502
i386 randconfig-d002-20200502
x86_64   randconfig-e003-20200502
i386 randconfig-e003-20200502
x86_64   randconfig-e001-20200502
i386 randconfig-e002-20200502
i386 randconfig-e001-20200502
x86_64   randconfig-e003-20200503
x86_64   randconfig-e002-20200503
i386 randconfig-e003-20200503
x86_64   randconfig-e001-20200503
i386 randconfig-e002-20200503
i386 randconfig-e001-20200503
i386 randconfig-f003-20200505
x86_64   randconfig-f001-20200505
x86_64   randconfig-f003-20200505
i386 randconfig-f001-20200505
i386 randconfig-f002-20200505
x86_64   randconfig-a003-20200505
x86_64   randconfig-a001-20200505
i386 randconfig-a001-20200505
i386 randconfig-a003-20200505
i386 randconfig-a002-20200505
ia64 randconfig-a001-20200503
arm64randconfig-a001-20200503
arc  randconfig-a001-20200503
arm  randconfig-a001-20200503
sparcrandconfig-a001-20200503
ia64 randconfig-a001-20200505
arc  randconfig-a001-20200505
powerpc  randconfig-a001-20200505
arm  randconfig-a001-20200505
sparcrandconfig-a001-20200505
riscvallyesconfig
riscvnommu_virt_defconfig
riscv   defconfig
riscv  rv32_defconfig
riscvallmodconfig
s390   zfcpdump_defconfig
s390 allyesconfig
s390  allnoconfig
s390 alldefconfig
s390defconfig
sh  rsk7269_defconfig
sh   allmodconfig
shtitan_defconfig
sh  sh7785lcr_32bit_defconfig
shallnoconfig
sparc   defconfig
sparc64 defconfig
sparc64   allnoconfig
sparc64  allyesconfig
sparc64  allmodconfig
um   x86_64_defconfig
um i386_defconfig
um  defconfig
x86_64   rhel
x86_64   rhel-7.6
x86_64rhel-7.6-kselftests
x86_64 rhel-7.2-clear
x86_64lkp
x86_64  fedora-25
x86_64  kexec

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

[RFC][PATCH 2/2] Add support for ima buffer pass using reserved memory arm64

2020-05-04 Thread Prakhar Srivastava

 Add support for ima buffer pass using reserved memory for
 arm64 kexec. Update the arch sepcific code path in kexec file load to store
 the ima buffer in the reserved memory. The same reserved memory is read on
 kexec or cold boot.

Signed-off-by: Prakhar Srivastava 
---
 arch/arm64/Kconfig |  1 +
 arch/arm64/include/asm/ima.h   | 22 +
 arch/arm64/include/asm/kexec.h |  5 ++
 arch/arm64/kernel/Makefile |  1 +
 arch/arm64/kernel/ima_kexec.c  | 64 ++
 arch/arm64/kernel/machine_kexec_file.c |  1 +
 arch/powerpc/include/asm/ima.h |  3 +-
 arch/powerpc/kexec/ima.c   | 14 +-
 security/integrity/ima/ima_kexec.c | 15 --
 9 files changed, 119 insertions(+), 7 deletions(-)
 create mode 100644 arch/arm64/include/asm/ima.h
 create mode 100644 arch/arm64/kernel/ima_kexec.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 40fb05d96c60..bc9e1a91686b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1069,6 +1069,7 @@ config KEXEC
 config KEXEC_FILE
bool "kexec file based system call"
select KEXEC_CORE
+   select HAVE_IMA_KEXEC
help
  This is new version of kexec system call. This system call is
  file based and takes file descriptors as system call argument
diff --git a/arch/arm64/include/asm/ima.h b/arch/arm64/include/asm/ima.h
new file mode 100644
index ..58033b427e59
--- /dev/null
+++ b/arch/arm64/include/asm/ima.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_ARM64_IMA_H
+#define _ASM_ARM64_IMA_H
+
+struct kimage;
+
+int is_ima_memory_reserved(void);
+int ima_get_kexec_buffer(void **addr, size_t *size);
+int ima_free_kexec_buffer(void);
+
+#ifdef CONFIG_IMA_KEXEC
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+ void *buffer, size_t size);
+
+#else
+int arch_ima_add_kexec_buffer(struct kimage *image, unsigned long load_addr,
+ void *buffer, size_t size)
+{
+   return 0;
+}
+#endif /* CONFIG_IMA_KEXEC */
+#endif /* _ASM_ARM64_IMA_H */
diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h
index d24b527e8c00..2bd19ccb6c43 100644
--- a/arch/arm64/include/asm/kexec.h
+++ b/arch/arm64/include/asm/kexec.h
@@ -100,6 +100,11 @@ struct kimage_arch {
void *elf_headers;
unsigned long elf_headers_mem;
unsigned long elf_headers_sz;
+
+#ifdef CONFIG_IMA_KEXEC
+   phys_addr_t ima_buffer_addr;
+   size_t ima_buffer_size;
+#endif
 };
 
 extern const struct kexec_file_ops kexec_image_ops;
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
index 4e5b8ee31442..cd3cb7690d51 100644
--- a/arch/arm64/kernel/Makefile
+++ b/arch/arm64/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_RANDOMIZE_BASE)  += kaslr.o
 obj-$(CONFIG_HIBERNATION)  += hibernate.o hibernate-asm.o
 obj-$(CONFIG_KEXEC_CORE)   += machine_kexec.o relocate_kernel.o
\
   cpu-reset.o
+obj-$(CONFIG_HAVE_IMA_KEXEC)   += ima_kexec.o
 obj-$(CONFIG_KEXEC_FILE)   += machine_kexec_file.o kexec_image.o
 obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o
 arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o
diff --git a/arch/arm64/kernel/ima_kexec.c b/arch/arm64/kernel/ima_kexec.c
new file mode 100644
index ..ff5649333c7c
--- /dev/null
+++ b/arch/arm64/kernel/ima_kexec.c
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Microsoft Corporation.
+ *
+ * Authors:
+ * Prakhar Srivastava 
+ */
+
+#include 
+#include 
+
+
+/**
+ * is_ima_memory_reserved - check if memory is reserved via device
+ * tree.
+ * Return: negative or zero when memory is not reserved.
+ * positive number on success.
+ *
+ */
+int is_ima_memory_reserved(void)
+{
+   return of_is_ima_memory_reserved();
+}
+
+/**
+ * ima_get_kexec_buffer - get IMA buffer from the previous kernel
+ * @addr:  On successful return, set to point to the buffer contents.
+ * @size:  On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int ima_get_kexec_buffer(void **addr, size_t *size)
+{
+   return of_get_ima_buffer(addr, size);
+}
+
+/**
+ * ima_free_kexec_buffer - free memory used by the IMA buffer
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int ima_free_kexec_buffer(void)
+{
+   return of_remove_ima_buffer();
+}
+
+#ifdef CONFIG_IMA_KEXEC
+/**
+ * arch_ima_add_kexec_buffer - do arch-specific steps to add the IMA
+ * measurement log.
+ * @image: - pointer to the kimage, to store the address and size of the
+ * IMA measurement log.
+ * @load_addr: - the address where the IMA measurement log is stored.
+ * @size - size of the IMA measurement log.
+ *
+ * Return: 0 on success, negative er

[RFC][PATCH 0/2] Add support for using reserved memory for ima buffer pass

2020-05-04 Thread Prakhar Srivastava

IMA during kexec(kexec file load) verifies the kernel signature and measures
the signature of the kernel. The signature in the logs can be used to verfiy 
the 
authenticity of the kernel. The logs don not get carried over kexec and thus
remote attesation cannot verify the signature of the running kernel.

Introduce an ABI to carry forward the ima logs over kexec.
Memory reserved via device tree reservation can be used to store and read
via the of_* functions.

Reserved memory stores the size(sizeof(size_t)) of the buffer in the starting
address, followed by the IMA log contents.

Tested on:
  arm64 with Uboot

Prakhar Srivastava (2):
  Add a layer of abstraction to use the memory reserved by device tree
for ima buffer pass.
  Add support for ima buffer pass using reserved memory for arm64 kexec.
Update the arch sepcific code path in kexec file load to store the
ima buffer in the reserved memory. The same reserved memory is read
on kexec or cold boot.

 arch/arm64/Kconfig |   1 +
 arch/arm64/include/asm/ima.h   |  22 
 arch/arm64/include/asm/kexec.h |   5 +
 arch/arm64/kernel/Makefile |   1 +
 arch/arm64/kernel/ima_kexec.c  |  64 ++
 arch/arm64/kernel/machine_kexec_file.c |   1 +
 arch/powerpc/include/asm/ima.h |   3 +-
 arch/powerpc/kexec/ima.c   |  14 ++-
 drivers/of/Kconfig |   6 +
 drivers/of/Makefile|   1 +
 drivers/of/of_ima.c| 165 +
 include/linux/of.h |  34 +
 security/integrity/ima/ima_kexec.c |  15 ++-
 13 files changed, 325 insertions(+), 7 deletions(-)
 create mode 100644 arch/arm64/include/asm/ima.h
 create mode 100644 arch/arm64/kernel/ima_kexec.c
 create mode 100644 drivers/of/of_ima.c

-- 
2.25.1

[RFC][PATCH 1/2] Add a layer of abstraction to use the memory reserved by device tree for ima buffer pass.

2020-05-04 Thread Prakhar Srivastava

Introduce a device tree layer for to read and store ima buffer
from the reserved memory section of a device tree.

Signed-off-by: Prakhar Srivastava 
---
 drivers/of/Kconfig  |   6 ++
 drivers/of/Makefile |   1 +
 drivers/of/of_ima.c | 165 
 include/linux/of.h  |  34 +
 4 files changed, 206 insertions(+)
 create mode 100644 drivers/of/of_ima.c

diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index d91618641be6..edb3c39740fb 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -107,4 +107,10 @@ config OF_DMA_DEFAULT_COHERENT
# arches should select this if DMA is coherent by default for OF devices
bool
 
+config OF_IMA
+   def_bool y
+   help
+ IMA related wrapper functions to add/remove ima measurement logs 
during
+ kexec_file_load call.
+
 endif # OF
diff --git a/drivers/of/Makefile b/drivers/of/Makefile
index 663a4af0cccd..b4caf083df4e 100644
--- a/drivers/of/Makefile
+++ b/drivers/of/Makefile
@@ -14,5 +14,6 @@ obj-$(CONFIG_OF_RESERVED_MEM) += of_reserved_mem.o
 obj-$(CONFIG_OF_RESOLVE)  += resolver.o
 obj-$(CONFIG_OF_OVERLAY) += overlay.o
 obj-$(CONFIG_OF_NUMA) += of_numa.o
+obj-$(CONFIG_OF_IMA) += of_ima.o
 
 obj-$(CONFIG_OF_UNITTEST) += unittest-data/
diff --git a/drivers/of/of_ima.c b/drivers/of/of_ima.c
new file mode 100644
index ..131f68d81e2e
--- /dev/null
+++ b/drivers/of/of_ima.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Microsoft Corporation.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static bool dtb_status_enabled;
+static struct resource mem_res;
+static void *vaddr;
+
+
+/**
+ * of_is_ima_memory_reserved - check if memory is reserved via device
+ * tree.
+ * Return: zero when memory is not reserved.
+ * positive number on success.
+ *
+ */
+int of_is_ima_memory_reserved(void)
+{
+   return dtb_status_enabled;
+}
+
+/**
+ * of_ima_write_buffer - Write the ima buffer into the reserved memory.
+ *
+ * ima_buffer - buffer starting address.
+ * ima_buffer_size - size of segment.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int of_ima_write_buffer(void *ima_buffer, size_t ima_buffer_size)
+{
+   void *addr;
+
+   if (!dtb_status_enabled)
+   return -EOPNOTSUPP;
+
+   vaddr = memremap(mem_res.start, resource_size(&mem_res), MEMREMAP_WB);
+   pr_info("Mapped reserved memory, vaddr: 0x%0llX, paddr: 0x%0llX\n , 
size : %lld",
+   (u64)vaddr, mem_res.start, resource_size(&mem_res));
+
+   if (vaddr) {
+   memcpy(vaddr, &ima_buffer_size, sizeof(size_t));
+   addr =  vaddr + sizeof(size_t);
+   memcpy(addr, ima_buffer, ima_buffer_size);
+   memunmap(vaddr);
+   vaddr = NULL;
+   }
+
+   return 0;
+}
+
+/**
+ * of_remove_ima_buffer - Write 0(Zero length buffer to read)to the
+ *size location of the buffer.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int of_remove_ima_buffer(void)
+{
+   size_t empty_buffer_size = 0;
+
+   if (!dtb_status_enabled)
+   return -ENOTSUPP;
+
+   if (vaddr) {
+   memcpy(vaddr, &empty_buffer_size, sizeof(size_t));
+   memunmap(vaddr);
+   vaddr = NULL;
+   }
+
+   return 0;
+}
+
+/**
+ * of_ima_get_size_allocated - Get the usable buffer size thats allocated in
+ * the device-tree.
+ *
+ * Return: 0 on unavailable node, size of the memory block - (size_t)
+ */
+size_t of_ima_get_size_allocated(void)
+{
+   size_t size = 0;
+
+   if (!dtb_status_enabled)
+   return size;
+
+   size = resource_size(&mem_res) - sizeof(size_t);
+   return size;
+}
+
+/**
+ * of_get_ima_buffer - Get IMA buffer address.
+ *
+ * @addr:   On successful return, set to point to the buffer contents.
+ * @size:   On successful return, set to the buffer size.
+ *
+ * Return: 0 on success, negative errno on error.
+ */
+int of_get_ima_buffer(void **addr, size_t *size)
+{
+   if (!dtb_status_enabled)
+   return -ENOTSUPP;
+
+   vaddr = memremap(mem_res.start, resource_size(&mem_res), MEMREMAP_WB);
+   pr_info("Mapped reserved memory, vaddr: 0x%0llX, paddr: 0x%0llX,\n 
allocated size : %lld, ima_buffer_size: %ld ",
+   (u64)vaddr, mem_res.start, resource_size(&mem_res), *(size_t *)vaddr);
+
+   *size = *(size_t *)vaddr;
+   *addr = vaddr + sizeof(size_t);
+   return 0;
+}
+
+static const struct of_device_id ima_buffer_pass_ids[] = {
+   {
+   .compatible = "linux,ima_buffer_pass",
+   },
+   {}
+};
+
+static const struct of_device_id ima_buffer_pass_match[] = {
+   {
+   .name = "ima_buffer_pass",
+   },
+};
+MODULE_DEVICE_TABLE(of, ima_buffer_pass_match);
+
+static int __init i

Re: [PATCH 1/3] ASoC: fsl_esai: introduce SoC specific data

2020-05-04 Thread Nicolin Chen

On Fri, May 01, 2020 at 04:12:04PM +0800, Shengjiu Wang wrote:
> Introduce a SoC specific data structure which contains the
> differences between the different SoCs.
> This makes it easier to support more differences without having
> to introduce a new if/else each time.
> 
> Signed-off-by: Shengjiu Wang 

Though the 2nd patch is having comments to address, this one
looks fine to me and should be able to merge as long as Mark
is okay with this too:

Acked-by: Nicolin Chen

Re: [PATCH V2 00/11] Subject: Remove duplicated kmap code

2020-05-04 Thread Al Viro

On Mon, May 04, 2020 at 01:17:41PM -0700, Ira Weiny wrote:

> > || * arm: much, much worse.  We have several files that pull 
> > linux/highmem.h:
> > || arch/arm/mm/cache-feroceon-l2.c, arch/arm/mm/cache-xsc3l2.c,
> > || arch/arm/mm/copypage-*.c, arch/arm/mm/dma-mapping.c, arch/arm/mm/flush.c,
> > || arch/arm/mm/highmem.c, arch/arm/probes/uprobes/core.c,
> > || arch/arm/include/asm/kvm_mmu.h (kmap_atomic_pfn()).
> > || Those are fine, but we also have this:
> > || arch/arm/include/asm/pgtable.h:200:#define __pte_map(pmd)   
> > (pte_t *)kmap_atomic(pmd_page(*(pmd)))
> > || arch/arm/include/asm/pgtable.h:208:#define pte_offset_map(pmd,addr) 
> > (__pte_map(pmd) + pte_index(addr))
> > || and sure as hell, asm/pgtable.h does *NOT* pull linux/highmem.h.
> 
> It does not pull asm/highmem.h either...

No, but the users of those macros need to be considered.

> > || #define pte_offset_map(dir, addr)   \
> > || ((pte_t *) kmap_atomic(pmd_page(*(dir))) + pte_index(addr))
> > || One pte_offset_map user in arch/microblaze:
> > || arch/microblaze/kernel/signal.c:207:ptep = pte_offset_map(pmdp, 
> > address);
> > || Messy, but doesn't require any changes (we have asm/pgalloc.h included
> > || there, and that pull linux/highmem.h).
> 
> AFAICS asm/pgtable.h does not include asm/highmem.h here...
> 
> So looks like arch/microblaze/kernel/signal.c will need linux/highmem.h

See above - line 39 in there is
#include 
and line 14 in arch/microblaze/include/asm/pgalloc.h is
#include 
It's conditional upon CONFIG_MMU in there, but so's the use of
pte_offset_map() in arch/microblaze/kernel/signal.c 

So it shouldn't be a problem.

> > || * xtensa: users in arch/xtensa/kernel/pci-dma.c, 
> > arch/xtensa/mm/highmem.c,
> > || arch/xtensa/mm/cache.c and arch/xtensa/platforms/iss/simdisk.c (all pull
> > || linux/highmem.h).
> 
> Actually
> 
> arch/xtensa/mm/cache.c gets linux/highmem.h from linux/pagemap.h
> 
> arch/xtensa/platforms/iss/simdisk.c may have an issue?
>   linux/blkdev.h -> CONFIG_BLOCK -> linux/pagemap.h -> linux/highmem.h
>   But simdisk.c requires BLK_DEV_SIMDISK -> CONFIG_BLOCK...
>   

Yep - see above re major chain of indirect includes conditional upon 
CONFIG_BLOCK
and its uses in places that only build with such configs.  There's a plenty of
similar considerations outside of arch/*, unfortunately...

Re: [PATCH V2 11/11] drm: Remove drm specific kmap_atomic code

2020-05-04 Thread Ira Weiny

On Mon, May 04, 2020 at 01:18:51PM +0200, Daniel Vetter wrote:
> On Mon, May 4, 2020 at 3:09 AM  wrote:
> >
> > From: Ira Weiny 
> >
> > kmap_atomic_prot() is now exported by all architectures.  Use this
> > function rather than open coding a driver specific kmap_atomic.
> >
> > Reviewed-by: Christian König 
> > Reviewed-by: Christoph Hellwig 
> > Signed-off-by: Ira Weiny 
> 
> I'm assuming this lands through some other tree or a topic branch or whatever.

Yes I think Andrew queued this up before and so I hope he will continue to do
so with the subsequent versions.

Andrew, LMK if this is an issue.

Thanks,
Ira

> 
> Acked-by: Daniel Vetter 
> 
> Cheers, Daniel
> 
> > ---
> >  drivers/gpu/drm/ttm/ttm_bo_util.c| 56 ++--
> >  drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 16 
> >  include/drm/ttm/ttm_bo_api.h |  4 --
> >  3 files changed, 12 insertions(+), 64 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c 
> > b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > index 52d2b71f1588..f09b096ba4fd 100644
> > --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> > +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> > @@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, 
> > unsigned long page)
> > return 0;
> >  }
> >
> > -#ifdef CONFIG_X86
> > -#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, 
> > __prot)
> > -#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr)
> > -#else
> > -#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0,  __prot)
> > -#define __ttm_kunmap_atomic(__addr) vunmap(__addr)
> > -#endif
> > -
> > -
> > -/**
> > - * ttm_kmap_atomic_prot - Efficient kernel map of a single page with
> > - * specified page protection.
> > - *
> > - * @page: The page to map.
> > - * @prot: The page protection.
> > - *
> > - * This function maps a TTM page using the kmap_atomic api if available,
> > - * otherwise falls back to vmap. The user must make sure that the
> > - * specified page does not have an aliased mapping with a different caching
> > - * policy unless the architecture explicitly allows it. Also mapping and
> > - * unmapping using this api must be correctly nested. Unmapping should
> > - * occur in the reverse order of mapping.
> > - */
> > -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot)
> > -{
> > -   if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> > -   return kmap_atomic(page);
> > -   else
> > -   return __ttm_kmap_atomic_prot(page, prot);
> > -}
> > -EXPORT_SYMBOL(ttm_kmap_atomic_prot);
> > -
> > -/**
> > - * ttm_kunmap_atomic_prot - Unmap a page that was mapped using
> > - * ttm_kmap_atomic_prot.
> > - *
> > - * @addr: The virtual address from the map.
> > - * @prot: The page protection.
> > - */
> > -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot)
> > -{
> > -   if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> > -   kunmap_atomic(addr);
> > -   else
> > -   __ttm_kunmap_atomic(addr);
> > -}
> > -EXPORT_SYMBOL(ttm_kunmap_atomic_prot);
> > -
> >  static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
> > unsigned long page,
> > pgprot_t prot)
> > @@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, 
> > void *src,
> > return -ENOMEM;
> >
> > src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
> > -   dst = ttm_kmap_atomic_prot(d, prot);
> > +   dst = kmap_atomic_prot(d, prot);
> > if (!dst)
> > return -ENOMEM;
> >
> > memcpy_fromio(dst, src, PAGE_SIZE);
> >
> > -   ttm_kunmap_atomic_prot(dst, prot);
> > +   kunmap_atomic(dst);
> >
> > return 0;
> >  }
> > @@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, 
> > void *dst,
> > return -ENOMEM;
> >
> > dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
> > -   src = ttm_kmap_atomic_prot(s, prot);
> > +   src = kmap_atomic_prot(s, prot);
> > if (!src)
> > return -ENOMEM;
> >
> > memcpy_toio(dst, src, PAGE_SIZE);
> >
> > -   ttm_kunmap_atomic_prot(src, prot);
> > +   kunmap_atomic(src);
> >
> > return 0;
> >  }
> > diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c 
> > b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> > index bb46ca0c458f..94d456a1d1a9 100644
> > --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> > +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> > @@ -374,12 +374,12 @@ static int vmw_bo_cpu_blit_line(struct 
> > vmw_bo_blit_line_data *d,
> > copy_size = min_t(u32, copy_size, PAGE_SIZE - 
> > src_page_offset);
> >
> > if (unmap_src) {
> > -   ttm_kunmap_atomic_prot(d->src_addr, d->src_prot);
> > +   kunmap_atomic(d->src_addr);
> > d->src_addr = NULL;
> > }
> >
> > if

Re: [PATCH V2 00/11] Subject: Remove duplicated kmap code

2020-05-04 Thread Ira Weiny

On Mon, May 04, 2020 at 06:33:57AM +0100, Al Viro wrote:
> On Sun, May 03, 2020 at 10:04:47PM -0700, Ira Weiny wrote:
> 
> > Grepping for 'asm/highmem.h' and investigations don't reveal any issues...  
> > But
> > you do have me worried.  That said 0-day has been crunching on multiple
> > versions of this series without issues such as this (save the mips issue
> > above).
> > 
> > I have to say it would be nice if the relation between linux/highmem.h and
> > asm/highmem.h was more straightforward.
> 
> IIRC, the headache was due to asm/pgtable.h on several architectures and
> asm/cacheflush.h on parisc.
> 
> 
> 
> || IOW, there's one in linux/highmem.h (conditional on 
> !CONFIG_HIGHMEM,
> || !ARCH_HAS_KMAP) and several per-architecture variants, usually declared in
> || their asm/highmem.h.  In three of those (microblaze, parisc and powerpc) 
> these
> || are inlines (parisc one identical to linux/highmem.h, lives in 
> asm/cacheflush.h,
> || powerpc and microblaze ones calling kmap_atomic_prot() which is defined in
> || arch/$ARCH/mm/highmem.c).
> || 
> || parisc case is weird - essentially, they want to call 
> || flush_kernel_dcache_page_addr() at the places where kunmap/kunmap_atomic
> || is done.  And they do so despite not selecting HIGHMEM, with definitions
> || in usual place.  They do have ARCH_HAS_KMAP defined, which prevents
> || getting buggered in linux/highmem.h.  ARCH_HAS_KMAP is parisc-unique,
> || BTW, and checked only in linux/highmem.h.
> || 
> || All genuine arch-specific variants are defined in (or call 
> functions
> || defined in) translation units that are only included CONFIG_HIGHMEM builds.

I agree with this statement.  But IMO additional confusion is caused by the
fact that some arch's condition the declarations on CONFIG_HIGHMEM within
asm/highmem.h (arc, arm, nds32) while others depend on linux/highmem.h (and
elsewhere) to do so (csky, microblaze, mips, powerpc, sparc, x86, xtensa).

Why?

I think (perhaps naive) over time asm/highmem.h needs to be isolated to being
included in linux/highmem.h.  But as you point out below that is not so easy.
I think that this series helps toward that goal.

> || 
> || It would be tempting to consolidate those, e.g. by adding 
> __kmap_atomic()
> || and __kmap_atomic_prot() without that boilerplate, with universal 
> kmap_atomic()
> || and kmap_atomic_prot() in linux/highmem.h.  Potential problem with that 
> would
> || be something that pulls ash/highmem.h (or asm/cacheflush.h in case of 
> parisc)
> || directly and uses kmap_atomic/kmap_atomic_prot.  There's not a lot places
> || pulling asm/highmem.h, and many of them are not even in includes:
> || 
> || arch/arm/include/asm/efi.h:13:#include 
> || arch/arm/mm/dma-mapping.c:31:#include 
> || arch/arm/mm/flush.c:14:#include 
> || arch/arm/mm/mmu.c:27:#include 
> || arch/mips/include/asm/pgtable-32.h:22:#include 
> || arch/mips/mm/cache.c:19:#include 
> || arch/mips/mm/fault.c:28:#include /* For 
> VMALLOC_END */
> || arch/nds32/include/asm/pgtable.h:60:#include 
> || arch/x86/kernel/setup_percpu.c:20:#include 
> || include/linux/highmem.h:35:#include 
> || 
> || Users of asm/cacheflush.h are rather more numerous; however, anything
> || outside of parisc-specific code has to pull linux/highmem.h, or it won't 
> see
> || the definitions of kmap_atomic/kmap_atomic_prot at all.  arch/parisc itself
> || has no callers of those.
> || 
> || Outside of arch/* there is a plenty of callers.  However, the following is
> || true: all instances of kmap_atomic or kmap_atomic_prot outside of arch/*
> || are either inside the linux/highmem.h or are preceded by include of
> || linux/highmem.h on any build that sees them (there is a common include
> || chain that is conditional upon CONFIG_BLOCK, but it's only needed in
> || drivers that are BLOCK-dependent).  It was not fun to verify, to put
> || it mildly...
> || 
> || So for parisc we have no problem - leaving 
> __kmap_atomic()/__kmap_atomic_prot()
> || in asm/cachefile.h and adding universal wrappers in linux/highmem.h will be
> || fine.  For other architectures the things might be trickier.

And the follow up series removes kmap_* from asm/cachefile.h in parisc which
should be cleaner going forward.

> || 
> || * arc: all users in arch/arc/ are within arch/arc/mm/{cache,highmem}.c;
> || both pull linux/highmem.h.  We are fine.

Still fine.

> || 
> || * arm: much, much worse.  We have several files that pull linux/highmem.h:
> || arch/arm/mm/cache-feroceon-l2.c, arch/arm/mm/cache-xsc3l2.c,
> || arch/arm/mm/copypage-*.c, arch/arm/mm/dma-mapping.c, arch/arm/mm/flush.c,
> || arch/arm/mm/highmem.c, arch/arm/probes/uprobes/core.c,
> || arch/arm/include/asm/kvm_mmu.h (kmap_atomic_pfn()).
> || Those are fine, but we also have this:
> || arch/arm/include/asm/pgtable.h:200:#define __pte_map(pmd)   
> (pte_t *)kmap_atomic(pmd_page(*(pmd)))
> || arch/arm/include/asm/pgtable.h:208:#define pte_offset_map(p

Re: [PATCH v2 17/20] mm: free_area_init: allow defining max_zone_pfn in descending order

2020-05-04 Thread Mike Rapoport

On Sun, May 03, 2020 at 11:43:00AM -0700, Guenter Roeck wrote:
> On Sun, May 03, 2020 at 10:41:38AM -0700, Guenter Roeck wrote:
> > Hi,
> > 
> > On Wed, Apr 29, 2020 at 03:11:23PM +0300, Mike Rapoport wrote:
> > > From: Mike Rapoport 
> > > 
> > > Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the
> > > ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it is
> > > sorted in descending order allows using free_area_init() on such
> > > architectures.
> > > 
> > > Add top -> down traversal of max_zone_pfn array in free_area_init() and 
> > > use
> > > the latter in ARC node/zone initialization.
> > > 
> > > Signed-off-by: Mike Rapoport 
> > 
> > This patch causes my microblazeel qemu boot test in linux-next to fail.
> > Reverting it fixes the problem.
> > 
> The same problem is seen with s390 emulations.

Yeah, this patch breaks some others as well :(

My assumption that max_zone_pfn defines architectural limit for maximal
PFN that can belong to a zone was over-optimistic. Several arches
actually do that, but others do

max_zone_pfn[ZONE_DMA] = MAX_DMA_PFN;
max_zone_pfn[ZONE_NORMAL] = max_pfn;

where MAX_DMA_PFN is build-time constrain and max_pfn is run time limit
for the current system.

So, when max_pfn is lower than MAX_DMA_PFN, the free_init_area() will
consider max_zone_pfn as descending and will wrongly calculate zone
extents.

That said, instead of trying to create a generic way to special case
ARC, I suggest to simply use the below patch instead.

diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 41eb9be1653c..386959bac3d2 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -77,6 +77,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
base, TO_MB(size), !in_use ? "Not used":"");
 }
 
+bool arch_has_descending_max_zone_pfns(void)
+{
+   return true;
+}
+
 /*
  * First memory setup routine called from setup_arch()
  * 1. setup swapper's mm @init_mm
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b990e9734474..114f0e027144 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7307,6 +7307,15 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
}
 }
 
+/*
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+   return false;
+}
+
 /**
  * free_area_init - Initialise all pg_data_t and zone data
  * @max_zone_pfn: an array of max PFNs for each zone
@@ -7324,7 +7333,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 {
unsigned long start_pfn, end_pfn;
int i, nid, zone;
-   bool descending = false;
+   bool descending;
 
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -7333,14 +7342,7 @@ void __init free_area_init(unsigned long *max_zone_pfn)
sizeof(arch_zone_highest_possible_pfn));
 
start_pfn = find_min_pfn_with_active_regions();
-
-   /*
-* Some architecturs, e.g. ARC may have ZONE_HIGHMEM below
-* ZONE_NORMAL. For such cases we allow max_zone_pfn sorted in the
-* descending order
-*/
-   if (MAX_NR_ZONES > 1 && max_zone_pfn[0] > max_zone_pfn[1])
-   descending = true;
+   descending = arch_has_descending_max_zone_pfns();
 
for (i = 0; i < MAX_NR_ZONES; i++) {
if (descending)

> Guenter
> 
> > qemu command line:
> > 
> > qemu-system-microblazeel -M petalogix-ml605 -m 256 \
> > -kernel arch/microblaze/boot/linux.bin -no-reboot \
> > -initrd rootfs.cpio \
> > -append 'panic=-1 slub_debug=FZPUA rdinit=/sbin/init 
> > console=ttyS0,115200' \
> > -monitor none -serial stdio -nographic
> > 
> > initrd:
> > 
> > https://github.com/groeck/linux-build-test/blob/master/rootfs/microblazeel/rootfs.cpio.gz
> > configuration:
> > 
> > https://github.com/groeck/linux-build-test/blob/master/rootfs/microblazeel/qemu_microblazeel_ml605_defconfig
> > 
> > Bisect log is below.
> > 
> > Guenter
> > 
> > ---
> > # bad: [fb9d670f57e3f6478602328bbbf71138be06ca4f] Add linux-next specific 
> > files for 20200501
> > # good: [6a8b55ed4056ea5559ebe4f6a4b247f627870d4c] Linux 5.7-rc3
> > git bisect start 'HEAD' 'v5.7-rc3'
> > # good: [068b80b68a670f0b17288c8a3d1ee751f35162ab] Merge remote-tracking 
> > branch 'drm/drm-next'
> > git bisect good 068b80b68a670f0b17288c8a3d1ee751f35162ab
> > # good: [46c70fc6a3ac35cd72ddad248dcbe4eee716d2a5] Merge remote-tracking 
> > branch 'drivers-x86/for-next'
> > git bisect good 46c70fc6a3ac35cd72ddad248dcbe4eee716d2a5
> > # good: [f39c4ad479a2f005f972a2b941b40efa6b9c9349] Merge remote-tracking 
> > branch 'rpmsg/for-next'
> > git bisect good f39c4ad479a2f005f972a2b941b40efa6b9c9349
> > # bad: [165d3ee0162fe28efc2c8180176633e33515df15] 
> > ipc-convert-ipcs_idr-to-xarray-update
> > git bisect ba

Re: [PATCH v2] misc: new driver sram_uapi for user level SRAM access

2020-05-04 Thread kbuild test robot

Hi Wang,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on char-misc/char-misc-testing]
[also build test WARNING on arm-soc/for-next linus/master linux/master v5.7-rc3 
next-20200501]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/Wang-Wenhu/misc-new-driver-sram_uapi-for-user-level-SRAM-access/20200421-105427
base:   https://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git 
ae83d0b416db002fe95601e7f97f64b59514d936
config: microblaze-allyesconfig (attached as .config)
compiler: microblaze-linux-gcc (GCC) 9.3.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day GCC_VERSION=9.3.0 make.cross 
ARCH=microblaze 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot 

Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings

All warnings (new ones prefixed by >>):

   drivers/misc/sram_uapi.c: In function 'sram_uapi_ioctl':
>> drivers/misc/sram_uapi.c:100:6: warning: 'type' may be used uninitialized in 
>> this function [-Wmaybe-uninitialized]
 100 |   if (cur->type == type) {
 |  ^
   drivers/misc/sram_uapi.c:188:8: note: 'type' was declared here
 188 |  __u32 type;
 |^~~~

vim +/type +100 drivers/misc/sram_uapi.c

93  
94  static struct sram_api *get_sram_api_from_type(__u32 type)
95  {
96  struct sram_api *cur;
97  
98  mutex_lock(&sram_list_lock);
99  list_for_each_entry(cur, &sram_list, list) {
 > 100  if (cur->type == type) {
   101  kref_get(&cur->kref);
   102  mutex_unlock(&sram_list_lock);
   103  return cur;
   104  }
   105  }
   106  mutex_unlock(&sram_list_lock);
   107  
   108  return NULL;
   109  }
   110  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip

[powerpc:merge] BUILD SUCCESS 1bc92fe3175eb26ff37e580c0383d7a9abe06835

2020-05-04 Thread kbuild test robot

tree/branch: https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git  
merge
branch HEAD: 1bc92fe3175eb26ff37e580c0383d7a9abe06835  Automatic merge of 
branches 'master', 'next' and 'fixes' into merge

elapsed time: 5894m

configs tested: 248
configs skipped: 0

The following configs have been built successfully.
More configs may be tested in the coming days.

arm   efm32_defconfig
arm at91_dt_defconfig
armshmobile_defconfig
arm64   defconfig
armmulti_v5_defconfig
arm   sunxi_defconfig
armmulti_v7_defconfig
arm  exynos_defconfig
arm64allyesconfig
arm  allyesconfig
arm64allmodconfig
arm  allmodconfig
arm64 allnoconfig
arm   allnoconfig
sparcallyesconfig
powerpc defconfig
ia64defconfig
powerpc  chrp32_defconfig
c6xevmc6678_defconfig
xtensa   common_defconfig
ia64 alldefconfig
nds32 allnoconfig
mips   capcella_defconfig
riscvnommu_virt_defconfig
mips allmodconfig
s390   zfcpdump_defconfig
powerpc mpc512x_defconfig
sh  sh7785lcr_32bit_defconfig
xtensa  iss_defconfig
um  defconfig
sh  rsk7269_defconfig
mips  allnoconfig
cskydefconfig
i386  allnoconfig
i386 allyesconfig
i386 alldefconfig
i386defconfig
i386  debian-10.3
ia64 allmodconfig
ia64  allnoconfig
ia64generic_defconfig
ia64  tiger_defconfig
ia64 bigsur_defconfig
ia64 allyesconfig
m68k   m5475evb_defconfig
m68k allmodconfig
m68k   bvme6000_defconfig
m68k   sun3_defconfig
m68k  multi_defconfig
nios2 3c120_defconfig
nios2 10m50_defconfig
c6x  allyesconfig
openrisc simple_smp_defconfig
openriscor1ksim_defconfig
nds32   defconfig
alpha   defconfig
h8300   h8s-sim_defconfig
h8300 edosk2674_defconfig
h8300h8300h-sim_defconfig
arc defconfig
arc  allyesconfig
microblaze  mmu_defconfig
microblazenommu_defconfig
mips  fuloong2e_defconfig
mips  malta_kvm_defconfig
mipsar7_defconfig
mips allyesconfig
mips 64r6el_defconfig
mips   32r2_defconfig
mipsmalta_kvm_guest_defconfig
mips tb0287_defconfig
mips   ip32_defconfig
mips  decstation_64_defconfig
mips  loongson3_defconfig
mips  ath79_defconfig
mipsbcm63xx_defconfig
pariscallnoconfig
pariscgeneric-64bit_defconfig
pariscgeneric-32bit_defconfig
parisc   allyesconfig
parisc   allmodconfig
powerpc   holly_defconfig
powerpc   ppc64_defconfig
powerpc  rhel-kconfig
powerpc   allnoconfig
powerpc  mpc866_ads_defconfig
powerpcamigaone_defconfig
powerpcadder875_defconfig
powerpc ep8248e_defconfig
powerpc  g5_defconfig
parisc   randconfig-a001-20200430
mips randconfig-a001-20200430
m68k randconfig-a001-20200430
riscvrandconfig-a001-20200430
alpharandconfig-a001-20200430
nds32randconfig-a001-20200430
m68k randconfig-a001-20200502
mips randconfig-a001-20200502
nds32randconfig-a001-20200502
alpharandconfig-a001-20200502
parisc   randc

[PATCH] powerpc/64/signal: balance return predictor stack in signal trampoline

Returning from an interrupt or syscall to a signal handler currently
begins execution directly at the handler's entry point, with LR set to
the address of the sigreturn trampoline. When the signal handler
function returns, it runs the trampoline. It looks like this:

# interrupt at user address xyz
# kernel stuff... signal is raised
rfid
# void handler(int sig)
addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
mflr 0
std 0,16(1)
stdu 1,-96(1)
# handler stuff
ld 0,16(1)
mtlr 0
blr
# __kernel_sigtramp_rt64
addir1,r1,__SIGNAL_FRAMESIZE
li  r0,__NR_rt_sigreturn
sc
# kernel executes rt_sigreturn
rfid
# back to user address xyz

Note the blr with no matching bl. This can corrupt the return predictor.

Solve this by instead resuming execution at the signal trampoline which
then calls the signal handler. qtrace-tools link_stack checker confirms
the entire user/kernel/vdso cycle is balanced after this patch, whereas
it's not upstream.

Alan confirms the dwarf unwind info still looks good. gdb still
recognises the signal frame and can step into parent frames if it break
inside a signal handler.

Performance is pretty noisy, not a very significant change on a POWER9
here, but branch misses are consistently a lot lower on a microbenchmark:

 Performance counter stats for './signal':

 13,085.72 msec task-clock#1.000 CPUs utilized
45,024,760,101  cycles#3.441 GHz
65,102,895,542  instructions  #1.45  insn per cycle
11,271,673,787  branches  #  861.372 M/sec
59,468,979  branch-misses #0.53% of all branches

 12,989.09 msec task-clock#1.000 CPUs utilized
44,692,719,559  cycles#3.441 GHz
65,109,984,964  instructions  #1.46  insn per cycle
11,282,136,057  branches  #  868.585 M/sec
39,786,942  branch-misses #0.35% of all branches

Cc: Alan Modra 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/ppc-opcode.h |  1 +
 arch/powerpc/kernel/signal_64.c   | 20 +++-
 arch/powerpc/kernel/vdso64/sigtramp.S | 13 +
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index c1df75edde44..747b37f1ce09 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -329,6 +329,7 @@
 #define PPC_INST_BLR   0x4e800020
 #define PPC_INST_BLRL  0x4e800021
 #define PPC_INST_BCTR  0x4e800420
+#define PPC_INST_BCTRL 0x4e800421
 #define PPC_INST_MULLD 0x7c0001d2
 #define PPC_INST_MULLW 0x7c0001d6
 #define PPC_INST_MULHWU0x7c16
diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
index adfde59cf4ba..6c17e2456ccc 100644
--- a/arch/powerpc/kernel/signal_64.c
+++ b/arch/powerpc/kernel/signal_64.c
@@ -41,7 +41,7 @@
 #define FP_REGS_SIZE   sizeof(elf_fpregset_t)
 
 #define TRAMP_TRACEBACK3
-#define TRAMP_SIZE 6
+#define TRAMP_SIZE 7
 
 /*
  * When we have signals to deliver, we set up on the user stack,
@@ -603,13 +603,15 @@ static long setup_trampoline(unsigned int syscall, 
unsigned int __user *tramp)
int i;
long err = 0;
 
+   /* bctrl # call the handler */
+   err |= __put_user(PPC_INST_BCTRL, &tramp[0]);
/* addi r1, r1, __SIGNAL_FRAMESIZE  # Pop the dummy stackframe */
err |= __put_user(PPC_INST_ADDI | __PPC_RT(R1) | __PPC_RA(R1) |
- (__SIGNAL_FRAMESIZE & 0x), &tramp[0]);
+ (__SIGNAL_FRAMESIZE & 0x), &tramp[1]);
/* li r0, __NR_[rt_]sigreturn| */
-   err |= __put_user(PPC_INST_ADDI | (syscall & 0x), &tramp[1]);
+   err |= __put_user(PPC_INST_ADDI | (syscall & 0x), &tramp[2]);
/* sc */
-   err |= __put_user(PPC_INST_SC, &tramp[2]);
+   err |= __put_user(PPC_INST_SC, &tramp[3]);
 
/* Minimal traceback info */
for (i=TRAMP_TRACEBACK; i < TRAMP_SIZE ;i++)
@@ -867,12 +869,12 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t 
*set,
 
/* Set up to return from userspace. */
if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) {
-   regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
+   regs->nip = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
} else {
err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
if (err)
goto badframe;
-   regs->link = (unsigned long) &frame->tramp[0];
+   regs->nip = (unsigned long) &frame->tramp[0];
}
 
/* Allocate a dummy call

Re: [PATCH 1/2] powerpc/spufs: fix copy_to_user while atomic

2020-05-04 Thread Christoph Hellwig

powerpc mantainers,

are you going to pick this up for the next -rc1?  I'm waiting for it to
hit upstream before resending the coredump series.

On Wed, Apr 29, 2020 at 03:03:02PM +0800, Jeremy Kerr wrote:
> Currently, we may perform a copy_to_user (through
> simple_read_from_buffer()) while holding a context's register_lock,
> while accessing the context save area.
> 
> This change uses a temporary buffer for the context save area data,
> which we then pass to simple_read_from_buffer.
> 
> Includes changes from Christoph Hellwig .
> 
> Fixes: bf1ab978be23 ("[POWERPC] coredump: Add SPU elf notes to coredump.")
> Signed-off-by: Jeremy Kerr 
> Reviewed-by: Arnd Bergmann 
> Reviewed-by: Christoph Hellwig 
> ---
>  arch/powerpc/platforms/cell/spufs/file.c | 113 +++
>  1 file changed, 75 insertions(+), 38 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/cell/spufs/file.c 
> b/arch/powerpc/platforms/cell/spufs/file.c
> index c0f950a3f4e1..b4e1ef650b40 100644
> --- a/arch/powerpc/platforms/cell/spufs/file.c
> +++ b/arch/powerpc/platforms/cell/spufs/file.c
> @@ -1978,8 +1978,9 @@ static ssize_t __spufs_mbox_info_read(struct 
> spu_context *ctx,
>  static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
>  size_t len, loff_t *pos)
>  {
> - int ret;
>   struct spu_context *ctx = file->private_data;
> + u32 stat, data;
> + int ret;
>  
>   if (!access_ok(buf, len))
>   return -EFAULT;
> @@ -1988,11 +1989,16 @@ static ssize_t spufs_mbox_info_read(struct file 
> *file, char __user *buf,
>   if (ret)
>   return ret;
>   spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_mbox_info_read(ctx, buf, len, pos);
> + stat = ctx->csa.prob.mb_stat_R;
> + data = ctx->csa.prob.pu_mb_R;
>   spin_unlock(&ctx->csa.register_lock);
>   spu_release_saved(ctx);
>  
> - return ret;
> + /* EOF if there's no entry in the mbox */
> + if (!(stat & 0xff))
> + return 0;
> +
> + return simple_read_from_buffer(buf, len, pos, &data, sizeof(data));
>  }
>  
>  static const struct file_operations spufs_mbox_info_fops = {
> @@ -2019,6 +2025,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
> char __user *buf,
>  size_t len, loff_t *pos)
>  {
>   struct spu_context *ctx = file->private_data;
> + u32 stat, data;
>   int ret;
>  
>   if (!access_ok(buf, len))
> @@ -2028,11 +2035,16 @@ static ssize_t spufs_ibox_info_read(struct file 
> *file, char __user *buf,
>   if (ret)
>   return ret;
>   spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_ibox_info_read(ctx, buf, len, pos);
> + stat = ctx->csa.prob.mb_stat_R;
> + data = ctx->csa.priv2.puint_mb_R;
>   spin_unlock(&ctx->csa.register_lock);
>   spu_release_saved(ctx);
>  
> - return ret;
> + /* EOF if there's no entry in the ibox */
> + if (!(stat & 0xff))
> + return 0;
> +
> + return simple_read_from_buffer(buf, len, pos, &data, sizeof(data));
>  }
>  
>  static const struct file_operations spufs_ibox_info_fops = {
> @@ -2041,6 +2053,11 @@ static const struct file_operations 
> spufs_ibox_info_fops = {
>   .llseek  = generic_file_llseek,
>  };
>  
> +static size_t spufs_wbox_info_cnt(struct spu_context *ctx)
> +{
> + return (4 - ((ctx->csa.prob.mb_stat_R & 0x00ff00) >> 8)) * sizeof(u32);
> +}
> +
>  static ssize_t __spufs_wbox_info_read(struct spu_context *ctx,
>   char __user *buf, size_t len, loff_t *pos)
>  {
> @@ -2049,7 +2066,7 @@ static ssize_t __spufs_wbox_info_read(struct 
> spu_context *ctx,
>   u32 wbox_stat;
>  
>   wbox_stat = ctx->csa.prob.mb_stat_R;
> - cnt = 4 - ((wbox_stat & 0x00ff00) >> 8);
> + cnt = spufs_wbox_info_cnt(ctx);
>   for (i = 0; i < cnt; i++) {
>   data[i] = ctx->csa.spu_mailbox_data[i];
>   }
> @@ -2062,7 +2079,8 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
> char __user *buf,
>  size_t len, loff_t *pos)
>  {
>   struct spu_context *ctx = file->private_data;
> - int ret;
> + u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)];
> + int ret, count;
>  
>   if (!access_ok(buf, len))
>   return -EFAULT;
> @@ -2071,11 +2089,13 @@ static ssize_t spufs_wbox_info_read(struct file 
> *file, char __user *buf,
>   if (ret)
>   return ret;
>   spin_lock(&ctx->csa.register_lock);
> - ret = __spufs_wbox_info_read(ctx, buf, len, pos);
> + count = spufs_wbox_info_cnt(ctx);
> + memcpy(&data, &ctx->csa.spu_mailbox_data, sizeof(data));
>   spin_unlock(&ctx->csa.register_lock);
>   spu_release_saved(ctx);
>  
> - return ret;
> + return simple_read_from_buffer(buf, len, pos, &data,
> + count * sizeof(u32));
>  }
>  
>  static const struct file_operations spufs_wbox_info_fops

[PATCH] powerpc/64s/radix: Don't prefetch DAR in update_mmu_cache

The idea behind this prefetch was to kick off a page table walk before
returning from the fault, getting some pipelining advantage.

But this never showed up any noticable performance advantage, and in
fact with KUAP the prefetches are actually blocked and cause some
kind of micro-architectural fault. Removing this improves page fault
microbenchmark performance by about 9%.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/pgtable.h |  7 +--
 arch/powerpc/mm/book3s64/hash_utils.c|  5 -
 arch/powerpc/mm/book3s64/pgtable.c   | 13 -
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 368b136517e0..59ed15e43e89 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1139,8 +1139,11 @@ extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot);
 extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot);
 extern void set_pmd_at(struct mm_struct *mm, unsigned long addr,
   pmd_t *pmdp, pmd_t pmd);
-extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long 
addr,
-pmd_t *pmd);
+static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
+   unsigned long addr, pmd_t *pmd)
+{
+}
+
 extern int hash__has_transparent_hugepage(void);
 static inline int has_transparent_hugepage(void)
 {
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 845da1e8ca4f..2458615805ee 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -1672,11 +1672,6 @@ void update_mmu_cache(struct vm_area_struct *vma, 
unsigned long address,
unsigned long trap;
bool is_exec;
 
-   if (radix_enabled()) {
-   prefetch((void *)address);
-   return;
-   }
-
/* We only want HPTEs for linux PTEs that have _PAGE_ACCESSED set */
if (!pte_young(*ptep) || address >= TASK_SIZE)
return;
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index e0bb69c616e4..821b483a5ac3 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -146,19 +146,6 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
pmdv &= _HPAGE_CHG_MASK;
return pmd_set_protbits(__pmd(pmdv), newprot);
 }
-
-/*
- * This is called at the end of handling a user page fault, when the
- * fault has been handled by updating a HUGE PMD entry in the linux page 
tables.
- * We use it to preload an HPTE into the hash table corresponding to
- * the updated linux HUGE PMD entry.
- */
-void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd)
-{
-   if (radix_enabled())
-   prefetch((void *)addr);
-}
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /* For use by kexec */
-- 
2.23.0

Re: [PATCH V2 11/11] drm: Remove drm specific kmap_atomic code

2020-05-04 Thread Daniel Vetter

On Mon, May 4, 2020 at 3:09 AM  wrote:
>
> From: Ira Weiny 
>
> kmap_atomic_prot() is now exported by all architectures.  Use this
> function rather than open coding a driver specific kmap_atomic.
>
> Reviewed-by: Christian König 
> Reviewed-by: Christoph Hellwig 
> Signed-off-by: Ira Weiny 

I'm assuming this lands through some other tree or a topic branch or whatever.

Acked-by: Daniel Vetter 

Cheers, Daniel

> ---
>  drivers/gpu/drm/ttm/ttm_bo_util.c| 56 ++--
>  drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 16 
>  include/drm/ttm/ttm_bo_api.h |  4 --
>  3 files changed, 12 insertions(+), 64 deletions(-)
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c 
> b/drivers/gpu/drm/ttm/ttm_bo_util.c
> index 52d2b71f1588..f09b096ba4fd 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_util.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
> @@ -257,54 +257,6 @@ static int ttm_copy_io_page(void *dst, void *src, 
> unsigned long page)
> return 0;
>  }
>
> -#ifdef CONFIG_X86
> -#define __ttm_kmap_atomic_prot(__page, __prot) kmap_atomic_prot(__page, 
> __prot)
> -#define __ttm_kunmap_atomic(__addr) kunmap_atomic(__addr)
> -#else
> -#define __ttm_kmap_atomic_prot(__page, __prot) vmap(&__page, 1, 0,  __prot)
> -#define __ttm_kunmap_atomic(__addr) vunmap(__addr)
> -#endif
> -
> -
> -/**
> - * ttm_kmap_atomic_prot - Efficient kernel map of a single page with
> - * specified page protection.
> - *
> - * @page: The page to map.
> - * @prot: The page protection.
> - *
> - * This function maps a TTM page using the kmap_atomic api if available,
> - * otherwise falls back to vmap. The user must make sure that the
> - * specified page does not have an aliased mapping with a different caching
> - * policy unless the architecture explicitly allows it. Also mapping and
> - * unmapping using this api must be correctly nested. Unmapping should
> - * occur in the reverse order of mapping.
> - */
> -void *ttm_kmap_atomic_prot(struct page *page, pgprot_t prot)
> -{
> -   if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> -   return kmap_atomic(page);
> -   else
> -   return __ttm_kmap_atomic_prot(page, prot);
> -}
> -EXPORT_SYMBOL(ttm_kmap_atomic_prot);
> -
> -/**
> - * ttm_kunmap_atomic_prot - Unmap a page that was mapped using
> - * ttm_kmap_atomic_prot.
> - *
> - * @addr: The virtual address from the map.
> - * @prot: The page protection.
> - */
> -void ttm_kunmap_atomic_prot(void *addr, pgprot_t prot)
> -{
> -   if (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))
> -   kunmap_atomic(addr);
> -   else
> -   __ttm_kunmap_atomic(addr);
> -}
> -EXPORT_SYMBOL(ttm_kunmap_atomic_prot);
> -
>  static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
> unsigned long page,
> pgprot_t prot)
> @@ -316,13 +268,13 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, 
> void *src,
> return -ENOMEM;
>
> src = (void *)((unsigned long)src + (page << PAGE_SHIFT));
> -   dst = ttm_kmap_atomic_prot(d, prot);
> +   dst = kmap_atomic_prot(d, prot);
> if (!dst)
> return -ENOMEM;
>
> memcpy_fromio(dst, src, PAGE_SIZE);
>
> -   ttm_kunmap_atomic_prot(dst, prot);
> +   kunmap_atomic(dst);
>
> return 0;
>  }
> @@ -338,13 +290,13 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, 
> void *dst,
> return -ENOMEM;
>
> dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT));
> -   src = ttm_kmap_atomic_prot(s, prot);
> +   src = kmap_atomic_prot(s, prot);
> if (!src)
> return -ENOMEM;
>
> memcpy_toio(dst, src, PAGE_SIZE);
>
> -   ttm_kunmap_atomic_prot(src, prot);
> +   kunmap_atomic(src);
>
> return 0;
>  }
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c 
> b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> index bb46ca0c458f..94d456a1d1a9 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c
> @@ -374,12 +374,12 @@ static int vmw_bo_cpu_blit_line(struct 
> vmw_bo_blit_line_data *d,
> copy_size = min_t(u32, copy_size, PAGE_SIZE - 
> src_page_offset);
>
> if (unmap_src) {
> -   ttm_kunmap_atomic_prot(d->src_addr, d->src_prot);
> +   kunmap_atomic(d->src_addr);
> d->src_addr = NULL;
> }
>
> if (unmap_dst) {
> -   ttm_kunmap_atomic_prot(d->dst_addr, d->dst_prot);
> +   kunmap_atomic(d->dst_addr);
> d->dst_addr = NULL;
> }
>
> @@ -388,8 +388,8 @@ static int vmw_bo_cpu_blit_line(struct 
> vmw_bo_blit_line_data *d,
> return -EINVAL;
>
> d->dst_addr =
> -   ttm_kmap_atomic_prot(d->dst_pages[dst_page],
> -

[PATCH v2 5/5] kvm_main: replace debugfs with stats_fs

Use stats_fs API instead of debugfs to create sources and add values.

This also requires to change all architecture files to replace the old
debugfs_entries with stats_fs_vcpu_entries and statsfs_vm_entries.

The files/folders name and organization is kept unchanged, and a symlink
in sys/kernel/debugfs/kvm is left for backward compatibility.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 arch/arm64/kvm/Kconfig  |   1 +
 arch/arm64/kvm/guest.c  |   2 +-
 arch/mips/kvm/Kconfig   |   1 +
 arch/mips/kvm/mips.c|   2 +-
 arch/powerpc/kvm/Kconfig|   1 +
 arch/powerpc/kvm/book3s.c   |   6 +-
 arch/powerpc/kvm/booke.c|   8 +-
 arch/s390/kvm/Kconfig   |   1 +
 arch/s390/kvm/kvm-s390.c|  16 +-
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/Kconfig|   1 +
 arch/x86/kvm/Makefile   |   2 +-
 arch/x86/kvm/debugfs.c  |  64 ---
 arch/x86/kvm/stats_fs.c |  56 ++
 arch/x86/kvm/x86.c  |   6 +-
 include/linux/kvm_host.h|  39 +---
 virt/kvm/arm/arm.c  |   2 +-
 virt/kvm/kvm_main.c | 314 
 18 files changed, 142 insertions(+), 382 deletions(-)
 delete mode 100644 arch/x86/kvm/debugfs.c
 create mode 100644 arch/x86/kvm/stats_fs.c

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 449386d76441..8c125387b673 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
depends on OF
# for TASKSTATS/TASK_DELAY_ACCT:
depends on NET && MULTIUSER
+   imply STATS_FS
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 8417b200bec9..235ed44e4353 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,7 +29,7 @@
 
 #include "trace.h"
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index b91d145aa2d5..19d14e979e5f 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -19,6 +19,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
depends on MIPS_FP_SUPPORT
+   imply STATS_FS
select EXPORT_UASM
select PREEMPT_NOTIFIERS
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index fdf1c14d9205..a47d21f35444 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -39,7 +39,7 @@
 #define VECTORSPACING 0x100/* for EI/VI mode */
 #endif
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("wait", wait_exits),
VCPU_STAT("cache", cache_exits),
VCPU_STAT("signal", signal_exits),
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 12885eda324e..feb5e110ebb0 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -19,6 +19,7 @@ if VIRTUALIZATION
 
 config KVM
bool
+   imply STATS_FS
select PREEMPT_NOTIFIERS
select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 37508a356f28..76222ab148da 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -38,7 +38,7 @@
 
 /* #define EXIT_DEBUG */
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("exits", sum_exits),
VCPU_STAT("mmio", mmio_exits),
VCPU_STAT("sig", signal_exits),
@@ -66,6 +66,10 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("pthru_all", pthru_all),
VCPU_STAT("pthru_host", pthru_host),
VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
+   { NULL }
+};
+
+struct stats_fs_value stats_fs_vm_entries[] = {
VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
{ NULL }
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index c2984cb6dfa7..b14c07786cc8 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -35,7 +35,12 @@
 
 unsigned long kvmppc_booke_handlers;
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vm_entries[] = {
+   VM_STAT("remote_tlb_flush", remote_tlb_flush),
+   { NULL }
+};
+
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("mmio", mmio_exits),
VCPU_STAT("sig", signal_exits),
VCPU_STAT("itlb_r", itlb_real_miss_exits),
@@ -54,7 +59,6 @@ struct kvm_stats_debugfs_item debu

[PATCH v2 4/5] stats_fs fs: virtual fs to show stats to the end-user

Add virtual fs that maps stats_fs sources with directories, and values
(simple or aggregates) to files.

Every time a file is read/cleared, the fs internally invokes the stats_fs
API to get/set the requested value.

fs/stats_fs/inode.c is pretty much similar to what is done in
fs/debugfs/inode.c, with the exception that the API is only
composed by stats_fs_create_file, stats_fs_create_dir and stats_fs_remove.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 fs/stats_fs/Makefile   |   2 +-
 fs/stats_fs/inode.c| 337 +
 fs/stats_fs/internal.h |  15 ++
 fs/stats_fs/stats_fs.c | 163 ++
 include/linux/stats_fs.h   |  15 ++
 include/uapi/linux/magic.h |   1 +
 tools/lib/api/fs/fs.c  |  21 +++
 7 files changed, 553 insertions(+), 1 deletion(-)
 create mode 100644 fs/stats_fs/inode.c

diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
index 9db130fac6b6..ac12c27545f6 100644
--- a/fs/stats_fs/Makefile
+++ b/fs/stats_fs/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-stats_fs-objs  := stats_fs.o
+stats_fs-objs  := inode.o stats_fs.o
 stats_fs-tests-objs:= stats_fs-tests.o
 
 obj-$(CONFIG_STATS_FS) += stats_fs.o
diff --git a/fs/stats_fs/inode.c b/fs/stats_fs/inode.c
new file mode 100644
index ..865ee91656ba
--- /dev/null
+++ b/fs/stats_fs/inode.c
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  inode.c - part of stats_fs, a tiny little stats_fs file system
+ *
+ *  Copyright (C) 2020 Emanuele Giuseppe Esposito 
+ *  Copyright (C) 2020 Redhat
+ */
+#define pr_fmt(fmt)"stats_fs: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "internal.h"
+
+#define STATS_FS_DEFAULT_MODE  0700
+
+static struct simple_fs stats_fs;
+static bool stats_fs_registered;
+
+struct stats_fs_mount_opts {
+   kuid_t uid;
+   kgid_t gid;
+   umode_t mode;
+};
+
+enum {
+   Opt_uid,
+   Opt_gid,
+   Opt_mode,
+   Opt_err
+};
+
+static const match_table_t tokens = {
+   {Opt_uid, "uid=%u"},
+   {Opt_gid, "gid=%u"},
+   {Opt_mode, "mode=%o"},
+   {Opt_err, NULL}
+};
+
+struct stats_fs_fs_info {
+   struct stats_fs_mount_opts mount_opts;
+};
+
+static int stats_fs_parse_options(char *data, struct stats_fs_mount_opts *opts)
+{
+   substring_t args[MAX_OPT_ARGS];
+   int option;
+   int token;
+   kuid_t uid;
+   kgid_t gid;
+   char *p;
+
+   opts->mode = STATS_FS_DEFAULT_MODE;
+
+   while ((p = strsep(&data, ",")) != NULL) {
+   if (!*p)
+   continue;
+
+   token = match_token(p, tokens, args);
+   switch (token) {
+   case Opt_uid:
+   if (match_int(&args[0], &option))
+   return -EINVAL;
+   uid = make_kuid(current_user_ns(), option);
+   if (!uid_valid(uid))
+   return -EINVAL;
+   opts->uid = uid;
+   break;
+   case Opt_gid:
+   if (match_int(&args[0], &option))
+   return -EINVAL;
+   gid = make_kgid(current_user_ns(), option);
+   if (!gid_valid(gid))
+   return -EINVAL;
+   opts->gid = gid;
+   break;
+   case Opt_mode:
+   if (match_octal(&args[0], &option))
+   return -EINVAL;
+   opts->mode = option & S_IALLUGO;
+   break;
+   /*
+* We might like to report bad mount options here;
+* but traditionally stats_fs has ignored all mount options
+*/
+   }
+   }
+
+   return 0;
+}
+
+static int stats_fs_apply_options(struct super_block *sb)
+{
+   struct stats_fs_fs_info *fsi = sb->s_fs_info;
+   struct inode *inode = d_inode(sb->s_root);
+   struct stats_fs_mount_opts *opts = &fsi->mount_opts;
+
+   inode->i_mode &= ~S_IALLUGO;
+   inode->i_mode |= opts->mode;
+
+   inode->i_uid = opts->uid;
+   inode->i_gid = opts->gid;
+
+   return 0;
+}
+
+static int stats_fs_remount(struct super_block *sb, int *flags, char *data)
+{
+   int err;
+   struct stats_fs_fs_info *fsi = sb->s_fs_info;
+
+   sync_filesystem(sb);
+   err = stats_fs_parse_options(data, &fsi->mount_opts);
+   if (err)
+   goto fail;
+
+   stats_fs_apply_options(sb);
+
+fail:
+   return err;
+}
+
+static int stats_fs_show_options(struct seq_file *m, struct dentry *root)
+{
+   struct stats_fs_fs_info *fsi = root->d_sb->s_fs_info;
+   struct stats_fs_mount_opts *opts = &fsi->mount_opts;
+
+   if (!uid_eq(opts->uid, GLOBAL_ROOT_UID))
+   seq_pr

[PATCH v2 3/5] kunit: tests for stats_fs API

Add kunit tests to extensively test the stats_fs API functionality.

In order to run them, the kernel .config must set CONFIG_KUNIT=y
and a new .kunitconfig file must be created with CONFIG_STATS_FS=y
and CONFIG_STATS_FS_TEST=y

Tests can be then started by running the following command from the root
directory of the linux kernel source tree:
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`

Signed-off-by: Emanuele Giuseppe Esposito 
---
 fs/Kconfig   |6 +
 fs/stats_fs/Makefile |2 +
 fs/stats_fs/stats_fs-tests.c | 1088 ++
 3 files changed, 1096 insertions(+)
 create mode 100644 fs/stats_fs/stats_fs-tests.c

diff --git a/fs/Kconfig b/fs/Kconfig
index 1b0de0f19e96..0844e8defd22 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -334,4 +334,10 @@ config STATS_FS
  stats_fs is a virtual file system that provides counters and
  other statistics about the running kernel.
 
+config STATS_FS_TEST
+   bool "Tests for stats_fs"
+   depends on STATS_FS && KUNIT
+   help
+ tests for the stats_fs API.
+
 endmenu
diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
index 94fe52d590d5..9db130fac6b6 100644
--- a/fs/stats_fs/Makefile
+++ b/fs/stats_fs/Makefile
@@ -1,4 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 stats_fs-objs  := stats_fs.o
+stats_fs-tests-objs:= stats_fs-tests.o
 
 obj-$(CONFIG_STATS_FS) += stats_fs.o
+obj-$(CONFIG_STATS_FS_TEST) += stats_fs-tests.o
diff --git a/fs/stats_fs/stats_fs-tests.c b/fs/stats_fs/stats_fs-tests.c
new file mode 100644
index ..46c3fb510ee9
--- /dev/null
+++ b/fs/stats_fs/stats_fs-tests.c
@@ -0,0 +1,1088 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include "internal.h"
+
+#define STATS_FS_STAT(el, x, ...)  
\
+   {  \
+   .name = #x, .offset = offsetof(struct container, el.x),\
+   ##__VA_ARGS__  \
+   }
+
+#define ARR_SIZE(el) ((int)(sizeof(el) / sizeof(struct stats_fs_value) - 1))
+
+struct test_values_struct {
+   uint64_t u64;
+   int32_t s32;
+   bool bo;
+   uint8_t u8;
+   int16_t s16;
+};
+
+struct container {
+   struct test_values_struct vals;
+};
+
+struct stats_fs_value test_values[6] = {
+   STATS_FS_STAT(vals, u64, .type = STATS_FS_U64,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+   STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+   STATS_FS_STAT(vals, bo, .type = STATS_FS_BOOL,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+   STATS_FS_STAT(vals, u8, .type = STATS_FS_U8, .aggr_kind = STATS_FS_NONE,
+ .mode = 0),
+   STATS_FS_STAT(vals, s16, .type = STATS_FS_S16,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+   { NULL },
+};
+
+struct stats_fs_value test_aggr[4] = {
+   STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_MIN, .mode = 0),
+   STATS_FS_STAT(vals, bo, .type = STATS_FS_BOOL,
+ .aggr_kind = STATS_FS_MAX, .mode = 0),
+   STATS_FS_STAT(vals, u64, .type = STATS_FS_U64,
+ .aggr_kind = STATS_FS_SUM, .mode = 0),
+   { NULL },
+};
+
+struct stats_fs_value test_same_name[3] = {
+   STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_NONE, .mode = 0),
+   STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_MIN, .mode = 0),
+   { NULL },
+};
+
+struct stats_fs_value test_all_aggr[6] = {
+   STATS_FS_STAT(vals, s32, .type = STATS_FS_S32,
+ .aggr_kind = STATS_FS_MIN, .mode = 0),
+   STATS_FS_STAT(vals, bo, .type = STATS_FS_BOOL,
+ .aggr_kind = STATS_FS_COUNT_ZERO, .mode = 0),
+   STATS_FS_STAT(vals, u64, .type = STATS_FS_U64,
+ .aggr_kind = STATS_FS_SUM, .mode = 0),
+   STATS_FS_STAT(vals, u8, .type = STATS_FS_U8, .aggr_kind = STATS_FS_AVG,
+ .mode = 0),
+   STATS_FS_STAT(vals, s16, .type = STATS_FS_S16,
+ .aggr_kind = STATS_FS_MAX, .mode = 0),
+   { NULL },
+};
+
+#define def_u64 ((uint64_t)64)
+
+#define def_val_s32 ((int32_t)S32_MIN)
+#define def_val_bool ((bool)true)
+#define def_val_u8 ((uint8_t)127)
+#define def_val_s16 ((int16_t)1)
+
+#define def_val2_s32 ((int32_t)S16_MAX)
+#define def_val2_bool ((bool)false)
+#define def_val2_u8 ((uint8_t)255)
+#define def_val2_s16 ((int16_t)-2)
+
+struct container cont = {
+   .vals = {
+   .u64 = def_u64,
+   .s

[PATCH v2 0/5] Statsfs: a new ram-based file sytem for Linux kernel statistics

There is currently no common way for Linux kernel subsystems to expose
statistics to userspace shared throughout the Linux kernel; subsystems
have to take care of gathering and displaying statistics by themselves,
for example in the form of files in debugfs. For example KVM has its own
code section that takes care of this in virt/kvm/kvm_main.c, where it sets
up debugfs handlers for displaying values and aggregating them from
various subfolders to obtain information about the system state (i.e.
displaying the total number of exits, calculated by summing all exits of
all cpus of all running virtual machines).

Allowing each section of the kernel to do so has two disadvantages. First,
it will introduce redundant code. Second, debugfs is anyway not the right
place for statistics (for example it is affected by lockdown)

In this patch series I introduce statsfs, a synthetic ram-based virtual
filesystem that takes care of gathering and displaying statistics for the
Linux kernel subsystems.

The file system is mounted on /sys/kernel/stats and would be already used
by kvm. Statsfs was initially introduced by Paolo Bonzini [1].

Statsfs offers a generic and stable API, allowing any kind of
directory/file organization and supporting multiple kind of aggregations
(not only sum, but also average, max, min and count_zero) and data types
(all unsigned and signed types plus boolean). The implementation, which is
a generalization of KVM’s debugfs statistics code, takes care of gathering
and displaying information at run time; users only need to specify the
values to be included in each source.

Statsfs would also be a different mountpoint from debugfs, and would not
suffer from limited access due to the security lock down patches. Its main
function is to display each statistics as a file in the desired folder
hierarchy defined through the API. Statsfs files can be read, and possibly
cleared if their file mode allows it.

Statsfs has two main components: the public API defined by
include/linux/statsfs.h, and the virtual file system which should end up
in /sys/kernel/stats.

The API has two main elements, values and sources. Kernel subsystems like
KVM can use the API to create a source, add child
sources/values/aggregates and register it to the root source (that on the
virtual fs would be /sys/kernel/statsfs).

Sources are created via statsfs_source_create(), and each source becomes a
directory in the file system. Sources form a parent-child relationship;
root sources are added to the file system via statsfs_source_register().
Every other source is added to or removed from a parent through the
statsfs_source_add_subordinate and statsfs_source_remote_subordinate APIs.
Once a source is created and added to the tree (via add_subordinate), it
will be used to compute aggregate values in the parent source.

Values represent quantites that are gathered by the statsfs user. Examples
of values include the number of vm exits of a given kind, the amount of
memory used by some data structure, the length of the longest hash table
chain, or anything like that. Values are defined with the
statsfs_source_add_values function. Each value is defined by a struct
statsfs_value; the same statsfs_value can be added to many different
sources. A value can be considered "simple" if it fetches data from a
user-provided location, or "aggregate" if it groups all values in the
subordinates sources that include the same statsfs_value.

For more information, please consult the kerneldoc documentation in patch
2 and the sample uses in the kunit tests and in KVM.

This series of patches is based on my previous series "libfs: group and
simplify linux fs code" and the single patch sent to kvm "kvm_host: unify
VM_STAT and VCPU_STAT definitions in a single place". The former
simplifies code duplicated in debugfs and tracefs (from which statsfs is
based on), the latter groups all macros definition for statistics in kvm
in a single common file shared by all architectures.

Patch 1 adds a new refcount and kref destructor wrappers that take a
semaphore, as those are used later by statsfs. Patch 2 introduces the
statsfs API, patch 3 provides extensive tests that can also be used as
example on how to use the API and patch 4 adds the file system support.
Finally, patch 5 provides a real-life example of statsfs usage in KVM.

[1]
https://lore.kernel.org/kvm/5d6cdcb1-d8ad-7ae6-7351-3544e2fa3...@redhat.com/?fbclid=IwAR18LHJ0PBcXcDaLzILFhHsl3qpT3z2vlG60RnqgbpGYhDv7L43n0ZXJY8M

Signed-off-by: Emanuele Giuseppe Esposito

v1->v2 remove unnecessary list_foreach_safe loops, fix wrong indentation,
change statsfs in stats_fs

Emanuele Giuseppe Esposito (5):
refcount, kref: add dec-and-test wrappers for rw_semaphores
stats_fs API: create, add and remove stats_fs sources and values
kunit: tests for stats_fs API
stats_fs fs: virtual fs to show stats to the end-user
kvm_main: replace debugfs with stats_fs

MAINTAINERS |7 +
arch/arm64/kvm/Kconfi

[PATCH v2 2/5] stats_fs API: create, add and remove stats_fs sources and values

Introduction to the stats_fs API, that allows to easily create, add
and remove stats_fs sources and values. The API allows to easily building
the statistics directory tree to automatically gather them for the linux
kernel. The main functionalities are: create a source, add child
sources/values/aggregates, register it to the root source (that on
the virtual fs would be /sys/kernel/statsfs), ad perform a search for
a value/aggregate.

This allows creating any kind of source tree, making it more flexible
also to future readjustments.

The API representation is only logical and will be backed up
by a virtual file system in patch 4.
Its usage will be shared between the stats_fs file system
and the end-users like kvm, the former calling it when it needs to
display and clear statistics, the latter to add values and sources.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 MAINTAINERS  |   7 +
 fs/Kconfig   |   6 +
 fs/Makefile  |   1 +
 fs/stats_fs/Makefile |   4 +
 fs/stats_fs/internal.h   |  20 ++
 fs/stats_fs/stats_fs.c   | 610 +++
 include/linux/stats_fs.h | 289 +++
 7 files changed, 937 insertions(+)
 create mode 100644 fs/stats_fs/Makefile
 create mode 100644 fs/stats_fs/internal.h
 create mode 100644 fs/stats_fs/stats_fs.c
 create mode 100644 include/linux/stats_fs.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b816a453b10e..a8403d07cee5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5229,6 +5229,13 @@ F:   include/linux/debugfs.h
 F: include/linux/kobj*
 F: lib/kobj*
 
+STATS_FS
+M: Paolo Bonzini 
+R: Emanuele Giuseppe Esposito 
+S: Supported
+F: include/linux/stats_fs.h
+F: fs/stats_fs
+
 DRIVERS FOR ADAPTIVE VOLTAGE SCALING (AVS)
 M: Kevin Hilman 
 M: Nishanth Menon 
diff --git a/fs/Kconfig b/fs/Kconfig
index f08fbbfafd9a..1b0de0f19e96 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -328,4 +328,10 @@ source "fs/unicode/Kconfig"
 config IO_WQ
bool
 
+config STATS_FS
+   bool "Statistics Filesystem"
+   help
+ stats_fs is a virtual file system that provides counters and
+ other statistics about the running kernel.
+
 endmenu
diff --git a/fs/Makefile b/fs/Makefile
index 2ce5112b02c8..91558eca0cf7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,6 +125,7 @@ obj-$(CONFIG_BEFS_FS)   += befs/
 obj-$(CONFIG_HOSTFS)   += hostfs/
 obj-$(CONFIG_CACHEFILES)   += cachefiles/
 obj-$(CONFIG_DEBUG_FS) += debugfs/
+obj-$(CONFIG_STATS_FS) += stats_fs/
 obj-$(CONFIG_TRACING)  += tracefs/
 obj-$(CONFIG_OCFS2_FS) += ocfs2/
 obj-$(CONFIG_BTRFS_FS) += btrfs/
diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
new file mode 100644
index ..94fe52d590d5
--- /dev/null
+++ b/fs/stats_fs/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+stats_fs-objs  := stats_fs.o
+
+obj-$(CONFIG_STATS_FS) += stats_fs.o
diff --git a/fs/stats_fs/internal.h b/fs/stats_fs/internal.h
new file mode 100644
index ..ddf262a60736
--- /dev/null
+++ b/fs/stats_fs/internal.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _STATS_FS_INTERNAL_H_
+#define _STATS_FS_INTERNAL_H_
+
+#include 
+#include 
+#include 
+#include 
+
+/* values, grouped by base */
+struct stats_fs_value_source {
+   void *base_addr;
+   bool files_created;
+   struct stats_fs_value *values;
+   struct list_head list_element;
+};
+
+int stats_fs_val_get_mode(struct stats_fs_value *val);
+
+#endif /* _STATS_FS_INTERNAL_H_ */
diff --git a/fs/stats_fs/stats_fs.c b/fs/stats_fs/stats_fs.c
new file mode 100644
index ..b63de12769e2
--- /dev/null
+++ b/fs/stats_fs/stats_fs.c
@@ -0,0 +1,610 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "internal.h"
+
+struct stats_fs_aggregate_value {
+   uint64_t sum, min, max;
+   uint32_t count, count_zero;
+};
+
+static int is_val_signed(struct stats_fs_value *val)
+{
+   return val->type & STATS_FS_SIGN;
+}
+
+int stats_fs_val_get_mode(struct stats_fs_value *val)
+{
+   return val->mode ? val->mode : 0644;
+}
+
+static struct stats_fs_value *find_value(struct stats_fs_value_source *src,
+struct stats_fs_value *val)
+{
+   struct stats_fs_value *entry;
+
+   for (entry = src->values; entry->name; entry++) {
+   if (entry == val)
+   return entry;
+   }
+   return NULL;
+}
+
+static struct stats_fs_value *
+search_value_in_source(struct stats_fs_source *src, struct stats_fs_value *arg,
+  struct stats_fs_value_source **val_src)
+{
+   struct stats_fs_value *entry;
+   struct stats_fs_value_source *src_entry;
+
+   list_for_each_entry (src_entry, &src->values_head, list_element) {
+   entry

[PATCH v2 1/5] refcount, kref: add dec-and-test wrappers for rw_semaphores

Similar to the existing functions that take a mutex or spinlock if and
only if a reference count is decremented to zero, these new function
take an rwsem for writing just before the refcount reaches 0 (and
call a user-provided function in the case of kref_put_rwsem).

These will be used for stats_fs_source data structures, which are
protected by an rw_semaphore to allow concurrent sysfs reads.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/linux/kref.h | 11 +++
 include/linux/refcount.h |  2 ++
 lib/refcount.c   | 32 
 3 files changed, 45 insertions(+)

diff --git a/include/linux/kref.h b/include/linux/kref.h
index d32e21a2538c..2dc935445f45 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -79,6 +79,17 @@ static inline int kref_put_mutex(struct kref *kref,
return 0;
 }
 
+static inline int kref_put_rwsem(struct kref *kref,
+void (*release)(struct kref *kref),
+struct rw_semaphore *rwsem)
+{
+   if (refcount_dec_and_down_write(&kref->refcount, rwsem)) {
+   release(kref);
+   return 1;
+   }
+   return 0;
+}
+
 static inline int kref_put_lock(struct kref *kref,
void (*release)(struct kref *kref),
spinlock_t *lock)
diff --git a/include/linux/refcount.h b/include/linux/refcount.h
index 0e3ee25eb156..a9d5038aec9a 100644
--- a/include/linux/refcount.h
+++ b/include/linux/refcount.h
@@ -99,6 +99,7 @@
 #include 
 
 struct mutex;
+struct rw_semaphore;
 
 /**
  * struct refcount_t - variant of atomic_t specialized for reference counts
@@ -313,6 +314,7 @@ static inline void refcount_dec(refcount_t *r)
 extern __must_check bool refcount_dec_if_one(refcount_t *r);
 extern __must_check bool refcount_dec_not_one(refcount_t *r);
 extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct 
mutex *lock);
+extern __must_check bool refcount_dec_and_down_write(refcount_t *r, struct 
rw_semaphore *rwsem);
 extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t 
*lock);
 extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
   spinlock_t *lock,
diff --git a/lib/refcount.c b/lib/refcount.c
index ebac8b7d15a7..03e113e1b43a 100644
--- a/lib/refcount.c
+++ b/lib/refcount.c
@@ -4,6 +4,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -94,6 +95,37 @@ bool refcount_dec_not_one(refcount_t *r)
 }
 EXPORT_SYMBOL(refcount_dec_not_one);
 
+/**
+ * refcount_dec_and_down_write - return holding rwsem for writing if able to 
decrement
+ *   refcount to 0
+ * @r: the refcount
+ * @lock: the mutex to be locked
+ *
+ * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail
+ * to decrement when saturated at REFCOUNT_SATURATED.
+ *
+ * Provides release memory ordering, such that prior loads and stores are done
+ * before, and provides a control dependency such that free() must come after.
+ * See the comment on top.
+ *
+ * Return: true and hold rwsem for writing if able to decrement refcount to 0, 
false
+ * otherwise
+ */
+bool refcount_dec_and_down_write(refcount_t *r, struct rw_semaphore *lock)
+{
+   if (refcount_dec_not_one(r))
+   return false;
+
+   down_write(lock);
+   if (!refcount_dec_and_test(r)) {
+   up_write(lock);
+   return false;
+   }
+
+   return true;
+}
+EXPORT_SYMBOL(refcount_dec_and_down_write);
+
 /**
  * refcount_dec_and_mutex_lock - return holding mutex if able to decrement
  *   refcount to 0
-- 
2.25.2

Re: [PATCH] powerpc/5200: update contact email

2020-05-04 Thread Michael Ellerman

Wolfram Sang  writes:
> My 'pengutronix' address is defunct for years. Merge the entries and use
> the proper contact address.

Is there any point adding the new address? It's just likely to bit-rot
one day too.

I figure the git history is a better source for more up-to-date emails.

cheers

> diff --git a/arch/powerpc/boot/dts/pcm032.dts 
> b/arch/powerpc/boot/dts/pcm032.dts
> index c259c6b3ac5a..780e13d99e7b 100644
> --- a/arch/powerpc/boot/dts/pcm032.dts
> +++ b/arch/powerpc/boot/dts/pcm032.dts
> @@ -3,9 +3,7 @@
>   * phyCORE-MPC5200B-IO (pcm032) board Device Tree Source
>   *
>   * Copyright (C) 2006-2009 Pengutronix
> - * Sascha Hauer 
> - * Juergen Beisert 
> - * Wolfram Sang 
> + * Sascha Hauer, Juergen Beisert, Wolfram Sang 
>   */
>  
>  /include/ "mpc5200b.dtsi"
> -- 
> 2.20.1

Re: [PATCH] powerpc/64s: Fix unrecoverable SLB crashes due to preemption check

2020-05-04 Thread Michael Ellerman

Hugh Dickins  writes:
> On Sun, 3 May 2020, Michael Ellerman wrote:
>
>> Hugh reported that his trusty G5 crashed after a few hours under load
>> with an "Unrecoverable exception 380".
>> 
>> The crash is in interrupt_return() where we check lazy_irq_pending(),
>> which calls get_paca() and with CONFIG_DEBUG_PREEMPT=y that goes to
>> check_preemption_disabled() via debug_smp_processor_id().
>> 
>> As Nick explained on the list:
>> 
>>   Problem is MSR[RI] is cleared here, ready to do the last few things
>>   for interrupt return where we're not allowed to take any other
>>   interrupts.
>> 
>>   SLB interrupts can happen just about anywhere aside from kernel
>>   text, global variables, and stack. When that hits, it appears to be
>>   unrecoverable due to RI=0.
>> 
>> The problematic access is in preempt_count() which is:
>> 
>>  return READ_ONCE(current_thread_info()->preempt_count);
>> 
>> Because of THREAD_INFO_IN_TASK, current_thread_info() just points to
>> current, so the access is to somewhere in kernel memory, but not on
>> the stack or in .data, which means it can cause an SLB miss. If we
>> take an SLB miss with RI=0 it is fatal.
>> 
>> The easiest solution is to add a version of lazy_irq_pending() that
>> doesn't do the preemption check and call it from the interrupt return
>> path.
>> 
>> Fixes: 68b34588e202 ("powerpc/64/sycall: Implement syscall entry/exit logic 
>> in C")
>> Reported-by: Hugh Dickins 
>> Signed-off-by: Michael Ellerman 
>
> Thank you, Michael and Nick: this has been running fine all day for me.

Thanks Hugh.

cheers

Re: [PATCH 2/7] crypto: powerpc/sha1 - remove unused temporary workspace

2020-05-04 Thread Michael Ellerman

Eric Biggers  writes:
> From: Eric Biggers 
>
> The PowerPC implementation of SHA-1 doesn't actually use the 16-word
> temporary array that's passed to the assembly code.  This was probably
> meant to correspond to the 'W' array that lib/sha1.c uses.  However, in
> sha1-powerpc-asm.S these values are actually stored in GPRs 16-31.
>
> Referencing SHA_WORKSPACE_WORDS from this code also isn't appropriate,
> since it's an implementation detail of lib/sha1.c.
>
> Therefore, just remove this unneeded array.
>
> Tested with:
>
>   export ARCH=powerpc CROSS_COMPILE=powerpc-linux-gnu-
>   make mpc85xx_defconfig
>   cat >> .config << EOF
>   # CONFIG_MODULES is not set
>   # CONFIG_CRYPTO_MANAGER_DISABLE_TESTS is not set
>   CONFIG_DEBUG_KERNEL=y
>   CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y
>   CONFIG_CRYPTO_SHA1_PPC=y
>   EOF
>   make olddefconfig
>   make -j32
>   qemu-system-ppc -M mpc8544ds -cpu e500 -nographic \
>   -kernel arch/powerpc/boot/zImage \
>   -append "cryptomgr.fuzz_iterations=1000 
> cryptomgr.panic_on_fail=1"

Thanks for testing.

I gave it a quick spin on a Power9 and it showed no issues.

Acked-by: Michael Ellerman  (powerpc)

cheers

Re: [PATCH v2 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline

2020-05-04 Thread Michal Hocko

On Thu 30-04-20 12:48:20, Srikar Dronamraju wrote:
> * Michal Hocko  [2020-04-29 14:22:11]:
> 
> > On Wed 29-04-20 07:11:45, Srikar Dronamraju wrote:
> > > > > 
> > > > > By marking, N_ONLINE as NODE_MASK_NONE, lets stop assuming that Node 
> > > > > 0 is
> > > > > always online.
> > > > > 
> > > > > ...
> > > > >
> > > > > --- a/mm/page_alloc.c
> > > > > +++ b/mm/page_alloc.c
> > > > > @@ -116,8 +116,10 @@ EXPORT_SYMBOL(latent_entropy);
> > > > >   */
> > > > >  nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
> > > > >   [N_POSSIBLE] = NODE_MASK_ALL,
> > > > > +#ifdef CONFIG_NUMA
> > > > > + [N_ONLINE] = NODE_MASK_NONE,
> > > > > +#else
> > > > >   [N_ONLINE] = { { [0] = 1UL } },
> > > > > -#ifndef CONFIG_NUMA
> > > > >   [N_NORMAL_MEMORY] = { { [0] = 1UL } },
> > > > >  #ifdef CONFIG_HIGHMEM
> > > > >   [N_HIGH_MEMORY] = { { [0] = 1UL } },
> > > > 
> > > > So on all other NUMA machines, when does node 0 get marked online?
> > > > 
> > > > This change means that for some time during boot, such machines will
> > > > now be running with node 0 marked as offline.  What are the
> > > > implications of this?  Will something break?
> > > 
> > > Till the nodes are detected, marking Node 0 as online tends to be 
> > > redundant.
> > > Because the system doesn't know if its a NUMA or a non-NUMA system.
> > > Once we detect the nodes, we online them immediately. Hence I don't see 
> > > any
> > > side-effects or negative implications of this change.
> > > 
> > > However if I am missing anything, please do let me know.
> > > 
> > > >From my part, I have tested this on
> > > 1. Non-NUMA Single node but CPUs and memory coming from zero node.
> > > 2. Non-NUMA Single node but CPUs and memory coming from non-zero node.
> > > 3. NUMA Multi node but with CPUs and memory from node 0.
> > > 4. NUMA Multi node but with no CPUs and memory from node 0.
> > 
> > Have you tested on something else than ppc? Each arch does the NUMA
> > setup separately and this is a big mess. E.g. x86 marks even memory less
> > nodes (see init_memory_less_node) as online.
> > 
> 
> while I have predominantly tested on ppc, I did test on X86 with CONFIG_NUMA
> enabled/disabled on both single node and multi node machines.
> However, I dont have a cpuless/memoryless x86 system.

This should be able to emulate inside kvm, I believe.

> > Honestly I have hard time to evaluate the effect of this patch. It makes
> > some sense to assume all nodes offline before they get online but this
> > is a land mine territory.
> > 
> > I am also not sure what kind of problem this is going to address. You
> > have mentioned numa balancing without many details.
> 
> 1. On a machine with just one node with node number not being 0,
> the current setup will end up showing 2 online nodes. And when there are
> more than one online nodes, numa_balancing gets enabled.
> 
> Without patch
> $ grep numa /proc/vmstat
> numa_hit 95179
> numa_miss 0
> numa_foreign 0
> numa_interleave 3764
> numa_local 95179
> numa_other 0
> numa_pte_updates 1206973 <--
> numa_huge_pte_updates 4654 <--
> numa_hint_faults 19560 <--
> numa_hint_faults_local 19560 <--
> numa_pages_migrated 0
> 
> 
> With patch
> $ grep numa /proc/vmstat 
> numa_hit 322338756
> numa_miss 0
> numa_foreign 0
> numa_interleave 3790
> numa_local 322338756
> numa_other 0
> numa_pte_updates 0 <--
> numa_huge_pte_updates 0 <--
> numa_hint_faults 0 <--
> numa_hint_faults_local 0 <--
> numa_pages_migrated 0
> 
> So we have a redundant page hinting numa faults which we can avoid.

interesting. Does this lead to any observable differences? Btw. it would
be really great to describe how the online state influences the numa
balancing.

> 2. Few people have complained about existence of this dummy node when
> parsing lscpu and numactl o/p. They somehow start to think that the tools
> are reporting incorrectly or the kernel is not able to recognize resources
> connected to the node.

Please be more specific.
-- 
Michal Hocko
SUSE Labs

Re: [PATCH v7 14/28] powerpc: Add a probe_kernel_read_inst() function

> @@ -524,7 +524,10 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned
> long addr) struct module *mod = rec->arch.mod;
> 
>   /* read where this goes */
> - if (probe_kernel_read(op, ip, sizeof(op)))
> + if (probe_kernel_read_inst(op, ip))
> + return -EFAULT;
> +
> + if (probe_kernel_read_inst(op + 1, ip + 4))
>   return -EFAULT;

I had to double check the above for what happens when we introduce prefix 
instructions but it looks mostly correct. There does however look to be a 
corner case that could alter the behaviour once prefix instructions are 
introduced.

With prefix instructions probe_kernel_read_inst() will read 8 bytes if the 
first 
4 bytes are a valid prefix. Therefore the above could end up trying to read 12 
bytes in total if the ip is a normal instruction and ip+4 is prefixed.

Obviously this is never going to match the expected nop sequence, and prefixed 
instructions shouldn't cross page boundaries so the extra 4 bytes should never 
be the cause of a fault either. The only difference we might see is 
ftrace_make_call() incorrectly returning -EFAULT instead of -EINVAL for an 
invalid (ie. crossing a 64 byte boundary) prefix instruction sequence.

In practice this doesn't seem like it would cause any real issues and the rest 
of the patch does not appear to change any existing behaviour.

Reviewed-by: Alistair Popple 

> 
>   if (!expected_nop_sequence(ip, op[0], op[1])) {
> @@ -587,7 +590,7 @@ __ftrace_make_call(struct dyn_ftrace *rec, unsigned long
> addr) unsigned long ip = rec->ip;
> 
>   /* read where this goes */
> - if (probe_kernel_read(&op, (void *)ip, MCOUNT_INSN_SIZE))
> + if (probe_kernel_read_inst(&op, (void *)ip))
>   return -EFAULT;
> 
>   /* It should be pointing to a nop */
> @@ -643,7 +646,7 @@ static int __ftrace_make_call_kernel(struct dyn_ftrace
> *rec, unsigned long addr) }
> 
>   /* Make sure we have a nop */
> - if (probe_kernel_read(&op, ip, sizeof(op))) {
> + if (probe_kernel_read_inst(&op, ip)) {
>   pr_err("Unable to read ftrace location %p\n", ip);
>   return -EFAULT;
>   }
> @@ -721,7 +724,7 @@ __ftrace_modify_call(struct dyn_ftrace *rec, unsigned
> long old_addr, }
> 
>   /* read where this goes */
> - if (probe_kernel_read(&op, (void *)ip, sizeof(int))) {
> + if (probe_kernel_read_inst(&op, (void *)ip)) {
>   pr_err("Fetching opcode failed.\n");
>   return -EFAULT;
>   }
> diff --git a/arch/powerpc/lib/inst.c b/arch/powerpc/lib/inst.c
> index eaf786afad2b..08dedd927268 100644
> --- a/arch/powerpc/lib/inst.c
> +++ b/arch/powerpc/lib/inst.c
> @@ -16,3 +16,14 @@ int probe_user_read_inst(struct ppc_inst *inst,
>   *inst = ppc_inst(val);
>   return err;
>  }
> +
> +int probe_kernel_read_inst(struct ppc_inst *inst,
> +struct ppc_inst *src)
> +{
> + unsigned int val;
> + int err;
> +
> + err = probe_kernel_read(&val, src, sizeof(val));
> + *inst = ppc_inst(val);
> + return err;
> +}

[PATCH v2 2/2] powerpc/64s/hash: add torture_hpt kernel boot option to increase hash faults

This option increases the number of hash misses by limiting the number of
kernel HPT entries, by accessing the address immediately after installing
the PTE, then removing it again (except in the case of CI entries that
must not be accessed, these are removed on the next hash fault).

This helps stress test difficult to hit paths in the kernel.

Signed-off-by: Nicholas Piggin 
---
v2:
- Add a bit more to the changelog.
- Add some kuap stuff for when Aneesh's kaup for hash patches are
  merged.

 .../admin-guide/kernel-parameters.txt |  9 +++
 arch/powerpc/include/asm/book3s/64/mmu-hash.h | 10 +++
 arch/powerpc/mm/book3s64/hash_4k.c|  3 +
 arch/powerpc/mm/book3s64/hash_64k.c   |  8 ++
 arch/powerpc/mm/book3s64/hash_utils.c | 76 ++-
 5 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 5a34b7dd9ebe..1ec6a32a717a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -876,6 +876,15 @@
them frequently to increase the rate of SLB faults
on kernel addresses.
 
+   torture_hpt [PPC]
+   Limits the number of kernel HPT entries in the hash
+   page table to increase the rate of hash page table
+   faults on kernel addresses.
+
+   This may hang when run on processors / emulators which
+   do not have a TLB, or flush it more often than
+   required, QEMU seems to have problems.
+
disable=[IPV6]
See Documentation/networking/ipv6.txt.
 
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 758de1e0f676..539e3d91eac4 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -324,6 +324,16 @@ static inline bool torture_slb(void)
return static_branch_unlikely(&torture_slb_key);
 }
 
+extern bool torture_hpt_enabled;
+DECLARE_STATIC_KEY_FALSE(torture_hpt_key);
+static inline bool torture_hpt(void)
+{
+   return static_branch_unlikely(&torture_hpt_key);
+}
+
+void hpt_do_torture(unsigned long ea, unsigned long access,
+   unsigned long rflags, unsigned long hpte_group);
+
 /*
  * This computes the AVPN and B fields of the first dword of a HPTE,
  * for use when we want to match an existing PTE.  The bottom 7 bits
diff --git a/arch/powerpc/mm/book3s64/hash_4k.c 
b/arch/powerpc/mm/book3s64/hash_4k.c
index 22e787123cdf..54e4ff8c558d 100644
--- a/arch/powerpc/mm/book3s64/hash_4k.c
+++ b/arch/powerpc/mm/book3s64/hash_4k.c
@@ -118,6 +118,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, 
unsigned long vsid,
}
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+   if (torture_hpt())
+   hpt_do_torture(ea, access, rflags, hpte_group);
}
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
diff --git a/arch/powerpc/mm/book3s64/hash_64k.c 
b/arch/powerpc/mm/book3s64/hash_64k.c
index 7084ce2951e6..19ea0fc145a9 100644
--- a/arch/powerpc/mm/book3s64/hash_64k.c
+++ b/arch/powerpc/mm/book3s64/hash_64k.c
@@ -216,6 +216,9 @@ int __hash_page_4K(unsigned long ea, unsigned long access, 
unsigned long vsid,
new_pte |= pte_set_hidx(ptep, rpte, subpg_index, slot, PTRS_PER_PTE);
new_pte |= H_PAGE_HASHPTE;
 
+   if (torture_hpt())
+   hpt_do_torture(ea, access, rflags, hpte_group);
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
return 0;
 }
@@ -327,7 +330,12 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 
new_pte = (new_pte & ~_PAGE_HPTEFLAGS) | H_PAGE_HASHPTE;
new_pte |= pte_set_hidx(ptep, rpte, 0, slot, PTRS_PER_PTE);
+
+   if (torture_hpt())
+   hpt_do_torture(ea, access, rflags, hpte_group);
}
+
*ptep = __pte(new_pte & ~H_PAGE_BUSY);
+
return 0;
 }
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 9c487b5782ef..845da1e8ca4f 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -353,8 +353,12 @@ int htab_remove_mapping(unsigned long vstart, unsigned 
long vend,
return ret;
 }
 
-static bool disable_1tb_segments = false;
+static bool disable_1tb_segments __read_mostly = false;
 bool torture_slb_enabled __read_mostly = false;
+bool torture_hpt_enabled __read_mostly = false;
+
+/* per-CPU array allocated if we enable torture_hpt. */
+static unsigned long *torture_hpt_last_group;
 
 static int __init parse_disable_1tb_segments(char *p)
 {
@@ -370,6

[PATCH v2 1/2] powerpc/64s/hash: add torture_slb kernel boot option to increase SLB faults

This option increases the number of SLB misses by limiting the number of
kernel SLB entries, and increased flushing of cached lookaside information.
This helps stress test difficult to hit paths in the kernel.

Signed-off-by: Nicholas Piggin 
---
v2:
- Address some comments from Aneesh about function names and types

 .../admin-guide/kernel-parameters.txt |   5 +
 arch/powerpc/include/asm/book3s/64/mmu-hash.h |   7 +
 arch/powerpc/mm/book3s64/hash_utils.c |  13 ++
 arch/powerpc/mm/book3s64/slb.c| 152 --
 4 files changed, 132 insertions(+), 45 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index f2a93c8679e8..5a34b7dd9ebe 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -871,6 +871,11 @@
can be useful when debugging issues that require an SLB
miss to occur.
 
+   torture_slb [PPC]
+   Limits the number of kernel SLB entries, and flushes
+   them frequently to increase the rate of SLB faults
+   on kernel addresses.
+
disable=[IPV6]
See Documentation/networking/ipv6.txt.
 
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h 
b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 3fa1b962dc27..758de1e0f676 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -317,6 +317,13 @@ extern unsigned long tce_alloc_start, tce_alloc_end;
  */
 extern int mmu_ci_restrictions;
 
+extern bool torture_slb_enabled;
+DECLARE_STATIC_KEY_FALSE(torture_slb_key);
+static inline bool torture_slb(void)
+{
+   return static_branch_unlikely(&torture_slb_key);
+}
+
 /*
  * This computes the AVPN and B fields of the first dword of a HPTE,
  * for use when we want to match an existing PTE.  The bottom 7 bits
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 8ed2411c3f39..9c487b5782ef 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -354,6 +354,7 @@ int htab_remove_mapping(unsigned long vstart, unsigned long 
vend,
 }
 
 static bool disable_1tb_segments = false;
+bool torture_slb_enabled __read_mostly = false;
 
 static int __init parse_disable_1tb_segments(char *p)
 {
@@ -362,6 +363,13 @@ static int __init parse_disable_1tb_segments(char *p)
 }
 early_param("disable_1tb_segments", parse_disable_1tb_segments);
 
+static int __init parse_torture_slb(char *p)
+{
+   torture_slb_enabled = true;
+   return 0;
+}
+early_param("torture_slb", parse_torture_slb);
+
 static int __init htab_dt_scan_seg_sizes(unsigned long node,
 const char *uname, int depth,
 void *data)
@@ -854,6 +862,8 @@ static void __init hash_init_partition_table(phys_addr_t 
hash_table,
pr_info("Partition table %p\n", partition_tb);
 }
 
+DEFINE_STATIC_KEY_FALSE(torture_slb_key);
+
 static void __init htab_initialize(void)
 {
unsigned long table;
@@ -870,6 +880,9 @@ static void __init htab_initialize(void)
printk(KERN_INFO "Using 1TB segments\n");
}
 
+   if (torture_slb_enabled)
+   static_branch_enable(&torture_slb_key);
+
/*
 * Calculate the required size of the htab.  We want the number of
 * PTEGs to equal one half the number of real pages.
diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c
index 716204aee3da..cf41c4b63d95 100644
--- a/arch/powerpc/mm/book3s64/slb.c
+++ b/arch/powerpc/mm/book3s64/slb.c
@@ -68,7 +68,7 @@ static void assert_slb_presence(bool present, unsigned long 
ea)
 * slbfee. requires bit 24 (PPC bit 39) be clear in RB. Hardware
 * ignores all other bits from 0-27, so just clear them all.
 */
-   ea &= ~((1UL << 28) - 1);
+   ea &= ~((1UL << SID_SHIFT) - 1);
asm volatile(__PPC_SLBFEE_DOT(%0, %1) : "=r"(tmp) : "r"(ea) : "cr0");
 
WARN_ON(present == (tmp == 0));
@@ -153,14 +153,42 @@ void slb_flush_all_realmode(void)
asm volatile("slbmte %0,%0; slbia" : : "r" (0));
 }
 
+static __always_inline void __slb_flush_and_restore_bolted(bool 
preserve_kernel_lookaside)
+{
+   struct slb_shadow *p = get_slb_shadow();
+   unsigned long ksp_esid_data, ksp_vsid_data;
+   u32 ih;
+
+   /*
+* SLBIA IH=1 on ISA v2.05 and newer processors may preserve lookaside
+* information created with Class=0 entries, which we use for kernel
+* SLB entries (the SLB entries themselves are still invalidated).
+*
+* Older processors will ignore this optimisation. Over-invalidation
+* is fine because we never rely on lookaside information existing.
+*/
+

Re: [PATCH v6 1/4] powerpc: Document details on H_SCM_HEALTH hcall

2020-05-04 Thread Vaibhav Jain

Thanks for looking into this patch Mpe,

Michael Ellerman  writes:

> Vaibhav Jain  writes:
>
>> Add documentation to 'papr_hcalls.rst' describing the bitmap flags
>> that are returned from H_SCM_HEALTH hcall as per the PAPR-SCM
>> specification.
>>
>> Cc: Dan Williams 
>> Cc: Michael Ellerman 
>> Cc: "Aneesh Kumar K . V" 
>> Signed-off-by: Vaibhav Jain 
>> ---
>> Changelog:
>>
>> v5..v6
>> * New patch in the series
>> ---
>>  Documentation/powerpc/papr_hcalls.rst | 43 ---
>>  1 file changed, 39 insertions(+), 4 deletions(-)
>>
>> diff --git a/Documentation/powerpc/papr_hcalls.rst 
>> b/Documentation/powerpc/papr_hcalls.rst
>> index 3493631a60f8..9a5ba5eaf323 100644
>> --- a/Documentation/powerpc/papr_hcalls.rst
>> +++ b/Documentation/powerpc/papr_hcalls.rst
>> @@ -220,13 +220,48 @@ from the LPAR memory.
>>  **H_SCM_HEALTH**
>>  
>>  | Input: drcIndex
>> -| Out: *health-bitmap, health-bit-valid-bitmap*
>> +| Out: *health-bitmap (r4), health-bit-valid-bitmap (r5)*
>>  | Return Value: *H_Success, H_Parameter, H_Hardware*
>>  
>>  Given a DRC Index return the info on predictive failure and overall health 
>> of
>> -the NVDIMM. The asserted bits in the health-bitmap indicate a single 
>> predictive
>> -failure and health-bit-valid-bitmap indicate which bits in health-bitmap are
>> -valid.
>> +the NVDIMM. The asserted bits in the health-bitmap indicate one or more 
>> states
>> +(described in table below) of the NVDIMM and health-bit-valid-bitmap 
>> indicate
>> +which bits in health-bitmap are valid.
>> +
>> +Health Bitmap Flags:
>> +
>> ++--+---+
>> +|  Bit |   Definition   
>>|
>> ++==+===+
>> +|  00  | SCM device is unable to persist memory contents.   
>>|
>> +|  | If the system is powered down, nothing will be saved.  
>>|
>> ++--+---+
>
> Are these correct bit numbers or backward IBM big endian bit numbers?
>
> ie. which bit is LSB?
These bit numbers index to a 64-bit dword laid in IBM big endian
format. So LSB would be at the located at a higher address. For example
0xC400 indicates bits 0, 1, and 5 are valid.

>
> cheers
> ___
> Linux-nvdimm mailing list -- linux-nvd...@lists.01.org
> To unsubscribe send an email to linux-nvdimm-le...@lists.01.org

Re: [PATCH v7 13/28] powerpc: Add a probe_user_read_inst() function

Looks good.

Reviewed-by: Alistair Popple 

On Friday, 1 May 2020 1:42:05 PM AEST Jordan Niethe wrote:
> Introduce a probe_user_read_inst() function to use in cases where
> probe_user_read() is used for getting an instruction. This will be more
> useful for prefixed instructions.
> 
> Signed-off-by: Jordan Niethe 
> ---
> v6: - New to series
> ---
>  arch/powerpc/include/asm/inst.h |  3 +++
>  arch/powerpc/lib/Makefile   |  2 +-
>  arch/powerpc/lib/inst.c | 18 ++
>  arch/powerpc/mm/fault.c |  2 +-
>  4 files changed, 23 insertions(+), 2 deletions(-)
>  create mode 100644 arch/powerpc/lib/inst.c
> 
> diff --git a/arch/powerpc/include/asm/inst.h
> b/arch/powerpc/include/asm/inst.h index 552e953bf04f..3e9a58420151 100644
> --- a/arch/powerpc/include/asm/inst.h
> +++ b/arch/powerpc/include/asm/inst.h
> @@ -37,4 +37,7 @@ static inline bool ppc_inst_equal(struct ppc_inst x,
> struct ppc_inst y) return ppc_inst_val(x) == ppc_inst_val(y);
>  }
> 
> +int probe_user_read_inst(struct ppc_inst *inst,
> +  struct ppc_inst *nip);
> +
>  #endif /* _ASM_INST_H */
> diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
> index b8de3be10eb4..546591848219 100644
> --- a/arch/powerpc/lib/Makefile
> +++ b/arch/powerpc/lib/Makefile
> @@ -16,7 +16,7 @@ CFLAGS_code-patching.o += -DDISABLE_BRANCH_PROFILING
>  CFLAGS_feature-fixups.o += -DDISABLE_BRANCH_PROFILING
>  endif
> 
> -obj-y += alloc.o code-patching.o feature-fixups.o pmem.o
> +obj-y += alloc.o code-patching.o feature-fixups.o pmem.o inst.o
> 
>  ifndef CONFIG_KASAN
>  obj-y+=  string.o memcmp_$(BITS).o
> diff --git a/arch/powerpc/lib/inst.c b/arch/powerpc/lib/inst.c
> new file mode 100644
> index ..eaf786afad2b
> --- /dev/null
> +++ b/arch/powerpc/lib/inst.c
> @@ -0,0 +1,18 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *  Copyright 2020, IBM Corporation.
> + */
> +
> +#include 
> +#include 
> +
> +int probe_user_read_inst(struct ppc_inst *inst,
> +  struct ppc_inst *nip)
> +{
> + unsigned int val;
> + int err;
> +
> + err = probe_user_read(&val, nip, sizeof(val));
> + *inst = ppc_inst(val);
> + return err;
> +}
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 4a50f125ec18..f3a943eae305 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -281,7 +281,7 @@ static bool bad_stack_expansion(struct pt_regs *regs,
> unsigned long address, access_ok(nip, sizeof(*nip))) {
>   struct ppc_inst inst;
> 
> - if (!probe_user_read(&inst, nip, sizeof(inst)))
> + if (!probe_user_read_inst(&inst, (struct ppc_inst 
> __user *)nip))
>   return !store_updates_sp(inst);
>   *must_retry = true;
>   }

Re: [PATCH v7 12/28] powerpc: Use a function for reading instructions

> diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
> index 31c870287f2b..6893d40a48c5 100644
> --- a/arch/powerpc/kernel/uprobes.c
> +++ b/arch/powerpc/kernel/uprobes.c
> @@ -174,7 +174,7 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe,
> struct pt_regs *regs) * emulate_step() returns 1 if the insn was
> successfully emulated. * For all other cases, we need to single-step in
> hardware.
>*/
> - ret = emulate_step(regs, auprobe->insn);
> + ret = emulate_step(regs, ppc_inst_read(&auprobe->insn));

I'm not a uprobe expert so I don't follow why we need this read here but the 
rest of the patch looked ok in that it shouldn't change behaviour (and in 
practice neither should the above) so:

Reviewed-by: Alistair Popple 

>   if (ret > 0)
>   return true;
>

Re: [PATCH 1/2] powerpc/64s/hash: add torture_slb kernel boot option to increase SLB faults

Excerpts from Aneesh Kumar K.V's message of May 4, 2020 5:27 pm:
> Nicholas Piggin  writes:
> 
>> This option increases the number of SLB misses by limiting the number of
>> kernel SLB entries, and increased flushing of cached lookaside information.
>> This helps stress test difficult to hit paths in the kernel.
>>
>> Signed-off-by: Nicholas Piggin 
> 
> 
> 
>> +{
>> +unsigned long slbie_data = get_paca()->slb_cache[index];
>> +unsigned long ksp = get_paca()->kstack;
>> +
>> +slbie_data <<= SID_SHIFT;
>> +slbie_data |= 0xc000ULL;
>> +if ((ksp & slb_esid_mask(mmu_kernel_ssize)) == slbie_data)
>> +return;
>> +slbie_data |= mmu_kernel_ssize << SLBIE_SSIZE_SHIFT;
>> +
>> +asm volatile("slbie %0" : : "r" (slbie_data));
>> +}
>> +
>> +static void slb_cache_slbie(unsigned int index)
> 
> May be slb_cache_slbie_user()? Similar to _kernel above?

Yeah that'd help.

>> +{
>> +unsigned long slbie_data = get_paca()->slb_cache[index];
>> +
>> +slbie_data <<= SID_SHIFT;
>> +slbie_data |= user_segment_size(slbie_data) << SLBIE_SSIZE_SHIFT;
>> +slbie_data |= SLBIE_C; /* user slbs have C=1 */
>> +
>> +asm volatile("slbie %0" : : "r" (slbie_data));
>> +}
>>  
>>  /* Flush all user entries from the segment table of the current processor. 
>> */
>>  void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
>> @@ -414,8 +449,14 @@ void switch_slb(struct task_struct *tsk, struct 
>> mm_struct *mm)
>>   * which would update the slb_cache/slb_cache_ptr fields in the PACA.
>>   */
>>  hard_irq_disable();
>> -asm volatile("isync" : : : "memory");
>> -if (cpu_has_feature(CPU_FTR_ARCH_300)) {
>> +isync();
>> +if (torture_slb()) {
>> +__slb_flush_and_restore_bolted(0);
> 
> s/0/SLIBA_IH_ALL or something like that? 

IH isn't so simple. 0 isn't all, it's clear all SLBE except index zero, 
and flush all EA lookaside data.

Maybe callers should use a bool parameter though to flush kernel 
lookaside data.

Thanks,
Nick

Re: [PATCH 2/2] powerpc/64s/hash: add torture_hpt kernel boot option to increase hash faults

Excerpts from Aneesh Kumar K.V's message of May 4, 2020 5:36 pm:
> Nicholas Piggin  writes:
> 
>> This option increases the number of hash misses by limiting the number of
>> kernel HPT entries. This helps stress test difficult to hit paths in the
>> kernel.
>>
> 
> It would nice if we can explain in commit message how we are limiting
> the number of HPT entries.

"limiting the number of kernel HPT entries by removing them as soon as 
possible after they are installed"?

> 
>> Signed-off-by: Nicholas Piggin 
>> ---
>>  .../admin-guide/kernel-parameters.txt |  9 +++
>>  arch/powerpc/include/asm/book3s/64/mmu-hash.h | 10 +++
>>  arch/powerpc/mm/book3s64/hash_4k.c|  3 +
>>  arch/powerpc/mm/book3s64/hash_64k.c   |  8 +++
>>  arch/powerpc/mm/book3s64/hash_utils.c | 66 ++-
>>  5 files changed, 95 insertions(+), 1 deletion(-)
> 
> 
> 
>   
>> +void hpt_do_torture(unsigned long ea, unsigned long access,
>> +unsigned long rflags, unsigned long hpte_group)
>> +{
>> +unsigned long last_group;
>> +int cpu = raw_smp_processor_id();
>> +
>> +last_group = torture_hpt_last_group[cpu];
>> +if (last_group != -1UL) {
>> +while (mmu_hash_ops.hpte_remove(last_group) != -1)
>> +;
>> +torture_hpt_last_group[cpu] = -1UL;
>> +}
>> +
>> +#define QEMU_WORKAROUND 0
>> +
>> +if (ea >= PAGE_OFFSET) {
>> +if (!QEMU_WORKAROUND && (access & (_PAGE_READ|_PAGE_WRITE)) &&
>> +!(rflags & (HPTE_R_I|HPTE_R_G))) {
>> +/* prefetch / prefetchw does not seem to set up a TLB
>> + * entry with the powerpc systemsim (mambo) emulator,
>> + * though it works with real hardware. An alternative
>> + * approach that would work more reliably on quirky
>> + * emulators like QEMU may be to remember the last
>> + * insertion and remove that, rather than removing the
>> + * current insertion. Then no prefetch is required.
>> + */
>> +if ((access & _PAGE_WRITE) && (access & _PAGE_READ))
>> +atomic_add(0, (atomic_t *)(ea & ~0x3));
>> +else if (access & _PAGE_READ)
>> +*(volatile char *)ea;
>> +
>> +mb();
>> +
>> +while (mmu_hash_ops.hpte_remove(hpte_group) != -1)
>> +;
> 
> Do we get similar hpte faults rate, if we remove everything except the
> current inserted entry?. If so that would largely simplify the code.

Well it would remove this one branch at least. It does actually help 
cause more faults and helps (in theory) irritate cases where you have 
two accesses to a vmalloc page in the kernel, where the first is okay
to take a fault but the second is buggy.

I actually like the interesting behaviour it exposes in emulators too. 
We should really fix mambo to have prefetches bring in TLBs, and fix 
qemu to bring in TLBs more like hardware, and it could have TLB vs PTE 
consistency checks to catch bugs like mambo does.

So I prefer leaving this in.

Thanks,
Nick

Re: [PATCH v7 08/28] powerpc: Use a function for getting the instruction op code