date:20200526

Re: [PATCH v4 6/7] KVM: MIPS: clean up redundant 'kvm_run' parameters

2020-05-26 Thread Tianjia Zhang





On 2020/4/27 13:40, Huacai Chen wrote:

Reviewed-by: Huacai Chen 

On Mon, Apr 27, 2020 at 12:35 PM Tianjia Zhang
 wrote:


In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
structure. For historical reasons, many kvm-related function parameters
retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
patch does a unified cleanup of these remaining redundant parameters.

Signed-off-by: Tianjia Zhang 
---
  arch/mips/include/asm/kvm_host.h |  28 +---
  arch/mips/kvm/emulate.c  |  59 ++--
  arch/mips/kvm/mips.c |  11 ++-
  arch/mips/kvm/trap_emul.c| 114 ++-
  arch/mips/kvm/vz.c   |  26 +++
  5 files changed, 87 insertions(+), 151 deletions(-)



Hi Huacai,

These two patches(6/7 and 7/7) should be merged into the tree of the 
mips architecture separately. At present, there seems to be no good way 
to merge the whole architecture patchs.


For this series of patches, some architectures have been merged, some 
need to update the patch.


Thanks and best,
Tianjia

Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone

2020-05-26 Thread Zong Li

On Wed, May 27, 2020 at 1:06 AM Alex Ghiti  wrote:
>
> Hi Zong,
>
> Le 5/26/20 à 5:43 AM, Zong Li a écrit :
> > On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti  wrote:
> >> This is a preparatory patch for relocatable kernel.
> >>
> >> The kernel used to be linked at PAGE_OFFSET address and used to be loaded
> >> physically at the beginning of the main memory. Therefore, we could use
> >> the linear mapping for the kernel mapping.
> >>
> >> But the relocated kernel base address will be different from PAGE_OFFSET
> >> and since in the linear mapping, two different virtual addresses cannot
> >> point to the same physical address, the kernel mapping needs to lie outside
> >> the linear mapping.
> >>
> >> In addition, because modules and BPF must be close to the kernel (inside
> >> +-2GB window), the kernel is placed at the end of the vmalloc zone minus
> >> 2GB, which leaves room for modules and BPF. The kernel could not be
> >> placed at the beginning of the vmalloc zone since other vmalloc
> >> allocations from the kernel could get all the +-2GB window around the
> >> kernel which would prevent new modules and BPF programs to be loaded.
> >>
> >> Signed-off-by: Alexandre Ghiti 
> >> ---
> >>   arch/riscv/boot/loader.lds.S |  3 +-
> >>   arch/riscv/include/asm/page.h| 10 +-
> >>   arch/riscv/include/asm/pgtable.h | 37 +---
> >>   arch/riscv/kernel/head.S |  3 +-
> >>   arch/riscv/kernel/module.c   |  4 +--
> >>   arch/riscv/kernel/vmlinux.lds.S  |  3 +-
> >>   arch/riscv/mm/init.c | 58 +---
> >>   arch/riscv/mm/physaddr.c |  2 +-
> >>   8 files changed, 87 insertions(+), 33 deletions(-)
> >>
> >> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
> >> index 47a5003c2e28..62d94696a19c 100644
> >> --- a/arch/riscv/boot/loader.lds.S
> >> +++ b/arch/riscv/boot/loader.lds.S
> >> @@ -1,13 +1,14 @@
> >>   /* SPDX-License-Identifier: GPL-2.0 */
> >>
> >>   #include 
> >> +#include 
> >>
> >>   OUTPUT_ARCH(riscv)
> >>   ENTRY(_start)
> >>
> >>   SECTIONS
> >>   {
> >> -   . = PAGE_OFFSET;
> >> +   . = KERNEL_LINK_ADDR;
> >>
> >>  .payload : {
> >>  *(.payload)
> >> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> >> index 2d50f76efe48..48bb09b6a9b7 100644
> >> --- a/arch/riscv/include/asm/page.h
> >> +++ b/arch/riscv/include/asm/page.h
> >> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t;
> >>
> >>   #ifdef CONFIG_MMU
> >>   extern unsigned long va_pa_offset;
> >> +extern unsigned long va_kernel_pa_offset;
> >>   extern unsigned long pfn_base;
> >>   #define ARCH_PFN_OFFSET(pfn_base)
> >>   #else
> >>   #define va_pa_offset   0
> >> +#define va_kernel_pa_offset0
> >>   #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT)
> >>   #endif /* CONFIG_MMU */
> >>
> >>   extern unsigned long max_low_pfn;
> >>   extern unsigned long min_low_pfn;
> >> +extern unsigned long kernel_virt_addr;
> >>
> >>   #define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + 
> >> va_pa_offset))
> >> -#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
> >> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
> >> +#define kernel_mapping_va_to_pa(x) \
> >> +   ((unsigned long)(x) - va_kernel_pa_offset)
> >> +#define __va_to_pa_nodebug(x)  \
> >> +   (((x) >= PAGE_OFFSET) ? \
> >> +   linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))
> >>
> >>   #ifdef CONFIG_DEBUG_VIRTUAL
> >>   extern phys_addr_t __virt_to_phys(unsigned long x);
> >> diff --git a/arch/riscv/include/asm/pgtable.h 
> >> b/arch/riscv/include/asm/pgtable.h
> >> index 35b60035b6b0..25213cfaf680 100644
> >> --- a/arch/riscv/include/asm/pgtable.h
> >> +++ b/arch/riscv/include/asm/pgtable.h
> >> @@ -11,23 +11,29 @@
> >>
> >>   #include 
> >>
> >> -#ifndef __ASSEMBLY__
> >> -
> >> -/* Page Upper Directory not used in RISC-V */
> >> -#include 
> >> -#include 
> >> -#include 
> >> -#include 
> >> -
> >> -#ifdef CONFIG_MMU
> >> +#ifndef CONFIG_MMU
> >> +#define KERNEL_VIRT_ADDR   PAGE_OFFSET
> >> +#define KERNEL_LINK_ADDR   PAGE_OFFSET
> >> +#else
> >> +/*
> >> + * Leave 2GB for modules and BPF that must lie within a 2GB range around
> >> + * the kernel.
> >> + */
> >> +#define KERNEL_VIRT_ADDR   (VMALLOC_END - SZ_2G + 1)
> >> +#define KERNEL_LINK_ADDR   KERNEL_VIRT_ADDR
> >>
> >>   #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
> >>   #define VMALLOC_END  (PAGE_OFFSET - 1)
> >>   #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE)
> >>
> >>   #define BPF_JIT_REGION_SIZE(SZ_128M)
> >> -#define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> >> -#define BPF_JIT_REGION_END (VMALLOC_END)
> >> +#define BPF_JIT_REGION_START   (kernel_virt_addr)
> >> +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE)
> > It seems to have a potenti

Re: [PATCH v4 3/7] KVM: PPC: Remove redundant kvm_run from vcpu_arch

2020-05-26 Thread Tianjia Zhang





On 2020/5/27 12:20, Paul Mackerras wrote:

On Mon, Apr 27, 2020 at 12:35:10PM +0800, Tianjia Zhang wrote:

The 'kvm_run' field already exists in the 'vcpu' structure, which
is the same structure as the 'kvm_run' in the 'vcpu_arch' and
should be deleted.

Signed-off-by: Tianjia Zhang 


Thanks, patches 3 and 4 of this series applied to my kvm-ppc-next branch.

Paul.



Thanks for your suggestion, for 5/7, I will submit a new version patch.

Thanks,
Tianjia

Re: [PATCH] media: omap3isp: Shuffle cacheflush.h and include mm.h

2020-05-26 Thread Christoph Hellwig

On Tue, May 26, 2020 at 09:34:27PM -0700, Nathan Chancellor wrote:
> After mm.h was removed from the asm-generic version of cacheflush.h,
> s390 allyesconfig shows several warnings of the following nature:

Hmm, I'm pretty sure I sent the same fix a few days ago in response to
a build bot report.  But if that didn't get picked up I'm fine with
your version of it as well of course:

Acked-by: Christoph Hellwig

[PATCH] media: omap3isp: Shuffle cacheflush.h and include mm.h

2020-05-26 Thread Nathan Chancellor

After mm.h was removed from the asm-generic version of cacheflush.h,
s390 allyesconfig shows several warnings of the following nature:

In file included from ./arch/s390/include/generated/asm/cacheflush.h:1,
 from drivers/media/platform/omap3isp/isp.c:42:
./include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct'
declared inside parameter list will not be visible outside of this
definition or declaration

cacheflush.h does not include mm.h nor does it include any forward
declaration of these structures hence the warning. To avoid this,
include mm.h explicitly in this file and shuffle cacheflush.h below it.

Fixes: 19c0054597a0 ("asm-generic: don't include  in cacheflush.h")
Signed-off-by: Nathan Chancellor 
---

I am aware the fixes tag is kind of irrelevant because that SHA will
change in the next linux-next revision and this will probably get folded
into the original patch anyways but still.

The other solution would be to add forward declarations of these structs
to the top of cacheflush.h, I just chose to do what Christoph did in the
original patch. I am happy to do that instead if you all feel that is
better.

 drivers/media/platform/omap3isp/isp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/media/platform/omap3isp/isp.c 
b/drivers/media/platform/omap3isp/isp.c
index a4ee6b86663e..54106a768e54 100644
--- a/drivers/media/platform/omap3isp/isp.c
+++ b/drivers/media/platform/omap3isp/isp.c
@@ -39,8 +39,6 @@
  * Troy Laramy 
  */
 
-#include 
-
 #include 
 #include 
 #include 
@@ -49,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -58,6 +57,8 @@
 #include 
 #include 
 
+#include 
+
 #ifdef CONFIG_ARM_DMA_USE_IOMMU
 #include 
 #endif
-- 
2.27.0.rc0

Re: [PATCH] KVM: PPC: Book3S HV: read ibm,secure-memory nodes

2020-05-26 Thread Paul Mackerras

On Thu, Apr 16, 2020 at 06:27:15PM +0200, Laurent Dufour wrote:
> The newly introduced ibm,secure-memory nodes supersede the
> ibm,uv-firmware's property secure-memory-ranges.
> 
> Firmware will no more expose the secure-memory-ranges property so first
> read the new one and if not found rollback to the older one.
> 
> Signed-off-by: Laurent Dufour 

Thanks, applied to my kvm-ppc-next branch.

Paul.

Re: [PATCH v4 3/7] KVM: PPC: Remove redundant kvm_run from vcpu_arch

2020-05-26 Thread Paul Mackerras

On Mon, Apr 27, 2020 at 12:35:10PM +0800, Tianjia Zhang wrote:
> The 'kvm_run' field already exists in the 'vcpu' structure, which
> is the same structure as the 'kvm_run' in the 'vcpu_arch' and
> should be deleted.
> 
> Signed-off-by: Tianjia Zhang 

Thanks, patches 3 and 4 of this series applied to my kvm-ppc-next branch.

Paul.

Re: [PATCH -next] KVM: PPC: Book3S HV: remove redundant NULL check

2020-05-26 Thread Paul Mackerras

On Wed, Apr 01, 2020 at 09:09:03PM +0800, Chen Zhou wrote:
> Free function kfree() already does NULL check, so the additional
> check is unnecessary, just remove it.
> 
> Signed-off-by: Chen Zhou 

Thanks, applied to my kvm-ppc-next branch.

Paul.

Re: [PATCH v2] KVM: PPC: Book3S HV: relax check on H_SVM_INIT_ABORT

2020-05-26 Thread Paul Mackerras

On Wed, May 20, 2020 at 07:43:08PM +0200, Laurent Dufour wrote:
> The commit 8c47b6ff29e3 ("KVM: PPC: Book3S HV: Check caller of H_SVM_*
> Hcalls") added checks of secure bit of SRR1 to filter out the Hcall
> reserved to the Ultravisor.
> 
> However, the Hcall H_SVM_INIT_ABORT is made by the Ultravisor passing the
> context of the VM calling UV_ESM. This allows the Hypervisor to return to
> the guest without going through the Ultravisor. Thus the Secure bit of SRR1
> is not set in that particular case.
> 
> In the case a regular VM is calling H_SVM_INIT_ABORT, this hcall will be
> filtered out in kvmppc_h_svm_init_abort() because kvm->arch.secure_guest is
> not set in that case.
> 
> Fixes: 8c47b6ff29e3 ("KVM: PPC: Book3S HV: Check caller of H_SVM_* Hcalls")
> Signed-off-by: Laurent Dufour 

Thanks, applied to my kvm-ppc-next branch.  I expanded the comment in
the code a little.

Paul.

Re: [PATCH] powerpc/kvm/radix: ignore kmemleak false positives

2020-05-26 Thread Paul Mackerras

On Wed, May 13, 2020 at 09:39:15AM -0400, Qian Cai wrote:
> kvmppc_pmd_alloc() and kvmppc_pte_alloc() allocate some memory but then
> pud_populate() and pmd_populate() will use __pa() to reference the newly
> allocated memory.
> 
> Since kmemleak is unable to track the physical memory resulting in false
> positives, silence those by using kmemleak_ignore().
> 
> unreferenced object 0xc000201c382a1000 (size 4096):
>  comm "qemu-kvm", pid 124828, jiffies 4295733767 (age 341.250s)
>  hex dump (first 32 bytes):
>c0 00 20 09 f4 60 03 87 c0 00 20 10 72 a0 03 87  .. ..` .r...
>c0 00 20 0e 13 a0 03 87 c0 00 20 1b dc c0 03 87  .. ... .
>  backtrace:
>[<4cc2790f>] kvmppc_create_pte+0x838/0xd20 [kvm_hv]
>kvmppc_pmd_alloc at arch/powerpc/kvm/book3s_64_mmu_radix.c:366
>(inlined by) kvmppc_create_pte at 
> arch/powerpc/kvm/book3s_64_mmu_radix.c:590
>[] kvmppc_book3s_instantiate_page+0x2e0/0x8c0 [kvm_hv]
>[] kvmppc_book3s_radix_page_fault+0x1b4/0x2b0 [kvm_hv]
>[<86dddc0e>] kvmppc_book3s_hv_page_fault+0x214/0x12a0 [kvm_hv]
>[<5ae9ccc2>] kvmppc_vcpu_run_hv+0xc5c/0x15f0 [kvm_hv]
>[] kvmppc_vcpu_run+0x34/0x48 [kvm]
>[] kvm_arch_vcpu_ioctl_run+0x314/0x420 [kvm]
>[<2543dd54>] kvm_vcpu_ioctl+0x33c/0x950 [kvm]
>[<48155cd6>] ksys_ioctl+0xd8/0x130
>[<41ffeaa7>] sys_ioctl+0x28/0x40
>[<4afc4310>] system_call_exception+0x114/0x1e0
>[] system_call_common+0xf0/0x278
> unreferenced object 0xc0002001f0c03900 (size 256):
>  comm "qemu-kvm", pid 124830, jiffies 4295735235 (age 326.570s)
>  hex dump (first 32 bytes):
>c0 00 20 10 fa a0 03 87 c0 00 20 10 fa a1 03 87  .. ... .
>c0 00 20 10 fa a2 03 87 c0 00 20 10 fa a3 03 87  .. ... .
>  backtrace:
>[<23f675b8>] kvmppc_create_pte+0x854/0xd20 [kvm_hv]
>kvmppc_pte_alloc at arch/powerpc/kvm/book3s_64_mmu_radix.c:356
>(inlined by) kvmppc_create_pte at 
> arch/powerpc/kvm/book3s_64_mmu_radix.c:593
>[] kvmppc_book3s_instantiate_page+0x2e0/0x8c0 [kvm_hv]
>[] kvmppc_book3s_radix_page_fault+0x1b4/0x2b0 [kvm_hv]
>[<86dddc0e>] kvmppc_book3s_hv_page_fault+0x214/0x12a0 [kvm_hv]
>[<5ae9ccc2>] kvmppc_vcpu_run_hv+0xc5c/0x15f0 [kvm_hv]
>[] kvmppc_vcpu_run+0x34/0x48 [kvm]
>[] kvm_arch_vcpu_ioctl_run+0x314/0x420 [kvm]
>[<2543dd54>] kvm_vcpu_ioctl+0x33c/0x950 [kvm]
>[<48155cd6>] ksys_ioctl+0xd8/0x130
>[<41ffeaa7>] sys_ioctl+0x28/0x40
>[<4afc4310>] system_call_exception+0x114/0x1e0
>[] system_call_common+0xf0/0x278
> 
> Signed-off-by: Qian Cai 

Thanks, applied to my kvm-ppc-next branch.

Paul.

Re: [PATCH] powerpc/kvm/book3s64/vio: fix some RCU-list locks

2020-05-26 Thread Paul Mackerras

On Sun, May 10, 2020 at 01:18:34AM -0400, Qian Cai wrote:
> It is unsafe to traverse kvm->arch.spapr_tce_tables and
> stt->iommu_tables without the RCU read lock held. Also, add
> cond_resched_rcu() in places with the RCU read lock held that could take
> a while to finish.
> 
>  arch/powerpc/kvm/book3s_64_vio.c:76 RCU-list traversed in non-reader 
> section!!
> 
>  other info that might help us debug this:
> 
>  rcu_scheduler_active = 2, debug_locks = 1
>  no locks held by qemu-kvm/4265.
> 
>  stack backtrace:
>  CPU: 96 PID: 4265 Comm: qemu-kvm Not tainted 5.7.0-rc4-next-20200508+ #2
>  Call Trace:
>  [c000201a8690f720] [c0715948] dump_stack+0xfc/0x174 (unreliable)
>  [c000201a8690f770] [c01d9470] lockdep_rcu_suspicious+0x140/0x164
>  [c000201a8690f7f0] [c00810b9fb48] 
> kvm_spapr_tce_release_iommu_group+0x1f0/0x220 [kvm]
>  [c000201a8690f870] [c00810b8462c] 
> kvm_spapr_tce_release_vfio_group+0x54/0xb0 [kvm]
>  [c000201a8690f8a0] [c00810b84710] kvm_vfio_destroy+0x88/0x140 [kvm]
>  [c000201a8690f8f0] [c00810b7d488] kvm_put_kvm+0x370/0x600 [kvm]
>  [c000201a8690f990] [c00810b7e3c0] kvm_vm_release+0x38/0x60 [kvm]
>  [c000201a8690f9c0] [c05223f4] __fput+0x124/0x330
>  [c000201a8690fa20] [c0151cd8] task_work_run+0xb8/0x130
>  [c000201a8690fa70] [c01197e8] do_exit+0x4e8/0xfa0
>  [c000201a8690fb70] [c011a374] do_group_exit+0x64/0xd0
>  [c000201a8690fbb0] [c0132c90] get_signal+0x1f0/0x1200
>  [c000201a8690fcc0] [c0020690] do_notify_resume+0x130/0x3c0
>  [c000201a8690fda0] [c0038d64] syscall_exit_prepare+0x1a4/0x280
>  [c000201a8690fe20] [c000c8f8] system_call_common+0xf8/0x278
> 
>  
>  arch/powerpc/kvm/book3s_64_vio.c:368 RCU-list traversed in non-reader 
> section!!
> 
>  other info that might help us debug this:
> 
>  rcu_scheduler_active = 2, debug_locks = 1
>  2 locks held by qemu-kvm/4264:
>   #0: c000201ae2d000d8 (&vcpu->mutex){+.+.}-{3:3}, at: 
> kvm_vcpu_ioctl+0xdc/0x950 [kvm]
>   #1: c000200c9ed0c468 (&kvm->srcu){}-{0:0}, at: 
> kvmppc_h_put_tce+0x88/0x340 [kvm]
> 
>  
>  arch/powerpc/kvm/book3s_64_vio.c:108 RCU-list traversed in non-reader 
> section!!
> 
>  other info that might help us debug this:
> 
>  rcu_scheduler_active = 2, debug_locks = 1
>  1 lock held by qemu-kvm/4257:
>   #0: c000200b1b363a40 (&kv->lock){+.+.}-{3:3}, at: 
> kvm_vfio_set_attr+0x598/0x6c0 [kvm]
> 
>  
>  arch/powerpc/kvm/book3s_64_vio.c:146 RCU-list traversed in non-reader 
> section!!
> 
>  other info that might help us debug this:
> 
>  rcu_scheduler_active = 2, debug_locks = 1
>  1 lock held by qemu-kvm/4257:
>   #0: c000200b1b363a40 (&kv->lock){+.+.}-{3:3}, at: 
> kvm_vfio_set_attr+0x598/0x6c0 [kvm]
> 
> Signed-off-by: Qian Cai 

Thanks, applied to my kvm-ppc-next branch, with the cond_resched_rcu()
in kvmppc_tce_validate removed.

Paul.

[PATCH v8 2/5] seq_buf: Export seq_buf_printf

2020-05-26 Thread Vaibhav Jain

'seq_buf' provides a very useful abstraction for writing to a string
buffer without needing to worry about it over-flowing. However even
though the API has been stable for couple of years now its still not
exported to kernel loadable modules limiting its usage.

Hence this patch proposes update to 'seq_buf.c' to mark
seq_buf_printf() which is part of the seq_buf API to be exported to
kernel loadable GPL modules. This symbol will be used in later parts
of this patch-set to simplify content creation for a sysfs attribute.

Cc: Piotr Maziarz 
Cc: Cezary Rojewski 
Cc: Christoph Hellwig 
Cc: Steven Rostedt 
Cc: Borislav Petkov 
Signed-off-by: Vaibhav Jain 
---
Changelog:

v7..v8:
* Updated the patch title [ Christoph Hellwig ]
* Updated patch description to replace confusing term 'external kernel
  modules' to 'kernel lodable modules'.

Resend:
* Added ack from Steven Rostedt

v6..v7:
* New patch in the series
---
 lib/seq_buf.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/seq_buf.c b/lib/seq_buf.c
index 4e865d42ab03..707453f5d58e 100644
--- a/lib/seq_buf.c
+++ b/lib/seq_buf.c
@@ -91,6 +91,7 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...)
 
return ret;
 }
+EXPORT_SYMBOL_GPL(seq_buf_printf);
 
 #ifdef CONFIG_BINARY_PRINTF
 /**
-- 
2.26.2

[PATCH v8 5/5] powerpc/papr_scm: Implement support for PAPR_SCM_PDSM_HEALTH

2020-05-26 Thread Vaibhav Jain

This patch implements support for PDSM request 'PAPR_SCM_PDSM_HEALTH'
that returns a newly introduced 'struct nd_papr_pdsm_health' instance
containing dimm health information back to user space in response to
ND_CMD_CALL. This functionality is implemented in newly introduced
papr_scm_get_health() that queries the scm-dimm health information and
then copies this information to the package payload whose layout is
defined by 'struct nd_papr_pdsm_health'.

The patch also introduces a new member 'struct papr_scm_priv.health'
thats an instance of 'struct nd_papr_pdsm_health' to cache the health
information of a nvdimm. As a result functions drc_pmem_query_health()
and flags_show() are updated to populate and use this new struct
instead of a u64 integer that was earlier used.

Cc: "Aneesh Kumar K . V" 
Cc: Dan Williams 
Cc: Michael Ellerman 
Cc: Ira Weiny 
Signed-off-by: Vaibhav Jain 
---
Changelog:

v7..v8:
* None

Resend:
* None

v6..v7:
* Updated flags_show() to use seq_buf_printf(). [Mpe]
* Updated papr_scm_get_health() to use newly introduced
  __drc_pmem_query_health() bypassing the cache [Mpe].

v5..v6:
* Added attribute '__packed' to 'struct nd_papr_pdsm_health_v1' to
  gaurd against possibility of different compilers adding different
  paddings to the struct [ Dan Williams ]

* Updated 'struct nd_papr_pdsm_health_v1' to use __u8 instead of
  'bool' and also updated drc_pmem_query_health() to take this into
  account. [ Dan Williams ]

v4..v5:
* None

v3..v4:
* Call the DSM_PAPR_SCM_HEALTH service function from
  papr_scm_service_dsm() instead of papr_scm_ndctl(). [Aneesh]

v2..v3:
* Updated struct nd_papr_scm_dimm_health_stat_v1 to use '__xx' types
  as its exported to the userspace [Aneesh]
* Changed the constants DSM_PAPR_SCM_DIMM_XX indicating dimm health
  from enum to #defines [Aneesh]

v1..v2:
* New patch in the series
---
 arch/powerpc/include/uapi/asm/papr_scm_pdsm.h |  39 ++
 arch/powerpc/platforms/pseries/papr_scm.c | 125 +++---
 2 files changed, 147 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h 
b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h
index c4bae3208e73..f81d714279f0 100644
--- a/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h
+++ b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h
@@ -115,6 +115,7 @@ struct nd_pdsm_cmd_pkg {
  */
 enum papr_scm_pdsm {
PAPR_SCM_PDSM_MIN = 0x0,
+   PAPR_SCM_PDSM_HEALTH,
PAPR_SCM_PDSM_MAX,
 };
 
@@ -133,4 +134,42 @@ static inline void *pdsm_cmd_to_payload(struct 
nd_pdsm_cmd_pkg *pcmd)
return (void *)(pcmd->payload);
 }
 
+/* Various scm-dimm health indicators */
+#define PAPR_PDSM_DIMM_HEALTHY   0
+#define PAPR_PDSM_DIMM_UNHEALTHY 1
+#define PAPR_PDSM_DIMM_CRITICAL  2
+#define PAPR_PDSM_DIMM_FATAL 3
+
+/*
+ * Struct exchanged between kernel & ndctl in for PAPR_SCM_PDSM_HEALTH
+ * Various flags indicate the health status of the dimm.
+ *
+ * dimm_unarmed: Dimm not armed. So contents wont persist.
+ * dimm_bad_shutdown   : Previous shutdown did not persist contents.
+ * dimm_bad_restore: Contents from previous shutdown werent restored.
+ * dimm_scrubbed   : Contents of the dimm have been scrubbed.
+ * dimm_locked : Contents of the dimm cant be modified until CEC reboot
+ * dimm_encrypted  : Contents of dimm are encrypted.
+ * dimm_health : Dimm health indicator. One of PAPR_PDSM_DIMM_
+ */
+struct nd_papr_pdsm_health_v1 {
+   __u8 dimm_unarmed;
+   __u8 dimm_bad_shutdown;
+   __u8 dimm_bad_restore;
+   __u8 dimm_scrubbed;
+   __u8 dimm_locked;
+   __u8 dimm_encrypted;
+   __u16 dimm_health;
+} __packed;
+
+/*
+ * Typedef the current struct for dimm_health so that any application
+ * or kernel recompiled after introducing a new version automatically
+ * supports the new version.
+ */
+#define nd_papr_pdsm_health nd_papr_pdsm_health_v1
+
+/* Current version number for the dimm health struct */
+#define ND_PAPR_PDSM_HEALTH_VERSION 1
+
 #endif /* _UAPI_ASM_POWERPC_PAPR_SCM_PDSM_H_ */
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index fcb8afee97dc..adf1fb819c56 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -88,7 +88,7 @@ struct papr_scm_priv {
unsigned long lasthealth_jiffies;
 
/* Health information for the dimm */
-   u64 health_bitmap;
+   struct nd_papr_pdsm_health health;
 };
 
 static int drc_pmem_bind(struct papr_scm_priv *p)
@@ -201,6 +201,7 @@ static int drc_pmem_query_n_bind(struct papr_scm_priv *p)
 static int __drc_pmem_query_health(struct papr_scm_priv *p)
 {
unsigned long ret[PLPAR_HCALL_BUFSIZE];
+   u64 health;
long rc;
 
/* issue the hcall */
@@ -208,18 +209,46 @@ static int __drc_pmem_query_health(struct papr_scm_priv 
*p)
if (rc != H_SUCCESS) {
dev_err(&p->pdev->dev,

[PATCH v8 4/5] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods

2020-05-26 Thread Vaibhav Jain

Introduce support for PAPR NVDIMM Specific Methods (PDSM) in papr_scm
module and add the command family to the white list of NVDIMM command
sets. Also advertise support for ND_CMD_CALL for the nvdimm
command mask and implement necessary scaffolding in the module to
handle ND_CMD_CALL ioctl and PDSM requests that we receive.

The layout of the PDSM request as we expect from libnvdimm/libndctl is
described in newly introduced uapi header 'papr_scm_pdsm.h' which
defines a new 'struct nd_pdsm_cmd_pkg' header. This header is used
to communicate the PDSM request via member
'nd_pkg_papr_scm->nd_command' and size of payload that need to be
sent/received for servicing the PDSM.

A new function is_cmd_valid() is implemented that reads the args to
papr_scm_ndctl() and performs sanity tests on them. A new function
papr_scm_service_pdsm() is introduced and is called from
papr_scm_ndctl() in case of a PDSM request is received via ND_CMD_CALL
command from libnvdimm.

Cc: "Aneesh Kumar K . V" 
Cc: Dan Williams 
Cc: Michael Ellerman 
Cc: Ira Weiny 
Signed-off-by: Vaibhav Jain 
---
Changelog:

v7..v8:
* Removed the 'payload_offset' field from 'struct
  nd_pdsm_cmd_pkg'. Instead command payload is always assumed to start
  at 'nd_pdsm_cmd_pkg.payload'. [ Aneesh ]
* To enable introducing new fields to 'struct nd_pdsm_cmd_pkg',
  'reserved' field of 10-bytes is introduced. [ Aneesh ]
* Fixed a typo in "Backward Compatibility" section of papr_scm_pdsm.h
  [ Ira ]

Resend:
* None

v6..v7 :
* Removed the re-definitions of __packed macro from papr_scm_pdsm.h
  [Mpe].
* Removed the usage of __KERNEL__ macros in papr_scm_pdsm.h [Mpe].
* Removed macros that were unused in papr_scm.c from papr_scm_pdsm.h
  [Mpe].
* Made functions defined in papr_scm_pdsm.h as static inline. [Mpe]

v5..v6 :
* Changed the usage of the term DSM to PDSM to distinguish it from the
  ACPI term [ Dan Williams ]
* Renamed papr_scm_dsm.h to papr_scm_pdsm.h and updated various struct
  to reflect the new terminology.
* Updated the patch description and title to reflect the new terminology.
* Squashed patch to introduce new command family in 'ndctl.h' with
  this patch [ Dan Williams ]
* Updated the papr_scm_pdsm method starting index from 0x1 to 0x0
  [ Dan Williams ]
* Removed redundant license text from the papr_scm_psdm.h file.
  [ Dan Williams ]
* s/envelop/envelope/ at various places [ Dan Williams ]
* Added '__packed' attribute to command package header to gaurd
  against different compiler adding paddings between the fields.
  [ Dan Williams]
* Converted various pr_debug to dev_debug [ Dan Williams ]

v4..v5 :
* None

v3..v4 :
* None

v2..v3 :
* Updated the patch prefix to 'ndctl/uapi' [Aneesh]

v1..v2 :
* None
---
 arch/powerpc/include/uapi/asm/papr_scm_pdsm.h | 136 ++
 arch/powerpc/platforms/pseries/papr_scm.c | 101 -
 include/uapi/linux/ndctl.h|   1 +
 3 files changed, 232 insertions(+), 6 deletions(-)
 create mode 100644 arch/powerpc/include/uapi/asm/papr_scm_pdsm.h

diff --git a/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h 
b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h
new file mode 100644
index ..c4bae3208e73
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
+/*
+ * PAPR-SCM Dimm specific methods (PDSM) and structs for libndctl
+ *
+ * (C) Copyright IBM 2020
+ *
+ * Author: Vaibhav Jain 
+ */
+
+#ifndef _UAPI_ASM_POWERPC_PAPR_SCM_PDSM_H_
+#define _UAPI_ASM_POWERPC_PAPR_SCM_PDSM_H_
+
+#include 
+
+/*
+ * PDSM Envelope:
+ *
+ * The ioctl ND_CMD_CALL transfers data between user-space and kernel via
+ * envelope which consists of a header and user-defined payload sections.
+ * The header is described by 'struct nd_pdsm_cmd_pkg' which expects a
+ * payload following it and accessible via 'nd_pdsm_cmd_pkg.payload' field.
+ * There is reserved field that can used to introduce new fields to the
+ * structure in future. It also tries to ensure that 'nd_pdsm_cmd_pkg.payload'
+ * lies at a 8-byte boundary.
+ *
+ *  +-+-+---+
+ *  |   64-Bytes  |   16-Bytes  |   Max 176-Bytes   |
+ *  +-+-+---+
+ *  |   nd_pdsm_cmd_pkg |   |
+ *  |-+ |   |
+ *  |  nd_cmd_pkg | |   |
+ *  +-+-+---+
+ *  | nd_family   | |   |
+ *  | nd_size_out | cmd_status  |   |
+ *  | nd_size_in  | payload_version | payload   |
+ *  | nd_command  | reserved|   |
+ *  | nd_fw_size  | |   |
+ *  +-+-+-

[PATCH v8 3/5] powerpc/papr_scm: Fetch nvdimm health information from PHYP

2020-05-26 Thread Vaibhav Jain

Implement support for fetching nvdimm health information via
H_SCM_HEALTH hcall as documented in Ref[1]. The hcall returns a pair
of 64-bit bitmap, bitwise-and of which is then stored in
'struct papr_scm_priv' and subsequently partially exposed to
user-space via newly introduced dimm specific attribute
'papr/flags'. Since the hcall is costly, the health information is
cached and only re-queried, 60s after the previous successful hcall.

The patch also adds a  documentation text describing flags reported by
the the new sysfs attribute 'papr/flags' is also introduced at
Documentation/ABI/testing/sysfs-bus-papr-scm.

[1] commit 58b278f568f0 ("powerpc: Provide initial documentation for
PAPR hcalls")

Cc: "Aneesh Kumar K . V" 
Cc: Dan Williams 
Cc: Michael Ellerman 
Cc: Ira Weiny 
Signed-off-by: Vaibhav Jain 
---
Changelog:

v7..v8:
* Update type of variable 'rc' in __drc_pmem_query_health() and
  drc_pmem_query_health() to long and int respectively. [ Ira ]
* Updated the patch description to s/64 bit Big Endian Number/64-bit
  bitmap/ [ Ira, Aneesh ].

Resend:
* None

v6..v7 :
* Used the exported buf_seq_printf() function to generate content for
  'papr/flags'
* Moved the PAPR_SCM_DIMM_* bit-flags macro definitions to papr_scm.c
  and removed the papr_scm.h file [Mpe]
* Some minor consistency issued in sysfs-bus-papr-scm
  documentation. [Mpe]
* s/dimm_mutex/health_mutex/g [Mpe]
* Split drc_pmem_query_health() into two function one of which takes
  care of caching and locking. [Mpe]
* Fixed a local copy creation of dimm health information using
  READ_ONCE(). [Mpe]

v5..v6 :
* Change the flags sysfs attribute from 'papr_flags' to 'papr/flags'
  [Dan Williams]
* Include documentation for 'papr/flags' attr [Dan Williams]
* Change flag 'save_fail' to 'flush_fail' [Dan Williams]
* Caching of health bitmap to reduce expensive hcalls [Dan Williams]
* Removed usage of PPC_BIT from 'papr-scm.h' header [Mpe]
* Replaced two __be64 integers from papr_scm_priv to a single u64
  integer [Mpe]
* Updated patch description to reflect the changes made in this
  version.
* Removed avoidable usage of 'papr_scm_priv.dimm_mutex' from
  flags_show() [Dan Williams]

v4..v5 :
* None

v3..v4 :
* None

v2..v3 :
* Removed PAPR_SCM_DIMM_HEALTH_NON_CRITICAL as a condition for
 NVDIMM unarmed [Aneesh]

v1..v2 :
* New patch in the series.
---
 Documentation/ABI/testing/sysfs-bus-papr-scm |  27 +++
 arch/powerpc/platforms/pseries/papr_scm.c| 169 ++-
 2 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-papr-scm

diff --git a/Documentation/ABI/testing/sysfs-bus-papr-scm 
b/Documentation/ABI/testing/sysfs-bus-papr-scm
new file mode 100644
index ..6143d06072f1
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-papr-scm
@@ -0,0 +1,27 @@
+What:  /sys/bus/nd/devices/nmemX/papr/flags
+Date:  Apr, 2020
+KernelVersion: v5.8
+Contact:   linuxppc-dev , 
linux-nvd...@lists.01.org,
+Description:
+   (RO) Report flags indicating various states of a
+   papr-scm NVDIMM device. Each flag maps to a one or
+   more bits set in the dimm-health-bitmap retrieved in
+   response to H_SCM_HEALTH hcall. The details of the bit
+   flags returned in response to this hcall is available
+   at 'Documentation/powerpc/papr_hcalls.rst' . Below are
+   the flags reported in this sysfs file:
+
+   * "not_armed"   : Indicates that NVDIMM contents will not
+ survive a power cycle.
+   * "flush_fail"  : Indicates that NVDIMM contents
+ couldn't be flushed during last
+ shut-down event.
+   * "restore_fail": Indicates that NVDIMM contents
+ couldn't be restored during NVDIMM
+ initialization.
+   * "encrypted"   : NVDIMM contents are encrypted.
+   * "smart_notify": There is health event for the NVDIMM.
+   * "scrubbed": Indicating that contents of the
+ NVDIMM have been scrubbed.
+   * "locked"  : Indicating that NVDIMM contents cant
+ be modified until next power cycle.
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index f35592423380..010cd9aae488 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -22,6 +23,44 @@
 (1ul << ND_CMD_GET_CONFIG_DATA) | \
 (1ul << ND_CMD_SET_CONFIG_DATA))
 
+/* DIMM health bitmap bitmap indicators */
+/* SCM device is unable to persist memory contents */
+#define PAPR_SCM_DIMM_UNARMED   (1ULL << (63 - 0))
+/* SCM device faile

[PATCH v8 1/5] powerpc: Document details on H_SCM_HEALTH hcall

2020-05-26 Thread Vaibhav Jain

Add documentation to 'papr_hcalls.rst' describing the bitmap flags
that are returned from H_SCM_HEALTH hcall as per the PAPR-SCM
specification.

Cc: "Aneesh Kumar K . V" 
Cc: Dan Williams 
Cc: Michael Ellerman 
Cc: Ira Weiny 
Signed-off-by: Vaibhav Jain 
---
Changelog:
v7..v8:
* Added a clarification on bit-ordering of Health Bitmap

Resend:
* None

v6..v7:
* None

v5..v6:
* New patch in the series
---
 Documentation/powerpc/papr_hcalls.rst | 45 ---
 1 file changed, 41 insertions(+), 4 deletions(-)

diff --git a/Documentation/powerpc/papr_hcalls.rst 
b/Documentation/powerpc/papr_hcalls.rst
index 3493631a60f8..45063f305813 100644
--- a/Documentation/powerpc/papr_hcalls.rst
+++ b/Documentation/powerpc/papr_hcalls.rst
@@ -220,13 +220,50 @@ from the LPAR memory.
 **H_SCM_HEALTH**
 
 | Input: drcIndex
-| Out: *health-bitmap, health-bit-valid-bitmap*
+| Out: *health-bitmap (r4), health-bit-valid-bitmap (r5)*
 | Return Value: *H_Success, H_Parameter, H_Hardware*
 
 Given a DRC Index return the info on predictive failure and overall health of
-the NVDIMM. The asserted bits in the health-bitmap indicate a single predictive
-failure and health-bit-valid-bitmap indicate which bits in health-bitmap are
-valid.
+the NVDIMM. The asserted bits in the health-bitmap indicate one or more states
+(described in table below) of the NVDIMM and health-bit-valid-bitmap indicate
+which bits in health-bitmap are valid. The bits are reported in
+reverse bit ordering for example a value of 0xC400
+indicates bits 0, 1, and 5 are valid.
+
+Health Bitmap Flags:
+
++--+---+
+|  Bit |   Definition  
|
++==+===+
+|  00  | SCM device is unable to persist memory contents.  
|
+|  | If the system is powered down, nothing will be saved. 
|
++--+---+
+|  01  | SCM device failed to persist memory contents. Either contents were 
not|
+|  | saved successfully on power down or were not restored properly on 
|
+|  | power up. 
|
++--+---+
+|  02  | SCM device contents are persisted from previous IPL. The data from
|
+|  | the last boot were successfully restored. 
|
++--+---+
+|  03  | SCM device contents are not persisted from previous IPL. There was no 
|
+|  | data to restore from the last boot.   
|
++--+---+
+|  04  | SCM device memory life remaining is critically low
|
++--+---+
+|  05  | SCM device will be garded off next IPL due to failure 
|
++--+---+
+|  06  | SCM contents cannot persist due to current platform health status. A  
|
+|  | hardware failure may prevent data from being saved or restored.   
|
++--+---+
+|  07  | SCM device is unable to persist memory contents in certain conditions 
|
++--+---+
+|  08  | SCM device is encrypted   
|
++--+---+
+|  09  | SCM device has successfully completed a requested erase or secure 
|
+|  | erase procedure.  
|
++--+---+
+|10:63 | Reserved / Unused 
|
++--+---+
 
 **H_SCM_PERFORMANCE_STATS**
 
-- 
2.26.2

[PATCH v8 0/5] powerpc/papr_scm: Add support for reporting nvdimm health

2020-05-26 Thread Vaibhav Jain

Changes since v7 [1]:

* Addressed various review comments from Aneesh, Ira and Mpe.
* Removed the 'payload_offset' field from 'struct nd_pdsm_cmd_pkg' and
  replaced it with some reserved fields [ Aneesh ].
* Updated the doc and description for patch that fetches dimm health
  information from PHYP clarifying bit-ordering [ Mpe and Ira ].
* Updated the patch title & description for patch exporting
  'seq_buf_printf'. [ Christoph Hellwig ]
* Fix types of various newly introduced vars in papr_scm.c [ Ira ].
* Fixed a typo in 'papr_scm_pdsm.h' [ Ira ]

[1] 
https://lore.kernel.org/linux-nvdimm/20200519190058.257981-1-vaib...@linux.ibm.com

---

The PAPR standard[2][4] provides mechanisms to query the health and
performance stats of an NVDIMM via various hcalls as described in
Ref[3].  Until now these stats were never available nor exposed to the
user-space tools like 'ndctl'. This is partly due to PAPR platform not
having support for ACPI and NFIT. Hence 'ndctl' is unable to query and
report the dimm health status and a user had no way to determine the
current health status of a NDVIMM.

To overcome this limitation, this patch-set updates papr_scm kernel
module to query and fetch NVDIMM health stats using hcalls described
in Ref[3].  This health and performance stats are then exposed to
userspace via sysfs and PAPR-NVDIMM-Specific-Methods(PDSM) issued by
libndctl.

These changes coupled with proposed ndtcl changes located at Ref[5]
should provide a way for the user to retrieve NVDIMM health status
using ndtcl.

Below is a sample output using proposed kernel + ndctl for PAPR NVDIMM
in a emulation environment:

 # ndctl list -DH
[
  {
"dev":"nmem0",
"health":{
  "health_state":"fatal",
  "shutdown_state":"dirty"
}
  }
]

Dimm health report output on a pseries guest lpar with vPMEM or HMS
based NVDIMMs that are in perfectly healthy conditions:

 # ndctl list -d nmem0 -H
[
  {
"dev":"nmem0",
"health":{
  "health_state":"ok",
  "shutdown_state":"clean"
}
  }
]

PAPR NVDIMM-Specific-Methods(PDSM)
==

PDSM requests are issued by vendor specific code in libndctl to
execute certain operations or fetch information from NVDIMMS. PDSMs
requests can be sent to papr_scm module via libndctl(userspace) and
libnvdimm (kernel) using the ND_CMD_CALL ioctl command which can be
handled in the dimm control function papr_scm_ndctl(). Current
patchset proposes a single PDSM to retrieve NVDIMM health, defined in
the newly introduced uapi header named 'papr_scm_pdsm.h'. Support for
more PDSMs will be added in future.

Structure of the patch-set
==

The patch-set starts with a doc patch documenting details of hcall
H_SCM_HEALTH. Second patch exports kernel symbol seq_buf_printf()
thats used in subsequent patches to generate sysfs attribute content.

Third patch implements support for fetching NVDIMM health information
from PHYP and partially exposing it to user-space via a NVDIMM sysfs
flag.

Fourth patches deal with implementing support for servicing PDSM
commands in papr_scm module.

Finally the last patch implements support for servicing PDSM
'PAPR_SCM_PDSM_HEALTH' that returns the NVDIMM health information to
libndctl.

References:
[2] "Power Architecture Platform Reference"
  https://en.wikipedia.org/wiki/Power_Architecture_Platform_Reference
[3] commit 58b278f568f0
 ("powerpc: Provide initial documentation for PAPR hcalls")
[4] "Linux on Power Architecture Platform Reference"
 https://members.openpowerfoundation.org/document/dl/469
[5] https://github.com/vaibhav92/ndctl/tree/papr_scm_health_v8

---
Vaibhav Jain (5):
  powerpc: Document details on H_SCM_HEALTH hcall
  seq_buf: Export seq_buf_printf
  powerpc/papr_scm: Fetch nvdimm health information from PHYP
  ndctl/papr_scm,uapi: Add support for PAPR nvdimm specific methods
  powerpc/papr_scm: Implement support for PAPR_SCM_PDSM_HEALTH

 Documentation/ABI/testing/sysfs-bus-papr-scm  |  27 ++
 Documentation/powerpc/papr_hcalls.rst |  45 ++-
 arch/powerpc/include/uapi/asm/papr_scm_pdsm.h | 175 +
 arch/powerpc/platforms/pseries/papr_scm.c | 363 +-
 include/uapi/linux/ndctl.h|   1 +
 lib/seq_buf.c |   1 +
 6 files changed, 599 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-papr-scm
 create mode 100644 arch/powerpc/include/uapi/asm/papr_scm_pdsm.h

-- 
2.26.2

Re: [PATCH v3] powerpc/XIVE: SVM: share the event-queue page with the Hypervisor.

2020-05-26 Thread Michael Ellerman

Ram Pai  writes:
> XIVE interrupt controller uses an Event Queue (EQ) to enqueue event
> notifications when an exception occurs. The EQ is a single memory page
> provided by the O/S defining a circular buffer, one per server and
> priority couple.
>
> On baremetal, the EQ page is configured with an OPAL call. On pseries,
> an extra hop is necessary and the guest OS uses the hcall
> H_INT_SET_QUEUE_CONFIG to configure the XIVE interrupt controller.
>
> The XIVE controller being Hypervisor privileged, it will not be allowed
> to enqueue event notifications for a Secure VM unless the EQ pages are
> shared by the Secure VM.
>
> Hypervisor/Ultravisor still requires support for the TIMA and ESB page
> fault handlers. Until this is complete, QEMU can use the emulated XIVE
> device for Secure VMs, option "kernel_irqchip=off" on the QEMU pseries
> machine.
>
> Cc: kvm-...@vger.kernel.org
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Michael Ellerman 
> Cc: Thiago Jung Bauermann 
> Cc: Michael Anderson 
> Cc: Sukadev Bhattiprolu 
> Cc: Alexey Kardashevskiy 
> Cc: Paul Mackerras 
> Cc: David Gibson 
> Reviewed-by: Cedric Le Goater 
> Reviewed-by: Greg Kurz 
> Signed-off-by: Ram Pai 
>
> v3: fix a minor semantics in description.
> and added reviewed-by from Cedric and Greg.
> v2: better description of the patch from Cedric.
> ---

Please put the change history after the '---' break in future please, I
had to fix this up manually.

cheers

[v3 2/2] dts: ppc: t1024rdb: remove interrupts property

2020-05-26 Thread Biwen Li

From: Biwen Li 

Since the interrupt pin for RTC DS1339 is not connected
to the CPU on T1024RDB, remove the interrupt property
from the device tree.

This also fix the following warning for hwclock.util-linux:
$ hwclock.util-linux
hwclock.util-linux: select() to /dev/rtc0
to wait for clock tick timed out

Signed-off-by: Biwen Li 
---
 arch/powerpc/boot/dts/fsl/t1024rdb.dts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/boot/dts/fsl/t1024rdb.dts 
b/arch/powerpc/boot/dts/fsl/t1024rdb.dts
index 645caff98ed1..605ceec66af3 100644
--- a/arch/powerpc/boot/dts/fsl/t1024rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t1024rdb.dts
@@ -161,7 +161,6 @@
rtc@68 {
compatible = "dallas,ds1339";
reg = <0x68>;
-   interrupts = <0x1 0x1 0 0>;
};
};
 
-- 
2.17.1

[v3 1/2] dts: ppc: t4240rdb: remove interrupts property

2020-05-26 Thread Biwen Li

From: Biwen Li 

Since the interrupt pin for RTC DS1374 is not connected
to the CPU on T4240RDB, remove the interrupt property
from the device tree.

This also fix the following warning for hwclock.util-linux:
$ hwclock.util-linux
hwclock.util-linux: select() to /dev/rtc0
to wait for clock tick timed out

Signed-off-by: Biwen Li 
---
 arch/powerpc/boot/dts/fsl/t4240rdb.dts | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/boot/dts/fsl/t4240rdb.dts 
b/arch/powerpc/boot/dts/fsl/t4240rdb.dts
index a56a705d41f7..145896f2eef6 100644
--- a/arch/powerpc/boot/dts/fsl/t4240rdb.dts
+++ b/arch/powerpc/boot/dts/fsl/t4240rdb.dts
@@ -144,7 +144,6 @@
rtc@68 {
compatible = "dallas,ds1374";
reg = <0x68>;
-   interrupts = <0x1 0x1 0 0>;
};
};
 
-- 
2.17.1

[PATCH v2] selftests: powerpc: Add test for execute-disabled pkeys

2020-05-26 Thread Sandipan Das

Apart from read and write access, memory protection keys can
also be used for restricting execute permission of pages on
powerpc. This adds a test to verify if the feature works as
expected.

Signed-off-by: Sandipan Das 
---

Previous versions can be found at
v1: 
https://lore.kernel.org/linuxppc-dev/20200508162332.65316-1-sandi...@linux.ibm.com/

Changes in v2:
- Added .gitignore entry for test binary.
- Fixed builds for older distros where siginfo_t might not have si_pkey as
  a formal member based on discussion with Michael.

---
 tools/testing/selftests/powerpc/mm/.gitignore |   1 +
 tools/testing/selftests/powerpc/mm/Makefile   |   3 +-
 .../selftests/powerpc/mm/pkey_exec_prot.c | 336 ++
 3 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/powerpc/mm/pkey_exec_prot.c

diff --git a/tools/testing/selftests/powerpc/mm/.gitignore 
b/tools/testing/selftests/powerpc/mm/.gitignore
index 2ca523255b1b..8f841f925baa 100644
--- a/tools/testing/selftests/powerpc/mm/.gitignore
+++ b/tools/testing/selftests/powerpc/mm/.gitignore
@@ -8,3 +8,4 @@ wild_bctr
 large_vm_fork_separation
 bad_accesses
 tlbie_test
+pkey_exec_prot
diff --git a/tools/testing/selftests/powerpc/mm/Makefile 
b/tools/testing/selftests/powerpc/mm/Makefile
index b9103c4bb414..2816229f648b 100644
--- a/tools/testing/selftests/powerpc/mm/Makefile
+++ b/tools/testing/selftests/powerpc/mm/Makefile
@@ -3,7 +3,7 @@ noarg:
$(MAKE) -C ../
 
 TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors 
wild_bctr \
- large_vm_fork_separation bad_accesses
+ large_vm_fork_separation bad_accesses pkey_exec_prot
 TEST_GEN_PROGS_EXTENDED := tlbie_test
 TEST_GEN_FILES := tempfile
 
@@ -17,6 +17,7 @@ $(OUTPUT)/prot_sao: ../utils.c
 $(OUTPUT)/wild_bctr: CFLAGS += -m64
 $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64
 $(OUTPUT)/bad_accesses: CFLAGS += -m64
+$(OUTPUT)/pkey_exec_prot: CFLAGS += -m64
 
 $(OUTPUT)/tempfile:
dd if=/dev/zero of=$@ bs=64k count=1
diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c 
b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
new file mode 100644
index ..147fb9ed47d5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+/*
+ * Copyright 2020, Sandipan Das, IBM Corp.
+ *
+ * Test if applying execute protection on pages using memory
+ * protection keys works as expected.
+ */
+
+#define _GNU_SOURCE
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+#include "utils.h"
+
+/* Override definitions as they might be inconsistent */
+#undef PKEY_DISABLE_ACCESS
+#define PKEY_DISABLE_ACCESS0x3
+
+#undef PKEY_DISABLE_WRITE
+#define PKEY_DISABLE_WRITE 0x2
+
+#undef PKEY_DISABLE_EXECUTE
+#define PKEY_DISABLE_EXECUTE   0x4
+
+/* Older distros might not define this */
+#ifndef SEGV_PKUERR
+#define SEGV_PKUERR4
+#endif
+
+#define SI_PKEY_OFFSET 0x20
+
+#define SYS_pkey_mprotect  386
+#define SYS_pkey_alloc 384
+#define SYS_pkey_free  385
+
+#define PKEY_BITS_PER_PKEY 2
+#define NR_PKEYS   32
+
+#define PKEY_BITS_MASK ((1UL << PKEY_BITS_PER_PKEY) - 1)
+
+static unsigned long pkeyreg_get(void)
+{
+   unsigned long uamr;
+
+   asm volatile("mfspr %0, 0xd" : "=r"(uamr));
+   return uamr;
+}
+
+static void pkeyreg_set(unsigned long uamr)
+{
+   asm volatile("isync; mtspr  0xd, %0; isync;" : : "r"(uamr));
+}
+
+static void pkey_set_rights(int pkey, unsigned long rights)
+{
+   unsigned long uamr, shift;
+
+   shift = (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY;
+   uamr = pkeyreg_get();
+   uamr &= ~(PKEY_BITS_MASK << shift);
+   uamr |= (rights & PKEY_BITS_MASK) << shift;
+   pkeyreg_set(uamr);
+}
+
+static int sys_pkey_mprotect(void *addr, size_t len, int prot, int pkey)
+{
+   return syscall(SYS_pkey_mprotect, addr, len, prot, pkey);
+}
+
+static int sys_pkey_alloc(unsigned long flags, unsigned long rights)
+{
+   return syscall(SYS_pkey_alloc, flags, rights);
+}
+
+static int sys_pkey_free(int pkey)
+{
+   return syscall(SYS_pkey_free, pkey);
+}
+
+static volatile int fpkey, fcode, ftype, faults;
+static unsigned long pgsize, numinsns;
+static volatile unsigned int *faddr;
+static unsigned int *insns;
+
+static void segv_handler(int signum, siginfo_t *sinfo, void *ctx)
+{
+   int pkey;
+
+#ifdef si_pkey
+   pkey = sinfo->si_pkey;
+#else
+   pkey = *((int *)(((char *) sinfo) + SI_PKEY_OFFSET));
+#endif
+
+   /* Check if this fault originated because of the expected reasons */
+   if (sinfo->si_code != SEGV_ACCERR && sinfo->si_code != SEGV_PKUERR) {
+   printf("got an unexpected fault, code = %d\n",
+  sinfo->si_code);
+   goto fail;
+   }
+
+   /* Check if this fault originated from the expected address */
+

Re: [PATCH -next] scsi: ibmvscsi: Make some functions static

2020-05-26 Thread Martin K. Petersen

On Wed, 20 May 2020 17:10:36 +0800, Chen Tao wrote:

> Fix the following warning:
> 
> drivers/scsi/ibmvscsi/ibmvscsi.c:2387:12: warning: symbol
> 'ibmvscsi_module_init' was not declared. Should it be static?
> drivers/scsi/ibmvscsi/ibmvscsi.c:2409:13: warning: symbol
> 'ibmvscsi_module_exit' was not declared. Should it be static?

Applied to 5.8/scsi-queue, thanks!

[1/1] scsi: ibmvscsi: Make some functions static
  https://git.kernel.org/mkp/scsi/c/1f93ad177d24

-- 
Martin K. Petersen  Oracle Linux Engineering

Re: [PATCH] powerpc/kvm/book3s64/vio: fix some RCU-list locks

2020-05-26 Thread Qian Cai

On Wed, May 27, 2020 at 11:13:23AM +1000, Paul Mackerras wrote:
> On Sun, May 10, 2020 at 01:18:34AM -0400, Qian Cai wrote:
> > It is unsafe to traverse kvm->arch.spapr_tce_tables and
> > stt->iommu_tables without the RCU read lock held. Also, add
> > cond_resched_rcu() in places with the RCU read lock held that could take
> > a while to finish.
> 
> This mostly looks fine.  The cond_resched_rcu() in kvmppc_tce_validate
> doesn't seem necessary (the list would rarely have more than a few
> dozen entries) and could be a performance problem given that TCE
> validation is a hot-path.
> 
> Are you OK with me modifying the patch to take out that
> cond_resched_rcu(), or is there some reason why it's essential that it
> be there?

Feel free to take out that cond_resched_rcu(). Your reasoning makes
sense.

Re: [PATCH] powerpc/kvm/book3s64/vio: fix some RCU-list locks

2020-05-26 Thread Paul Mackerras

On Sun, May 10, 2020 at 01:18:34AM -0400, Qian Cai wrote:
> It is unsafe to traverse kvm->arch.spapr_tce_tables and
> stt->iommu_tables without the RCU read lock held. Also, add
> cond_resched_rcu() in places with the RCU read lock held that could take
> a while to finish.

This mostly looks fine.  The cond_resched_rcu() in kvmppc_tce_validate
doesn't seem necessary (the list would rarely have more than a few
dozen entries) and could be a performance problem given that TCE
validation is a hot-path.

Are you OK with me modifying the patch to take out that
cond_resched_rcu(), or is there some reason why it's essential that it
be there?

Paul.

Re: [PATCH 2/3] powerpc/pci: unmap legacy INTx interrupts of passthrough IO adapters

2020-05-26 Thread Oliver O'Halloran

On Wed, Apr 29, 2020 at 5:51 PM Cédric Le Goater  wrote:
>
> When a passthrough IO adapter is removed from a pseries machine using
> hash MMU and the XIVE interrupt mode, the POWER hypervisor, pHyp,
> expects the guest OS to have cleared all page table entries related to
> the adapter. If some are still present, the RTAS call which isolates
> the PCI slot returns error 9001 "valid outstanding translations" and
> the removal of the IO adapter fails.
>
> INTx interrupt numbers need special care because Linux maps the
> interrupts automatically in the Linux interrupt number space if they
> are presented in the device tree node describing the IO adapter. These
> interrupts are not un-mapped automatically and in case of an hot-plug
> adapter, the PCI hot-plug layer needs to handle the cleanup to make
> sure that all the page table entries of the XIVE ESB pages are
> cleared.
>
> Cc: "Oliver O'Halloran" 
> Signed-off-by: Cédric Le Goater 
> ---
>  arch/powerpc/kernel/pci-hotplug.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/arch/powerpc/kernel/pci-hotplug.c 
> b/arch/powerpc/kernel/pci-hotplug.c
> index bf83f76563a3..9e9c6befd7ea 100644
> --- a/arch/powerpc/kernel/pci-hotplug.c
> +++ b/arch/powerpc/kernel/pci-hotplug.c
> @@ -57,6 +57,8 @@ void pcibios_release_device(struct pci_dev *dev)
> struct pci_controller *phb = pci_bus_to_host(dev->bus);
> struct pci_dn *pdn = pci_get_pdn(dev);
>
> +   irq_dispose_mapping(dev->irq);

What does the original mapping? Powerpc arch code or the PCI core?
Tearing down the mapping in pcibios_release_device() seems a bit fishy
to me since the PCI core has already torn down the device state at
that point. If the release is delayed it's possible that another
pci_dev has mapped the IRQ before we get here, but maybe that's ok.

> +
> eeh_remove_device(dev);
>
> if (phb->controller_ops.release_device)
> --
> 2.25.4
>

Re: [PATCH] selftests: powerpc: Add test for execute-disabled pkeys

2020-05-26 Thread Michael Ellerman

Sandipan Das  writes:
> Hi Michael,
>
> On 26/05/20 6:05 pm, Michael Ellerman wrote:
>> [...]
>>> +
>>> +/* Override definitions as they might be inconsistent */
>>> +#undef PKEY_DISABLE_ACCESS
>>> +#define PKEY_DISABLE_ACCESS0x3
>> 
>> Why would they be inconsistent?
>> 
>
> The definition in sys/mman.h still uses the value specific to
> Intel's implementation i.e. 1, when this should have been 3
> for powerpc. I have seen this on Ubuntu 18.04 and 20.04.

Hmm OK, that's a bug but oh well nothing we can do about it.
 
>> I think a reasonable solution is to use the absence of SEGV_PKUERR to
>> basically turn the whole test into a nop at build time, eg:
...
>
> Or can I use this from the pkey tests under selftests/vm?
>
> static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
> {
> #ifdef si_pkey
>   return &si->si_pkey;
> #else
>   return (u32 *)(((u8 *)si) + si_pkey_offset);
> #endif
> }
>
> Where si_pkey_offset is 0x20 for powerpc.

Yeah that's fine if it works. Please send a v2 with that change.

cheers

Re: [PATCH v3 0/7] Statsfs: a new ram-based file system for Linux kernel statistics

2020-05-26 Thread Jakub Kicinski

On Tue, 26 May 2020 13:03:10 +0200 Emanuele Giuseppe Esposito wrote:
> There is currently no common way for Linux kernel subsystems to expose
> statistics to userspace shared throughout the Linux kernel; subsystems have
> to take care of gathering and displaying statistics by themselves, for
> example in the form of files in debugfs. For example KVM has its own code
> section that takes care of this in virt/kvm/kvm_main.c, where it sets up
> debugfs handlers for displaying values and aggregating them from various
> subfolders to obtain information about the system state (i.e. displaying
> the total number of exits, calculated by summing all exits of all cpus of
> all running virtual machines).
> 
> Allowing each section of the kernel to do so has two disadvantages. First,
> it will introduce redundant code. Second, debugfs is anyway not the right
> place for statistics (for example it is affected by lockdown)
> 
> In this patch series I introduce statsfs, a synthetic ram-based virtual
> filesystem that takes care of gathering and displaying statistics for the
> Linux kernel subsystems.
> 
> The file system is mounted on /sys/kernel/stats and would be already used
> by kvm. Statsfs was initially introduced by Paolo Bonzini [1].

What's the direct motivation for this work? Moving KVM stats out of
debugfs?

In my experience stats belong in the API used for creating/enumerating
objects, statsfs sounds like going in the exact opposite direction -
creating a parallel structure / hierarchy for exposing stats. I know
nothing about KVM but are you sure all the info that has to be exposed
will be stats?

In case of networking we have the basic stats in sysfs, under the
netdevice's kobject. But since we're not using sysfs much any more 
for config, new stats are added in netlink APIs. Again - same APIs
used for enumeration and config.

[PATCH v2] powerpc/wii: Fix declaration made after definition

2020-05-26 Thread Nathan Chancellor

A 0day randconfig uncovered an error with clang, trimmed for brevity:

arch/powerpc/platforms/embedded6xx/wii.c:195:7: error: attribute
declaration must precede definition [-Werror,-Wignored-attributes]
if (!machine_is(wii))
 ^

The macro machine_is declares mach_##name but define_machine actually
defines mach_##name, hence the warning.

To fix this, move define_machine after the machine_is usage.

Fixes: 5a7ee3198dfa ("powerpc: wii: platform support")
Reported-by: kbuild test robot 
Link: https://github.com/ClangBuiltLinux/linux/issues/989
Reviewed-by: Nick Desaulniers 
Signed-off-by: Nathan Chancellor 
---

v1 -> v2:

* s/is_machine/machine_is/ (Nick)

* Add Nick's reviewed-by tag.

 arch/powerpc/platforms/embedded6xx/wii.c | 25 
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/embedded6xx/wii.c 
b/arch/powerpc/platforms/embedded6xx/wii.c
index 67e48b0a164e..a802ef957d63 100644
--- a/arch/powerpc/platforms/embedded6xx/wii.c
+++ b/arch/powerpc/platforms/embedded6xx/wii.c
@@ -172,19 +172,6 @@ static void wii_shutdown(void)
flipper_quiesce();
 }
 
-define_machine(wii) {
-   .name   = "wii",
-   .probe  = wii_probe,
-   .setup_arch = wii_setup_arch,
-   .restart= wii_restart,
-   .halt   = wii_halt,
-   .init_IRQ   = wii_pic_probe,
-   .get_irq= flipper_pic_get_irq,
-   .calibrate_decr = generic_calibrate_decr,
-   .progress   = udbg_progress,
-   .machine_shutdown   = wii_shutdown,
-};
-
 static const struct of_device_id wii_of_bus[] = {
{ .compatible = "nintendo,hollywood", },
{ },
@@ -200,3 +187,15 @@ static int __init wii_device_probe(void)
 }
 device_initcall(wii_device_probe);
 
+define_machine(wii) {
+   .name   = "wii",
+   .probe  = wii_probe,
+   .setup_arch = wii_setup_arch,
+   .restart= wii_restart,
+   .halt   = wii_halt,
+   .init_IRQ   = wii_pic_probe,
+   .get_irq= flipper_pic_get_irq,
+   .calibrate_decr = generic_calibrate_decr,
+   .progress   = udbg_progress,
+   .machine_shutdown   = wii_shutdown,
+};

base-commit: b0523c7b1c9d0edcd6c0fe6d2cb558a9ad5c60a8
-- 
2.27.0.rc0

Re: [PATCH] soc: fsl: qe: Replace one-element array and use struct_size() helper

2020-05-26 Thread Li Yang

On Sun, May 24, 2020 at 9:49 PM Qiang Zhao  wrote:
>
> On Wed, May 23, 2020 at 5:22 PM Li Yang 
> > -Original Message-
> > From: Li Yang 
> > Sent: 2020年5月23日 5:22
> > To: Kees Cook 
> > Cc: Gustavo A. R. Silva ; Qiang Zhao
> > ; linuxppc-dev ;
> > moderated list:ARM/FREESCALE IMX / MXC ARM ARCHITECTURE
> > ; lkml ;
> > Gustavo A. R. Silva 
> > Subject: Re: [PATCH] soc: fsl: qe: Replace one-element array and use
> > struct_size() helper
> >
> > On Wed, May 20, 2020 at 10:24 PM Kees Cook 
> > wrote:
> > >
> > > On Wed, May 20, 2020 at 06:52:21PM -0500, Li Yang wrote:
> > > > On Mon, May 18, 2020 at 5:57 PM Kees Cook 
> > wrote:
> > > > > Hm, looking at this code, I see a few other things that need to be
> > > > > fixed:
> > > > >
> > > > > 1) drivers/tty/serial/ucc_uart.c does not do a be32_to_cpu() 
> > > > > conversion
> > > > >on the length test (understandably, a little-endian system has 
> > > > > never
> > run
> > > > >this code since it's ppc specific), but it's still wrong:
> > > > >
> > > > > if (firmware->header.length != fw->size) {
> > > > >
> > > > >compare to the firmware loader:
> > > > >
> > > > > length = be32_to_cpu(hdr->length);
> > > > >
> > > > > 2) drivers/soc/fsl/qe/qe.c does not perform bounds checking on the
> > > > >per-microcode offsets, so the uploader might send data outside the
> > > > >firmware buffer. Perhaps:
> > > >
> > > > We do validate the CRC for each microcode, it is unlikely the CRC
> > > > check can pass if the offset or length is not correct.  But you are
> > > > probably right that it will be safer to check the boundary and fail
> > >
> > > Right, but a malicious firmware file could still match CRC but trick
> > > the kernel code.
> > >
> > > > quicker before we actually start the CRC check.  Will you come up
> > > > with a formal patch or you want us to deal with it?
> > >
> > > It sounds like Gustavo will be sending one, though I don't think
> > > either of us have the hardware to test it with, so if you could do
> > > that part, that would be great! :)
> >
> > That will be great.  I think Zhao Qiang can help with the testing part.
> >
>
> Now the firmware are loaded in uboot, and kernel will do nothing for it.
> So testing on it maybe need some extra codes both in driver and dts.
> In the meanwhile, I am so busy on some high priority work that maybe test work
> could not be done in time.
> Once I am free, I will do it.

Thanks.  You are right that most of the QE drivers doesn't support
requesting firmware in kernel except the ucc_uart.  So it probably can
be tested with that driver without requiring code change.

>
> Best Regards
> Qiang Zhao

Re: [PATCH] KVM: PPC: Book3S HV: read ibm,secure-memory nodes

2020-05-26 Thread Laurent Dufour


Paul, could you please take that patch?

Le 16/04/2020 à 18:27, Laurent Dufour a écrit :

The newly introduced ibm,secure-memory nodes supersede the
ibm,uv-firmware's property secure-memory-ranges.

Firmware will no more expose the secure-memory-ranges property so first
read the new one and if not found rollback to the older one.

Signed-off-by: Laurent Dufour 
---
  arch/powerpc/kvm/book3s_hv_uvmem.c | 14 ++
  1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 53b88cae3e73..ad950f8996e0 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -735,6 +735,20 @@ static u64 kvmppc_get_secmem_size(void)
const __be32 *prop;
u64 size = 0;

+   /*
+* First try the new ibm,secure-memory nodes which supersede the
+* secure-memory-ranges property.
+* If we found somes, no need to read the deprecated one.
+*/
+   for_each_compatible_node(np, NULL, "ibm,secure-memory") {
+   prop = of_get_property(np, "reg", &len);
+   if (!prop)
+   continue;
+   size += of_read_number(prop + 2, 2);
+   }
+   if (size)
+   return size;
+
np = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware");
if (!np)
goto out;

Re: [PATCH v2 08/20] arm64: simplify detection of memory zone boundaries for UMA configs

2020-05-26 Thread Catalin Marinas

On Wed, Apr 29, 2020 at 03:11:14PM +0300, Mike Rapoport wrote:
> From: Mike Rapoport 
> 
> The free_area_init() function only requires the definition of maximal PFN
> for each of the supported zone rater than calculation of actual zone sizes
> and the sizes of the holes between the zones.
> 
> After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
> available to all architectures.
> 
> Using this function instead of free_area_init_node() simplifies the zone
> detection.
> 
> Signed-off-by: Mike Rapoport 

Acked-by: Catalin Marinas 

(BTW, none of my acks so far made it to the linux-arm-kernel list
because of the large number of people on cc)

Re: [PATCH v2 05/20] mm: use free_area_init() instead of free_area_init_nodes()

2020-05-26 Thread Catalin Marinas

On Wed, Apr 29, 2020 at 03:11:11PM +0300, Mike Rapoport wrote:
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index e42727e3568e..a650adb358ee 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -206,7 +206,7 @@ static void __init zone_sizes_init(unsigned long min, 
> unsigned long max)
>  #endif
>   max_zone_pfns[ZONE_NORMAL] = max;
>  
> - free_area_init_nodes(max_zone_pfns);
> + free_area_init(max_zone_pfns);
>  }

Acked-by: Catalin Marinas

Re: [PATCH v2 03/20] mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option

2020-05-26 Thread Catalin Marinas

On Wed, Apr 29, 2020 at 03:11:09PM +0300, Mike Rapoport wrote:
> From: Mike Rapoport 
> 
> The CONFIG_HAVE_MEMBLOCK_NODE_MAP is used to differentiate initialization
> of nodes and zones structures between the systems that have region to node
> mapping in memblock and those that don't.
> 
> Currently all the NUMA architectures enable this option and for the
> non-NUMA systems we can presume that all the memory belongs to node 0 and
> therefore the compile time configuration option is not required.
> 
> The remaining few architectures that use DISCONTIGMEM without NUMA are
> easily updated to use memblock_add_node() instead of memblock_add() and
> thus have proper correspondence of memblock regions to NUMA nodes.
> 
> Still, free_area_init_node() must have a backward compatible version
> because its semantics with and without CONFIG_HAVE_MEMBLOCK_NODE_MAP is
> different. Once all the architectures will use the new semantics, the
> entire compatibility layer can be dropped.
> 
> To avoid addition of extra run time memory to store node id for
> architectures that keep memblock but have only a single node, the node id
> field of the memblock_region is guarded by CONFIG_NEED_MULTIPLE_NODES and
> the corresponding accessors presume that in those cases it is always 0.
> 
> Signed-off-by: Mike Rapoport 
> ---
>  .../vm/numa-memblock/arch-support.txt |  34 --
>  arch/alpha/mm/numa.c  |   4 +-
>  arch/arm64/Kconfig|   1 -

For arm64:

Acked-by: Catalin Marinas

Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone

2020-05-26 Thread Alex Ghiti


Hi Zong,

Le 5/26/20 à 5:43 AM, Zong Li a écrit :

On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti  wrote:

This is a preparatory patch for relocatable kernel.

The kernel used to be linked at PAGE_OFFSET address and used to be loaded
physically at the beginning of the main memory. Therefore, we could use
the linear mapping for the kernel mapping.

But the relocated kernel base address will be different from PAGE_OFFSET
and since in the linear mapping, two different virtual addresses cannot
point to the same physical address, the kernel mapping needs to lie outside
the linear mapping.

In addition, because modules and BPF must be close to the kernel (inside
+-2GB window), the kernel is placed at the end of the vmalloc zone minus
2GB, which leaves room for modules and BPF. The kernel could not be
placed at the beginning of the vmalloc zone since other vmalloc
allocations from the kernel could get all the +-2GB window around the
kernel which would prevent new modules and BPF programs to be loaded.

Signed-off-by: Alexandre Ghiti 
---
  arch/riscv/boot/loader.lds.S |  3 +-
  arch/riscv/include/asm/page.h| 10 +-
  arch/riscv/include/asm/pgtable.h | 37 +---
  arch/riscv/kernel/head.S |  3 +-
  arch/riscv/kernel/module.c   |  4 +--
  arch/riscv/kernel/vmlinux.lds.S  |  3 +-
  arch/riscv/mm/init.c | 58 +---
  arch/riscv/mm/physaddr.c |  2 +-
  8 files changed, 87 insertions(+), 33 deletions(-)

diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
index 47a5003c2e28..62d94696a19c 100644
--- a/arch/riscv/boot/loader.lds.S
+++ b/arch/riscv/boot/loader.lds.S
@@ -1,13 +1,14 @@
  /* SPDX-License-Identifier: GPL-2.0 */

  #include 
+#include 

  OUTPUT_ARCH(riscv)
  ENTRY(_start)

  SECTIONS
  {
-   . = PAGE_OFFSET;
+   . = KERNEL_LINK_ADDR;

 .payload : {
 *(.payload)
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index 2d50f76efe48..48bb09b6a9b7 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -90,18 +90,26 @@ typedef struct page *pgtable_t;

  #ifdef CONFIG_MMU
  extern unsigned long va_pa_offset;
+extern unsigned long va_kernel_pa_offset;
  extern unsigned long pfn_base;
  #define ARCH_PFN_OFFSET(pfn_base)
  #else
  #define va_pa_offset   0
+#define va_kernel_pa_offset0
  #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT)
  #endif /* CONFIG_MMU */

  extern unsigned long max_low_pfn;
  extern unsigned long min_low_pfn;
+extern unsigned long kernel_virt_addr;

  #define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
-#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
+#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
+#define kernel_mapping_va_to_pa(x) \
+   ((unsigned long)(x) - va_kernel_pa_offset)
+#define __va_to_pa_nodebug(x)  \
+   (((x) >= PAGE_OFFSET) ? \
+   linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))

  #ifdef CONFIG_DEBUG_VIRTUAL
  extern phys_addr_t __virt_to_phys(unsigned long x);
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 35b60035b6b0..25213cfaf680 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -11,23 +11,29 @@

  #include 

-#ifndef __ASSEMBLY__
-
-/* Page Upper Directory not used in RISC-V */
-#include 
-#include 
-#include 
-#include 
-
-#ifdef CONFIG_MMU
+#ifndef CONFIG_MMU
+#define KERNEL_VIRT_ADDR   PAGE_OFFSET
+#define KERNEL_LINK_ADDR   PAGE_OFFSET
+#else
+/*
+ * Leave 2GB for modules and BPF that must lie within a 2GB range around
+ * the kernel.
+ */
+#define KERNEL_VIRT_ADDR   (VMALLOC_END - SZ_2G + 1)
+#define KERNEL_LINK_ADDR   KERNEL_VIRT_ADDR

  #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
  #define VMALLOC_END  (PAGE_OFFSET - 1)
  #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE)

  #define BPF_JIT_REGION_SIZE(SZ_128M)
-#define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
-#define BPF_JIT_REGION_END (VMALLOC_END)
+#define BPF_JIT_REGION_START   (kernel_virt_addr)
+#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE)

It seems to have a potential risk here, the region of bpf is
overlapping with kernel mapping, so if kernel size is bigger than
128MB, bpf region would be occupied and run out by kernel mapping.


+
+#ifdef CONFIG_64BIT
+#define VMALLOC_MODULE_START   BPF_JIT_REGION_END
+#define VMALLOC_MODULE_END VMALLOC_END
+#endif


Although kernel_virt_addr is a fixed address now, I think it could be
changed for the purpose of relocatable or KASLR, so if
kernel_virt_addr is moved to far from VMALLOC_END than 2G, the region
of module would be too big.



Yes you're right, that's wrong to allow modules to lie outside
the 2G window, thanks for noticing.



In addition, the region of

Re: [PATCH v3 7/7] [not for merge] netstats: example use of stats_fs API

2020-05-26 Thread Emanuele Giuseppe Esposito




Hi Andrew


How do you atomically get and display a group of statistics?

If you look at how the netlink socket works, you will see code like:

 do {
 start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
 rx_packets = cpu_stats->rx_packets;
 rx_bytes = cpu_stats->rx_bytes;

 } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));

It will ensure that rx_packets and rx_bytes are consistent with each
other. If the value of the sequence counter changes while inside the
loop, the loop so repeated until it does not change.

In general, hardware counters in NICs are the same.  You tell it to
take a snapshot of the statistics counters, and then read them all
back, to give a consistent view across all the statistics.

I've not looked at this new code in detail, but it looks like you have
one file per statistic, and assume each statistic is independent of
every other statistic. This independence can limit how you use the
values, particularly when debugging. The netlink interface we use does
not have this limitation.


You're right, statistics are treated independently so what you describe 
is currently not supported.


In KVM the utilization is more qualitative, so there isn't such problem.
But as long as the interface is based on file access, the possibility of 
snapshotting might not be useful; however, it could still be considered 
to be added later together with the binary access.


Jonathan, how is your metricfs handling this case?

Thank you,
Emanuele

powerpc/pci: [PATCH 1/1 V3] PCIE PHB reset

2020-05-26 Thread wenxiong

From: Wen Xiong 

Several device drivers hit EEH(Extended Error handling) when triggering
kdump on Pseries PowerVM. This patch implemented a reset of the PHBs
in pci general code when triggering kdump. PHB reset stop all PCI
transactions from normal kernel. We have tested the patch in several
enviroments:
- direct slot adapters
- adapters under the switch
- a VF adapter in PowerVM
- a VF adapter/adapter in KVM guest.

Signed-off-by: Wen Xiong 

---
 arch/powerpc/platforms/pseries/pci.c | 152 +++
 1 file changed, 152 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/pci.c 
b/arch/powerpc/platforms/pseries/pci.c
index 911534b89c85..cb7e4276cf04 100644
--- a/arch/powerpc/platforms/pseries/pci.c
+++ b/arch/powerpc/platforms/pseries/pci.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -354,3 +356,153 @@ int pseries_root_bridge_prepare(struct pci_host_bridge 
*bridge)
 
return 0;
 }
+
+/**
+ * pseries_get_pdn_addr - Retrieve PHB address
+ * @pe: EEH PE
+ *
+ * Retrieve the assocated PHB address. Actually, there're 2 RTAS
+ * function calls dedicated for the purpose. We need implement
+ * it through the new function and then the old one. Besides,
+ * you should make sure the config address is figured out from
+ * FDT node before calling the function.
+ *
+ */
+static int pseries_get_pdn_addr(struct pci_controller *phb)
+{
+   int ret = -1;
+   int rets[3];
+   int ibm_get_config_addr_info;
+   int ibm_get_config_addr_info2;
+   int config_addr = 0;
+   struct pci_dn *root_pdn, *pdn;
+
+   ibm_get_config_addr_info2   = rtas_token("ibm,get-config-addr-info2");
+   ibm_get_config_addr_info= rtas_token("ibm,get-config-addr-info");
+
+   root_pdn = PCI_DN(phb->dn);
+   pdn = list_first_entry(&root_pdn->child_list, struct pci_dn, list);
+   config_addr = (pdn->busno << 16) | (pdn->devfn << 8);
+
+   if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) {
+   /*
+* First of all, we need to make sure there has one PE
+* associated with the device. If option is 1, it
+* queries if config address is supported in a PE or not.
+* If option is 0, it returns PE config address or config
+* address for the PE primary bus.
+*/
+   ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+   config_addr, BUID_HI(pdn->phb->buid),
+   BUID_LO(pdn->phb->buid), 1);
+   if (ret || (rets[0] == 0)) {
+   pr_warn("%s: Failed to get address for PHB#%x-PE# 
option=%d config_addr=%x\n",
+   __func__, pdn->phb->global_number, 1, rets[0]);
+   return -1;
+   }
+
+   /* Retrieve the associated PE config address */
+   ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets,
+   config_addr, BUID_HI(pdn->phb->buid),
+   BUID_LO(pdn->phb->buid), 0);
+   if (ret) {
+   pr_warn("%s: Failed to get address for PHB#%x-PE# 
option=%d config_addr=%x\n",
+   __func__, pdn->phb->global_number, 0, rets[0]);
+   return -1;
+   }
+   return rets[0];
+   }
+
+   if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) {
+   ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets,
+   config_addr, BUID_HI(pdn->phb->buid),
+   BUID_LO(pdn->phb->buid), 0);
+   if (ret || rets[0]) {
+   pr_warn("%s: Failed to get address for PHB#%x-PE# 
config_addr=%x\n",
+   __func__, pdn->phb->global_number, rets[0]);
+   return -1;
+   }
+   return rets[0];
+   }
+
+   return ret;
+}
+
+static int __init pseries_phb_reset(void)
+{
+   struct pci_controller *phb;
+   int config_addr;
+   int ibm_set_slot_reset;
+   int ibm_configure_pe;
+   int ret;
+
+   if (is_kdump_kernel() || reset_devices) {
+   pr_info("Issue PHB reset ...\n");
+   ibm_set_slot_reset = rtas_token("ibm,set-slot-reset");
+   ibm_configure_pe = rtas_token("ibm,configure-pe");
+
+   if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE ||
+   ibm_configure_pe == RTAS_UNKNOWN_SERVICE) {
+   pr_info("%s: EEH functionality not supported\n",
+   __func__);
+   }
+
+   list_for_each_entry(phb, &hose_list, list_node) {
+   config_addr = pseries_get_pdn_addr(phb);
+   if (config_addr == -1)
+   continue;
+
+   ret = rtas_call(ibm_set_slot_reset, 4, 1, N

Re: [PATCH v3 7/7] [not for merge] netstats: example use of stats_fs API

2020-05-26 Thread Andrew Lunn

On Tue, May 26, 2020 at 01:03:17PM +0200, Emanuele Giuseppe Esposito wrote:
> Apply stats_fs on the networking statistics subsystem.
> 
> Currently it only works with disabled network namespace
> (CONFIG_NET_NS=n), because multiple namespaces will have the same
> device name under the same root source that will cause a conflict in
> stats_fs.

Hi Emanuele

How do you atomically get and display a group of statistics?

If you look at how the netlink socket works, you will see code like:

do {
start = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
rx_packets = cpu_stats->rx_packets;
rx_bytes = cpu_stats->rx_bytes;

} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start));

It will ensure that rx_packets and rx_bytes are consistent with each
other. If the value of the sequence counter changes while inside the
loop, the loop so repeated until it does not change.

In general, hardware counters in NICs are the same.  You tell it to
take a snapshot of the statistics counters, and then read them all
back, to give a consistent view across all the statistics.

I've not looked at this new code in detail, but it looks like you have
one file per statistic, and assume each statistic is independent of
every other statistic. This independence can limit how you use the
values, particularly when debugging. The netlink interface we use does
not have this limitation.

 Andrew

Re: [PATCH] selftests: powerpc: Add test for execute-disabled pkeys

2020-05-26 Thread Sandipan Das

Hi Michael,

On 26/05/20 6:05 pm, Michael Ellerman wrote:
> [...]
>> +
>> +/* Override definitions as they might be inconsistent */
>> +#undef PKEY_DISABLE_ACCESS
>> +#define PKEY_DISABLE_ACCESS 0x3
> 
> Why would they be inconsistent?
> 

The definition in sys/mman.h still uses the value specific to
Intel's implementation i.e. 1, when this should have been 3
for powerpc. I have seen this on Ubuntu 18.04 and 20.04.

> 
>> +/* Older distros might not define this */
>> +#ifndef SEGV_PKUERR
>> +#define SEGV_PKUERR 4
>> +#endif
> ...
>> +
>> +/* Restore permissions in order to continue */
>> +switch (fcode) {
>> +case SEGV_ACCERR:
>> +if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE)) {
>> +perror("mprotect");
>> +goto fail;
>> +}
>> +break;
>> +case SEGV_PKUERR:
>> +if (sinfo->si_pkey != fpkey)
>> +goto fail;
> 
> This doesn't compile on older distros, eg Ubuntu 16.04:
> 
>   pkey_exec_prot.c: In function 'segv_handler':
>   pkey_exec_prot.c:121:12: error: 'siginfo_t {aka struct }' has no 
> member named 'si_pkey'
>  if (sinfo->si_pkey != fpkey)
>   ^
>   pkey_exec_prot.c:151:24: error: 'siginfo_t {aka struct }' has no 
> member named 'si_pkey'
>  pkey_set_rights(sinfo->si_pkey, 0);
>   ^
>   ../../lib.mk:142: recipe for target 
> '/output/kselftest/powerpc/mm/pkey_exec_prot' failed
> 

Thanks for reporting this.

> 
> I think a reasonable solution is to use the absence of SEGV_PKUERR to
> basically turn the whole test into a nop at build time, eg:
> 
> diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c 
> b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> index b346ad205e68..218257b89fbb 100644
> --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> @@ -30,9 +30,7 @@
>  #define PKEY_DISABLE_EXECUTE   0x4
> 
>  /* Older distros might not define this */
> -#ifndef SEGV_PKUERR
> -#define SEGV_PKUERR4
> -#endif
> +#ifdef SEGV_PKUERR
> 
>  #define SYS_pkey_mprotect  386
>  #define SYS_pkey_alloc 384
> @@ -319,6 +317,13 @@ static int test(void)
> 
> return 0;
>  }
> +#else
> +static int test(void)
> +{
> +   printf("Test built with old libc lacking pkey support.\n");
> +   SKIP_IF(true);
> +}
> +#endif /* SEGV_PKUERR */
> 
>  int main(void)
>  {
> 
> 

Or can I use this from the pkey tests under selftests/vm?

static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si)
{
#ifdef si_pkey
return &si->si_pkey;
#else
return (u32 *)(((u8 *)si) + si_pkey_offset);
#endif
}

Where si_pkey_offset is 0x20 for powerpc.


- Sandipan

Re: [PATCH v2 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms

2020-05-26 Thread Michael Ellerman

Michal Simek  writes:
> Hi Michael,
>
> On 01. 04. 20 13:30, Michal Simek wrote:
>> On 01. 04. 20 12:38, Takashi Iwai wrote:
>>> On Wed, 01 Apr 2020 12:35:16 +0200,
>>> Michael Ellerman wrote:

 Michal Simek  writes:
> On 01. 04. 20 4:07, Michael Ellerman wrote:
>> Michal Simek  writes:
>>> Hi,
>>>
>>> recently we wanted to update xilinx intc driver and we found that 
>>> function
>>> which we wanted to remove is still wired by ancient Xilinx PowerPC
>>> platforms. Here is the thread about it.
>>> https://lore.kernel.org/linux-next/48d3232d-0f1d-42ea-3109-f44bbabfa...@xilinx.com/
>>>
>>> I have been talking about it internally and there is no interest in 
>>> these
>>> platforms and it is also orphan for quite a long time. None is really
>>> running/testing these platforms regularly that's why I think it makes 
>>> sense
>>> to remove them also with drivers which are specific to this platform.
>>>
>>> U-Boot support was removed in 2017 without anybody complain about it
>>> https://github.com/Xilinx/u-boot-xlnx/commit/98f705c9cefdfdba62c069821bbba10273a0a8ed
>>>
>>> Based on current ppc/next.
>>>
>>> If anyone has any objection about it, please let me know.
>>
>> Thanks for taking the time to find all this code and remove it.
>>
>> I'm not going to take this series for v5.7, it was posted too close to
>> the merge window, and doing so wouldn't give people much time to object,
>> especially given people are distracted at the moment.
>>
>> I'm happy to take it for v5.8, assuming there's no major objections.
>
> Sure. Just to let you know Christophe Leroy included this patch in his
> series about ppc405 removal. It should be the same.
>
> If you don't want to take that alsa patch I can send it separately and
> this patch can be taken from his series. I don't really mind but please
> let me know what way you prefer.

 It's better to keep it all together, so I'm happy take the alsa patch as
 well, it's already been acked.
>
> Can you please take this series? I know that there is v5 from Christophe
> which has this 1/2 as 1/13. But I need this alsa patch too and I would
> like to close this because it is around for almost 2 months and none
> raised a concern about removing just these Xilinx platforms.

Sorry I meant to reply to your last mail.

I have Christophe's series in my testing branch, planning for it to be
in v5.8.

Even if the rest of his series doesn't make it for some reason, as you
say the Xilinx removal is uncontroversial so I'll keep that in.

I forgot about the sound patch, I'll pick that up as well.

cheers

Re: [PATCH v4 07/45] powerpc/ptdump: Limit size of flags text to 1/2 chars on PPC32

2020-05-26 Thread Michael Ellerman

Christophe Leroy  writes:
> Le 25/05/2020 à 07:15, Michael Ellerman a écrit :
>> Christophe Leroy  writes:
>>> In order to have all flags fit on a 80 chars wide screen,
>>> reduce the flags to 1 char (2 where ambiguous).
>> 
>> I don't love this, the output is less readable. Is fitting on an 80 char
>> screen a real issue for you? I just make my terminal window bigger.
>
> I don't have strong opinion about that, and the terminal can be made bigger.
> I just don't like how messy it is, some flags are so big that they hide 
> other ones and getting it more ordered and more compact helped me during 
> all the verifications I did with this series, but we can leave it as is 
> if you prefer.

I think I do.

> Would you like a v5 without patches 7 and 8 ? Or I can just resend the 
> patches that will be impacted, that is 9 and 38 ?

I dropped 7 and 8 and then fixed up 9 and 38, it was easy enough.

I used "coherent" and "huge".

> With the change I get.
>
> ---[ Start of kernel VM ]---
> 0xc000-0xc0ff  0x16M   h  r   x  psh a
> 0xc100-0xc7ff  0x0100   112M   h  rw psh  d  a
> ---[ vmalloc() Area ]---
> 0xc900-0xc9003fff  0x050e400016K  rw psh  d  a
> 0xc9008000-0xc900bfff  0x050ec00016K  rw psh  d  a
> 0xc901-0xc9013fff  0xd00016K  rw p  i  g  sh  d  a
> 0xc9018000-0xc901bfff  0x050f16K  rw psh  d  a

It's definitely more compact :)

But I worry no one other than you will be able to decipher it, without
constantly referring back to the source code.

cheers

Re: [PATCH] selftests: powerpc: Add test for execute-disabled pkeys

2020-05-26 Thread Michael Ellerman

Hi Sandipan,

Sandipan Das  writes:
> diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c 
> b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> new file mode 100644
> index ..b346ad205e68
> --- /dev/null
> +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
> @@ -0,0 +1,326 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +
> +/*
> + * Copyright 2020, Sandipan Das, IBM Corp.
> + *
> + * Test if applying execute protection on pages using memory
> + * protection keys works as expected.
> + */
> +
> +#define _GNU_SOURCE
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include 
> +#include 
> +#include 
> +
> +#include "utils.h"
> +
> +/* Override definitions as they might be inconsistent */
> +#undef PKEY_DISABLE_ACCESS
> +#define PKEY_DISABLE_ACCESS  0x3

Why would they be inconsistent?


> +/* Older distros might not define this */
> +#ifndef SEGV_PKUERR
> +#define SEGV_PKUERR  4
> +#endif
...
> +
> + /* Restore permissions in order to continue */
> + switch (fcode) {
> + case SEGV_ACCERR:
> + if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE)) {
> + perror("mprotect");
> + goto fail;
> + }
> + break;
> + case SEGV_PKUERR:
> + if (sinfo->si_pkey != fpkey)
> + goto fail;

This doesn't compile on older distros, eg Ubuntu 16.04:

  pkey_exec_prot.c: In function 'segv_handler':
  pkey_exec_prot.c:121:12: error: 'siginfo_t {aka struct }' has no 
member named 'si_pkey'
 if (sinfo->si_pkey != fpkey)
  ^
  pkey_exec_prot.c:151:24: error: 'siginfo_t {aka struct }' has no 
member named 'si_pkey'
 pkey_set_rights(sinfo->si_pkey, 0);
  ^
  ../../lib.mk:142: recipe for target 
'/output/kselftest/powerpc/mm/pkey_exec_prot' failed


I think a reasonable solution is to use the absence of SEGV_PKUERR to
basically turn the whole test into a nop at build time, eg:

diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c 
b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
index b346ad205e68..218257b89fbb 100644
--- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
+++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c
@@ -30,9 +30,7 @@
 #define PKEY_DISABLE_EXECUTE   0x4

 /* Older distros might not define this */
-#ifndef SEGV_PKUERR
-#define SEGV_PKUERR4
-#endif
+#ifdef SEGV_PKUERR

 #define SYS_pkey_mprotect  386
 #define SYS_pkey_alloc 384
@@ -319,6 +317,13 @@ static int test(void)

return 0;
 }
+#else
+static int test(void)
+{
+   printf("Test built with old libc lacking pkey support.\n");
+   SKIP_IF(true);
+}
+#endif /* SEGV_PKUERR */

 int main(void)
 {


cheers

Re: [RESEND PATCH v7 4/5] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods

2020-05-26 Thread Michael Ellerman

Vaibhav Jain  writes:
> Hi Ira, Mpe and Aneesh,
>
> Vaibhav Jain  writes:
>
>> Michael Ellerman  writes:
>>
>>> Ira Weiny  writes:
 On Wed, May 20, 2020 at 12:30:57AM +0530, Vaibhav Jain wrote:
> Introduce support for Papr nvDimm Specific Methods (PDSM) in papr_scm
> modules and add the command family to the white list of NVDIMM command
> sets. Also advertise support for ND_CMD_CALL for the dimm
> command mask and implement necessary scaffolding in the module to
> handle ND_CMD_CALL ioctl and PDSM requests that we receive.
>>> ...
> + *
> + * Payload Version:
> + *
> + * A 'payload_version' field is present in PDSM header that indicates a 
> specific
> + * version of the structure present in PDSM Payload for a given PDSM 
> command.
> + * This provides backward compatibility in case the PDSM Payload 
> structure
> + * evolves and different structures are supported by 'papr_scm' and 
> 'libndctl'.
> + *
> + * When sending a PDSM Payload to 'papr_scm', 'libndctl' should send the 
> version
> + * of the payload struct it supports via 'payload_version' field. The 
> 'papr_scm'
> + * module when servicing the PDSM envelope checks the 'payload_version' 
> and then
> + * uses 'payload struct version' == MIN('payload_version field',
> + * 'max payload-struct-version supported by papr_scm') to service the 
> PDSM.
> + * After servicing the PDSM, 'papr_scm' put the negotiated version of 
> payload
> + * struct in returned 'payload_version' field.

 FWIW many people believe using a size rather than version is more 
 sustainable.
 It is expected that new payload structures are larger (more features) than 
 the
 previous payload structure.

 I can't find references at the moment through.
>>>
>>> I think clone_args is a good modern example:
>>>
>>>   
>>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/sched.h#n88
>>
>> Thank Ira and Mpe for pointing this out. I looked into how clone3 sycall
>> handles clone_args and few differences came out:
>>
>> * Unlike clone_args that are always transferred in one direction from
>>   user-space to kernel, payload contents of pdsms are transferred in both
>>   directions. Having a single version number makes it easier for
>>   user-space and kernel to determine what data will be exchanged.
>>
>> * For PDSMs, the version number is negotiated between libndctl and
>>   kernel. For example in case kernel only supports an older version of
>>   a structure, its free to send a lower version number back to
>>   libndctl. Such negotiations doesnt happen with clone3 syscall.
>
> If you are ok with the explaination above please let me know. I will
> quickly spin off a v8 addressing your review comments.

I don't have strong opinions about the user API, it's really up to the
nvdimm folks.

cheers

[PATCH v3 7/7] [not for merge] netstats: example use of stats_fs API

2020-05-26 Thread Emanuele Giuseppe Esposito

Apply stats_fs on the networking statistics subsystem.

Currently it only works with disabled network namespace
(CONFIG_NET_NS=n), because multiple namespaces will have the same
device name under the same root source that will cause a conflict in
stats_fs.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 include/linux/netdevice.h |  2 ++
 net/Kconfig   |  1 +
 net/core/dev.c| 66 +++
 3 files changed, 69 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 130a668049ab..408c4e7b0e21 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -48,6 +48,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct netpoll_info;
 struct device;
@@ -2117,6 +2118,7 @@ struct net_device {
unsignedwol_enabled:1;
 
struct list_headnet_notifier_list;
+   struct stats_fs_source  *stats_fs_src;
 
 #if IS_ENABLED(CONFIG_MACSEC)
/* MACsec management functions */
diff --git a/net/Kconfig b/net/Kconfig
index df8d8c9bd021..3441d5bb6107 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -8,6 +8,7 @@ menuconfig NET
select NLATTR
select GENERIC_NET_UTILS
select BPF
+   select STATS_FS_API
---help---
  Unless you really know what you are doing, you should say Y here.
  The reason is that some programs need kernel networking support even
diff --git a/net/core/dev.c b/net/core/dev.c
index 522288177bbd..3db48cd1a097 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -142,6 +142,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "net-sysfs.h"
 
@@ -150,6 +151,11 @@
 /* This should be increased if a protocol with a bigger head is added. */
 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 
+#define NETDEV_STAT(str, m, ...)   
\
+   { str, offsetof(struct rtnl_link_stats64, m),   
\
+ &stats_fs_type_netdev_u64,
\
+ STATS_FS_SUM, ## __VA_ARGS__ }
+
 static DEFINE_SPINLOCK(ptype_lock);
 static DEFINE_SPINLOCK(offload_lock);
 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
@@ -196,6 +202,53 @@ static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 
 static seqcount_t devnet_rename_seq;
 
+static uint64_t stats_fs_get_netdev_u64(struct stats_fs_value *val,
+   void *base)
+{
+   struct net_device *netdev = (struct net_device *)base;
+   struct rtnl_link_stats64 net_stats;
+
+   dev_get_stats(netdev, &net_stats);
+
+   return stats_fs_get_u64(val, &net_stats);
+}
+
+static struct stats_fs_type stats_fs_type_netdev_u64 = {
+   .get = stats_fs_get_netdev_u64,
+   .clear = NULL,
+   .sign = false
+};
+
+static struct stats_fs_source *netdev_root;
+
+static struct stats_fs_value stats_fs_netdev_entries[] = {
+   NETDEV_STAT("rx_packets", rx_packets),
+   NETDEV_STAT("tx_packets", tx_packets),
+   NETDEV_STAT("rx_bytes", rx_bytes),
+   NETDEV_STAT("tx_bytes", tx_bytes),
+   NETDEV_STAT("rx_errors", rx_errors),
+   NETDEV_STAT("tx_errors", tx_errors),
+   NETDEV_STAT("rx_dropped", rx_dropped),
+   NETDEV_STAT("tx_dropped", tx_dropped),
+   NETDEV_STAT("multicast", multicast),
+   NETDEV_STAT("collisions", collisions),
+   NETDEV_STAT("rx_length_errors", rx_length_errors),
+   NETDEV_STAT("rx_over_errors", rx_over_errors),
+   NETDEV_STAT("rx_crc_errors", rx_crc_errors),
+   NETDEV_STAT("rx_frame_errors", rx_frame_errors),
+   NETDEV_STAT("rx_fifo_errors", rx_fifo_errors),
+   NETDEV_STAT("rx_missed_errors", rx_missed_errors),
+   NETDEV_STAT("tx_aborted_errors", tx_aborted_errors),
+   NETDEV_STAT("tx_carrier_errors", tx_carrier_errors),
+   NETDEV_STAT("tx_fifo_errors", tx_fifo_errors),
+   NETDEV_STAT("tx_heartbeat_errors", tx_heartbeat_errors),
+   NETDEV_STAT("tx_window_errors", tx_window_errors),
+   NETDEV_STAT("rx_compressed", rx_compressed),
+   NETDEV_STAT("tx_compressed", tx_compressed),
+   NETDEV_STAT("rx_nohandler", rx_nohandler),
+   { NULL }
+};
+
 static inline void dev_base_seq_inc(struct net *net)
 {
while (++net->dev_base_seq == 0)
@@ -8783,6 +8836,11 @@ static void rollback_registered_many(struct list_head 
*head)
ASSERT_RTNL();
 
list_for_each_entry_safe(dev, tmp, head, unreg_list) {
+   stats_fs_source_remove_subordinate(netdev_root,
+  dev->stats_fs_src);
+   stats_fs_source_revoke(dev->stats_fs_src);
+   stats_fs_source_put(dev->stats_fs_src);
+
/* Some devices call without registering
 * for initialization unwind. Remove those
 * devices and proceed with the remaining.
@@ -9436,6 +9494,11 @@ int register_netdevice(struct net_device *dev)
dev->rtnl_lin

[PATCH v3 6/7] [not for merge] kvm: example of stats_fs_value show function

2020-05-26 Thread Emanuele Giuseppe Esposito

Add an example of the show function using the mp_state value.

mp_state is an enum that represents the VCPU state,
so instead of displaying its integer representation,
the show function takes care of translating the integer into a
more meaningful string representation.

The VCPU status is shown in the kvm//vcpu/mp_state file

Signed-off-by: Emanuele Giuseppe Esposito 
---
 arch/x86/kvm/stats_fs.c | 54 +
 1 file changed, 54 insertions(+)

diff --git a/arch/x86/kvm/stats_fs.c b/arch/x86/kvm/stats_fs.c
index f6edebb9c559..902be18562da 100644
--- a/arch/x86/kvm/stats_fs.c
+++ b/arch/x86/kvm/stats_fs.c
@@ -39,11 +39,65 @@ struct stats_fs_value stats_fs_vcpu_arch_tsc_frac[] = {
{ NULL } /* base is &kvm_tsc_scaling_ratio_frac_bits */
 };
 
+char *stats_fs_vcpu_get_mpstate(uint64_t state)
+{
+   char *state_str;
+
+   state_str = kzalloc(20, GFP_KERNEL);
+   if (!state_str)
+   return ERR_PTR(-ENOMEM);
+
+   switch (state) {
+   case KVM_MP_STATE_RUNNABLE:
+   strcpy(state_str, "RUNNABLE");
+   break;
+   case KVM_MP_STATE_UNINITIALIZED:
+   strcpy(state_str, "UNINITIALIZED");
+   break;
+   case KVM_MP_STATE_INIT_RECEIVED:
+   strcpy(state_str, "INIT_RECEIVED");
+   break;
+   case KVM_MP_STATE_HALTED:
+   strcpy(state_str, "HALTED");
+   break;
+   case KVM_MP_STATE_SIPI_RECEIVED:
+   strcpy(state_str, "SIPI_RECEIVED");
+   break;
+   case KVM_MP_STATE_STOPPED:
+   strcpy(state_str, "STOPPED");
+   break;
+   case KVM_MP_STATE_CHECK_STOP:
+   strcpy(state_str, "CHECK_STOP");
+   break;
+   case KVM_MP_STATE_OPERATING:
+   strcpy(state_str, "OPERATING");
+   break;
+   case KVM_MP_STATE_LOAD:
+   strcpy(state_str, "LOAD");
+   break;
+   default:
+   strcpy(state_str, "UNRECOGNIZED");
+   break;
+   }
+
+   return state_str;
+}
+
+struct stats_fs_value stats_fs_vcpu_mp_state[] = {
+   VCPU_ARCH_STATS_FS("mp_state", kvm_vcpu_arch, mp_state,
+  .type = &stats_fs_type_u32,
+  .show = stats_fs_vcpu_get_mpstate),
+   { NULL }
+};
+
 void kvm_arch_create_vcpu_stats_fs(struct kvm_vcpu *vcpu)
 {
stats_fs_source_add_values(vcpu->stats_fs_src, stats_fs_vcpu_tsc_offset,
   &vcpu->arch, 0);
 
+   stats_fs_source_add_values(vcpu->stats_fs_src, stats_fs_vcpu_mp_state,
+  &vcpu->arch, 0);
+
if (lapic_in_kernel(vcpu))
stats_fs_source_add_values(vcpu->stats_fs_src,
   stats_fs_vcpu_arch_lapic_timer,
-- 
2.25.4

[PATCH v3 5/7] kvm_main: replace debugfs with stats_fs

2020-05-26 Thread Emanuele Giuseppe Esposito

Use stats_fs API instead of debugfs to create sources and add values.

This also requires to change all architecture files to replace the old
debugfs_entries with stats_fs_vcpu_entries and statsfs_vm_entries.

The files/folders name and organization is kept unchanged, and a symlink
in sys/kernel/debugfs/kvm is left for backward compatibility.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 arch/arm64/kvm/Kconfig  |   1 +
 arch/arm64/kvm/guest.c  |   2 +-
 arch/mips/kvm/Kconfig   |   1 +
 arch/mips/kvm/mips.c|   2 +-
 arch/powerpc/kvm/Kconfig|   1 +
 arch/powerpc/kvm/book3s.c   |  12 +-
 arch/powerpc/kvm/booke.c|   8 +-
 arch/s390/kvm/Kconfig   |   1 +
 arch/s390/kvm/kvm-s390.c|  16 +-
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/Kconfig|   1 +
 arch/x86/kvm/Makefile   |   2 +-
 arch/x86/kvm/debugfs.c  |  64 ---
 arch/x86/kvm/stats_fs.c |  60 ++
 arch/x86/kvm/x86.c  |  11 +-
 include/linux/kvm_host.h|  45 ++---
 virt/kvm/arm/arm.c  |   2 +-
 virt/kvm/kvm_main.c | 318 +---
 18 files changed, 161 insertions(+), 388 deletions(-)
 delete mode 100644 arch/x86/kvm/debugfs.c
 create mode 100644 arch/x86/kvm/stats_fs.c

diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 449386d76441..f95f6d1c3610 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
depends on OF
# for TASKSTATS/TASK_DELAY_ACCT:
depends on NET && MULTIUSER
+   select STATS_FS_API
select MMU_NOTIFIER
select PREEMPT_NOTIFIERS
select HAVE_KVM_CPU_RELAX_INTERCEPT
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 8417b200bec9..235ed44e4353 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -29,7 +29,7 @@
 
 #include "trace.h"
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("halt_successful_poll", halt_successful_poll),
VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index b91d145aa2d5..b19fbc5297b4 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -19,6 +19,7 @@ config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
depends on MIPS_FP_SUPPORT
+   select STATS_FS_API
select EXPORT_UASM
select PREEMPT_NOTIFIERS
select KVM_GENERIC_DIRTYLOG_READ_PROTECT
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index fdf1c14d9205..a47d21f35444 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -39,7 +39,7 @@
 #define VECTORSPACING 0x100/* for EI/VI mode */
 #endif
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("wait", wait_exits),
VCPU_STAT("cache", cache_exits),
VCPU_STAT("signal", signal_exits),
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 12885eda324e..6f0675edfe7c 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -19,6 +19,7 @@ if VIRTUALIZATION
 
 config KVM
bool
+   select STATS_FS_API
select PREEMPT_NOTIFIERS
select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 37508a356f28..e3346b3087d0 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -38,7 +38,7 @@
 
 /* #define EXIT_DEBUG */
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vcpu_entries[] = {
VCPU_STAT("exits", sum_exits),
VCPU_STAT("mmio", mmio_exits),
VCPU_STAT("sig", signal_exits),
@@ -66,8 +66,14 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
VCPU_STAT("pthru_all", pthru_all),
VCPU_STAT("pthru_host", pthru_host),
VCPU_STAT("pthru_bad_aff", pthru_bad_aff),
-   VM_STAT("largepages_2M", num_2M_pages, .mode = 0444),
-   VM_STAT("largepages_1G", num_1G_pages, .mode = 0444),
+   { NULL }
+};
+
+struct stats_fs_value stats_fs_vm_entries[] = {
+   VM_STAT("largepages_2M", num_2M_pages,
+   .value_flag = STATS_FS_FLOATING_VALUE),
+   VM_STAT("largepages_1G", num_1G_pages,
+   .value_flag = STATS_FS_FLOATING_VALUE),
{ NULL }
 };
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index c2984cb6dfa7..b14c07786cc8 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -35,7 +35,12 @@
 
 unsigned long kvmppc_booke_handlers;
 
-struct kvm_stats_debugfs_item debugfs_entries[] = {
+struct stats_fs_value stats_fs_vm_entries[] = {
+   VM_STAT("remote_tlb_flush", remote_tlb_flush),
+   { NULL }
+};
+
+stru

[PATCH v3 4/7] stats_fs fs: virtual fs to show stats to the end-user

2020-05-26 Thread Emanuele Giuseppe Esposito

Add virtual fs that maps stats_fs sources with directories, and values
(simple or aggregates) to files.

Every time a file is read/cleared, the fs internally invokes the stats_fs
API to get/set the requested value.

Also introduce the optional show function in each value, that allows
to customize how the value is displayed inside a file. This could be
especially useful with enums.

fs/stats_fs/inode.cis pretty much similar to what is done in
fs/debugfs/inode.c, with the exception that the API is only
composed by stats_fs_create_file, stats_fs_create_dir and stats_fs_remove.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 fs/stats_fs/Makefile   |   2 +-
 fs/stats_fs/inode.c| 461 +
 fs/stats_fs/internal.h |  15 ++
 fs/stats_fs/stats_fs.c |  92 +++-
 include/linux/stats_fs.h   |  18 ++
 include/uapi/linux/magic.h |   1 +
 tools/lib/api/fs/fs.c  |  21 ++
 7 files changed, 608 insertions(+), 2 deletions(-)
 create mode 100644 fs/stats_fs/inode.c

diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
index bc59a54d5721..19b7e13f6c3d 100644
--- a/fs/stats_fs/Makefile
+++ b/fs/stats_fs/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0-only
-stats_fs-objs  := stats_fs.o
+stats_fs-objs  := inode.o stats_fs.o
 stats_fs-tests-objs:= stats_fs-tests.o
 
 obj-$(CONFIG_STATS_FS) += stats_fs.o
diff --git a/fs/stats_fs/inode.c b/fs/stats_fs/inode.c
new file mode 100644
index ..eaa0a8bc7466
--- /dev/null
+++ b/fs/stats_fs/inode.c
@@ -0,0 +1,461 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ *  inode.c - part of stats_fs, a tiny little stats_fs file system
+ *
+ *  Copyright (C) 2020 Emanuele Giuseppe Esposito 
+ *  Copyright (C) 2020 Redhat
+ */
+#define pr_fmt(fmt)"stats_fs: " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "internal.h"
+
+#define STATS_FS_DEFAULT_MODE  0700
+
+static struct simple_fs stats_fs;
+static bool stats_fs_registered;
+
+struct stats_fs_mount_opts {
+   kuid_t uid;
+   kgid_t gid;
+   umode_t mode;
+};
+
+enum {
+   Opt_uid,
+   Opt_gid,
+   Opt_mode,
+   Opt_err
+};
+
+static const match_table_t tokens = {
+   {Opt_uid, "uid=%u"},
+   {Opt_gid, "gid=%u"},
+   {Opt_mode, "mode=%o"},
+   {Opt_err, NULL}
+};
+
+struct stats_fs_fs_info {
+   struct stats_fs_mount_opts mount_opts;
+};
+
+static int stats_fs_parse_options(char *data, struct stats_fs_mount_opts *opts)
+{
+   substring_t args[MAX_OPT_ARGS];
+   int option;
+   int token;
+   kuid_t uid;
+   kgid_t gid;
+   char *p;
+
+   opts->mode = STATS_FS_DEFAULT_MODE;
+
+   while ((p = strsep(&data, ",")) != NULL) {
+   if (!*p)
+   continue;
+
+   token = match_token(p, tokens, args);
+   switch (token) {
+   case Opt_uid:
+   if (match_int(&args[0], &option))
+   return -EINVAL;
+   uid = make_kuid(current_user_ns(), option);
+   if (!uid_valid(uid))
+   return -EINVAL;
+   opts->uid = uid;
+   break;
+   case Opt_gid:
+   if (match_int(&args[0], &option))
+   return -EINVAL;
+   gid = make_kgid(current_user_ns(), option);
+   if (!gid_valid(gid))
+   return -EINVAL;
+   opts->gid = gid;
+   break;
+   case Opt_mode:
+   if (match_octal(&args[0], &option))
+   return -EINVAL;
+   opts->mode = option & S_IALLUGO;
+   break;
+   /*
+* We might like to report bad mount options here;
+* but traditionally stats_fs has ignored all mount options
+*/
+   }
+   }
+
+   return 0;
+}
+
+static int stats_fs_apply_options(struct super_block *sb)
+{
+   struct stats_fs_fs_info *fsi = sb->s_fs_info;
+   struct inode *inode = d_inode(sb->s_root);
+   struct stats_fs_mount_opts *opts = &fsi->mount_opts;
+
+   inode->i_mode &= ~S_IALLUGO;
+   inode->i_mode |= opts->mode;
+
+   inode->i_uid = opts->uid;
+   inode->i_gid = opts->gid;
+
+   return 0;
+}
+
+static int stats_fs_remount(struct super_block *sb, int *flags, char *data)
+{
+   int err;
+   struct stats_fs_fs_info *fsi = sb->s_fs_info;
+
+   sync_filesystem(sb);
+   err = stats_fs_parse_options(data, &fsi->mount_opts);
+   if (err)
+   goto fail;
+
+   stats_fs_apply_options(sb);
+
+fail:
+   return err;
+}
+
+static int stats_fs_show_options(struct seq_file *m, struct dentry *root)
+{
+   struct stats_fs_fs_info *f

[PATCH v3 3/7] kunit: tests for stats_fs API

2020-05-26 Thread Emanuele Giuseppe Esposito

Add kunit tests to extensively test the stats_fs API functionality.

In order to run them, the kernel .config must set CONFIG_KUNIT=y
and a new .kunitconfig file must be created with CONFIG_STATS_FS=y
and CONFIG_STATS_FS_TEST=y

Tests can be then started by running the following command from the root
directory of the linux kernel source tree:
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`

Signed-off-by: Emanuele Giuseppe Esposito 
---
 fs/Kconfig   |6 +
 fs/stats_fs/Makefile |2 +
 fs/stats_fs/stats_fs-tests.c | 1097 ++
 3 files changed, 1105 insertions(+)
 create mode 100644 fs/stats_fs/stats_fs-tests.c

diff --git a/fs/Kconfig b/fs/Kconfig
index 684ad61129ab..02bbb0e4cdf7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -227,6 +227,12 @@ config STATS_FS
  stats_fs is a virtual file system that provides counters and
  other statistics about the running kernel.
 
+config STATS_FS_TEST
+   bool "Tests for stats_fs"
+   depends on STATS_FS && KUNIT
+   help
+ tests for the stats_fs API.
+
 config STATS_FS_API
bool
imply STATS_FS
diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
index bd988daa4c39..bc59a54d5721 100644
--- a/fs/stats_fs/Makefile
+++ b/fs/stats_fs/Makefile
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 stats_fs-objs  := stats_fs.o
+stats_fs-tests-objs:= stats_fs-tests.o
 
 obj-$(CONFIG_STATS_FS) += stats_fs.o
 obj-$(CONFIG_STATS_FS_STUB) += stub.o
+obj-$(CONFIG_STATS_FS_TEST) += stats_fs-tests.o
diff --git a/fs/stats_fs/stats_fs-tests.c b/fs/stats_fs/stats_fs-tests.c
new file mode 100644
index ..bbac133d7fe7
--- /dev/null
+++ b/fs/stats_fs/stats_fs-tests.c
@@ -0,0 +1,1097 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include "internal.h"
+
+#define STATS_FS_STAT(el, x, ...)  
\
+   {  \
+   .name = #x, .offset = offsetof(struct container, el.x),\
+   ##__VA_ARGS__  \
+   }
+
+#define ARR_SIZE(el) ((int)(sizeof(el) / sizeof(struct stats_fs_value) - 1))
+
+struct test_values_struct {
+   uint64_t u64;
+   int32_t s32;
+   bool bo;
+   uint8_t u8;
+   int16_t s16;
+};
+
+struct container {
+   struct test_values_struct vals;
+};
+
+struct stats_fs_value test_values[6] = {
+   STATS_FS_STAT(vals, u64, .type = &stats_fs_type_u64,
+ .aggr_kind = STATS_FS_NONE,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32,
+ .aggr_kind = STATS_FS_NONE),
+   STATS_FS_STAT(vals, bo, .type = &stats_fs_type_bool,
+ .aggr_kind = STATS_FS_NONE,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   STATS_FS_STAT(vals, u8, .type = &stats_fs_type_u8,
+ .aggr_kind = STATS_FS_NONE),
+   STATS_FS_STAT(vals, s16, .type = &stats_fs_type_s16,
+ .aggr_kind = STATS_FS_NONE,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   { NULL },
+};
+
+struct stats_fs_value test_aggr[4] = {
+   STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32,
+ .aggr_kind = STATS_FS_MIN,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   STATS_FS_STAT(vals, bo, .type = &stats_fs_type_bool,
+ .aggr_kind = STATS_FS_MAX,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   STATS_FS_STAT(vals, u64, .type = &stats_fs_type_u64,
+ .aggr_kind = STATS_FS_SUM,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   { NULL },
+};
+
+struct stats_fs_value test_same_name[3] = {
+   STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32,
+ .aggr_kind = STATS_FS_NONE),
+   STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32,
+ .aggr_kind = STATS_FS_MIN),
+   { NULL },
+};
+
+struct stats_fs_value test_all_aggr[6] = {
+   STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32,
+ .aggr_kind = STATS_FS_MIN),
+   STATS_FS_STAT(vals, bo, .type = &stats_fs_type_bool,
+ .aggr_kind = STATS_FS_COUNT_ZERO,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   STATS_FS_STAT(vals, u64, .type = &stats_fs_type_u64,
+ .aggr_kind = STATS_FS_SUM),
+   STATS_FS_STAT(vals, u8, .type = &stats_fs_type_u8,
+ .aggr_kind = STATS_FS_AVG,
+ .value_flag = STATS_FS_FLOATING_VALUE),
+   STATS_FS_STAT(vals, s16, .type = &stats_fs_type_s16,
+

[PATCH v3 2/7] documentation for stats_fs

2020-05-26 Thread Emanuele Giuseppe Esposito

Html docs for a complete documentation of the stats_fs API,
filesystem and usage.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 Documentation/filesystems/index.rst|   1 +
 Documentation/filesystems/stats_fs.rst | 222 +
 2 files changed, 223 insertions(+)
 create mode 100644 Documentation/filesystems/stats_fs.rst

diff --git a/Documentation/filesystems/index.rst 
b/Documentation/filesystems/index.rst
index e7b46dac7079..9a46fd851c6e 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -89,6 +89,7 @@ Documentation for filesystem implementations.
relay
romfs
squashfs
+   stats_fs
sysfs
sysv-fs
tmpfs
diff --git a/Documentation/filesystems/stats_fs.rst 
b/Documentation/filesystems/stats_fs.rst
new file mode 100644
index ..292c689ffb98
--- /dev/null
+++ b/Documentation/filesystems/stats_fs.rst
@@ -0,0 +1,222 @@
+
+Stats_FS
+
+
+Stats_fs is a synthetic ram-based virtual filesystem that takes care of
+gathering and displaying statistics for the Linux kernel subsystems.
+
+The motivation for stats_fs comes from the fact that there is no common
+way for Linux kernel subsystems to expose statistics to userspace shared
+throughout the Linux kernel; subsystems have to take care of gathering and
+displaying statistics by themselves, for example in the form of files in
+debugfs.
+
+Allowing each subsystem of the kernel to do so has two disadvantages.
+First, it will introduce redundant code. Second, debugfs is anyway not the
+right place for statistics (for example it is affected by lockdown).
+
+Stats_fs offers a generic and stable API, allowing any kind of
+directory/file organization and supporting multiple kind of aggregations
+(not only sum, but also average, max, min and count_zero) and data types
+(boolean, all unsigned/signed and custom types). The implementation takes
+care of gathering and displaying information at run time; users only need
+to specify the values to be included in each source. Optionally, users can
+also provide a display function for each value, that will take care of
+displaying the provided value in a custom format.
+
+Its main function is to display each statistics as a file in the desired
+folder hierarchy defined through the API. Stats_fs files can be read, and
+possibly cleared if their file mode allows it.
+
+Stats_fs is typically mounted with a command like::
+
+mount -t stats_fs stats_fs /sys/kernel/stats_fs
+
+(Or an equivalent /etc/fstab line).
+
+Stats_fs has two main components: the public API defined by
+include/linux/stats_fs.h, and the virtual file system in
+/sys/kernel/stats.
+
+The API has two main elements, values and sources. Kernel
+subsystems will create a source, add child
+sources/values/aggregates and register it to the root source (that on the
+virtual fs would be /sys/kernel/stats).
+
+The stats_fs API is defined in .
+
+Sources
+Sources are created via ``stats_fs_source_create()``, and each
+source becomes a directory in the file system. Sources form a
+parent-child relationship; root sources are added to the file
+system via ``stats_fs_source_register()``. Therefore each Linux
+subsystem will add its own entry to the root, filesystem similar
+to what it is done in debugfs. Every other source is added to or
+removed from a parent through the
+``stats_fs_source_add_subordinate()`` and
+``stats_fs_source_remove_subordinate()`` APIs. Once a source is
+created and added to the tree (via add_subordinate), it will be
+used to compute aggregate values in the parent source. A source
+can optionally be hidden from the filesystem but still considered
+in the aggregation operations if the corresponding flag is set
+during initialization.
+
+Values
+Values represent quantites that are gathered by the stats_fs user.
+Examples of values include the number of vm exits of a given kind,
+the amount of memory used by some data structure, the length of
+the longest hash table chain, or anything like that. Values are
+defined with the stats_fs_source_add_values function. Each value
+is defined by a ``struct stats_fs_value``; the same
+``stats_fs_value`` can be added to many different sources. A value
+can be considered "simple" if it fetches data from a user-provided
+location, or "aggregate" if it groups all values in the
+subordinate sources that include the same ``stats_fs_value``.
+Values by default are considered to be cumulative, meaning the
+value they represent never decreases, but can also be defined as
+floating if they exibith a different behavior. The main difference
+between these two is reflected into the file permission, since a
+floating value file does not allow the user to clear it. Each
+value

[PATCH v3 1/7] stats_fs API: create, add and remove stats_fs sources and values

2020-05-26 Thread Emanuele Giuseppe Esposito

Introduction to the stats_fs API, that allows to easily create, add
and remove stats_fs sources and values. The API allows to easily building
the statistics directory tree to automatically gather them for the linux
kernel. The main functionalities are: create a source, add child
sources/values/aggregates, register it to the root source (that on
the virtual fs would be /sys/kernel/stats), ad perform a search for
a value/aggregate.

Each source and value has an optional flag parameter:
in a value, it represent whether the statistic is cumulative or floating, in a
source whether it should be visible from the filesystem or not.
Defaults are respectively cumulative and visible.
Both flags fields are represented as an uint32_t to offer portability for
future flags.

Each value also takes a struct stats_fs_type pointer that defines
get and clear function for that stat, allowing custom
types handling. The API also  provides default get and clear types for
the supported standard types (stats_fs_type_*).

The API representation is only logical and will be backed up
by a virtual file system in patch 4.
Its usage will be shared between the stats_fs file system
and the end-users like kvm, the former calling it when it needs to
display and clear statistics, the latter to add values and sources.

Signed-off-by: Emanuele Giuseppe Esposito 
---
 MAINTAINERS  |   7 +
 fs/Kconfig   |  14 +
 fs/Makefile  |   1 +
 fs/stats_fs/Makefile |   5 +
 fs/stats_fs/internal.h   |  19 ++
 fs/stats_fs/stats_fs.c   | 552 +++
 fs/stats_fs/stub.c   |  13 +
 include/linux/stats_fs.h | 363 +
 8 files changed, 974 insertions(+)
 create mode 100644 fs/stats_fs/Makefile
 create mode 100644 fs/stats_fs/internal.h
 create mode 100644 fs/stats_fs/stats_fs.c
 create mode 100644 fs/stats_fs/stub.c
 create mode 100644 include/linux/stats_fs.h

diff --git a/MAINTAINERS b/MAINTAINERS
index b816a453b10e..a8403d07cee5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5229,6 +5229,13 @@ F:   include/linux/debugfs.h
 F: include/linux/kobj*
 F: lib/kobj*
 
+STATS_FS
+M: Paolo Bonzini 
+R: Emanuele Giuseppe Esposito 
+S: Supported
+F: include/linux/stats_fs.h
+F: fs/stats_fs
+
 DRIVERS FOR ADAPTIVE VOLTAGE SCALING (AVS)
 M: Kevin Hilman 
 M: Nishanth Menon 
diff --git a/fs/Kconfig b/fs/Kconfig
index f08fbbfafd9a..684ad61129ab 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -221,6 +221,20 @@ config MEMFD_CREATE
 config ARCH_HAS_GIGANTIC_PAGE
bool
 
+config STATS_FS
+   bool "Statistics Filesystem"
+   help
+ stats_fs is a virtual file system that provides counters and
+ other statistics about the running kernel.
+
+config STATS_FS_API
+   bool
+   imply STATS_FS
+
+config STATS_FS_STUB
+   bool
+   default y if STATS_FS_API && !STATS_FS
+
 source "fs/configfs/Kconfig"
 source "fs/efivarfs/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index 2ce5112b02c8..91558eca0cf7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,6 +125,7 @@ obj-$(CONFIG_BEFS_FS)   += befs/
 obj-$(CONFIG_HOSTFS)   += hostfs/
 obj-$(CONFIG_CACHEFILES)   += cachefiles/
 obj-$(CONFIG_DEBUG_FS) += debugfs/
+obj-$(CONFIG_STATS_FS) += stats_fs/
 obj-$(CONFIG_TRACING)  += tracefs/
 obj-$(CONFIG_OCFS2_FS) += ocfs2/
 obj-$(CONFIG_BTRFS_FS) += btrfs/
diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile
new file mode 100644
index ..bd988daa4c39
--- /dev/null
+++ b/fs/stats_fs/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+stats_fs-objs  := stats_fs.o
+
+obj-$(CONFIG_STATS_FS) += stats_fs.o
+obj-$(CONFIG_STATS_FS_STUB) += stub.o
diff --git a/fs/stats_fs/internal.h b/fs/stats_fs/internal.h
new file mode 100644
index ..4993afbb1e45
--- /dev/null
+++ b/fs/stats_fs/internal.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _STATS_FS_INTERNAL_H_
+#define _STATS_FS_INTERNAL_H_
+
+#include 
+#include 
+#include 
+#include 
+
+/* values, grouped by base */
+struct stats_fs_value_source {
+   void *base_addr;
+   bool files_created;
+   uint32_t common_flags;
+   struct stats_fs_value *values;
+   struct list_head list_element;
+};
+
+#endif /* _STATS_FS_INTERNAL_H_ */
diff --git a/fs/stats_fs/stats_fs.c b/fs/stats_fs/stats_fs.c
new file mode 100644
index ..b76ee44f6dac
--- /dev/null
+++ b/fs/stats_fs/stats_fs.c
@@ -0,0 +1,552 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "internal.h"
+
+struct stats_fs_aggregate_value {
+   uint64_t sum, min, max;
+   uint32_t count, count_zero;
+};
+
+#define STATS_FS_DEFINE_TYPE_STRUCT(gtype, stype, si)  
\
+   const struct stats_fs_type stats_fs_type_##gtype =

[PATCH v3 0/7] Statsfs: a new ram-based file system for Linux kernel statistics

2020-05-26 Thread Emanuele Giuseppe Esposito

There is currently no common way for Linux kernel subsystems to expose
statistics to userspace shared throughout the Linux kernel; subsystems have
to take care of gathering and displaying statistics by themselves, for
example in the form of files in debugfs. For example KVM has its own code
section that takes care of this in virt/kvm/kvm_main.c, where it sets up
debugfs handlers for displaying values and aggregating them from various
subfolders to obtain information about the system state (i.e. displaying
the total number of exits, calculated by summing all exits of all cpus of
all running virtual machines).

Allowing each section of the kernel to do so has two disadvantages. First,
it will introduce redundant code. Second, debugfs is anyway not the right
place for statistics (for example it is affected by lockdown)

In this patch series I introduce statsfs, a synthetic ram-based virtual
filesystem that takes care of gathering and displaying statistics for the
Linux kernel subsystems.

The file system is mounted on /sys/kernel/stats and would be already used
by kvm. Statsfs was initially introduced by Paolo Bonzini [1].

Statsfs offers a generic and stable API, allowing any kind of
directory/file organization and supporting multiple kind of aggregations
(not only sum, but also average, max, min and count_zero) and data types
(boolean, unsigned/signed and custom types). The implementation, which is
a generalization of KVM’s debugfs statistics code, takes care of gathering
and displaying information at run time; users only need to specify the
values to be included in each source.

Statsfs would also be a different mountpoint from debugfs, and would not
suffer from limited access due to the security lock down patches. Its main
function is to display each statistics as a file in the desired folder
hierarchy defined through the API. Statsfs files can be read, and possibly
cleared if their file mode allows it.

Statsfs has two main components: the public API defined by
include/linux/statsfs.h, and the virtual file system which should end up in
/sys/kernel/stats.

The API has two main elements, values and sources. Kernel subsystems like
KVM can use the API to create a source, add child sources/values/aggregates
and register it to the root source (that on the virtual fs would be
/sys/kernel/statsfs).

Sources are created via statsfs_source_create(), and each source becomes a
directory in the file system. Sources form a parent-child relationship;
root sources are added to the file system via statsfs_source_register().
Every other source is added to or removed from a parent through the
statsfs_source_add_subordinate and statsfs_source_remote_subordinate APIs.
Once a source is created and added to the tree (via add_subordinate), it
will be used to compute aggregate values in the parent source.
A source can optionally be hidden from the filesystem
but still considered in the aggregation operations if the corresponding
flag is set during initialization.

Values represent quantites that are gathered by the statsfs user. Examples
of values include the number of vm exits of a given kind, the amount of
memory used by some data structure, the length of the longest hash table
chain, or anything like that. Values are defined with the
statsfs_source_add_values function. Each value is defined by a struct
statsfs_value; the same statsfs_value can be added to many different
sources. A value can be considered "simple" if it fetches data from a
user-provided location, or "aggregate" if it groups all values in the
subordinates sources that include the same statsfs_value.
Each value has a stats_fs_type pointer in order to allow the user to
provide custom get and clear functions. The library, however, also
exports default stats_fs_type structs for the standard types
(all unsigned and signed types plus boolean).
A value can also provide a show function, that takes care
of displaying the value in a custom string format. This can be especially
useful when displaying enums.

For more information, please consult the kerneldoc documentation in patch 2
and the sample uses in the kunit tests, KVM and networking.

This series of patches is based on my previous series "libfs: group and
simplify linux fs code" and the single patch sent to kvm "kvm_host: unify
VM_STAT and VCPU_STAT definitions in a single place". The former simplifies
code duplicated in debugfs and tracefs (from which statsfs is based on),
the latter groups all macros definition for statistics in kvm in a single
common file shared by all architectures.

Patch 1 adds a new refcount and kref destructor wrappers that take a
semaphore, as those are used later by statsfs. Patch 2 introduces the
statsfs API, patch 3 provides extensive tests that can also be used as
example on how to use the API and patch 4 adds the file system support.
Finally, patch 5 provides a real-life example of statsfs usage in KVM,
with patch 6 providing a concrete example of the show function and
patch 7 anothe

Re: [PATCH v4 3/6] asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE

2020-05-26 Thread Greg KH

On Wed, May 20, 2020 at 02:00:22PM +0530, Santosh Sivaraj wrote:
> From: Peter Zijlstra 
> 
> commit 96bc9567cbe112e9320250f01b9c060c882e8619 upstream
> 
> Make issuing a TLB invalidate for page-table pages the normal case.
> 
> The reason is twofold:
> 
>  - too many invalidates is safer than too few,
>  - most architectures use the linux page-tables natively
>and would thus require this.
> 
> Make it an opt-out, instead of an opt-in.
> 
> No change in behavior intended.
> 
> Signed-off-by: Peter Zijlstra (Intel) 
> Cc:  # 4.19
> Signed-off-by: Santosh Sivaraj 
> [santosh: prerequisite for upcoming tlbflush backports]
> ---
>  arch/Kconfig | 2 +-
>  arch/powerpc/Kconfig | 1 +
>  arch/sparc/Kconfig   | 1 +
>  arch/x86/Kconfig | 1 -
>  mm/memory.c  | 2 +-
>  5 files changed, 4 insertions(+), 3 deletions(-)

Why did you not also change arch/arm64/Kconfig and
include/asm-generic/tlb.h like the original patch changed?

Why can those files be ignored/left out?  You need to explain that in
the backport section, all you said was "prerequisite..." and did not say
why you changed this patch.

Please fix up, and make sure you do the same for all of the other
patches in this series for when you resend it.

thanks,

greg k-h

Re: [linux-next PATCH] mm/gup.c: Convert to use get_user_{page|pages}_fast_only()

2020-05-26 Thread Souptick Joarder

On Tue, May 26, 2020 at 1:29 PM Paul Mackerras  wrote:
>
> On Mon, May 25, 2020 at 02:23:32PM +0530, Souptick Joarder wrote:
> > API __get_user_pages_fast() renamed to get_user_pages_fast_only()
> > to align with pin_user_pages_fast_only().
> >
> > As part of this we will get rid of write parameter.
> > Instead caller will pass FOLL_WRITE to get_user_pages_fast_only().
> > This will not change any existing functionality of the API.
> >
> > All the callers are changed to pass FOLL_WRITE.
> >
> > Also introduce get_user_page_fast_only(), and use it in a few
> > places that hard-code nr_pages to 1.
> >
> > Updated the documentation of the API.
> >
> > Signed-off-by: Souptick Joarder 
>
> The arch/powerpc/kvm bits look reasonable.
>
> Reviewed-by: Paul Mackerras 

Thanks Paul. This patch is merged through mm-tree.
https://lore.kernel.org/kvm/1590396812-31277-1-git-send-email-jrdr.li...@gmail.com/

Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone

2020-05-26 Thread Zong Li

On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti  wrote:
>
> This is a preparatory patch for relocatable kernel.
>
> The kernel used to be linked at PAGE_OFFSET address and used to be loaded
> physically at the beginning of the main memory. Therefore, we could use
> the linear mapping for the kernel mapping.
>
> But the relocated kernel base address will be different from PAGE_OFFSET
> and since in the linear mapping, two different virtual addresses cannot
> point to the same physical address, the kernel mapping needs to lie outside
> the linear mapping.
>
> In addition, because modules and BPF must be close to the kernel (inside
> +-2GB window), the kernel is placed at the end of the vmalloc zone minus
> 2GB, which leaves room for modules and BPF. The kernel could not be
> placed at the beginning of the vmalloc zone since other vmalloc
> allocations from the kernel could get all the +-2GB window around the
> kernel which would prevent new modules and BPF programs to be loaded.
>
> Signed-off-by: Alexandre Ghiti 
> ---
>  arch/riscv/boot/loader.lds.S |  3 +-
>  arch/riscv/include/asm/page.h| 10 +-
>  arch/riscv/include/asm/pgtable.h | 37 +---
>  arch/riscv/kernel/head.S |  3 +-
>  arch/riscv/kernel/module.c   |  4 +--
>  arch/riscv/kernel/vmlinux.lds.S  |  3 +-
>  arch/riscv/mm/init.c | 58 +---
>  arch/riscv/mm/physaddr.c |  2 +-
>  8 files changed, 87 insertions(+), 33 deletions(-)
>
> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S
> index 47a5003c2e28..62d94696a19c 100644
> --- a/arch/riscv/boot/loader.lds.S
> +++ b/arch/riscv/boot/loader.lds.S
> @@ -1,13 +1,14 @@
>  /* SPDX-License-Identifier: GPL-2.0 */
>
>  #include 
> +#include 
>
>  OUTPUT_ARCH(riscv)
>  ENTRY(_start)
>
>  SECTIONS
>  {
> -   . = PAGE_OFFSET;
> +   . = KERNEL_LINK_ADDR;
>
> .payload : {
> *(.payload)
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 2d50f76efe48..48bb09b6a9b7 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t;
>
>  #ifdef CONFIG_MMU
>  extern unsigned long va_pa_offset;
> +extern unsigned long va_kernel_pa_offset;
>  extern unsigned long pfn_base;
>  #define ARCH_PFN_OFFSET(pfn_base)
>  #else
>  #define va_pa_offset   0
> +#define va_kernel_pa_offset0
>  #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT)
>  #endif /* CONFIG_MMU */
>
>  extern unsigned long max_low_pfn;
>  extern unsigned long min_low_pfn;
> +extern unsigned long kernel_virt_addr;
>
>  #define __pa_to_va_nodebug(x)  ((void *)((unsigned long) (x) + va_pa_offset))
> -#define __va_to_pa_nodebug(x)  ((unsigned long)(x) - va_pa_offset)
> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset)
> +#define kernel_mapping_va_to_pa(x) \
> +   ((unsigned long)(x) - va_kernel_pa_offset)
> +#define __va_to_pa_nodebug(x)  \
> +   (((x) >= PAGE_OFFSET) ? \
> +   linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x))
>
>  #ifdef CONFIG_DEBUG_VIRTUAL
>  extern phys_addr_t __virt_to_phys(unsigned long x);
> diff --git a/arch/riscv/include/asm/pgtable.h 
> b/arch/riscv/include/asm/pgtable.h
> index 35b60035b6b0..25213cfaf680 100644
> --- a/arch/riscv/include/asm/pgtable.h
> +++ b/arch/riscv/include/asm/pgtable.h
> @@ -11,23 +11,29 @@
>
>  #include 
>
> -#ifndef __ASSEMBLY__
> -
> -/* Page Upper Directory not used in RISC-V */
> -#include 
> -#include 
> -#include 
> -#include 
> -
> -#ifdef CONFIG_MMU
> +#ifndef CONFIG_MMU
> +#define KERNEL_VIRT_ADDR   PAGE_OFFSET
> +#define KERNEL_LINK_ADDR   PAGE_OFFSET
> +#else
> +/*
> + * Leave 2GB for modules and BPF that must lie within a 2GB range around
> + * the kernel.
> + */
> +#define KERNEL_VIRT_ADDR   (VMALLOC_END - SZ_2G + 1)
> +#define KERNEL_LINK_ADDR   KERNEL_VIRT_ADDR
>
>  #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
>  #define VMALLOC_END  (PAGE_OFFSET - 1)
>  #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE)
>
>  #define BPF_JIT_REGION_SIZE(SZ_128M)
> -#define BPF_JIT_REGION_START   (PAGE_OFFSET - BPF_JIT_REGION_SIZE)
> -#define BPF_JIT_REGION_END (VMALLOC_END)
> +#define BPF_JIT_REGION_START   (kernel_virt_addr)
> +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE)

It seems to have a potential risk here, the region of bpf is
overlapping with kernel mapping, so if kernel size is bigger than
128MB, bpf region would be occupied and run out by kernel mapping.

> +
> +#ifdef CONFIG_64BIT
> +#define VMALLOC_MODULE_START   BPF_JIT_REGION_END
> +#define VMALLOC_MODULE_END VMALLOC_END
> +#endif
>

Although kernel_virt_addr is a fixed address now, I think it could be
changed for the purpose of relocatable or KASLR, so if
kernel_virt_addr is moved to far from VMALLOC_END than 2G, the region
of module w

Re: [PATCH v3 2/3] riscv: Introduce CONFIG_RELOCATABLE

2020-05-26 Thread Zong Li

On Sun, May 24, 2020 at 4:55 PM Alexandre Ghiti  wrote:
>
> This config allows to compile the kernel as PIE and to relocate it at
> any virtual address at runtime: this paves the way to KASLR and to 4-level
> page table folding at runtime. Runtime relocation is possible since
> relocation metadata are embedded into the kernel.
>
> Note that relocating at runtime introduces an overhead even if the
> kernel is loaded at the same address it was linked at and that the compiler
> options are those used in arm64 which uses the same RELA relocation
> format.
>
> Signed-off-by: Alexandre Ghiti 
> ---
>  arch/riscv/Kconfig  | 12 +++
>  arch/riscv/Makefile |  5 ++-
>  arch/riscv/kernel/vmlinux.lds.S |  6 ++--
>  arch/riscv/mm/Makefile  |  4 +++
>  arch/riscv/mm/init.c| 63 +
>  5 files changed, 87 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index a31e1a41913a..93127d5913fe 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -170,6 +170,18 @@ config PGTABLE_LEVELS
> default 3 if 64BIT
> default 2
>
> +config RELOCATABLE
> +   bool
> +   depends on MMU
> +   help
> +  This builds a kernel as a Position Independent Executable (PIE),
> +  which retains all relocation metadata required to relocate the
> +  kernel binary at runtime to a different virtual address than the
> +  address it was linked at.
> +  Since RISCV uses the RELA relocation format, this requires a
> +  relocation pass at runtime even if the kernel is loaded at the
> +  same address it was linked at.
> +
>  source "arch/riscv/Kconfig.socs"
>
>  menu "Platform type"
> diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
> index fb6e37db836d..1406416ea743 100644
> --- a/arch/riscv/Makefile
> +++ b/arch/riscv/Makefile
> @@ -9,7 +9,10 @@
>  #
>
>  OBJCOPYFLAGS:= -O binary
> -LDFLAGS_vmlinux :=
> +ifeq ($(CONFIG_RELOCATABLE),y)
> +LDFLAGS_vmlinux := -shared -Bsymbolic -z notext -z norelro
> +KBUILD_CFLAGS += -fPIE
> +endif
>  ifeq ($(CONFIG_DYNAMIC_FTRACE),y)
> LDFLAGS_vmlinux := --no-relax
>  endif
> diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S
> index a9abde62909f..e8ffba8c2044 100644
> --- a/arch/riscv/kernel/vmlinux.lds.S
> +++ b/arch/riscv/kernel/vmlinux.lds.S
> @@ -85,8 +85,10 @@ SECTIONS
>
> BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0)
>
> -   .rel.dyn : {
> -   *(.rel.dyn*)
> +   .rela.dyn : ALIGN(8) {
> +   __rela_dyn_start = .;
> +   *(.rela .rela*)
> +   __rela_dyn_end = .;
> }
>
> _end = .;
> diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
> index 363ef01c30b1..dc5cdaa80bc1 100644
> --- a/arch/riscv/mm/Makefile
> +++ b/arch/riscv/mm/Makefile
> @@ -1,6 +1,10 @@
>  # SPDX-License-Identifier: GPL-2.0-only
>
>  CFLAGS_init.o := -mcmodel=medany
> +ifdef CONFIG_RELOCATABLE
> +CFLAGS_init.o += -fno-pie
> +endif
> +
>  ifdef CONFIG_FTRACE
>  CFLAGS_REMOVE_init.o = -pg
>  endif
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 17f108baec4f..7074522d40c6 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -13,6 +13,9 @@
>  #include 
>  #include 
>  #include 
> +#ifdef CONFIG_RELOCATABLE
> +#include 
> +#endif
>
>  #include 
>  #include 
> @@ -379,6 +382,53 @@ static uintptr_t __init best_map_size(phys_addr_t base, 
> phys_addr_t size)
>  #error "setup_vm() is called from head.S before relocate so it should not 
> use absolute addressing."
>  #endif
>
> +#ifdef CONFIG_RELOCATABLE
> +extern unsigned long __rela_dyn_start, __rela_dyn_end;
> +
> +#ifdef CONFIG_64BIT
> +#define Elf_Rela Elf64_Rela
> +#define Elf_Addr Elf64_Addr
> +#else
> +#define Elf_Rela Elf32_Rela
> +#define Elf_Addr Elf32_Addr
> +#endif
> +
> +void __init relocate_kernel(uintptr_t load_pa)
> +{
> +   Elf_Rela *rela = (Elf_Rela *)&__rela_dyn_start;
> +   /*
> +* This holds the offset between the linked virtual address and the
> +* relocated virtual address.
> +*/
> +   uintptr_t reloc_offset = kernel_virt_addr - KERNEL_LINK_ADDR;
> +   /*
> +* This holds the offset between kernel linked virtual address and
> +* physical address.
> +*/
> +   uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - load_pa;
> +
> +   for ( ; rela < (Elf_Rela *)&__rela_dyn_end; rela++) {
> +   Elf_Addr addr = (rela->r_offset - va_kernel_link_pa_offset);
> +   Elf_Addr relocated_addr = rela->r_addend;
> +
> +   if (rela->r_info != R_RISCV_RELATIVE)
> +   continue;
> +
> +   /*
> +* Make sure to not relocate vdso symbols like rt_sigreturn
> +* which are linked from the address 0 in vmlinux since
> +* vdso symbol addresses are actually used as an of

[PATCH 6/6] powerpc/ppc-opcode: fold PPC_INST_* macros into PPC_RAW_* macros

2020-05-26 Thread Balamuruhan S

Lot of PPC_INST_* macros are used only ever in PPC_* macros, fold those
PPC_INST_* into PPC_RAW_* to avoid using PPC_INST_* accidentally.

Signed-off-by: Balamuruhan S 
Acked-by: Naveen N. Rao 
Tested-by: Naveen N. Rao 
---
 arch/powerpc/include/asm/ppc-opcode.h | 381 +-
 1 file changed, 125 insertions(+), 256 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index c8d71a8bef46..bbb77f998f19 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -208,56 +208,27 @@
 #define OP_LQ56
 
 /* sorted alphabetically */
-#define PPC_INST_BHRBE 0x7c00025c
-#define PPC_INST_CLRBHRB   0x7c00035c
 #define PPC_INST_COPY  0x7c20060c
-#define PPC_INST_CP_ABORT  0x7c00068c
-#define PPC_INST_DARN  0x7c0005e6
 #define PPC_INST_DCBA  0x7c0005ec
 #define PPC_INST_DCBA_MASK 0xfc0007fe
-#define PPC_INST_DCBAL 0x7c2005ec
-#define PPC_INST_DCBZL 0x7c2007ec
-#define PPC_INST_ICBT  0x7c2c
-#define PPC_INST_ICSWX 0x7c00032d
-#define PPC_INST_ICSWEPX   0x7c00076d
 #define PPC_INST_ISEL  0x7c1e
 #define PPC_INST_ISEL_MASK 0xfc3e
-#define PPC_INST_LDARX 0x7ca8
-#define PPC_INST_STDCX 0x7c0001ad
-#define PPC_INST_LQARX 0x7c000228
-#define PPC_INST_STQCX 0x7c00016d
 #define PPC_INST_LSWI  0x7c0004aa
 #define PPC_INST_LSWX  0x7c00042a
-#define PPC_INST_LWARX 0x7c28
-#define PPC_INST_STWCX 0x7c00012d
 #define PPC_INST_LWSYNC0x7c2004ac
 #define PPC_INST_SYNC  0x7c0004ac
 #define PPC_INST_SYNC_MASK 0xfc0007fe
 #define PPC_INST_ISYNC 0x4c00012c
-#define PPC_INST_LXVD2X0x7c000698
 #define PPC_INST_MCRXR 0x7c000400
 #define PPC_INST_MCRXR_MASK0xfc0007fe
 #define PPC_INST_MFSPR_PVR 0x7c1f42a6
 #define PPC_INST_MFSPR_PVR_MASK0xfc1e
-#define PPC_INST_MFTMR 0x7c0002dc
-#define PPC_INST_MSGSND0x7c00019c
-#define PPC_INST_MSGCLR0x7c0001dc
-#define PPC_INST_MSGSYNC   0x7c0006ec
-#define PPC_INST_MSGSNDP   0x7c00011c
-#define PPC_INST_MSGCLRP   0x7c00015c
 #define PPC_INST_MTMSRD0x7c000164
-#define PPC_INST_MTTMR 0x7c0003dc
 #define PPC_INST_NOP   0x6000
-#define PPC_INST_PASTE 0x7c20070d
 #define PPC_INST_POPCNTB   0x7cf4
 #define PPC_INST_POPCNTB_MASK  0xfc0007fe
-#define PPC_INST_POPCNTD   0x7c0003f4
-#define PPC_INST_POPCNTW   0x7c0002f4
 #define PPC_INST_RFEBB 0x4c000124
-#define PPC_INST_RFCI  0x4c66
-#define PPC_INST_RFDI  0x4c4e
 #define PPC_INST_RFID  0x4c24
-#define PPC_INST_RFMCI 0x4c4c
 #define PPC_INST_MFSPR 0x7c0002a6
 #define PPC_INST_MFSPR_DSCR0x7c1102a6
 #define PPC_INST_MFSPR_DSCR_MASK   0xfc1e
@@ -267,131 +238,30 @@
 #define PPC_INST_MFSPR_DSCR_USER_MASK  0xfc1e
 #define PPC_INST_MTSPR_DSCR_USER   0x7c0303a6
 #define PPC_INST_MTSPR_DSCR_USER_MASK  0xfc1e
-#define PPC_INST_MFVSRD0x7c66
-#define PPC_INST_MTVSRD0x7c000166
 #define PPC_INST_SC0x4402
-#define PPC_INST_SLBFEE0x7c0007a7
-#define PPC_INST_SLBIA 0x7c0003e4
-
 #define PPC_INST_STRING0x7c00042a
 #define PPC_INST_STRING_MASK   0xfc0007fe
 #define PPC_INST_STRING_GEN_MASK   0xfc00067e
-
 #define PPC_INST_STSWI 0x7c0005aa
 #define PPC_INST_STSWX 0x7c00052a
-#define PPC_INST_STXVD2X   0x7c000798
-#define PPC_INST_TLBIE 0x7c000264
-#define PPC_INST_TLBIEL0x7c000224
-#define PPC_INST_TLBILX0x7c24
-#define PPC_INST_WAIT  0x7c7c
-#define PPC_INST_TLBIVAX   0x7c000624
-#define PPC_INST_TLBSRX_DOT0x7c0006a5
-#define PPC_INST_VPMSUMW   0x1488
-#define PPC_INST_VPMSUMD   0x14c8
-#define PPC_INST_VPERMXOR  0x102d
-#define PPC_INST_XXLOR 0xf490
-#define PPC_INST_XXSWAPD   0xf250
-#define PPC_INST_XVCPSGNDP 0xf780
 #define PPC_INST_TRECHKPT  0x7c0007dd
 #define PPC_INST_TRECLAIM  0x7c00075d
-#define PPC_INST_TABORT0x7c00071d
 #define PPC_INST_TSR   0x

[PATCH 5/6] powerpc/ppc-opcode: reuse raw instruction macros to stringify

2020-05-26 Thread Balamuruhan S

Wrap existing stringify macros to reuse raw instruction encoding macros that
are newly added.

Signed-off-by: Balamuruhan S 
Acked-by: Naveen N. Rao 
Tested-by: Naveen N. Rao 
---
 arch/powerpc/include/asm/ppc-opcode.h | 220 +-
 1 file changed, 71 insertions(+), 149 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 94b889d89395..c8d71a8bef46 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -781,166 +781,92 @@
___PPC_RA(a))
 
 /* Deal with instructions that older assemblers aren't aware of */
-#definePPC_CP_ABORTstringify_in_c(.long PPC_INST_CP_ABORT)
-#definePPC_COPY(a, b)  stringify_in_c(.long PPC_INST_COPY | \
-   ___PPC_RA(a) | ___PPC_RB(b))
-#define PPC_DARN(t, l) stringify_in_c(.long PPC_INST_DARN |  \
-   ___PPC_RT(t)   |  \
-   (((l) & 0x3) << 16))
-#definePPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \
-   __PPC_RA(a) | __PPC_RB(b))
-#definePPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \
-   __PPC_RA(a) | __PPC_RB(b))
-#define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LQARX | \
-   ___PPC_RT(t) | ___PPC_RA(a) | \
-   ___PPC_RB(b) | __PPC_EH(eh))
-#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \
-   ___PPC_RT(t) | ___PPC_RA(a) | \
-   ___PPC_RB(b) | __PPC_EH(eh))
-#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \
-   ___PPC_RT(t) | ___PPC_RA(a) | \
-   ___PPC_RB(b) | __PPC_EH(eh))
-#define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \
-   ___PPC_RT(t) | ___PPC_RA(a) | \
-   ___PPC_RB(b))
-#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHD | \
-   ___PPC_RT(t) | ___PPC_RA(a)  | \
-   ___PPC_RB(b) | ___PPC_RC(c))
-#define PPC_MADDHDU(t, a, b, c)stringify_in_c(.long PPC_INST_MADDHDU | 
\
-   ___PPC_RT(t) | ___PPC_RA(a)   | \
-   ___PPC_RB(b) | ___PPC_RC(c))
-#define PPC_MADDLD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDLD | \
-   ___PPC_RT(t) | ___PPC_RA(a)  | \
-   ___PPC_RB(b) | ___PPC_RC(c))
-#define PPC_MSGSND(b)  stringify_in_c(.long PPC_INST_MSGSND | \
-   ___PPC_RB(b))
-#define PPC_MSGSYNCstringify_in_c(.long PPC_INST_MSGSYNC)
-#define PPC_MSGCLR(b)  stringify_in_c(.long PPC_INST_MSGCLR | \
-   ___PPC_RB(b))
-#define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \
-   ___PPC_RB(b))
-#define PPC_MSGCLRP(b) stringify_in_c(.long PPC_INST_MSGCLRP | \
-   ___PPC_RB(b))
-#define PPC_PASTE(a, b)stringify_in_c(.long PPC_INST_PASTE | \
-   ___PPC_RA(a) | ___PPC_RB(b))
-#define PPC_POPCNTB(a, s)  stringify_in_c(.long PPC_INST_POPCNTB | \
-   __PPC_RA(a) | __PPC_RS(s))
-#define PPC_POPCNTD(a, s)  stringify_in_c(.long PPC_INST_POPCNTD | \
-   __PPC_RA(a) | __PPC_RS(s))
-#define PPC_POPCNTW(a, s)  stringify_in_c(.long PPC_INST_POPCNTW | \
-   __PPC_RA(a) | __PPC_RS(s))
-#define PPC_RFCI   stringify_in_c(.long PPC_INST_RFCI)
-#define PPC_RFDI   stringify_in_c(.long PPC_INST_RFDI)
-#define PPC_RFMCI  stringify_in_c(.long PPC_INST_RFMCI)
-#define PPC_TLBILX(t, a, b)stringify_in_c(.long PPC_INST_TLBILX | \
-   __PPC_T_TLB(t) | __PPC_RA0(a) | 
__PPC_RB(b))
+#definePPC_CP_ABORTstringify_in_c(.long PPC_RAW_CP_ABORT)
+#definePPC_COPY(a, b)  stringify_in_c(.long PPC_RAW_COPY(a, b))
+#define PPC_DARN(t, l) stringify_in_c(.long PPC_RAW_DARN(t, l))
+#definePPC_DCBAL(a, b) stringify_in_c(.long PPC_RAW_DCBAL(a, 
b))
+#definePPC_DCBZL(a, b) stringify_in_c(.long PPC_RAW_DCBZL(a, 
b))
+#define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh))
+#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LDARX(t, a, b, eh))
+#define PPC_

[PATCH 4/6] powerpc/ppc-opcode: consolidate powerpc instructions from bpf_jit.h

2020-05-26 Thread Balamuruhan S

move macro definitions of powerpc instructions from bpf_jit.h to ppc-opcode.h
and adopt the users of the macros accordingly. `PPC_MR()` is defined twice in
bpf_jit.h, remove the duplicate one.

Signed-off-by: Balamuruhan S 
Acked-by: Naveen N. Rao 
Tested-by: Naveen N. Rao 
---
 arch/powerpc/include/asm/ppc-opcode.h | 139 +
 arch/powerpc/net/bpf_jit.h| 166 ++-
 arch/powerpc/net/bpf_jit32.h  |  24 +--
 arch/powerpc/net/bpf_jit64.h  |  12 +-
 arch/powerpc/net/bpf_jit_comp.c   | 132 ++--
 arch/powerpc/net/bpf_jit_comp64.c | 278 +-
 6 files changed, 378 insertions(+), 373 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index ca3f0351b878..94b889d89395 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -79,6 +79,16 @@
 #define IMM_L(i)   ((uintptr_t)(i) & 0x)
 #define IMM_DS(i)  ((uintptr_t)(i) & 0xfffc)
 
+/*
+ * 16-bit immediate helper macros: HA() is for use with sign-extending instrs
+ * (e.g. LD, ADDI).  If the bottom 16 bits is "-ve", add another bit into the
+ * top half to negate the effect (i.e. 0x + 1 = 0x(1)).
+ */
+#define IMM_H(i)((uintptr_t)(i)>>16)
+#define IMM_HA(i)   (((uintptr_t)(i)>>16) +   \
+   (((uintptr_t)(i) & 0x8000) >> 15))
+
+
 /* opcode and xopcode for instructions */
 #define OP_TRAP 3
 #define OP_TRAP_64 2
@@ -640,6 +650,135 @@
 #define PPC_RAW_ADDC_DOT(t, a, b)  (PPC_INST_ADDC | ___PPC_RT(t) | \
___PPC_RA(a) | ___PPC_RB(b) | \
0x1)
+#define PPC_RAW_NOP()  (PPC_INST_NOP)
+#define PPC_RAW_BLR()  (PPC_INST_BLR)
+#define PPC_RAW_BLRL() (PPC_INST_BLRL)
+#define PPC_RAW_MTLR(r)(PPC_INST_MTLR | ___PPC_RT(r))
+#define PPC_RAW_BCTR() (PPC_INST_BCTR)
+#define PPC_RAW_MTCTR(r)   (PPC_INST_MTCTR | ___PPC_RT(r))
+#define PPC_RAW_ADDI(d, a, i)  (PPC_INST_ADDI | ___PPC_RT(d) | \
+   ___PPC_RA(a) | IMM_L(i))
+#define PPC_RAW_LI(r, i)   PPC_RAW_ADDI(r, 0, i)
+#define PPC_RAW_ADDIS(d, a, i) (PPC_INST_ADDIS | \
+   ___PPC_RT(d) | ___PPC_RA(a) | \
+   IMM_L(i))
+#define PPC_RAW_LIS(r, i)  PPC_RAW_ADDIS(r, 0, i)
+#define PPC_RAW_STDX(r, base, b)   (PPC_INST_STDX | ___PPC_RS(r) | \
+   ___PPC_RA(base) | ___PPC_RB(b))
+#define PPC_RAW_STDU(r, base, i)   (PPC_INST_STDU | ___PPC_RS(r) | \
+   ___PPC_RA(base) | \
+   ((i) & 0xfffc))
+#define PPC_RAW_STW(r, base, i)(PPC_INST_STW | ___PPC_RS(r) | \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_RAW_STWU(r, base, i)   (PPC_INST_STWU | ___PPC_RS(r) | \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_RAW_STH(r, base, i)(PPC_INST_STH | ___PPC_RS(r) | \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_RAW_STB(r, base, i)(PPC_INST_STB | ___PPC_RS(r) | \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_RAW_LBZ(r, base, i)(PPC_INST_LBZ | ___PPC_RT(r) | \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_RAW_LDX(r, base, b)(PPC_INST_LDX | ___PPC_RT(r) | \
+   ___PPC_RA(base) | ___PPC_RB(b))
+#define PPC_RAW_LHZ(r, base, i)(PPC_INST_LHZ | ___PPC_RT(r) | \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_RAW_LHBRX(r, base, b)  (PPC_INST_LHBRX | ___PPC_RT(r) | \
+   ___PPC_RA(base) | ___PPC_RB(b))
+#define PPC_RAW_LDBRX(r, base, b)  (PPC_INST_LDBRX | ___PPC_RT(r) | \
+   ___PPC_RA(base) | ___PPC_RB(b))
+#define PPC_RAW_STWCX(s, a, b) (PPC_INST_STWCX | ___PPC_RS(s) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_CMPWI(a, i)(PPC_INST_CMPWI | ___PPC_RA(a) | 
IMM_L(i))
+#define PPC_RAW_CMPDI(a, i)(PPC_INST_CMPDI | ___PPC_RA(a) | 
IMM_L(i))
+#define PPC_RAW_CMPW(a, b) (PPC_INST_CMPW | ___PPC_RA(a) | \
+   ___PPC_RB(b))
+#define PPC_RAW_CMPD(a, b) (PPC_INST_CMPD | ___PPC_RA(a) | \
+   ___PPC_RB(b))
+

[PATCH 3/6] powerpc/bpf_jit: reuse instruction macros from ppc-opcode.h

2020-05-26 Thread Balamuruhan S

remove duplicate macro definitions from bpf_jit.h and reuse the macros from
ppc-opcode.h

Signed-off-by: Balamuruhan S 
Acked-by: Naveen N. Rao 
Tested-by: Naveen N. Rao 
---
 arch/powerpc/net/bpf_jit.h| 18 +-
 arch/powerpc/net/bpf_jit32.h  | 10 +-
 arch/powerpc/net/bpf_jit64.h  |  4 ++--
 arch/powerpc/net/bpf_jit_comp.c   |  2 +-
 arch/powerpc/net/bpf_jit_comp64.c | 20 ++--
 5 files changed, 19 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 55d4377ccfae..535d1de4dfee 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -11,6 +11,7 @@
 #ifndef __ASSEMBLY__
 
 #include 
+#include 
 
 #ifdef PPC64_ELF_ABI_v1
 #define FUNCTION_DESCR_SIZE24
@@ -26,7 +27,6 @@
 #define IMM_H(i)   ((uintptr_t)(i)>>16)
 #define IMM_HA(i)  (((uintptr_t)(i)>>16) +   \
(((uintptr_t)(i) & 0x8000) >> 15))
-#define IMM_L(i)   ((uintptr_t)(i) & 0x)
 
 #define PLANT_INSTR(d, idx, instr)   \
do { if (d) { (d)[idx] = instr; } idx++; } while (0)
@@ -45,8 +45,6 @@
 #define PPC_ADDIS(d, a, i) EMIT(PPC_INST_ADDIS | \
 ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i))
 #define PPC_LIS(r, i)  PPC_ADDIS(r, 0, i)
-#define PPC_STD(r, base, i)EMIT(PPC_INST_STD | ___PPC_RS(r) |\
-___PPC_RA(base) | ((i) & 0xfffc))
 #define PPC_STDX(r, base, b)   EMIT(PPC_INST_STDX | ___PPC_RS(r) |   \
 ___PPC_RA(base) | ___PPC_RB(b))
 #define PPC_STDU(r, base, i)   EMIT(PPC_INST_STDU | ___PPC_RS(r) |   \
@@ -62,12 +60,8 @@
 
 #define PPC_LBZ(r, base, i)EMIT(PPC_INST_LBZ | ___PPC_RT(r) |\
 ___PPC_RA(base) | IMM_L(i))
-#define PPC_LD(r, base, i) EMIT(PPC_INST_LD | ___PPC_RT(r) | \
-___PPC_RA(base) | ((i) & 0xfffc))
 #define PPC_LDX(r, base, b)EMIT(PPC_INST_LDX | ___PPC_RT(r) |\
 ___PPC_RA(base) | ___PPC_RB(b))
-#define PPC_LWZ(r, base, i)EMIT(PPC_INST_LWZ | ___PPC_RT(r) |\
-___PPC_RA(base) | IMM_L(i))
 #define PPC_LHZ(r, base, i)EMIT(PPC_INST_LHZ | ___PPC_RT(r) |\
 ___PPC_RA(base) | IMM_L(i))
 #define PPC_LHBRX(r, base, b)  EMIT(PPC_INST_LHBRX | ___PPC_RT(r) |  \
@@ -75,16 +69,8 @@
 #define PPC_LDBRX(r, base, b)  EMIT(PPC_INST_LDBRX | ___PPC_RT(r) |  \
 ___PPC_RA(base) | ___PPC_RB(b))
 
-#define PPC_BPF_LDARX(t, a, b, eh) EMIT(PPC_INST_LDARX | ___PPC_RT(t) |
  \
-   ___PPC_RA(a) | ___PPC_RB(b) | \
-   __PPC_EH(eh))
-#define PPC_BPF_LWARX(t, a, b, eh) EMIT(PPC_INST_LWARX | ___PPC_RT(t) |
  \
-   ___PPC_RA(a) | ___PPC_RB(b) | \
-   __PPC_EH(eh))
 #define PPC_BPF_STWCX(s, a, b) EMIT(PPC_INST_STWCX | ___PPC_RS(s) |  \
___PPC_RA(a) | ___PPC_RB(b))
-#define PPC_BPF_STDCX(s, a, b) EMIT(PPC_INST_STDCX | ___PPC_RS(s) |  \
-   ___PPC_RA(a) | ___PPC_RB(b))
 #define PPC_CMPWI(a, i)EMIT(PPC_INST_CMPWI | ___PPC_RA(a) | 
IMM_L(i))
 #define PPC_CMPDI(a, i)EMIT(PPC_INST_CMPDI | ___PPC_RA(a) | 
IMM_L(i))
 #define PPC_CMPW(a, b) EMIT(PPC_INST_CMPW | ___PPC_RA(a) |   \
@@ -100,8 +86,6 @@
 
 #define PPC_SUB(d, a, b)   EMIT(PPC_INST_SUB | ___PPC_RT(d) |\
 ___PPC_RB(a) | ___PPC_RA(b))
-#define PPC_ADD(d, a, b)   EMIT(PPC_INST_ADD | ___PPC_RT(d) |\
-___PPC_RA(a) | ___PPC_RB(b))
 #define PPC_MULD(d, a, b)  EMIT(PPC_INST_MULLD | ___PPC_RT(d) |  \
 ___PPC_RA(a) | ___PPC_RB(b))
 #define PPC_MULW(d, a, b)  EMIT(PPC_INST_MULLW | ___PPC_RT(d) |  \
diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h
index 4ec2a9f14f84..753c244a7cf9 100644
--- a/arch/powerpc/net/bpf_jit32.h
+++ b/arch/powerpc/net/bpf_jit32.h
@@ -76,13 +76,13 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh);
else {  PPC_ADDIS(r, base, IMM_HA(i));\
PPC_LBZ(r, r, IMM_L(i)); } } while(0)
 
-#define PPC_LD_OFFS(r, base, i) do { if ((i) < 32768) PPC_LD(r, base, i); \
+#define PPC_LD_OFFS(r, base, i) do { if ((i) < 32768) EMIT(PPC_RAW_LD(r, base, 
i)); \
else {  PPC_ADDIS(r, base, IMM_HA(i));

[PATCH 2/6] powerpc/ppc-opcode: move ppc instruction encoding from test_emulate_step

2020-05-26 Thread Balamuruhan S

Few ppc instructions are encoded in test_emulate_step.c, consolidate
them and use it from ppc-opcode.h

Signed-off-by: Balamuruhan S 
Acked-by: Naveen N. Rao 
Tested-by: Naveen N. Rao 
---
 arch/powerpc/include/asm/ppc-opcode.h |  35 ++
 arch/powerpc/lib/test_emulate_step.c  | 155 ++
 2 files changed, 91 insertions(+), 99 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index e3540be1fc17..ca3f0351b878 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -76,6 +76,9 @@
 #define__REGA0_R30 30
 #define__REGA0_R31 31
 
+#define IMM_L(i)   ((uintptr_t)(i) & 0x)
+#define IMM_DS(i)  ((uintptr_t)(i) & 0xfffc)
+
 /* opcode and xopcode for instructions */
 #define OP_TRAP 3
 #define OP_TRAP_64 2
@@ -605,6 +608,38 @@
___PPC_RT(vrt) | \
___PPC_RA(vra) | \
___PPC_RB(vrb) | __PPC_RC21)
+#define PPC_RAW_LD(r, base, i) (PPC_INST_LD | ___PPC_RT(r) | \
+   ___PPC_RA(base) | IMM_DS(i))
+#define PPC_RAW_LWZ(r, base, i)(PPC_INST_LWZ | ___PPC_RT(r) | \
+   ___PPC_RA(base) | IMM_L(i))
+#define PPC_RAW_LWZX(t, a, b)  (PPC_INST_LWZX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_STD(r, base, i)(PPC_INST_STD | ___PPC_RS(r) | \
+   ___PPC_RA(base) | IMM_DS(i))
+#define PPC_RAW_STDCX(s, a, b) (PPC_INST_STDCX | ___PPC_RS(s) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_LFSX(t, a, b)  (PPC_INST_LFSX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_STFSX(s, a, b) (PPC_INST_STFSX | ___PPC_RS(s) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_LFDX(t, a, b)  (PPC_INST_LFDX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_STFDX(s, a, b) (PPC_INST_STFDX | ___PPC_RS(s) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_LVX(t, a, b)   (PPC_INST_LVX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_STVX(s, a, b)  (PPC_INST_STVX | ___PPC_RS(s) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_ADD(t, a, b)   (PPC_INST_ADD | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_ADD_DOT(t, a, b)   (PPC_INST_ADD | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b) | \
+   0x1)
+#define PPC_RAW_ADDC(t, a, b)  (PPC_INST_ADDC | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b))
+#define PPC_RAW_ADDC_DOT(t, a, b)  (PPC_INST_ADDC | ___PPC_RT(t) | \
+   ___PPC_RA(a) | ___PPC_RB(b) | \
+   0x1)
 
 /* Deal with instructions that older assemblers aren't aware of */
 #definePPC_CP_ABORTstringify_in_c(.long PPC_INST_CP_ABORT)
diff --git a/arch/powerpc/lib/test_emulate_step.c 
b/arch/powerpc/lib/test_emulate_step.c
index 46af80279ebc..e508290eb15d 100644
--- a/arch/powerpc/lib/test_emulate_step.c
+++ b/arch/powerpc/lib/test_emulate_step.c
@@ -13,49 +13,6 @@
 #include 
 #include 
 
-#define IMM_L(i)   ((uintptr_t)(i) & 0x)
-#define IMM_DS(i)  ((uintptr_t)(i) & 0xfffc)
-
-/*
- * Defined with TEST_ prefix so it does not conflict with other
- * definitions.
- */
-#define TEST_LD(r, base, i)ppc_inst(PPC_INST_LD | ___PPC_RT(r) |   
\
-   ___PPC_RA(base) | IMM_DS(i))
-#define TEST_LWZ(r, base, i)   ppc_inst(PPC_INST_LWZ | ___PPC_RT(r) |  
\
-   ___PPC_RA(base) | IMM_L(i))
-#define TEST_LWZX(t, a, b) ppc_inst(PPC_INST_LWZX | ___PPC_RT(t) | 
\
-   ___PPC_RA(a) | ___PPC_RB(b))
-#define TEST_STD(r, base, i)   ppc_inst(PPC_INST_STD | ___PPC_RS(r) |  
\
-   ___PPC_RA(base) | IMM_DS(i))
-#define TEST_LDARX(t, a, b, eh)ppc_inst(PPC_INST_LDARX | ___PPC_RT(t) 
|\
-   ___PPC_RA(a) | ___PPC_RB(b) |   \
-   __PPC_EH(eh))
-#define TEST_STDCX(s, a, b)ppc_inst(PPC_INST_STDCX |

[PATCH 1/6] powerpc/ppc-opcode: introduce PPC_RAW_* macros for base instruction encoding

2020-05-26 Thread Balamuruhan S

Introduce PPC_RAW_* macros to have all the bare encoding of ppc
instructions. Move `VSX_XX*()` and `TMRN()` macros up to reuse it.

Signed-off-by: Balamuruhan S 
Acked-by: Naveen N. Rao 
Tested-by: Naveen N. Rao 
---
 arch/powerpc/include/asm/ppc-opcode.h | 183 --
 1 file changed, 175 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/ppc-opcode.h 
b/arch/powerpc/include/asm/ppc-opcode.h
index 2a39c716c343..e3540be1fc17 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -431,6 +431,181 @@
 #define __PPC_EH(eh)   0
 #endif
 
+/* Base instruction encoding */
+#define PPC_RAW_CP_ABORT   (PPC_INST_CP_ABORT)
+#define PPC_RAW_COPY(a, b) (PPC_INST_COPY | ___PPC_RA(a) | \
+   ___PPC_RB(b))
+#define PPC_RAW_DARN(t, l) (PPC_INST_DARN | ___PPC_RT(t) | \
+   (((l) & 0x3) << 16))
+#define PPC_RAW_DCBAL(a, b)(PPC_INST_DCBAL | __PPC_RA(a) | \
+   __PPC_RB(b))
+#define PPC_RAW_DCBZL(a, b)(PPC_INST_DCBZL | __PPC_RA(a) | \
+   __PPC_RB(b))
+#define PPC_RAW_LQARX(t, a, b, eh) (PPC_INST_LQARX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | \
+   ___PPC_RB(b) | __PPC_EH(eh))
+#define PPC_RAW_LDARX(t, a, b, eh) (PPC_INST_LDARX | ___PPC_RT(t) | \
+   ___PPC_RA(a) | \
+   ___PPC_RB(b) | __PPC_EH(eh))
+#define PPC_RAW_LWARX(t, a, b, eh) (PPC_INST_LWARX | \
+   ___PPC_RT(t) | ___PPC_RA(a) | \
+   ___PPC_RB(b) | __PPC_EH(eh))
+#define PPC_RAW_STQCX(t, a, b) (PPC_INST_STQCX | \
+   ___PPC_RT(t) | ___PPC_RA(a) | \
+   ___PPC_RB(b))
+#define PPC_RAW_MADDHD(t, a, b, c) (PPC_INST_MADDHD | \
+   ___PPC_RT(t) | ___PPC_RA(a) | \
+   ___PPC_RB(b) | ___PPC_RC(c))
+#define PPC_RAW_MADDHDU(t, a, b, c)(PPC_INST_MADDHDU | \
+   ___PPC_RT(t) | ___PPC_RA(a) | \
+   ___PPC_RB(b) | ___PPC_RC(c))
+#define PPC_RAW_MADDLD(t, a, b, c) (PPC_INST_MADDLD | \
+   ___PPC_RT(t) | ___PPC_RA(a) | \
+   ___PPC_RB(b) | ___PPC_RC(c))
+#define PPC_RAW_MSGSND(b)  (PPC_INST_MSGSND | ___PPC_RB(b))
+#define PPC_RAW_MSGSYNC(PPC_INST_MSGSYNC)
+#define PPC_RAW_MSGCLR(b)  (PPC_INST_MSGCLR | ___PPC_RB(b))
+#define PPC_RAW_MSGSNDP(b) (PPC_INST_MSGSNDP | ___PPC_RB(b))
+#define PPC_RAW_MSGCLRP(b) (PPC_INST_MSGCLRP | ___PPC_RB(b))
+#define PPC_RAW_PASTE(a, b)(PPC_INST_PASTE | ___PPC_RA(a) | \
+   ___PPC_RB(b))
+#define PPC_RAW_POPCNTB(a, s)  (PPC_INST_POPCNTB | __PPC_RA(a) | \
+   __PPC_RS(s))
+#define PPC_RAW_POPCNTD(a, s)  (PPC_INST_POPCNTD | __PPC_RA(a) | \
+   __PPC_RS(s))
+#define PPC_RAW_POPCNTW(a, s)  (PPC_INST_POPCNTW | __PPC_RA(a) | \
+   __PPC_RS(s))
+#define PPC_RAW_RFCI   (PPC_INST_RFCI)
+#define PPC_RAW_RFDI   (PPC_INST_RFDI)
+#define PPC_RAW_RFMCI  (PPC_INST_RFMCI)
+#define PPC_RAW_TLBILX(t, a, b)(PPC_INST_TLBILX | \
+   __PPC_T_TLB(t) | \
+   __PPC_RA0(a) | \
+   __PPC_RB(b))
+#define PPC_RAW_WAIT(w)(PPC_INST_WAIT | __PPC_WC(w))
+#define PPC_RAW_TLBIE(lp, a)   (PPC_INST_TLBIE | ___PPC_RB(a) | \
+   ___PPC_RS(lp))
+#define PPC_RAW_TLBIE_5(rb, rs, ric, prs, r) \
+   (PPC_INST_TLBIE | \
+   ___PPC_RB(rb) | \
+   ___PPC_RS(rs) | \
+   ___PPC_RIC(ric) | \
+   ___PPC_PRS(prs) | \
+   ___PPC_R(r))
+#define PPC_RAW_TLBIEL(rb, rs, ric, prs, r) \
+   (PPC_INST_TLBIEL | \
+   ___PPC_RB(rb) | \
+   ___PPC_RS(rs) |

[PATCH 0/6] consolidate PowerPC instruction encoding macros

2020-05-26 Thread Balamuruhan S

ppc-opcode.h have base instruction encoding wrapped with stringify_in_c()
for raw encoding to have compatibility. But there are redundant macros for
base instruction encodings in bpf, instruction emulation test infrastructure
and powerpc selftests.

Currently PPC_INST_* macros are used for encoding instruction opcode and PPC_*
for raw instuction encoding, this rfc patchset introduces PPC_RAW_* macros for
base instruction encoding and reuse it from elsewhere. With this change we can
avoid redundant macro definitions in multiple files and start adding new
instructions in ppc-opcode.h in future.

Changes in v1:
-
* Drop the patch that had changes in stringloops Makefile.
* Include Acked-by and Tested-by tag from Naveen.
* Rebased on next branch of linuxppc tree.

Changes in rfc v2:
-
Fix review comments/suggestions from Naveen and Michael Ellerman,

* Rename PPC_ENCODE_* to PPC_RAW_* for base instruction encoding macros.
* Split the patches that does mass renaming and make them simpler that just
  adds new macros.
* Keep the patch to update all the existing names later (patch 6).
* Lot of PPC_INST_* macros are used only in ppc-opcode.h for PPC_*  macros,
  fold PPC_INST_* encoding into PPC_RAW_* to avoid using them accidentally.
* Fixed clipped macros that was due to a typo/copy-paste
* Consolidated all the instruction encoding macros from bpf_jit.h to
  ppc-opcode.h
* squashed patch that removes the duplicate macro PPC_MR() in bpf_jit.h
* merge few changes in bpf_jit files from patch 2 into patch 3
* few fixes in powerpc selftest stringloops Makefile
* build tested for ppc64le_defconfig, ppc64e_defconfig and pmac32_defconfig
* Rebased on next branch of linuxppc tree

Testing:
---
* Tested it by compiling vmlinux and comparing objdump of it with and without
  the patchset and observed that it remains same,

  # diff vmlinux_objdump vmlinux_rfc_objdump 
  2c2
  < vmlinux: file format elf64-powerpcle
  ---
  > vmlinux_rfc: file format elf64-powerpcle

* Tested building it with this changes for Fedora30 config, booted VM
  with powerpc next and powerpc next + patchset to run powerpc selftest and
  ftrace selftest. There were couple of failures that were common and
  patchset did not introduce any new failures.

  ftrace selftest:
  ---
# # of passed:  96
# # of failed:  1
# # of unresolved:  7
# # of untested:  0
# # of unsupported:  1
# # of xfailed:  1
# # of undefined(test bug):  0
not ok 1 selftests: ftrace: ftracetest # exit=1

  powerpc selftest:
  
not ok 7 selftests: powerpc/dscr: dscr_sysfs_thread_test # exit=1
not ok 20 selftests: powerpc/pmu/ebb: lost_exception_test # TIMEOUT
not ok 2 selftests: powerpc/security: spectre_v2 # exit=1

Thanks to Naveen, Sandipan and Michael on overall suggestions/improvements.

I would request for review and suggestions to make it better.

rfc v2: https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-April/209395.html
rfc v1: https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-March/206494.html

Balamuruhan S (6):
  powerpc/ppc-opcode: introduce PPC_RAW_* macros for base instruction
encoding
  powerpc/ppc-opcode: move ppc instruction encoding from
test_emulate_step
  powerpc/bpf_jit: reuse instruction macros from ppc-opcode.h
  powerpc/ppc-opcode: consolidate powerpc instructions from bpf_jit.h
  powerpc/ppc-opcode: reuse raw instruction macros to stringify
  powerpc/ppc-opcode: fold PPC_INST_* macros into PPC_RAW_* macros

 arch/powerpc/include/asm/ppc-opcode.h | 706 +++---
 arch/powerpc/lib/test_emulate_step.c  | 155 ++
 arch/powerpc/net/bpf_jit.h| 184 +--
 arch/powerpc/net/bpf_jit32.h  |  34 +-
 arch/powerpc/net/bpf_jit64.h  |  16 +-
 arch/powerpc/net/bpf_jit_comp.c   | 134 ++---
 arch/powerpc/net/bpf_jit_comp64.c | 298 +--
 7 files changed, 733 insertions(+), 794 deletions(-)


base-commit: 30df74d67d48949da87e3a5b57c381763e8fd526
-- 
2.24.1

Re: [linux-next PATCH] mm/gup.c: Convert to use get_user_{page|pages}_fast_only()

2020-05-26 Thread Paul Mackerras

On Mon, May 25, 2020 at 02:23:32PM +0530, Souptick Joarder wrote:
> API __get_user_pages_fast() renamed to get_user_pages_fast_only()
> to align with pin_user_pages_fast_only().
> 
> As part of this we will get rid of write parameter.
> Instead caller will pass FOLL_WRITE to get_user_pages_fast_only().
> This will not change any existing functionality of the API.
> 
> All the callers are changed to pass FOLL_WRITE.
> 
> Also introduce get_user_page_fast_only(), and use it in a few
> places that hard-code nr_pages to 1.
> 
> Updated the documentation of the API.
> 
> Signed-off-by: Souptick Joarder 

The arch/powerpc/kvm bits look reasonable.

Reviewed-by: Paul Mackerras

Re: [RFC PATCH v2 7/7] powerpc/selftest: reuse ppc-opcode macros to avoid redundancy

2020-05-26 Thread Balamuruhan S

On Thu, 2020-04-30 at 17:27 +0530, Naveen N. Rao wrote:
> Michael Ellerman wrote:
> > "Naveen N. Rao"  writes:
> > > Michael Ellerman wrote:
> > > > Balamuruhan S  writes:
> > > > > Avoid redefining macros to encode ppc instructions instead reuse it
> > > > > from
> > > > > ppc-opcode.h, Makefile changes are necessary to compile memcmp_64.S
> > > > > with
> > > > > __ASSEMBLY__ defined from selftests.
> > > > > 
> > > > > Signed-off-by: Balamuruhan S 
> > > > > ---
> > > > >  .../selftests/powerpc/stringloops/Makefile| 34 ++---
> > > > > -
> > > > >  .../powerpc/stringloops/asm/asm-const.h   |  1 +
> > > > >  .../powerpc/stringloops/asm/ppc-opcode.h  | 36 +--
> > > > > 
> > > > >  3 files changed, 29 insertions(+), 42 deletions(-)
> > > > >  create mode 12
> > > > > tools/testing/selftests/powerpc/stringloops/asm/asm-const.h
> > > > >  mode change 100644 => 12
> > > > > tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
> > > > > 
> > > > > diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile
> > > > > b/tools/testing/selftests/powerpc/stringloops/Makefile
> > > > > index 7fc0623d85c3..efe76c5a5b94 100644
> > > > > --- a/tools/testing/selftests/powerpc/stringloops/Makefile
> > > > > +++ b/tools/testing/selftests/powerpc/stringloops/Makefile
> > > > > @@ -1,26 +1,44 @@
> > > > >  # SPDX-License-Identifier: GPL-2.0
> > > > >  # The loops are all 64-bit code
> > > > > -CFLAGS += -I$(CURDIR)
> > > > > +GIT_VERSION = $(shell git describe --always --long --dirty || echo
> > > > > "unknown")
> > > > > +CFLAGS += -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR)
> > > > > -I$(CURDIR)/../include
> > > > >  
> > > > >  EXTRA_SOURCES := ../harness.c
> > > > >  
> > > > >  build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c
> > > > > >/dev/null 2>&1) then echo "1"; fi)
> > > > >  
> > > > > +ifneq ($(build_32bit),1)
> > > > >  TEST_GEN_PROGS := memcmp_64 strlen
> > > > > +TEST_GEN_FILES := memcmp.o memcmp_64.o memcmp_64
> > > > > +MEMCMP := $(OUTPUT)/memcmp.o
> > > > > +MEMCMP_64 := $(OUTPUT)/memcmp_64.o
> > > > > +HARNESS :=  $(OUTPUT)/../harness.o
> > > > > +CFLAGS += -m64 -maltivec
> > > > >  
> > > > > -$(OUTPUT)/memcmp_64: memcmp.c
> > > > > -$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
> > > > > +OVERRIDE_TARGETS := 1
> > > > > +include ../../lib.mk
> > > > >  
> > > > > -ifeq ($(build_32bit),1)
> > > > > +$(OUTPUT)/memcmp_64: $(MEMCMP_64) $(MEMCMP) $(HARNESS)
> > > > > + $(CC) $(CFLAGS) memcmp.o memcmp_64.o ../harness.o -o memcmp_64
> > > > > +
> > > > > +$(MEMCMP_64): memcmp_64.S
> > > > > + $(CC) $(CFLAGS) -D__ASSEMBLY__ -o memcmp_64.o -c memcmp_64.S
> > > > > +
> > > > > +$(MEMCMP): memcmp.c
> > > > > + $(CC) $(CFLAGS) -o memcmp.o -c memcmp.c
> > > > > +
> > > > > +$(HARNESS): $(EXTRA_SOURCES)
> > > > > + $(CC) $(CFLAGS) -DGIT_VERSION='"$(GIT_VERSION)"' -o
> > > > > ../harness.o -c $(EXTRA_SOURCES)
> > > > 
> > > > What are you actually trying to do here? Is it just that you need to
> > > > define __ASSEMBLY__ for memcmp_64.S?
> > > 
> > > Adding __ASSEMBLY__ while building memcmp_64.S would be the goal, so as 
> > > to reuse ppc-opcode.h. However, asm/ppc-opcode.h under stringloops test 
> > > is tiny and doesn't seem to justify the change.

Okay, I will drop the last patch that have changes for stringloops Makefile.

make and make clean is not working from inside stringloops directory which is
fixed with this change.


> > 
> > I don't see ppc-opcode.h testing __ASSEMBLY__ though, so I don't think
> > we even need to define it?
> 
> Right -- it's rather 'stringify_in_c' which tests it. 'asm/ppc-opcode.h' 
> under stringloops/ unconditionally defines 'stringify_in_c' this way:
>   #  define stringify_in_c(...)   __VA_ARGS__ 
> 

It is expecting __ASSEMBLY__ through ppc-opcode.h -> asm-const.h to raw encode
the instruction in assembly file instead to stringify it for c file. we observe
this Assembler messages without defining __ASSEMBLY__,

memcmp_64.S: Assembler messages:
memcmp_64.S:473: Error: unknown pseudo-op: `.long (0x10c7 | (((0) & 0x1f)
<< 21) | (((0) & 0x1f) << 16) | (((1) & 0x1f) << 11) | (0x1 << 10))'
memcmp_64.S:477: Error: unknown pseudo-op: `.long (0x10c7 | (((0) & 0x1f)
<< 21) | (((0) & 0x1f) << 16) | (((1) & 0x1f) << 11) | (0x1 << 10))'
memcmp_64.S:586: Error: unknown pseudo-op: `.long (0x1006 | (((7) & 0x1f)
<< 21) | (((9) & 0x1f) << 16) | (((10) & 0x1f) << 11) | (0x1 << 10))'
memcmp_64.S:607: Error: unknown pseudo-op: `.long (0x1006 | (((7) & 0x1f)
<< 21) | (((9) & 0x1f) << 16) | (((10) & 0x1f) << 11) | (0x1 << 10))'
memcmp_64.S:616: Error: unknown pseudo-op: `.long (0x1006 | (((7) & 0x1f)
<< 21) | (((9) & 0x1f) << 16) | (((10) & 0x1f) << 11) | (0x1 << 10))'
make[1]: *** [../../lib.mk:148:
/home/bala/linux/tools/testing/selftests/powerpc/stringloops/memcmp_64] Error 1

-- Bala
> 
> - Naveen
>

Re: [PATCH V3 2/2] tools/perf: Add perf tools support for extended register capability in powerpc

2020-05-26 Thread Madhavan Srinivasan





On 5/20/20 3:15 PM, Athira Rajeev wrote:

From: Anju T Sudhakar 

Add extended regs to sample_reg_mask in the tool side to use
with `-I?` option. Perf tools side uses extended mask to display
the platform supported register names (with -I? option) to the user
and also send this mask to the kernel to capture the extended registers
in each sample. Hence decide the mask value based on the processor
version.

Signed-off-by: Anju T Sudhakar 
[Decide extended mask at run time based on platform]
Signed-off-by: Athira Rajeev 


Reviewed-by: Madhavan Srinivasan 


---
  tools/arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++-
  tools/perf/arch/powerpc/include/perf_regs.h |  5 ++-
  tools/perf/arch/powerpc/util/perf_regs.c| 55 +
  3 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h 
b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..485b1d5 100644
--- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_DSISR,
PERF_REG_POWERPC_SIER,
PERF_REG_POWERPC_MMCRA,
-   PERF_REG_POWERPC_MAX,
+   /* Extended registers */
+   PERF_REG_POWERPC_MMCR0,
+   PERF_REG_POWERPC_MMCR1,
+   PERF_REG_POWERPC_MMCR2,
+   /* Max regs without the extended regs */
+   PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
  };
+
+#define PERF_REG_PMU_MASK  ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \
+   - PERF_REG_PMU_MASK)
+
  #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/tools/perf/arch/powerpc/include/perf_regs.h 
b/tools/perf/arch/powerpc/include/perf_regs.h
index e18a355..46ed00d 100644
--- a/tools/perf/arch/powerpc/include/perf_regs.h
+++ b/tools/perf/arch/powerpc/include/perf_regs.h
@@ -64,7 +64,10 @@
[PERF_REG_POWERPC_DAR] = "dar",
[PERF_REG_POWERPC_DSISR] = "dsisr",
[PERF_REG_POWERPC_SIER] = "sier",
-   [PERF_REG_POWERPC_MMCRA] = "mmcra"
+   [PERF_REG_POWERPC_MMCRA] = "mmcra",
+   [PERF_REG_POWERPC_MMCR0] = "mmcr0",
+   [PERF_REG_POWERPC_MMCR1] = "mmcr1",
+   [PERF_REG_POWERPC_MMCR2] = "mmcr2",
  };

  static inline const char *perf_reg_name(int id)
diff --git a/tools/perf/arch/powerpc/util/perf_regs.c 
b/tools/perf/arch/powerpc/util/perf_regs.c
index 0a52429..9179230 100644
--- a/tools/perf/arch/powerpc/util/perf_regs.c
+++ b/tools/perf/arch/powerpc/util/perf_regs.c
@@ -6,9 +6,14 @@

  #include "../../../util/perf_regs.h"
  #include "../../../util/debug.h"
+#include "../../../util/event.h"
+#include "../../../util/header.h"
+#include "../../../perf-sys.h"

  #include 

+#define PVR_POWER9 0x004E
+
  const struct sample_reg sample_reg_masks[] = {
SMPL_REG(r0, PERF_REG_POWERPC_R0),
SMPL_REG(r1, PERF_REG_POWERPC_R1),
@@ -55,6 +60,9 @@
SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR),
SMPL_REG(sier, PERF_REG_POWERPC_SIER),
SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA),
+   SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0),
+   SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1),
+   SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2),
SMPL_REG_END
  };

@@ -163,3 +171,50 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op)

return SDT_ARG_VALID;
  }
+
+uint64_t arch__intr_reg_mask(void)
+{
+   struct perf_event_attr attr = {
+   .type   = PERF_TYPE_HARDWARE,
+   .config = PERF_COUNT_HW_CPU_CYCLES,
+   .sample_type= PERF_SAMPLE_REGS_INTR,
+   .precise_ip = 1,
+   .disabled   = 1,
+   .exclude_kernel = 1,
+   };
+   int fd, ret;
+   char buffer[64];
+   u32 version;
+   u64 extended_mask = 0;
+
+   /* Get the PVR value to set the extended
+* mask specific to platform
+*/
+   get_cpuid(buffer, sizeof(buffer));
+   ret = sscanf(buffer, "%u,", &version);
+
+   if (ret != 1) {
+   pr_debug("Failed to get the processor version, unable to output 
extended registers\n");
+   return PERF_REGS_MASK;
+   }
+
+   if (version == PVR_POWER9)
+   extended_mask = PERF_REG_PMU_MASK_300;
+   else
+   return PERF_REGS_MASK;
+
+   attr.sample_regs_intr = extended_mask;
+   attr.sample_period = 1;
+   event_attr_init(&attr);
+
+   /*
+* check if the pmu supports perf extended regs, before
+* returning the register mask to sample.
+*/
+   fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
+   if (fd != -1) {
+   close(fd);
+   return (extended_mask | PERF_REGS_MASK);
+   }
+   return PERF_REGS_MASK

Re: [PATCH V3 1/2] powerpc/perf: Add support for outputting extended regs in perf intr_regs

2020-05-26 Thread Madhavan Srinivasan





On 5/20/20 3:15 PM, Athira Rajeev wrote:

From: Anju T Sudhakar 

Add support for perf extended register capability in powerpc.
The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the
PMU which support extended registers. The generic code define the mask
of extended registers as 0 for non supported architectures.

Patch adds extended regs support for power9 platform by
exposing MMCR0, MMCR1 and MMCR2 registers.

REG_RESERVED mask needs update to include extended regs.
`PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers,
is defined at runtime in the kernel based on platform since the supported
registers may differ from one processor version to another and hence the
MASK value.

with patch
--

available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11
r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26
r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe
trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2

PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0
... intr regs: mask 0x ABI 64-bit
 r00xc012b77c
 r10xc03fe5e03930
 r20xc1b0e000
 r30xc03fdcddf800
 r40xc03fc788
 r50x9c422724be
 r60xc03fe5e03908
 r70xff63bddc8706
 r80x9e4
 r90x0
 r10   0x1
 r11   0x0
 r12   0xc01299c0
 r13   0xc03c4800
 r14   0x0
 r15   0x7fffdd8b8b00
 r16   0x0
 r17   0x7fffdd8be6b8
 r18   0x7e7076607730
 r19   0x2f
 r20   0xc0001fc26c68
 r21   0xc0002041e4227e00
 r22   0xc0002018fb60
 r23   0x1
 r24   0xc03ffec4d900
 r25   0x8000
 r26   0x0
 r27   0x1
 r28   0x1
 r29   0xc1be1260
 r30   0x6008010
 r31   0xc03ffebb7218
 nip   0xc012b910
 msr   0x90009033
 orig_r3 0xc012b86c
 ctr   0xc01299c0
 link  0xc012b77c
 xer   0x0
 ccr   0x2800
 softe 0x1
 trap  0xf00
 dar   0x0
 dsisr 0x800
 sier  0x0
 mmcra 0x800
 mmcr0 0x82008090
 mmcr1 0x1e00
 mmcr2 0x0
  ... thread: perf:4784

Signed-off-by: Anju T Sudhakar 
[Defined PERF_REG_EXTENDED_MASK at run time to add support for different 
platforms ]
Signed-off-by: Athira Rajeev 


Patch looks fine except for couple minor nits (extra tabs and newline 
issue).


Reviewed-by: Madhavan Srinivasan 

---
  arch/powerpc/include/asm/perf_event_server.h |  8 +++
  arch/powerpc/include/uapi/asm/perf_regs.h| 14 +++-
  arch/powerpc/perf/core-book3s.c  |  1 +
  arch/powerpc/perf/perf_regs.c| 34 +---
  arch/powerpc/perf/power9-pmu.c   |  6 +
  5 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_event_server.h 
b/arch/powerpc/include/asm/perf_event_server.h
index 3e9703f..1458e1a 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -15,6 +15,9 @@
  #define MAX_EVENT_ALTERNATIVES8
  #define MAX_LIMITED_HWCOUNTERS2

+extern u64 mask_var;
+#define PERF_REG_EXTENDED_MASK  mask_var
+
  struct perf_event;

  /*
@@ -55,6 +58,11 @@ struct power_pmu {
int *blacklist_ev;
/* BHRB entries in the PMU */
int bhrb_nr;
+   /*
+* set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if
+* the pmu supports extended perf regs capability
+*/
+   int capabilities;
  };

  /*
diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h 
b/arch/powerpc/include/uapi/asm/perf_regs.h
index f599064..485b1d5 100644
--- a/arch/powerpc/include/uapi/asm/perf_regs.h
+++ b/arch/powerpc/include/uapi/asm/perf_regs.h
@@ -48,6 +48,18 @@ enum perf_event_powerpc_regs {
PERF_REG_POWERPC_DSISR,
PERF_REG_POWERPC_SIER,
PERF_REG_POWERPC_MMCRA,
-   PERF_REG_POWERPC_MAX,
+   /* Extended registers */
+   PERF_REG_POWERPC_MMCR0,
+   PERF_REG_POWERPC_MMCR1,
+   PERF_REG_POWERPC_MMCR2,
+   /* Max regs without the extended regs */
+   PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
  };
+
+#define PERF_REG_PMU_MASK  ((1ULL << PERF_REG_POWERPC_MAX) - 1)
+
+/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */
+#define PERF_REG_PMU_MASK_300   (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \
+   - PERF_REG_PMU_MASK)
+
  #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 3dcfecf..f56b778 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2276,6 +2276,7 @@ int register_power_pmu(struct power_pmu *pmu)

power_pmu.attr_groups = ppmu->attr_groups;

+   power_pmu.capabilities |= (ppmu->capabilities & 
PERF_PMU_CAP_EXTENDED_REGS);

Co

[PATCH v2] powerpc: Add ppc_inst_as_u64()

2020-05-26 Thread Michael Ellerman

The code patching code wants to get the value of a struct ppc_inst as
a u64 when the instruction is prefixed, so we can pass the u64 down to
__put_user_asm() and write it with a single store.

The optprobes code wants to load a struct ppc_inst as an immediate
into a register so it is useful to have it as a u64 to use the
existing helper function.

Currently this is a bit awkward because the value differs based on the
CPU endianness, so add a helper to do the conversion.

This fixes the usage in arch_prepare_optimized_kprobe() which was
previously incorrect on big endian.

Fixes: 650b55b707fd ("powerpc: Add prefixed instructions to instruction data 
type")
Signed-off-by: Michael Ellerman 
Tested-by: Jordan Niethe 
Link: https://lore.kernel.org/r/20200525055004.2182328-1-...@ellerman.id.au
---
 arch/powerpc/include/asm/inst.h  | 9 +
 arch/powerpc/kernel/optprobes.c  | 3 +--
 arch/powerpc/lib/code-patching.c | 8 +---
 3 files changed, 11 insertions(+), 9 deletions(-)

v2: Update the commit message as noted by Jordan.
Add a Fixes tag.

diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h
index 5b756ba77ed2..45f3ec868258 100644
--- a/arch/powerpc/include/asm/inst.h
+++ b/arch/powerpc/include/asm/inst.h
@@ -113,6 +113,15 @@ static inline struct ppc_inst *ppc_inst_next(void 
*location, struct ppc_inst *va
return location + ppc_inst_len(tmp);
 }
 
+static inline u64 ppc_inst_as_u64(struct ppc_inst x)
+{
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+   return (u64)ppc_inst_suffix(x) << 32 | ppc_inst_val(x);
+#else
+   return (u64)ppc_inst_val(x) << 32 | ppc_inst_suffix(x);
+#endif
+}
+
 int probe_user_read_inst(struct ppc_inst *inst,
 struct ppc_inst __user *nip);
 
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 3ac105e7faae..69bfe96884e2 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -283,8 +283,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe 
*op, struct kprobe *p)
 * 3. load instruction to be emulated into relevant register, and
 */
temp = ppc_inst_read((struct ppc_inst *)p->ainsn.insn);
-   patch_imm64_load_insns(ppc_inst_val(temp) | ((u64)ppc_inst_suffix(temp) 
<< 32),
-  4, buff + TMPL_INSN_IDX);
+   patch_imm64_load_insns(ppc_inst_as_u64(temp), 4, buff + TMPL_INSN_IDX);
 
/*
 * 4. branch back from trampoline
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 64cf621e5b00..5ecf0d635a8d 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -27,13 +27,7 @@ static int __patch_instruction(struct ppc_inst *exec_addr, 
struct ppc_inst instr
if (!ppc_inst_prefixed(instr)) {
__put_user_asm(ppc_inst_val(instr), patch_addr, err, "stw");
} else {
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-   __put_user_asm((u64)ppc_inst_suffix(instr) << 32 |
-  ppc_inst_val(instr), patch_addr, err, "std");
-#else
-   __put_user_asm((u64)ppc_inst_val(instr) << 32 |
-  ppc_inst_suffix(instr), patch_addr, err, "std");
-#endif
+   __put_user_asm(ppc_inst_as_u64(instr), patch_addr, err, "std");
}
 
if (err)
-- 
2.25.1

66 matches

Mail list logo