Re: [PATCH v4 6/7] KVM: MIPS: clean up redundant 'kvm_run' parameters
On 2020/4/27 13:40, Huacai Chen wrote: Reviewed-by: Huacai Chen On Mon, Apr 27, 2020 at 12:35 PM Tianjia Zhang wrote: In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu' structure. For historical reasons, many kvm-related function parameters retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This patch does a unified cleanup of these remaining redundant parameters. Signed-off-by: Tianjia Zhang --- arch/mips/include/asm/kvm_host.h | 28 +--- arch/mips/kvm/emulate.c | 59 ++-- arch/mips/kvm/mips.c | 11 ++- arch/mips/kvm/trap_emul.c| 114 ++- arch/mips/kvm/vz.c | 26 +++ 5 files changed, 87 insertions(+), 151 deletions(-) Hi Huacai, These two patches(6/7 and 7/7) should be merged into the tree of the mips architecture separately. At present, there seems to be no good way to merge the whole architecture patchs. For this series of patches, some architectures have been merged, some need to update the patch. Thanks and best, Tianjia
Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
On Wed, May 27, 2020 at 1:06 AM Alex Ghiti wrote: > > Hi Zong, > > Le 5/26/20 à 5:43 AM, Zong Li a écrit : > > On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti wrote: > >> This is a preparatory patch for relocatable kernel. > >> > >> The kernel used to be linked at PAGE_OFFSET address and used to be loaded > >> physically at the beginning of the main memory. Therefore, we could use > >> the linear mapping for the kernel mapping. > >> > >> But the relocated kernel base address will be different from PAGE_OFFSET > >> and since in the linear mapping, two different virtual addresses cannot > >> point to the same physical address, the kernel mapping needs to lie outside > >> the linear mapping. > >> > >> In addition, because modules and BPF must be close to the kernel (inside > >> +-2GB window), the kernel is placed at the end of the vmalloc zone minus > >> 2GB, which leaves room for modules and BPF. The kernel could not be > >> placed at the beginning of the vmalloc zone since other vmalloc > >> allocations from the kernel could get all the +-2GB window around the > >> kernel which would prevent new modules and BPF programs to be loaded. > >> > >> Signed-off-by: Alexandre Ghiti > >> --- > >> arch/riscv/boot/loader.lds.S | 3 +- > >> arch/riscv/include/asm/page.h| 10 +- > >> arch/riscv/include/asm/pgtable.h | 37 +--- > >> arch/riscv/kernel/head.S | 3 +- > >> arch/riscv/kernel/module.c | 4 +-- > >> arch/riscv/kernel/vmlinux.lds.S | 3 +- > >> arch/riscv/mm/init.c | 58 +--- > >> arch/riscv/mm/physaddr.c | 2 +- > >> 8 files changed, 87 insertions(+), 33 deletions(-) > >> > >> diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S > >> index 47a5003c2e28..62d94696a19c 100644 > >> --- a/arch/riscv/boot/loader.lds.S > >> +++ b/arch/riscv/boot/loader.lds.S > >> @@ -1,13 +1,14 @@ > >> /* SPDX-License-Identifier: GPL-2.0 */ > >> > >> #include > >> +#include > >> > >> OUTPUT_ARCH(riscv) > >> ENTRY(_start) > >> > >> SECTIONS > >> { > >> - . = PAGE_OFFSET; > >> + . = KERNEL_LINK_ADDR; > >> > >> .payload : { > >> *(.payload) > >> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > >> index 2d50f76efe48..48bb09b6a9b7 100644 > >> --- a/arch/riscv/include/asm/page.h > >> +++ b/arch/riscv/include/asm/page.h > >> @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; > >> > >> #ifdef CONFIG_MMU > >> extern unsigned long va_pa_offset; > >> +extern unsigned long va_kernel_pa_offset; > >> extern unsigned long pfn_base; > >> #define ARCH_PFN_OFFSET(pfn_base) > >> #else > >> #define va_pa_offset 0 > >> +#define va_kernel_pa_offset0 > >> #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT) > >> #endif /* CONFIG_MMU */ > >> > >> extern unsigned long max_low_pfn; > >> extern unsigned long min_low_pfn; > >> +extern unsigned long kernel_virt_addr; > >> > >> #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + > >> va_pa_offset)) > >> -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) > >> +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) > >> +#define kernel_mapping_va_to_pa(x) \ > >> + ((unsigned long)(x) - va_kernel_pa_offset) > >> +#define __va_to_pa_nodebug(x) \ > >> + (((x) >= PAGE_OFFSET) ? \ > >> + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) > >> > >> #ifdef CONFIG_DEBUG_VIRTUAL > >> extern phys_addr_t __virt_to_phys(unsigned long x); > >> diff --git a/arch/riscv/include/asm/pgtable.h > >> b/arch/riscv/include/asm/pgtable.h > >> index 35b60035b6b0..25213cfaf680 100644 > >> --- a/arch/riscv/include/asm/pgtable.h > >> +++ b/arch/riscv/include/asm/pgtable.h > >> @@ -11,23 +11,29 @@ > >> > >> #include > >> > >> -#ifndef __ASSEMBLY__ > >> - > >> -/* Page Upper Directory not used in RISC-V */ > >> -#include > >> -#include > >> -#include > >> -#include > >> - > >> -#ifdef CONFIG_MMU > >> +#ifndef CONFIG_MMU > >> +#define KERNEL_VIRT_ADDR PAGE_OFFSET > >> +#define KERNEL_LINK_ADDR PAGE_OFFSET > >> +#else > >> +/* > >> + * Leave 2GB for modules and BPF that must lie within a 2GB range around > >> + * the kernel. > >> + */ > >> +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) > >> +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR > >> > >> #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > >> #define VMALLOC_END (PAGE_OFFSET - 1) > >> #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE) > >> > >> #define BPF_JIT_REGION_SIZE(SZ_128M) > >> -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) > >> -#define BPF_JIT_REGION_END (VMALLOC_END) > >> +#define BPF_JIT_REGION_START (kernel_virt_addr) > >> +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE) > > It seems to have a potenti
Re: [PATCH v4 3/7] KVM: PPC: Remove redundant kvm_run from vcpu_arch
On 2020/5/27 12:20, Paul Mackerras wrote: On Mon, Apr 27, 2020 at 12:35:10PM +0800, Tianjia Zhang wrote: The 'kvm_run' field already exists in the 'vcpu' structure, which is the same structure as the 'kvm_run' in the 'vcpu_arch' and should be deleted. Signed-off-by: Tianjia Zhang Thanks, patches 3 and 4 of this series applied to my kvm-ppc-next branch. Paul. Thanks for your suggestion, for 5/7, I will submit a new version patch. Thanks, Tianjia
Re: [PATCH] media: omap3isp: Shuffle cacheflush.h and include mm.h
On Tue, May 26, 2020 at 09:34:27PM -0700, Nathan Chancellor wrote: > After mm.h was removed from the asm-generic version of cacheflush.h, > s390 allyesconfig shows several warnings of the following nature: Hmm, I'm pretty sure I sent the same fix a few days ago in response to a build bot report. But if that didn't get picked up I'm fine with your version of it as well of course: Acked-by: Christoph Hellwig
[PATCH] media: omap3isp: Shuffle cacheflush.h and include mm.h
After mm.h was removed from the asm-generic version of cacheflush.h, s390 allyesconfig shows several warnings of the following nature: In file included from ./arch/s390/include/generated/asm/cacheflush.h:1, from drivers/media/platform/omap3isp/isp.c:42: ./include/asm-generic/cacheflush.h:16:42: warning: 'struct mm_struct' declared inside parameter list will not be visible outside of this definition or declaration cacheflush.h does not include mm.h nor does it include any forward declaration of these structures hence the warning. To avoid this, include mm.h explicitly in this file and shuffle cacheflush.h below it. Fixes: 19c0054597a0 ("asm-generic: don't include in cacheflush.h") Signed-off-by: Nathan Chancellor --- I am aware the fixes tag is kind of irrelevant because that SHA will change in the next linux-next revision and this will probably get folded into the original patch anyways but still. The other solution would be to add forward declarations of these structs to the top of cacheflush.h, I just chose to do what Christoph did in the original patch. I am happy to do that instead if you all feel that is better. drivers/media/platform/omap3isp/isp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/media/platform/omap3isp/isp.c b/drivers/media/platform/omap3isp/isp.c index a4ee6b86663e..54106a768e54 100644 --- a/drivers/media/platform/omap3isp/isp.c +++ b/drivers/media/platform/omap3isp/isp.c @@ -39,8 +39,6 @@ * Troy Laramy */ -#include - #include #include #include @@ -49,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +57,8 @@ #include #include +#include + #ifdef CONFIG_ARM_DMA_USE_IOMMU #include #endif -- 2.27.0.rc0
Re: [PATCH] KVM: PPC: Book3S HV: read ibm,secure-memory nodes
On Thu, Apr 16, 2020 at 06:27:15PM +0200, Laurent Dufour wrote: > The newly introduced ibm,secure-memory nodes supersede the > ibm,uv-firmware's property secure-memory-ranges. > > Firmware will no more expose the secure-memory-ranges property so first > read the new one and if not found rollback to the older one. > > Signed-off-by: Laurent Dufour Thanks, applied to my kvm-ppc-next branch. Paul.
Re: [PATCH v4 3/7] KVM: PPC: Remove redundant kvm_run from vcpu_arch
On Mon, Apr 27, 2020 at 12:35:10PM +0800, Tianjia Zhang wrote: > The 'kvm_run' field already exists in the 'vcpu' structure, which > is the same structure as the 'kvm_run' in the 'vcpu_arch' and > should be deleted. > > Signed-off-by: Tianjia Zhang Thanks, patches 3 and 4 of this series applied to my kvm-ppc-next branch. Paul.
Re: [PATCH -next] KVM: PPC: Book3S HV: remove redundant NULL check
On Wed, Apr 01, 2020 at 09:09:03PM +0800, Chen Zhou wrote: > Free function kfree() already does NULL check, so the additional > check is unnecessary, just remove it. > > Signed-off-by: Chen Zhou Thanks, applied to my kvm-ppc-next branch. Paul.
Re: [PATCH v2] KVM: PPC: Book3S HV: relax check on H_SVM_INIT_ABORT
On Wed, May 20, 2020 at 07:43:08PM +0200, Laurent Dufour wrote: > The commit 8c47b6ff29e3 ("KVM: PPC: Book3S HV: Check caller of H_SVM_* > Hcalls") added checks of secure bit of SRR1 to filter out the Hcall > reserved to the Ultravisor. > > However, the Hcall H_SVM_INIT_ABORT is made by the Ultravisor passing the > context of the VM calling UV_ESM. This allows the Hypervisor to return to > the guest without going through the Ultravisor. Thus the Secure bit of SRR1 > is not set in that particular case. > > In the case a regular VM is calling H_SVM_INIT_ABORT, this hcall will be > filtered out in kvmppc_h_svm_init_abort() because kvm->arch.secure_guest is > not set in that case. > > Fixes: 8c47b6ff29e3 ("KVM: PPC: Book3S HV: Check caller of H_SVM_* Hcalls") > Signed-off-by: Laurent Dufour Thanks, applied to my kvm-ppc-next branch. I expanded the comment in the code a little. Paul.
Re: [PATCH] powerpc/kvm/radix: ignore kmemleak false positives
On Wed, May 13, 2020 at 09:39:15AM -0400, Qian Cai wrote: > kvmppc_pmd_alloc() and kvmppc_pte_alloc() allocate some memory but then > pud_populate() and pmd_populate() will use __pa() to reference the newly > allocated memory. > > Since kmemleak is unable to track the physical memory resulting in false > positives, silence those by using kmemleak_ignore(). > > unreferenced object 0xc000201c382a1000 (size 4096): > comm "qemu-kvm", pid 124828, jiffies 4295733767 (age 341.250s) > hex dump (first 32 bytes): >c0 00 20 09 f4 60 03 87 c0 00 20 10 72 a0 03 87 .. ..` .r... >c0 00 20 0e 13 a0 03 87 c0 00 20 1b dc c0 03 87 .. ... . > backtrace: >[<4cc2790f>] kvmppc_create_pte+0x838/0xd20 [kvm_hv] >kvmppc_pmd_alloc at arch/powerpc/kvm/book3s_64_mmu_radix.c:366 >(inlined by) kvmppc_create_pte at > arch/powerpc/kvm/book3s_64_mmu_radix.c:590 >[] kvmppc_book3s_instantiate_page+0x2e0/0x8c0 [kvm_hv] >[ ] kvmppc_book3s_radix_page_fault+0x1b4/0x2b0 [kvm_hv] >[<86dddc0e>] kvmppc_book3s_hv_page_fault+0x214/0x12a0 [kvm_hv] >[<5ae9ccc2>] kvmppc_vcpu_run_hv+0xc5c/0x15f0 [kvm_hv] >[ ] kvmppc_vcpu_run+0x34/0x48 [kvm] >[ ] kvm_arch_vcpu_ioctl_run+0x314/0x420 [kvm] >[<2543dd54>] kvm_vcpu_ioctl+0x33c/0x950 [kvm] >[<48155cd6>] ksys_ioctl+0xd8/0x130 >[<41ffeaa7>] sys_ioctl+0x28/0x40 >[<4afc4310>] system_call_exception+0x114/0x1e0 >[ ] system_call_common+0xf0/0x278 > unreferenced object 0xc0002001f0c03900 (size 256): > comm "qemu-kvm", pid 124830, jiffies 4295735235 (age 326.570s) > hex dump (first 32 bytes): >c0 00 20 10 fa a0 03 87 c0 00 20 10 fa a1 03 87 .. ... . >c0 00 20 10 fa a2 03 87 c0 00 20 10 fa a3 03 87 .. ... . > backtrace: >[<23f675b8>] kvmppc_create_pte+0x854/0xd20 [kvm_hv] >kvmppc_pte_alloc at arch/powerpc/kvm/book3s_64_mmu_radix.c:356 >(inlined by) kvmppc_create_pte at > arch/powerpc/kvm/book3s_64_mmu_radix.c:593 >[ ] kvmppc_book3s_instantiate_page+0x2e0/0x8c0 [kvm_hv] >[ ] kvmppc_book3s_radix_page_fault+0x1b4/0x2b0 [kvm_hv] >[<86dddc0e>] kvmppc_book3s_hv_page_fault+0x214/0x12a0 [kvm_hv] >[<5ae9ccc2>] kvmppc_vcpu_run_hv+0xc5c/0x15f0 [kvm_hv] >[ ] kvmppc_vcpu_run+0x34/0x48 [kvm] >[ ] kvm_arch_vcpu_ioctl_run+0x314/0x420 [kvm] >[<2543dd54>] kvm_vcpu_ioctl+0x33c/0x950 [kvm] >[<48155cd6>] ksys_ioctl+0xd8/0x130 >[<41ffeaa7>] sys_ioctl+0x28/0x40 >[<4afc4310>] system_call_exception+0x114/0x1e0 >[ ] system_call_common+0xf0/0x278 > > Signed-off-by: Qian Cai Thanks, applied to my kvm-ppc-next branch. Paul.
Re: [PATCH] powerpc/kvm/book3s64/vio: fix some RCU-list locks
On Sun, May 10, 2020 at 01:18:34AM -0400, Qian Cai wrote: > It is unsafe to traverse kvm->arch.spapr_tce_tables and > stt->iommu_tables without the RCU read lock held. Also, add > cond_resched_rcu() in places with the RCU read lock held that could take > a while to finish. > > arch/powerpc/kvm/book3s_64_vio.c:76 RCU-list traversed in non-reader > section!! > > other info that might help us debug this: > > rcu_scheduler_active = 2, debug_locks = 1 > no locks held by qemu-kvm/4265. > > stack backtrace: > CPU: 96 PID: 4265 Comm: qemu-kvm Not tainted 5.7.0-rc4-next-20200508+ #2 > Call Trace: > [c000201a8690f720] [c0715948] dump_stack+0xfc/0x174 (unreliable) > [c000201a8690f770] [c01d9470] lockdep_rcu_suspicious+0x140/0x164 > [c000201a8690f7f0] [c00810b9fb48] > kvm_spapr_tce_release_iommu_group+0x1f0/0x220 [kvm] > [c000201a8690f870] [c00810b8462c] > kvm_spapr_tce_release_vfio_group+0x54/0xb0 [kvm] > [c000201a8690f8a0] [c00810b84710] kvm_vfio_destroy+0x88/0x140 [kvm] > [c000201a8690f8f0] [c00810b7d488] kvm_put_kvm+0x370/0x600 [kvm] > [c000201a8690f990] [c00810b7e3c0] kvm_vm_release+0x38/0x60 [kvm] > [c000201a8690f9c0] [c05223f4] __fput+0x124/0x330 > [c000201a8690fa20] [c0151cd8] task_work_run+0xb8/0x130 > [c000201a8690fa70] [c01197e8] do_exit+0x4e8/0xfa0 > [c000201a8690fb70] [c011a374] do_group_exit+0x64/0xd0 > [c000201a8690fbb0] [c0132c90] get_signal+0x1f0/0x1200 > [c000201a8690fcc0] [c0020690] do_notify_resume+0x130/0x3c0 > [c000201a8690fda0] [c0038d64] syscall_exit_prepare+0x1a4/0x280 > [c000201a8690fe20] [c000c8f8] system_call_common+0xf8/0x278 > > > arch/powerpc/kvm/book3s_64_vio.c:368 RCU-list traversed in non-reader > section!! > > other info that might help us debug this: > > rcu_scheduler_active = 2, debug_locks = 1 > 2 locks held by qemu-kvm/4264: > #0: c000201ae2d000d8 (&vcpu->mutex){+.+.}-{3:3}, at: > kvm_vcpu_ioctl+0xdc/0x950 [kvm] > #1: c000200c9ed0c468 (&kvm->srcu){}-{0:0}, at: > kvmppc_h_put_tce+0x88/0x340 [kvm] > > > arch/powerpc/kvm/book3s_64_vio.c:108 RCU-list traversed in non-reader > section!! > > other info that might help us debug this: > > rcu_scheduler_active = 2, debug_locks = 1 > 1 lock held by qemu-kvm/4257: > #0: c000200b1b363a40 (&kv->lock){+.+.}-{3:3}, at: > kvm_vfio_set_attr+0x598/0x6c0 [kvm] > > > arch/powerpc/kvm/book3s_64_vio.c:146 RCU-list traversed in non-reader > section!! > > other info that might help us debug this: > > rcu_scheduler_active = 2, debug_locks = 1 > 1 lock held by qemu-kvm/4257: > #0: c000200b1b363a40 (&kv->lock){+.+.}-{3:3}, at: > kvm_vfio_set_attr+0x598/0x6c0 [kvm] > > Signed-off-by: Qian Cai Thanks, applied to my kvm-ppc-next branch, with the cond_resched_rcu() in kvmppc_tce_validate removed. Paul.
[PATCH v8 2/5] seq_buf: Export seq_buf_printf
'seq_buf' provides a very useful abstraction for writing to a string buffer without needing to worry about it over-flowing. However even though the API has been stable for couple of years now its still not exported to kernel loadable modules limiting its usage. Hence this patch proposes update to 'seq_buf.c' to mark seq_buf_printf() which is part of the seq_buf API to be exported to kernel loadable GPL modules. This symbol will be used in later parts of this patch-set to simplify content creation for a sysfs attribute. Cc: Piotr Maziarz Cc: Cezary Rojewski Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Borislav Petkov Signed-off-by: Vaibhav Jain --- Changelog: v7..v8: * Updated the patch title [ Christoph Hellwig ] * Updated patch description to replace confusing term 'external kernel modules' to 'kernel lodable modules'. Resend: * Added ack from Steven Rostedt v6..v7: * New patch in the series --- lib/seq_buf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 4e865d42ab03..707453f5d58e 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -91,6 +91,7 @@ int seq_buf_printf(struct seq_buf *s, const char *fmt, ...) return ret; } +EXPORT_SYMBOL_GPL(seq_buf_printf); #ifdef CONFIG_BINARY_PRINTF /** -- 2.26.2
[PATCH v8 5/5] powerpc/papr_scm: Implement support for PAPR_SCM_PDSM_HEALTH
This patch implements support for PDSM request 'PAPR_SCM_PDSM_HEALTH' that returns a newly introduced 'struct nd_papr_pdsm_health' instance containing dimm health information back to user space in response to ND_CMD_CALL. This functionality is implemented in newly introduced papr_scm_get_health() that queries the scm-dimm health information and then copies this information to the package payload whose layout is defined by 'struct nd_papr_pdsm_health'. The patch also introduces a new member 'struct papr_scm_priv.health' thats an instance of 'struct nd_papr_pdsm_health' to cache the health information of a nvdimm. As a result functions drc_pmem_query_health() and flags_show() are updated to populate and use this new struct instead of a u64 integer that was earlier used. Cc: "Aneesh Kumar K . V" Cc: Dan Williams Cc: Michael Ellerman Cc: Ira Weiny Signed-off-by: Vaibhav Jain --- Changelog: v7..v8: * None Resend: * None v6..v7: * Updated flags_show() to use seq_buf_printf(). [Mpe] * Updated papr_scm_get_health() to use newly introduced __drc_pmem_query_health() bypassing the cache [Mpe]. v5..v6: * Added attribute '__packed' to 'struct nd_papr_pdsm_health_v1' to gaurd against possibility of different compilers adding different paddings to the struct [ Dan Williams ] * Updated 'struct nd_papr_pdsm_health_v1' to use __u8 instead of 'bool' and also updated drc_pmem_query_health() to take this into account. [ Dan Williams ] v4..v5: * None v3..v4: * Call the DSM_PAPR_SCM_HEALTH service function from papr_scm_service_dsm() instead of papr_scm_ndctl(). [Aneesh] v2..v3: * Updated struct nd_papr_scm_dimm_health_stat_v1 to use '__xx' types as its exported to the userspace [Aneesh] * Changed the constants DSM_PAPR_SCM_DIMM_XX indicating dimm health from enum to #defines [Aneesh] v1..v2: * New patch in the series --- arch/powerpc/include/uapi/asm/papr_scm_pdsm.h | 39 ++ arch/powerpc/platforms/pseries/papr_scm.c | 125 +++--- 2 files changed, 147 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h index c4bae3208e73..f81d714279f0 100644 --- a/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h +++ b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h @@ -115,6 +115,7 @@ struct nd_pdsm_cmd_pkg { */ enum papr_scm_pdsm { PAPR_SCM_PDSM_MIN = 0x0, + PAPR_SCM_PDSM_HEALTH, PAPR_SCM_PDSM_MAX, }; @@ -133,4 +134,42 @@ static inline void *pdsm_cmd_to_payload(struct nd_pdsm_cmd_pkg *pcmd) return (void *)(pcmd->payload); } +/* Various scm-dimm health indicators */ +#define PAPR_PDSM_DIMM_HEALTHY 0 +#define PAPR_PDSM_DIMM_UNHEALTHY 1 +#define PAPR_PDSM_DIMM_CRITICAL 2 +#define PAPR_PDSM_DIMM_FATAL 3 + +/* + * Struct exchanged between kernel & ndctl in for PAPR_SCM_PDSM_HEALTH + * Various flags indicate the health status of the dimm. + * + * dimm_unarmed: Dimm not armed. So contents wont persist. + * dimm_bad_shutdown : Previous shutdown did not persist contents. + * dimm_bad_restore: Contents from previous shutdown werent restored. + * dimm_scrubbed : Contents of the dimm have been scrubbed. + * dimm_locked : Contents of the dimm cant be modified until CEC reboot + * dimm_encrypted : Contents of dimm are encrypted. + * dimm_health : Dimm health indicator. One of PAPR_PDSM_DIMM_ + */ +struct nd_papr_pdsm_health_v1 { + __u8 dimm_unarmed; + __u8 dimm_bad_shutdown; + __u8 dimm_bad_restore; + __u8 dimm_scrubbed; + __u8 dimm_locked; + __u8 dimm_encrypted; + __u16 dimm_health; +} __packed; + +/* + * Typedef the current struct for dimm_health so that any application + * or kernel recompiled after introducing a new version automatically + * supports the new version. + */ +#define nd_papr_pdsm_health nd_papr_pdsm_health_v1 + +/* Current version number for the dimm health struct */ +#define ND_PAPR_PDSM_HEALTH_VERSION 1 + #endif /* _UAPI_ASM_POWERPC_PAPR_SCM_PDSM_H_ */ diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index fcb8afee97dc..adf1fb819c56 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -88,7 +88,7 @@ struct papr_scm_priv { unsigned long lasthealth_jiffies; /* Health information for the dimm */ - u64 health_bitmap; + struct nd_papr_pdsm_health health; }; static int drc_pmem_bind(struct papr_scm_priv *p) @@ -201,6 +201,7 @@ static int drc_pmem_query_n_bind(struct papr_scm_priv *p) static int __drc_pmem_query_health(struct papr_scm_priv *p) { unsigned long ret[PLPAR_HCALL_BUFSIZE]; + u64 health; long rc; /* issue the hcall */ @@ -208,18 +209,46 @@ static int __drc_pmem_query_health(struct papr_scm_priv *p) if (rc != H_SUCCESS) { dev_err(&p->pdev->dev,
[PATCH v8 4/5] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods
Introduce support for PAPR NVDIMM Specific Methods (PDSM) in papr_scm module and add the command family to the white list of NVDIMM command sets. Also advertise support for ND_CMD_CALL for the nvdimm command mask and implement necessary scaffolding in the module to handle ND_CMD_CALL ioctl and PDSM requests that we receive. The layout of the PDSM request as we expect from libnvdimm/libndctl is described in newly introduced uapi header 'papr_scm_pdsm.h' which defines a new 'struct nd_pdsm_cmd_pkg' header. This header is used to communicate the PDSM request via member 'nd_pkg_papr_scm->nd_command' and size of payload that need to be sent/received for servicing the PDSM. A new function is_cmd_valid() is implemented that reads the args to papr_scm_ndctl() and performs sanity tests on them. A new function papr_scm_service_pdsm() is introduced and is called from papr_scm_ndctl() in case of a PDSM request is received via ND_CMD_CALL command from libnvdimm. Cc: "Aneesh Kumar K . V" Cc: Dan Williams Cc: Michael Ellerman Cc: Ira Weiny Signed-off-by: Vaibhav Jain --- Changelog: v7..v8: * Removed the 'payload_offset' field from 'struct nd_pdsm_cmd_pkg'. Instead command payload is always assumed to start at 'nd_pdsm_cmd_pkg.payload'. [ Aneesh ] * To enable introducing new fields to 'struct nd_pdsm_cmd_pkg', 'reserved' field of 10-bytes is introduced. [ Aneesh ] * Fixed a typo in "Backward Compatibility" section of papr_scm_pdsm.h [ Ira ] Resend: * None v6..v7 : * Removed the re-definitions of __packed macro from papr_scm_pdsm.h [Mpe]. * Removed the usage of __KERNEL__ macros in papr_scm_pdsm.h [Mpe]. * Removed macros that were unused in papr_scm.c from papr_scm_pdsm.h [Mpe]. * Made functions defined in papr_scm_pdsm.h as static inline. [Mpe] v5..v6 : * Changed the usage of the term DSM to PDSM to distinguish it from the ACPI term [ Dan Williams ] * Renamed papr_scm_dsm.h to papr_scm_pdsm.h and updated various struct to reflect the new terminology. * Updated the patch description and title to reflect the new terminology. * Squashed patch to introduce new command family in 'ndctl.h' with this patch [ Dan Williams ] * Updated the papr_scm_pdsm method starting index from 0x1 to 0x0 [ Dan Williams ] * Removed redundant license text from the papr_scm_psdm.h file. [ Dan Williams ] * s/envelop/envelope/ at various places [ Dan Williams ] * Added '__packed' attribute to command package header to gaurd against different compiler adding paddings between the fields. [ Dan Williams] * Converted various pr_debug to dev_debug [ Dan Williams ] v4..v5 : * None v3..v4 : * None v2..v3 : * Updated the patch prefix to 'ndctl/uapi' [Aneesh] v1..v2 : * None --- arch/powerpc/include/uapi/asm/papr_scm_pdsm.h | 136 ++ arch/powerpc/platforms/pseries/papr_scm.c | 101 - include/uapi/linux/ndctl.h| 1 + 3 files changed, 232 insertions(+), 6 deletions(-) create mode 100644 arch/powerpc/include/uapi/asm/papr_scm_pdsm.h diff --git a/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h new file mode 100644 index ..c4bae3208e73 --- /dev/null +++ b/arch/powerpc/include/uapi/asm/papr_scm_pdsm.h @@ -0,0 +1,136 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * PAPR-SCM Dimm specific methods (PDSM) and structs for libndctl + * + * (C) Copyright IBM 2020 + * + * Author: Vaibhav Jain + */ + +#ifndef _UAPI_ASM_POWERPC_PAPR_SCM_PDSM_H_ +#define _UAPI_ASM_POWERPC_PAPR_SCM_PDSM_H_ + +#include + +/* + * PDSM Envelope: + * + * The ioctl ND_CMD_CALL transfers data between user-space and kernel via + * envelope which consists of a header and user-defined payload sections. + * The header is described by 'struct nd_pdsm_cmd_pkg' which expects a + * payload following it and accessible via 'nd_pdsm_cmd_pkg.payload' field. + * There is reserved field that can used to introduce new fields to the + * structure in future. It also tries to ensure that 'nd_pdsm_cmd_pkg.payload' + * lies at a 8-byte boundary. + * + * +-+-+---+ + * | 64-Bytes | 16-Bytes | Max 176-Bytes | + * +-+-+---+ + * | nd_pdsm_cmd_pkg | | + * |-+ | | + * | nd_cmd_pkg | | | + * +-+-+---+ + * | nd_family | | | + * | nd_size_out | cmd_status | | + * | nd_size_in | payload_version | payload | + * | nd_command | reserved| | + * | nd_fw_size | | | + * +-+-+-
[PATCH v8 3/5] powerpc/papr_scm: Fetch nvdimm health information from PHYP
Implement support for fetching nvdimm health information via H_SCM_HEALTH hcall as documented in Ref[1]. The hcall returns a pair of 64-bit bitmap, bitwise-and of which is then stored in 'struct papr_scm_priv' and subsequently partially exposed to user-space via newly introduced dimm specific attribute 'papr/flags'. Since the hcall is costly, the health information is cached and only re-queried, 60s after the previous successful hcall. The patch also adds a documentation text describing flags reported by the the new sysfs attribute 'papr/flags' is also introduced at Documentation/ABI/testing/sysfs-bus-papr-scm. [1] commit 58b278f568f0 ("powerpc: Provide initial documentation for PAPR hcalls") Cc: "Aneesh Kumar K . V" Cc: Dan Williams Cc: Michael Ellerman Cc: Ira Weiny Signed-off-by: Vaibhav Jain --- Changelog: v7..v8: * Update type of variable 'rc' in __drc_pmem_query_health() and drc_pmem_query_health() to long and int respectively. [ Ira ] * Updated the patch description to s/64 bit Big Endian Number/64-bit bitmap/ [ Ira, Aneesh ]. Resend: * None v6..v7 : * Used the exported buf_seq_printf() function to generate content for 'papr/flags' * Moved the PAPR_SCM_DIMM_* bit-flags macro definitions to papr_scm.c and removed the papr_scm.h file [Mpe] * Some minor consistency issued in sysfs-bus-papr-scm documentation. [Mpe] * s/dimm_mutex/health_mutex/g [Mpe] * Split drc_pmem_query_health() into two function one of which takes care of caching and locking. [Mpe] * Fixed a local copy creation of dimm health information using READ_ONCE(). [Mpe] v5..v6 : * Change the flags sysfs attribute from 'papr_flags' to 'papr/flags' [Dan Williams] * Include documentation for 'papr/flags' attr [Dan Williams] * Change flag 'save_fail' to 'flush_fail' [Dan Williams] * Caching of health bitmap to reduce expensive hcalls [Dan Williams] * Removed usage of PPC_BIT from 'papr-scm.h' header [Mpe] * Replaced two __be64 integers from papr_scm_priv to a single u64 integer [Mpe] * Updated patch description to reflect the changes made in this version. * Removed avoidable usage of 'papr_scm_priv.dimm_mutex' from flags_show() [Dan Williams] v4..v5 : * None v3..v4 : * None v2..v3 : * Removed PAPR_SCM_DIMM_HEALTH_NON_CRITICAL as a condition for NVDIMM unarmed [Aneesh] v1..v2 : * New patch in the series. --- Documentation/ABI/testing/sysfs-bus-papr-scm | 27 +++ arch/powerpc/platforms/pseries/papr_scm.c| 169 ++- 2 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-bus-papr-scm diff --git a/Documentation/ABI/testing/sysfs-bus-papr-scm b/Documentation/ABI/testing/sysfs-bus-papr-scm new file mode 100644 index ..6143d06072f1 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-papr-scm @@ -0,0 +1,27 @@ +What: /sys/bus/nd/devices/nmemX/papr/flags +Date: Apr, 2020 +KernelVersion: v5.8 +Contact: linuxppc-dev , linux-nvd...@lists.01.org, +Description: + (RO) Report flags indicating various states of a + papr-scm NVDIMM device. Each flag maps to a one or + more bits set in the dimm-health-bitmap retrieved in + response to H_SCM_HEALTH hcall. The details of the bit + flags returned in response to this hcall is available + at 'Documentation/powerpc/papr_hcalls.rst' . Below are + the flags reported in this sysfs file: + + * "not_armed" : Indicates that NVDIMM contents will not + survive a power cycle. + * "flush_fail" : Indicates that NVDIMM contents + couldn't be flushed during last + shut-down event. + * "restore_fail": Indicates that NVDIMM contents + couldn't be restored during NVDIMM + initialization. + * "encrypted" : NVDIMM contents are encrypted. + * "smart_notify": There is health event for the NVDIMM. + * "scrubbed": Indicating that contents of the + NVDIMM have been scrubbed. + * "locked" : Indicating that NVDIMM contents cant + be modified until next power cycle. diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f35592423380..010cd9aae488 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include @@ -22,6 +23,44 @@ (1ul << ND_CMD_GET_CONFIG_DATA) | \ (1ul << ND_CMD_SET_CONFIG_DATA)) +/* DIMM health bitmap bitmap indicators */ +/* SCM device is unable to persist memory contents */ +#define PAPR_SCM_DIMM_UNARMED (1ULL << (63 - 0)) +/* SCM device faile
[PATCH v8 1/5] powerpc: Document details on H_SCM_HEALTH hcall
Add documentation to 'papr_hcalls.rst' describing the bitmap flags that are returned from H_SCM_HEALTH hcall as per the PAPR-SCM specification. Cc: "Aneesh Kumar K . V" Cc: Dan Williams Cc: Michael Ellerman Cc: Ira Weiny Signed-off-by: Vaibhav Jain --- Changelog: v7..v8: * Added a clarification on bit-ordering of Health Bitmap Resend: * None v6..v7: * None v5..v6: * New patch in the series --- Documentation/powerpc/papr_hcalls.rst | 45 --- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/Documentation/powerpc/papr_hcalls.rst b/Documentation/powerpc/papr_hcalls.rst index 3493631a60f8..45063f305813 100644 --- a/Documentation/powerpc/papr_hcalls.rst +++ b/Documentation/powerpc/papr_hcalls.rst @@ -220,13 +220,50 @@ from the LPAR memory. **H_SCM_HEALTH** | Input: drcIndex -| Out: *health-bitmap, health-bit-valid-bitmap* +| Out: *health-bitmap (r4), health-bit-valid-bitmap (r5)* | Return Value: *H_Success, H_Parameter, H_Hardware* Given a DRC Index return the info on predictive failure and overall health of -the NVDIMM. The asserted bits in the health-bitmap indicate a single predictive -failure and health-bit-valid-bitmap indicate which bits in health-bitmap are -valid. +the NVDIMM. The asserted bits in the health-bitmap indicate one or more states +(described in table below) of the NVDIMM and health-bit-valid-bitmap indicate +which bits in health-bitmap are valid. The bits are reported in +reverse bit ordering for example a value of 0xC400 +indicates bits 0, 1, and 5 are valid. + +Health Bitmap Flags: + ++--+---+ +| Bit | Definition | ++==+===+ +| 00 | SCM device is unable to persist memory contents. | +| | If the system is powered down, nothing will be saved. | ++--+---+ +| 01 | SCM device failed to persist memory contents. Either contents were not| +| | saved successfully on power down or were not restored properly on | +| | power up. | ++--+---+ +| 02 | SCM device contents are persisted from previous IPL. The data from | +| | the last boot were successfully restored. | ++--+---+ +| 03 | SCM device contents are not persisted from previous IPL. There was no | +| | data to restore from the last boot. | ++--+---+ +| 04 | SCM device memory life remaining is critically low | ++--+---+ +| 05 | SCM device will be garded off next IPL due to failure | ++--+---+ +| 06 | SCM contents cannot persist due to current platform health status. A | +| | hardware failure may prevent data from being saved or restored. | ++--+---+ +| 07 | SCM device is unable to persist memory contents in certain conditions | ++--+---+ +| 08 | SCM device is encrypted | ++--+---+ +| 09 | SCM device has successfully completed a requested erase or secure | +| | erase procedure. | ++--+---+ +|10:63 | Reserved / Unused | ++--+---+ **H_SCM_PERFORMANCE_STATS** -- 2.26.2
[PATCH v8 0/5] powerpc/papr_scm: Add support for reporting nvdimm health
Changes since v7 [1]: * Addressed various review comments from Aneesh, Ira and Mpe. * Removed the 'payload_offset' field from 'struct nd_pdsm_cmd_pkg' and replaced it with some reserved fields [ Aneesh ]. * Updated the doc and description for patch that fetches dimm health information from PHYP clarifying bit-ordering [ Mpe and Ira ]. * Updated the patch title & description for patch exporting 'seq_buf_printf'. [ Christoph Hellwig ] * Fix types of various newly introduced vars in papr_scm.c [ Ira ]. * Fixed a typo in 'papr_scm_pdsm.h' [ Ira ] [1] https://lore.kernel.org/linux-nvdimm/20200519190058.257981-1-vaib...@linux.ibm.com --- The PAPR standard[2][4] provides mechanisms to query the health and performance stats of an NVDIMM via various hcalls as described in Ref[3]. Until now these stats were never available nor exposed to the user-space tools like 'ndctl'. This is partly due to PAPR platform not having support for ACPI and NFIT. Hence 'ndctl' is unable to query and report the dimm health status and a user had no way to determine the current health status of a NDVIMM. To overcome this limitation, this patch-set updates papr_scm kernel module to query and fetch NVDIMM health stats using hcalls described in Ref[3]. This health and performance stats are then exposed to userspace via sysfs and PAPR-NVDIMM-Specific-Methods(PDSM) issued by libndctl. These changes coupled with proposed ndtcl changes located at Ref[5] should provide a way for the user to retrieve NVDIMM health status using ndtcl. Below is a sample output using proposed kernel + ndctl for PAPR NVDIMM in a emulation environment: # ndctl list -DH [ { "dev":"nmem0", "health":{ "health_state":"fatal", "shutdown_state":"dirty" } } ] Dimm health report output on a pseries guest lpar with vPMEM or HMS based NVDIMMs that are in perfectly healthy conditions: # ndctl list -d nmem0 -H [ { "dev":"nmem0", "health":{ "health_state":"ok", "shutdown_state":"clean" } } ] PAPR NVDIMM-Specific-Methods(PDSM) == PDSM requests are issued by vendor specific code in libndctl to execute certain operations or fetch information from NVDIMMS. PDSMs requests can be sent to papr_scm module via libndctl(userspace) and libnvdimm (kernel) using the ND_CMD_CALL ioctl command which can be handled in the dimm control function papr_scm_ndctl(). Current patchset proposes a single PDSM to retrieve NVDIMM health, defined in the newly introduced uapi header named 'papr_scm_pdsm.h'. Support for more PDSMs will be added in future. Structure of the patch-set == The patch-set starts with a doc patch documenting details of hcall H_SCM_HEALTH. Second patch exports kernel symbol seq_buf_printf() thats used in subsequent patches to generate sysfs attribute content. Third patch implements support for fetching NVDIMM health information from PHYP and partially exposing it to user-space via a NVDIMM sysfs flag. Fourth patches deal with implementing support for servicing PDSM commands in papr_scm module. Finally the last patch implements support for servicing PDSM 'PAPR_SCM_PDSM_HEALTH' that returns the NVDIMM health information to libndctl. References: [2] "Power Architecture Platform Reference" https://en.wikipedia.org/wiki/Power_Architecture_Platform_Reference [3] commit 58b278f568f0 ("powerpc: Provide initial documentation for PAPR hcalls") [4] "Linux on Power Architecture Platform Reference" https://members.openpowerfoundation.org/document/dl/469 [5] https://github.com/vaibhav92/ndctl/tree/papr_scm_health_v8 --- Vaibhav Jain (5): powerpc: Document details on H_SCM_HEALTH hcall seq_buf: Export seq_buf_printf powerpc/papr_scm: Fetch nvdimm health information from PHYP ndctl/papr_scm,uapi: Add support for PAPR nvdimm specific methods powerpc/papr_scm: Implement support for PAPR_SCM_PDSM_HEALTH Documentation/ABI/testing/sysfs-bus-papr-scm | 27 ++ Documentation/powerpc/papr_hcalls.rst | 45 ++- arch/powerpc/include/uapi/asm/papr_scm_pdsm.h | 175 + arch/powerpc/platforms/pseries/papr_scm.c | 363 +- include/uapi/linux/ndctl.h| 1 + lib/seq_buf.c | 1 + 6 files changed, 599 insertions(+), 13 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-bus-papr-scm create mode 100644 arch/powerpc/include/uapi/asm/papr_scm_pdsm.h -- 2.26.2
Re: [PATCH v3] powerpc/XIVE: SVM: share the event-queue page with the Hypervisor.
Ram Pai writes: > XIVE interrupt controller uses an Event Queue (EQ) to enqueue event > notifications when an exception occurs. The EQ is a single memory page > provided by the O/S defining a circular buffer, one per server and > priority couple. > > On baremetal, the EQ page is configured with an OPAL call. On pseries, > an extra hop is necessary and the guest OS uses the hcall > H_INT_SET_QUEUE_CONFIG to configure the XIVE interrupt controller. > > The XIVE controller being Hypervisor privileged, it will not be allowed > to enqueue event notifications for a Secure VM unless the EQ pages are > shared by the Secure VM. > > Hypervisor/Ultravisor still requires support for the TIMA and ESB page > fault handlers. Until this is complete, QEMU can use the emulated XIVE > device for Secure VMs, option "kernel_irqchip=off" on the QEMU pseries > machine. > > Cc: kvm-...@vger.kernel.org > Cc: linuxppc-dev@lists.ozlabs.org > Cc: Michael Ellerman > Cc: Thiago Jung Bauermann > Cc: Michael Anderson > Cc: Sukadev Bhattiprolu > Cc: Alexey Kardashevskiy > Cc: Paul Mackerras > Cc: David Gibson > Reviewed-by: Cedric Le Goater > Reviewed-by: Greg Kurz > Signed-off-by: Ram Pai > > v3: fix a minor semantics in description. > and added reviewed-by from Cedric and Greg. > v2: better description of the patch from Cedric. > --- Please put the change history after the '---' break in future please, I had to fix this up manually. cheers
[v3 2/2] dts: ppc: t1024rdb: remove interrupts property
From: Biwen Li Since the interrupt pin for RTC DS1339 is not connected to the CPU on T1024RDB, remove the interrupt property from the device tree. This also fix the following warning for hwclock.util-linux: $ hwclock.util-linux hwclock.util-linux: select() to /dev/rtc0 to wait for clock tick timed out Signed-off-by: Biwen Li --- arch/powerpc/boot/dts/fsl/t1024rdb.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/boot/dts/fsl/t1024rdb.dts b/arch/powerpc/boot/dts/fsl/t1024rdb.dts index 645caff98ed1..605ceec66af3 100644 --- a/arch/powerpc/boot/dts/fsl/t1024rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t1024rdb.dts @@ -161,7 +161,6 @@ rtc@68 { compatible = "dallas,ds1339"; reg = <0x68>; - interrupts = <0x1 0x1 0 0>; }; }; -- 2.17.1
[v3 1/2] dts: ppc: t4240rdb: remove interrupts property
From: Biwen Li Since the interrupt pin for RTC DS1374 is not connected to the CPU on T4240RDB, remove the interrupt property from the device tree. This also fix the following warning for hwclock.util-linux: $ hwclock.util-linux hwclock.util-linux: select() to /dev/rtc0 to wait for clock tick timed out Signed-off-by: Biwen Li --- arch/powerpc/boot/dts/fsl/t4240rdb.dts | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/boot/dts/fsl/t4240rdb.dts b/arch/powerpc/boot/dts/fsl/t4240rdb.dts index a56a705d41f7..145896f2eef6 100644 --- a/arch/powerpc/boot/dts/fsl/t4240rdb.dts +++ b/arch/powerpc/boot/dts/fsl/t4240rdb.dts @@ -144,7 +144,6 @@ rtc@68 { compatible = "dallas,ds1374"; reg = <0x68>; - interrupts = <0x1 0x1 0 0>; }; }; -- 2.17.1
[PATCH v2] selftests: powerpc: Add test for execute-disabled pkeys
Apart from read and write access, memory protection keys can also be used for restricting execute permission of pages on powerpc. This adds a test to verify if the feature works as expected. Signed-off-by: Sandipan Das --- Previous versions can be found at v1: https://lore.kernel.org/linuxppc-dev/20200508162332.65316-1-sandi...@linux.ibm.com/ Changes in v2: - Added .gitignore entry for test binary. - Fixed builds for older distros where siginfo_t might not have si_pkey as a formal member based on discussion with Michael. --- tools/testing/selftests/powerpc/mm/.gitignore | 1 + tools/testing/selftests/powerpc/mm/Makefile | 3 +- .../selftests/powerpc/mm/pkey_exec_prot.c | 336 ++ 3 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/mm/pkey_exec_prot.c diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index 2ca523255b1b..8f841f925baa 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -8,3 +8,4 @@ wild_bctr large_vm_fork_separation bad_accesses tlbie_test +pkey_exec_prot diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index b9103c4bb414..2816229f648b 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -3,7 +3,7 @@ noarg: $(MAKE) -C ../ TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr \ - large_vm_fork_separation bad_accesses + large_vm_fork_separation bad_accesses pkey_exec_prot TEST_GEN_PROGS_EXTENDED := tlbie_test TEST_GEN_FILES := tempfile @@ -17,6 +17,7 @@ $(OUTPUT)/prot_sao: ../utils.c $(OUTPUT)/wild_bctr: CFLAGS += -m64 $(OUTPUT)/large_vm_fork_separation: CFLAGS += -m64 $(OUTPUT)/bad_accesses: CFLAGS += -m64 +$(OUTPUT)/pkey_exec_prot: CFLAGS += -m64 $(OUTPUT)/tempfile: dd if=/dev/zero of=$@ bs=64k count=1 diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c new file mode 100644 index ..147fb9ed47d5 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* + * Copyright 2020, Sandipan Das, IBM Corp. + * + * Test if applying execute protection on pages using memory + * protection keys works as expected. + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +#include +#include +#include + +#include "utils.h" + +/* Override definitions as they might be inconsistent */ +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS0x3 + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#undef PKEY_DISABLE_EXECUTE +#define PKEY_DISABLE_EXECUTE 0x4 + +/* Older distros might not define this */ +#ifndef SEGV_PKUERR +#define SEGV_PKUERR4 +#endif + +#define SI_PKEY_OFFSET 0x20 + +#define SYS_pkey_mprotect 386 +#define SYS_pkey_alloc 384 +#define SYS_pkey_free 385 + +#define PKEY_BITS_PER_PKEY 2 +#define NR_PKEYS 32 + +#define PKEY_BITS_MASK ((1UL << PKEY_BITS_PER_PKEY) - 1) + +static unsigned long pkeyreg_get(void) +{ + unsigned long uamr; + + asm volatile("mfspr %0, 0xd" : "=r"(uamr)); + return uamr; +} + +static void pkeyreg_set(unsigned long uamr) +{ + asm volatile("isync; mtspr 0xd, %0; isync;" : : "r"(uamr)); +} + +static void pkey_set_rights(int pkey, unsigned long rights) +{ + unsigned long uamr, shift; + + shift = (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; + uamr = pkeyreg_get(); + uamr &= ~(PKEY_BITS_MASK << shift); + uamr |= (rights & PKEY_BITS_MASK) << shift; + pkeyreg_set(uamr); +} + +static int sys_pkey_mprotect(void *addr, size_t len, int prot, int pkey) +{ + return syscall(SYS_pkey_mprotect, addr, len, prot, pkey); +} + +static int sys_pkey_alloc(unsigned long flags, unsigned long rights) +{ + return syscall(SYS_pkey_alloc, flags, rights); +} + +static int sys_pkey_free(int pkey) +{ + return syscall(SYS_pkey_free, pkey); +} + +static volatile int fpkey, fcode, ftype, faults; +static unsigned long pgsize, numinsns; +static volatile unsigned int *faddr; +static unsigned int *insns; + +static void segv_handler(int signum, siginfo_t *sinfo, void *ctx) +{ + int pkey; + +#ifdef si_pkey + pkey = sinfo->si_pkey; +#else + pkey = *((int *)(((char *) sinfo) + SI_PKEY_OFFSET)); +#endif + + /* Check if this fault originated because of the expected reasons */ + if (sinfo->si_code != SEGV_ACCERR && sinfo->si_code != SEGV_PKUERR) { + printf("got an unexpected fault, code = %d\n", + sinfo->si_code); + goto fail; + } + + /* Check if this fault originated from the expected address */ +
Re: [PATCH -next] scsi: ibmvscsi: Make some functions static
On Wed, 20 May 2020 17:10:36 +0800, Chen Tao wrote: > Fix the following warning: > > drivers/scsi/ibmvscsi/ibmvscsi.c:2387:12: warning: symbol > 'ibmvscsi_module_init' was not declared. Should it be static? > drivers/scsi/ibmvscsi/ibmvscsi.c:2409:13: warning: symbol > 'ibmvscsi_module_exit' was not declared. Should it be static? Applied to 5.8/scsi-queue, thanks! [1/1] scsi: ibmvscsi: Make some functions static https://git.kernel.org/mkp/scsi/c/1f93ad177d24 -- Martin K. Petersen Oracle Linux Engineering
Re: [PATCH] powerpc/kvm/book3s64/vio: fix some RCU-list locks
On Wed, May 27, 2020 at 11:13:23AM +1000, Paul Mackerras wrote: > On Sun, May 10, 2020 at 01:18:34AM -0400, Qian Cai wrote: > > It is unsafe to traverse kvm->arch.spapr_tce_tables and > > stt->iommu_tables without the RCU read lock held. Also, add > > cond_resched_rcu() in places with the RCU read lock held that could take > > a while to finish. > > This mostly looks fine. The cond_resched_rcu() in kvmppc_tce_validate > doesn't seem necessary (the list would rarely have more than a few > dozen entries) and could be a performance problem given that TCE > validation is a hot-path. > > Are you OK with me modifying the patch to take out that > cond_resched_rcu(), or is there some reason why it's essential that it > be there? Feel free to take out that cond_resched_rcu(). Your reasoning makes sense.
Re: [PATCH] powerpc/kvm/book3s64/vio: fix some RCU-list locks
On Sun, May 10, 2020 at 01:18:34AM -0400, Qian Cai wrote: > It is unsafe to traverse kvm->arch.spapr_tce_tables and > stt->iommu_tables without the RCU read lock held. Also, add > cond_resched_rcu() in places with the RCU read lock held that could take > a while to finish. This mostly looks fine. The cond_resched_rcu() in kvmppc_tce_validate doesn't seem necessary (the list would rarely have more than a few dozen entries) and could be a performance problem given that TCE validation is a hot-path. Are you OK with me modifying the patch to take out that cond_resched_rcu(), or is there some reason why it's essential that it be there? Paul.
Re: [PATCH 2/3] powerpc/pci: unmap legacy INTx interrupts of passthrough IO adapters
On Wed, Apr 29, 2020 at 5:51 PM Cédric Le Goater wrote: > > When a passthrough IO adapter is removed from a pseries machine using > hash MMU and the XIVE interrupt mode, the POWER hypervisor, pHyp, > expects the guest OS to have cleared all page table entries related to > the adapter. If some are still present, the RTAS call which isolates > the PCI slot returns error 9001 "valid outstanding translations" and > the removal of the IO adapter fails. > > INTx interrupt numbers need special care because Linux maps the > interrupts automatically in the Linux interrupt number space if they > are presented in the device tree node describing the IO adapter. These > interrupts are not un-mapped automatically and in case of an hot-plug > adapter, the PCI hot-plug layer needs to handle the cleanup to make > sure that all the page table entries of the XIVE ESB pages are > cleared. > > Cc: "Oliver O'Halloran" > Signed-off-by: Cédric Le Goater > --- > arch/powerpc/kernel/pci-hotplug.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/arch/powerpc/kernel/pci-hotplug.c > b/arch/powerpc/kernel/pci-hotplug.c > index bf83f76563a3..9e9c6befd7ea 100644 > --- a/arch/powerpc/kernel/pci-hotplug.c > +++ b/arch/powerpc/kernel/pci-hotplug.c > @@ -57,6 +57,8 @@ void pcibios_release_device(struct pci_dev *dev) > struct pci_controller *phb = pci_bus_to_host(dev->bus); > struct pci_dn *pdn = pci_get_pdn(dev); > > + irq_dispose_mapping(dev->irq); What does the original mapping? Powerpc arch code or the PCI core? Tearing down the mapping in pcibios_release_device() seems a bit fishy to me since the PCI core has already torn down the device state at that point. If the release is delayed it's possible that another pci_dev has mapped the IRQ before we get here, but maybe that's ok. > + > eeh_remove_device(dev); > > if (phb->controller_ops.release_device) > -- > 2.25.4 >
Re: [PATCH] selftests: powerpc: Add test for execute-disabled pkeys
Sandipan Das writes: > Hi Michael, > > On 26/05/20 6:05 pm, Michael Ellerman wrote: >> [...] >>> + >>> +/* Override definitions as they might be inconsistent */ >>> +#undef PKEY_DISABLE_ACCESS >>> +#define PKEY_DISABLE_ACCESS0x3 >> >> Why would they be inconsistent? >> > > The definition in sys/mman.h still uses the value specific to > Intel's implementation i.e. 1, when this should have been 3 > for powerpc. I have seen this on Ubuntu 18.04 and 20.04. Hmm OK, that's a bug but oh well nothing we can do about it. >> I think a reasonable solution is to use the absence of SEGV_PKUERR to >> basically turn the whole test into a nop at build time, eg: ... > > Or can I use this from the pkey tests under selftests/vm? > > static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) > { > #ifdef si_pkey > return &si->si_pkey; > #else > return (u32 *)(((u8 *)si) + si_pkey_offset); > #endif > } > > Where si_pkey_offset is 0x20 for powerpc. Yeah that's fine if it works. Please send a v2 with that change. cheers
Re: [PATCH v3 0/7] Statsfs: a new ram-based file system for Linux kernel statistics
On Tue, 26 May 2020 13:03:10 +0200 Emanuele Giuseppe Esposito wrote: > There is currently no common way for Linux kernel subsystems to expose > statistics to userspace shared throughout the Linux kernel; subsystems have > to take care of gathering and displaying statistics by themselves, for > example in the form of files in debugfs. For example KVM has its own code > section that takes care of this in virt/kvm/kvm_main.c, where it sets up > debugfs handlers for displaying values and aggregating them from various > subfolders to obtain information about the system state (i.e. displaying > the total number of exits, calculated by summing all exits of all cpus of > all running virtual machines). > > Allowing each section of the kernel to do so has two disadvantages. First, > it will introduce redundant code. Second, debugfs is anyway not the right > place for statistics (for example it is affected by lockdown) > > In this patch series I introduce statsfs, a synthetic ram-based virtual > filesystem that takes care of gathering and displaying statistics for the > Linux kernel subsystems. > > The file system is mounted on /sys/kernel/stats and would be already used > by kvm. Statsfs was initially introduced by Paolo Bonzini [1]. What's the direct motivation for this work? Moving KVM stats out of debugfs? In my experience stats belong in the API used for creating/enumerating objects, statsfs sounds like going in the exact opposite direction - creating a parallel structure / hierarchy for exposing stats. I know nothing about KVM but are you sure all the info that has to be exposed will be stats? In case of networking we have the basic stats in sysfs, under the netdevice's kobject. But since we're not using sysfs much any more for config, new stats are added in netlink APIs. Again - same APIs used for enumeration and config.
[PATCH v2] powerpc/wii: Fix declaration made after definition
A 0day randconfig uncovered an error with clang, trimmed for brevity: arch/powerpc/platforms/embedded6xx/wii.c:195:7: error: attribute declaration must precede definition [-Werror,-Wignored-attributes] if (!machine_is(wii)) ^ The macro machine_is declares mach_##name but define_machine actually defines mach_##name, hence the warning. To fix this, move define_machine after the machine_is usage. Fixes: 5a7ee3198dfa ("powerpc: wii: platform support") Reported-by: kbuild test robot Link: https://github.com/ClangBuiltLinux/linux/issues/989 Reviewed-by: Nick Desaulniers Signed-off-by: Nathan Chancellor --- v1 -> v2: * s/is_machine/machine_is/ (Nick) * Add Nick's reviewed-by tag. arch/powerpc/platforms/embedded6xx/wii.c | 25 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/platforms/embedded6xx/wii.c b/arch/powerpc/platforms/embedded6xx/wii.c index 67e48b0a164e..a802ef957d63 100644 --- a/arch/powerpc/platforms/embedded6xx/wii.c +++ b/arch/powerpc/platforms/embedded6xx/wii.c @@ -172,19 +172,6 @@ static void wii_shutdown(void) flipper_quiesce(); } -define_machine(wii) { - .name = "wii", - .probe = wii_probe, - .setup_arch = wii_setup_arch, - .restart= wii_restart, - .halt = wii_halt, - .init_IRQ = wii_pic_probe, - .get_irq= flipper_pic_get_irq, - .calibrate_decr = generic_calibrate_decr, - .progress = udbg_progress, - .machine_shutdown = wii_shutdown, -}; - static const struct of_device_id wii_of_bus[] = { { .compatible = "nintendo,hollywood", }, { }, @@ -200,3 +187,15 @@ static int __init wii_device_probe(void) } device_initcall(wii_device_probe); +define_machine(wii) { + .name = "wii", + .probe = wii_probe, + .setup_arch = wii_setup_arch, + .restart= wii_restart, + .halt = wii_halt, + .init_IRQ = wii_pic_probe, + .get_irq= flipper_pic_get_irq, + .calibrate_decr = generic_calibrate_decr, + .progress = udbg_progress, + .machine_shutdown = wii_shutdown, +}; base-commit: b0523c7b1c9d0edcd6c0fe6d2cb558a9ad5c60a8 -- 2.27.0.rc0
Re: [PATCH] soc: fsl: qe: Replace one-element array and use struct_size() helper
On Sun, May 24, 2020 at 9:49 PM Qiang Zhao wrote: > > On Wed, May 23, 2020 at 5:22 PM Li Yang > > -Original Message- > > From: Li Yang > > Sent: 2020年5月23日 5:22 > > To: Kees Cook > > Cc: Gustavo A. R. Silva ; Qiang Zhao > > ; linuxppc-dev ; > > moderated list:ARM/FREESCALE IMX / MXC ARM ARCHITECTURE > > ; lkml ; > > Gustavo A. R. Silva > > Subject: Re: [PATCH] soc: fsl: qe: Replace one-element array and use > > struct_size() helper > > > > On Wed, May 20, 2020 at 10:24 PM Kees Cook > > wrote: > > > > > > On Wed, May 20, 2020 at 06:52:21PM -0500, Li Yang wrote: > > > > On Mon, May 18, 2020 at 5:57 PM Kees Cook > > wrote: > > > > > Hm, looking at this code, I see a few other things that need to be > > > > > fixed: > > > > > > > > > > 1) drivers/tty/serial/ucc_uart.c does not do a be32_to_cpu() > > > > > conversion > > > > >on the length test (understandably, a little-endian system has > > > > > never > > run > > > > >this code since it's ppc specific), but it's still wrong: > > > > > > > > > > if (firmware->header.length != fw->size) { > > > > > > > > > >compare to the firmware loader: > > > > > > > > > > length = be32_to_cpu(hdr->length); > > > > > > > > > > 2) drivers/soc/fsl/qe/qe.c does not perform bounds checking on the > > > > >per-microcode offsets, so the uploader might send data outside the > > > > >firmware buffer. Perhaps: > > > > > > > > We do validate the CRC for each microcode, it is unlikely the CRC > > > > check can pass if the offset or length is not correct. But you are > > > > probably right that it will be safer to check the boundary and fail > > > > > > Right, but a malicious firmware file could still match CRC but trick > > > the kernel code. > > > > > > > quicker before we actually start the CRC check. Will you come up > > > > with a formal patch or you want us to deal with it? > > > > > > It sounds like Gustavo will be sending one, though I don't think > > > either of us have the hardware to test it with, so if you could do > > > that part, that would be great! :) > > > > That will be great. I think Zhao Qiang can help with the testing part. > > > > Now the firmware are loaded in uboot, and kernel will do nothing for it. > So testing on it maybe need some extra codes both in driver and dts. > In the meanwhile, I am so busy on some high priority work that maybe test work > could not be done in time. > Once I am free, I will do it. Thanks. You are right that most of the QE drivers doesn't support requesting firmware in kernel except the ucc_uart. So it probably can be tested with that driver without requiring code change. > > Best Regards > Qiang Zhao
Re: [PATCH] KVM: PPC: Book3S HV: read ibm,secure-memory nodes
Paul, could you please take that patch? Le 16/04/2020 à 18:27, Laurent Dufour a écrit : The newly introduced ibm,secure-memory nodes supersede the ibm,uv-firmware's property secure-memory-ranges. Firmware will no more expose the secure-memory-ranges property so first read the new one and if not found rollback to the older one. Signed-off-by: Laurent Dufour --- arch/powerpc/kvm/book3s_hv_uvmem.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 53b88cae3e73..ad950f8996e0 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -735,6 +735,20 @@ static u64 kvmppc_get_secmem_size(void) const __be32 *prop; u64 size = 0; + /* +* First try the new ibm,secure-memory nodes which supersede the +* secure-memory-ranges property. +* If we found somes, no need to read the deprecated one. +*/ + for_each_compatible_node(np, NULL, "ibm,secure-memory") { + prop = of_get_property(np, "reg", &len); + if (!prop) + continue; + size += of_read_number(prop + 2, 2); + } + if (size) + return size; + np = of_find_compatible_node(NULL, NULL, "ibm,uv-firmware"); if (!np) goto out;
Re: [PATCH v2 08/20] arm64: simplify detection of memory zone boundaries for UMA configs
On Wed, Apr 29, 2020 at 03:11:14PM +0300, Mike Rapoport wrote: > From: Mike Rapoport > > The free_area_init() function only requires the definition of maximal PFN > for each of the supported zone rater than calculation of actual zone sizes > and the sizes of the holes between the zones. > > After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is > available to all architectures. > > Using this function instead of free_area_init_node() simplifies the zone > detection. > > Signed-off-by: Mike Rapoport Acked-by: Catalin Marinas (BTW, none of my acks so far made it to the linux-arm-kernel list because of the large number of people on cc)
Re: [PATCH v2 05/20] mm: use free_area_init() instead of free_area_init_nodes()
On Wed, Apr 29, 2020 at 03:11:11PM +0300, Mike Rapoport wrote: > diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c > index e42727e3568e..a650adb358ee 100644 > --- a/arch/arm64/mm/init.c > +++ b/arch/arm64/mm/init.c > @@ -206,7 +206,7 @@ static void __init zone_sizes_init(unsigned long min, > unsigned long max) > #endif > max_zone_pfns[ZONE_NORMAL] = max; > > - free_area_init_nodes(max_zone_pfns); > + free_area_init(max_zone_pfns); > } Acked-by: Catalin Marinas
Re: [PATCH v2 03/20] mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option
On Wed, Apr 29, 2020 at 03:11:09PM +0300, Mike Rapoport wrote: > From: Mike Rapoport > > The CONFIG_HAVE_MEMBLOCK_NODE_MAP is used to differentiate initialization > of nodes and zones structures between the systems that have region to node > mapping in memblock and those that don't. > > Currently all the NUMA architectures enable this option and for the > non-NUMA systems we can presume that all the memory belongs to node 0 and > therefore the compile time configuration option is not required. > > The remaining few architectures that use DISCONTIGMEM without NUMA are > easily updated to use memblock_add_node() instead of memblock_add() and > thus have proper correspondence of memblock regions to NUMA nodes. > > Still, free_area_init_node() must have a backward compatible version > because its semantics with and without CONFIG_HAVE_MEMBLOCK_NODE_MAP is > different. Once all the architectures will use the new semantics, the > entire compatibility layer can be dropped. > > To avoid addition of extra run time memory to store node id for > architectures that keep memblock but have only a single node, the node id > field of the memblock_region is guarded by CONFIG_NEED_MULTIPLE_NODES and > the corresponding accessors presume that in those cases it is always 0. > > Signed-off-by: Mike Rapoport > --- > .../vm/numa-memblock/arch-support.txt | 34 -- > arch/alpha/mm/numa.c | 4 +- > arch/arm64/Kconfig| 1 - For arm64: Acked-by: Catalin Marinas
Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
Hi Zong, Le 5/26/20 à 5:43 AM, Zong Li a écrit : On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti wrote: This is a preparatory patch for relocatable kernel. The kernel used to be linked at PAGE_OFFSET address and used to be loaded physically at the beginning of the main memory. Therefore, we could use the linear mapping for the kernel mapping. But the relocated kernel base address will be different from PAGE_OFFSET and since in the linear mapping, two different virtual addresses cannot point to the same physical address, the kernel mapping needs to lie outside the linear mapping. In addition, because modules and BPF must be close to the kernel (inside +-2GB window), the kernel is placed at the end of the vmalloc zone minus 2GB, which leaves room for modules and BPF. The kernel could not be placed at the beginning of the vmalloc zone since other vmalloc allocations from the kernel could get all the +-2GB window around the kernel which would prevent new modules and BPF programs to be loaded. Signed-off-by: Alexandre Ghiti --- arch/riscv/boot/loader.lds.S | 3 +- arch/riscv/include/asm/page.h| 10 +- arch/riscv/include/asm/pgtable.h | 37 +--- arch/riscv/kernel/head.S | 3 +- arch/riscv/kernel/module.c | 4 +-- arch/riscv/kernel/vmlinux.lds.S | 3 +- arch/riscv/mm/init.c | 58 +--- arch/riscv/mm/physaddr.c | 2 +- 8 files changed, 87 insertions(+), 33 deletions(-) diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S index 47a5003c2e28..62d94696a19c 100644 --- a/arch/riscv/boot/loader.lds.S +++ b/arch/riscv/boot/loader.lds.S @@ -1,13 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include +#include OUTPUT_ARCH(riscv) ENTRY(_start) SECTIONS { - . = PAGE_OFFSET; + . = KERNEL_LINK_ADDR; .payload : { *(.payload) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 2d50f76efe48..48bb09b6a9b7 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; #ifdef CONFIG_MMU extern unsigned long va_pa_offset; +extern unsigned long va_kernel_pa_offset; extern unsigned long pfn_base; #define ARCH_PFN_OFFSET(pfn_base) #else #define va_pa_offset 0 +#define va_kernel_pa_offset0 #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT) #endif /* CONFIG_MMU */ extern unsigned long max_low_pfn; extern unsigned long min_low_pfn; +extern unsigned long kernel_virt_addr; #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) +#define kernel_mapping_va_to_pa(x) \ + ((unsigned long)(x) - va_kernel_pa_offset) +#define __va_to_pa_nodebug(x) \ + (((x) >= PAGE_OFFSET) ? \ + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) #ifdef CONFIG_DEBUG_VIRTUAL extern phys_addr_t __virt_to_phys(unsigned long x); diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 35b60035b6b0..25213cfaf680 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -11,23 +11,29 @@ #include -#ifndef __ASSEMBLY__ - -/* Page Upper Directory not used in RISC-V */ -#include -#include -#include -#include - -#ifdef CONFIG_MMU +#ifndef CONFIG_MMU +#define KERNEL_VIRT_ADDR PAGE_OFFSET +#define KERNEL_LINK_ADDR PAGE_OFFSET +#else +/* + * Leave 2GB for modules and BPF that must lie within a 2GB range around + * the kernel. + */ +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) #define VMALLOC_END (PAGE_OFFSET - 1) #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE) #define BPF_JIT_REGION_SIZE(SZ_128M) -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) -#define BPF_JIT_REGION_END (VMALLOC_END) +#define BPF_JIT_REGION_START (kernel_virt_addr) +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE) It seems to have a potential risk here, the region of bpf is overlapping with kernel mapping, so if kernel size is bigger than 128MB, bpf region would be occupied and run out by kernel mapping. + +#ifdef CONFIG_64BIT +#define VMALLOC_MODULE_START BPF_JIT_REGION_END +#define VMALLOC_MODULE_END VMALLOC_END +#endif Although kernel_virt_addr is a fixed address now, I think it could be changed for the purpose of relocatable or KASLR, so if kernel_virt_addr is moved to far from VMALLOC_END than 2G, the region of module would be too big. Yes you're right, that's wrong to allow modules to lie outside the 2G window, thanks for noticing. In addition, the region of
Re: [PATCH v3 7/7] [not for merge] netstats: example use of stats_fs API
Hi Andrew How do you atomically get and display a group of statistics? If you look at how the netlink socket works, you will see code like: do { start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); rx_packets = cpu_stats->rx_packets; rx_bytes = cpu_stats->rx_bytes; } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); It will ensure that rx_packets and rx_bytes are consistent with each other. If the value of the sequence counter changes while inside the loop, the loop so repeated until it does not change. In general, hardware counters in NICs are the same. You tell it to take a snapshot of the statistics counters, and then read them all back, to give a consistent view across all the statistics. I've not looked at this new code in detail, but it looks like you have one file per statistic, and assume each statistic is independent of every other statistic. This independence can limit how you use the values, particularly when debugging. The netlink interface we use does not have this limitation. You're right, statistics are treated independently so what you describe is currently not supported. In KVM the utilization is more qualitative, so there isn't such problem. But as long as the interface is based on file access, the possibility of snapshotting might not be useful; however, it could still be considered to be added later together with the binary access. Jonathan, how is your metricfs handling this case? Thank you, Emanuele
powerpc/pci: [PATCH 1/1 V3] PCIE PHB reset
From: Wen Xiong Several device drivers hit EEH(Extended Error handling) when triggering kdump on Pseries PowerVM. This patch implemented a reset of the PHBs in pci general code when triggering kdump. PHB reset stop all PCI transactions from normal kernel. We have tested the patch in several enviroments: - direct slot adapters - adapters under the switch - a VF adapter in PowerVM - a VF adapter/adapter in KVM guest. Signed-off-by: Wen Xiong --- arch/powerpc/platforms/pseries/pci.c | 152 +++ 1 file changed, 152 insertions(+) diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c index 911534b89c85..cb7e4276cf04 100644 --- a/arch/powerpc/platforms/pseries/pci.c +++ b/arch/powerpc/platforms/pseries/pci.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include @@ -354,3 +356,153 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge) return 0; } + +/** + * pseries_get_pdn_addr - Retrieve PHB address + * @pe: EEH PE + * + * Retrieve the assocated PHB address. Actually, there're 2 RTAS + * function calls dedicated for the purpose. We need implement + * it through the new function and then the old one. Besides, + * you should make sure the config address is figured out from + * FDT node before calling the function. + * + */ +static int pseries_get_pdn_addr(struct pci_controller *phb) +{ + int ret = -1; + int rets[3]; + int ibm_get_config_addr_info; + int ibm_get_config_addr_info2; + int config_addr = 0; + struct pci_dn *root_pdn, *pdn; + + ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); + ibm_get_config_addr_info= rtas_token("ibm,get-config-addr-info"); + + root_pdn = PCI_DN(phb->dn); + pdn = list_first_entry(&root_pdn->child_list, struct pci_dn, list); + config_addr = (pdn->busno << 16) | (pdn->devfn << 8); + + if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { + /* +* First of all, we need to make sure there has one PE +* associated with the device. If option is 1, it +* queries if config address is supported in a PE or not. +* If option is 0, it returns PE config address or config +* address for the PE primary bus. +*/ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + config_addr, BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), 1); + if (ret || (rets[0] == 0)) { + pr_warn("%s: Failed to get address for PHB#%x-PE# option=%d config_addr=%x\n", + __func__, pdn->phb->global_number, 1, rets[0]); + return -1; + } + + /* Retrieve the associated PE config address */ + ret = rtas_call(ibm_get_config_addr_info2, 4, 2, rets, + config_addr, BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), 0); + if (ret) { + pr_warn("%s: Failed to get address for PHB#%x-PE# option=%d config_addr=%x\n", + __func__, pdn->phb->global_number, 0, rets[0]); + return -1; + } + return rets[0]; + } + + if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call(ibm_get_config_addr_info, 4, 2, rets, + config_addr, BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), 0); + if (ret || rets[0]) { + pr_warn("%s: Failed to get address for PHB#%x-PE# config_addr=%x\n", + __func__, pdn->phb->global_number, rets[0]); + return -1; + } + return rets[0]; + } + + return ret; +} + +static int __init pseries_phb_reset(void) +{ + struct pci_controller *phb; + int config_addr; + int ibm_set_slot_reset; + int ibm_configure_pe; + int ret; + + if (is_kdump_kernel() || reset_devices) { + pr_info("Issue PHB reset ...\n"); + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); + ibm_configure_pe = rtas_token("ibm,configure-pe"); + + if (ibm_set_slot_reset == RTAS_UNKNOWN_SERVICE || + ibm_configure_pe == RTAS_UNKNOWN_SERVICE) { + pr_info("%s: EEH functionality not supported\n", + __func__); + } + + list_for_each_entry(phb, &hose_list, list_node) { + config_addr = pseries_get_pdn_addr(phb); + if (config_addr == -1) + continue; + + ret = rtas_call(ibm_set_slot_reset, 4, 1, N
Re: [PATCH v3 7/7] [not for merge] netstats: example use of stats_fs API
On Tue, May 26, 2020 at 01:03:17PM +0200, Emanuele Giuseppe Esposito wrote: > Apply stats_fs on the networking statistics subsystem. > > Currently it only works with disabled network namespace > (CONFIG_NET_NS=n), because multiple namespaces will have the same > device name under the same root source that will cause a conflict in > stats_fs. Hi Emanuele How do you atomically get and display a group of statistics? If you look at how the netlink socket works, you will see code like: do { start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); rx_packets = cpu_stats->rx_packets; rx_bytes = cpu_stats->rx_bytes; } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); It will ensure that rx_packets and rx_bytes are consistent with each other. If the value of the sequence counter changes while inside the loop, the loop so repeated until it does not change. In general, hardware counters in NICs are the same. You tell it to take a snapshot of the statistics counters, and then read them all back, to give a consistent view across all the statistics. I've not looked at this new code in detail, but it looks like you have one file per statistic, and assume each statistic is independent of every other statistic. This independence can limit how you use the values, particularly when debugging. The netlink interface we use does not have this limitation. Andrew
Re: [PATCH] selftests: powerpc: Add test for execute-disabled pkeys
Hi Michael, On 26/05/20 6:05 pm, Michael Ellerman wrote: > [...] >> + >> +/* Override definitions as they might be inconsistent */ >> +#undef PKEY_DISABLE_ACCESS >> +#define PKEY_DISABLE_ACCESS 0x3 > > Why would they be inconsistent? > The definition in sys/mman.h still uses the value specific to Intel's implementation i.e. 1, when this should have been 3 for powerpc. I have seen this on Ubuntu 18.04 and 20.04. > >> +/* Older distros might not define this */ >> +#ifndef SEGV_PKUERR >> +#define SEGV_PKUERR 4 >> +#endif > ... >> + >> +/* Restore permissions in order to continue */ >> +switch (fcode) { >> +case SEGV_ACCERR: >> +if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE)) { >> +perror("mprotect"); >> +goto fail; >> +} >> +break; >> +case SEGV_PKUERR: >> +if (sinfo->si_pkey != fpkey) >> +goto fail; > > This doesn't compile on older distros, eg Ubuntu 16.04: > > pkey_exec_prot.c: In function 'segv_handler': > pkey_exec_prot.c:121:12: error: 'siginfo_t {aka struct }' has no > member named 'si_pkey' > if (sinfo->si_pkey != fpkey) > ^ > pkey_exec_prot.c:151:24: error: 'siginfo_t {aka struct }' has no > member named 'si_pkey' > pkey_set_rights(sinfo->si_pkey, 0); > ^ > ../../lib.mk:142: recipe for target > '/output/kselftest/powerpc/mm/pkey_exec_prot' failed > Thanks for reporting this. > > I think a reasonable solution is to use the absence of SEGV_PKUERR to > basically turn the whole test into a nop at build time, eg: > > diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c > b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c > index b346ad205e68..218257b89fbb 100644 > --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c > +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c > @@ -30,9 +30,7 @@ > #define PKEY_DISABLE_EXECUTE 0x4 > > /* Older distros might not define this */ > -#ifndef SEGV_PKUERR > -#define SEGV_PKUERR4 > -#endif > +#ifdef SEGV_PKUERR > > #define SYS_pkey_mprotect 386 > #define SYS_pkey_alloc 384 > @@ -319,6 +317,13 @@ static int test(void) > > return 0; > } > +#else > +static int test(void) > +{ > + printf("Test built with old libc lacking pkey support.\n"); > + SKIP_IF(true); > +} > +#endif /* SEGV_PKUERR */ > > int main(void) > { > > Or can I use this from the pkey tests under selftests/vm? static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) { #ifdef si_pkey return &si->si_pkey; #else return (u32 *)(((u8 *)si) + si_pkey_offset); #endif } Where si_pkey_offset is 0x20 for powerpc. - Sandipan
Re: [PATCH v2 0/2] powerpc: Remove support for ppc405/440 Xilinx platforms
Michal Simek writes: > Hi Michael, > > On 01. 04. 20 13:30, Michal Simek wrote: >> On 01. 04. 20 12:38, Takashi Iwai wrote: >>> On Wed, 01 Apr 2020 12:35:16 +0200, >>> Michael Ellerman wrote: Michal Simek writes: > On 01. 04. 20 4:07, Michael Ellerman wrote: >> Michal Simek writes: >>> Hi, >>> >>> recently we wanted to update xilinx intc driver and we found that >>> function >>> which we wanted to remove is still wired by ancient Xilinx PowerPC >>> platforms. Here is the thread about it. >>> https://lore.kernel.org/linux-next/48d3232d-0f1d-42ea-3109-f44bbabfa...@xilinx.com/ >>> >>> I have been talking about it internally and there is no interest in >>> these >>> platforms and it is also orphan for quite a long time. None is really >>> running/testing these platforms regularly that's why I think it makes >>> sense >>> to remove them also with drivers which are specific to this platform. >>> >>> U-Boot support was removed in 2017 without anybody complain about it >>> https://github.com/Xilinx/u-boot-xlnx/commit/98f705c9cefdfdba62c069821bbba10273a0a8ed >>> >>> Based on current ppc/next. >>> >>> If anyone has any objection about it, please let me know. >> >> Thanks for taking the time to find all this code and remove it. >> >> I'm not going to take this series for v5.7, it was posted too close to >> the merge window, and doing so wouldn't give people much time to object, >> especially given people are distracted at the moment. >> >> I'm happy to take it for v5.8, assuming there's no major objections. > > Sure. Just to let you know Christophe Leroy included this patch in his > series about ppc405 removal. It should be the same. > > If you don't want to take that alsa patch I can send it separately and > this patch can be taken from his series. I don't really mind but please > let me know what way you prefer. It's better to keep it all together, so I'm happy take the alsa patch as well, it's already been acked. > > Can you please take this series? I know that there is v5 from Christophe > which has this 1/2 as 1/13. But I need this alsa patch too and I would > like to close this because it is around for almost 2 months and none > raised a concern about removing just these Xilinx platforms. Sorry I meant to reply to your last mail. I have Christophe's series in my testing branch, planning for it to be in v5.8. Even if the rest of his series doesn't make it for some reason, as you say the Xilinx removal is uncontroversial so I'll keep that in. I forgot about the sound patch, I'll pick that up as well. cheers
Re: [PATCH v4 07/45] powerpc/ptdump: Limit size of flags text to 1/2 chars on PPC32
Christophe Leroy writes: > Le 25/05/2020 à 07:15, Michael Ellerman a écrit : >> Christophe Leroy writes: >>> In order to have all flags fit on a 80 chars wide screen, >>> reduce the flags to 1 char (2 where ambiguous). >> >> I don't love this, the output is less readable. Is fitting on an 80 char >> screen a real issue for you? I just make my terminal window bigger. > > I don't have strong opinion about that, and the terminal can be made bigger. > I just don't like how messy it is, some flags are so big that they hide > other ones and getting it more ordered and more compact helped me during > all the verifications I did with this series, but we can leave it as is > if you prefer. I think I do. > Would you like a v5 without patches 7 and 8 ? Or I can just resend the > patches that will be impacted, that is 9 and 38 ? I dropped 7 and 8 and then fixed up 9 and 38, it was easy enough. I used "coherent" and "huge". > With the change I get. > > ---[ Start of kernel VM ]--- > 0xc000-0xc0ff 0x16M h r x psh a > 0xc100-0xc7ff 0x0100 112M h rw psh d a > ---[ vmalloc() Area ]--- > 0xc900-0xc9003fff 0x050e400016K rw psh d a > 0xc9008000-0xc900bfff 0x050ec00016K rw psh d a > 0xc901-0xc9013fff 0xd00016K rw p i g sh d a > 0xc9018000-0xc901bfff 0x050f16K rw psh d a It's definitely more compact :) But I worry no one other than you will be able to decipher it, without constantly referring back to the source code. cheers
Re: [PATCH] selftests: powerpc: Add test for execute-disabled pkeys
Hi Sandipan, Sandipan Das writes: > diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c > b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c > new file mode 100644 > index ..b346ad205e68 > --- /dev/null > +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c > @@ -0,0 +1,326 @@ > +// SPDX-License-Identifier: GPL-2.0+ > + > +/* > + * Copyright 2020, Sandipan Das, IBM Corp. > + * > + * Test if applying execute protection on pages using memory > + * protection keys works as expected. > + */ > + > +#define _GNU_SOURCE > +#include > +#include > +#include > +#include > + > +#include > +#include > +#include > + > +#include "utils.h" > + > +/* Override definitions as they might be inconsistent */ > +#undef PKEY_DISABLE_ACCESS > +#define PKEY_DISABLE_ACCESS 0x3 Why would they be inconsistent? > +/* Older distros might not define this */ > +#ifndef SEGV_PKUERR > +#define SEGV_PKUERR 4 > +#endif ... > + > + /* Restore permissions in order to continue */ > + switch (fcode) { > + case SEGV_ACCERR: > + if (mprotect(insns, pgsize, PROT_READ | PROT_WRITE)) { > + perror("mprotect"); > + goto fail; > + } > + break; > + case SEGV_PKUERR: > + if (sinfo->si_pkey != fpkey) > + goto fail; This doesn't compile on older distros, eg Ubuntu 16.04: pkey_exec_prot.c: In function 'segv_handler': pkey_exec_prot.c:121:12: error: 'siginfo_t {aka struct }' has no member named 'si_pkey' if (sinfo->si_pkey != fpkey) ^ pkey_exec_prot.c:151:24: error: 'siginfo_t {aka struct }' has no member named 'si_pkey' pkey_set_rights(sinfo->si_pkey, 0); ^ ../../lib.mk:142: recipe for target '/output/kselftest/powerpc/mm/pkey_exec_prot' failed I think a reasonable solution is to use the absence of SEGV_PKUERR to basically turn the whole test into a nop at build time, eg: diff --git a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c index b346ad205e68..218257b89fbb 100644 --- a/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c +++ b/tools/testing/selftests/powerpc/mm/pkey_exec_prot.c @@ -30,9 +30,7 @@ #define PKEY_DISABLE_EXECUTE 0x4 /* Older distros might not define this */ -#ifndef SEGV_PKUERR -#define SEGV_PKUERR4 -#endif +#ifdef SEGV_PKUERR #define SYS_pkey_mprotect 386 #define SYS_pkey_alloc 384 @@ -319,6 +317,13 @@ static int test(void) return 0; } +#else +static int test(void) +{ + printf("Test built with old libc lacking pkey support.\n"); + SKIP_IF(true); +} +#endif /* SEGV_PKUERR */ int main(void) { cheers
Re: [RESEND PATCH v7 4/5] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods
Vaibhav Jain writes: > Hi Ira, Mpe and Aneesh, > > Vaibhav Jain writes: > >> Michael Ellerman writes: >> >>> Ira Weiny writes: On Wed, May 20, 2020 at 12:30:57AM +0530, Vaibhav Jain wrote: > Introduce support for Papr nvDimm Specific Methods (PDSM) in papr_scm > modules and add the command family to the white list of NVDIMM command > sets. Also advertise support for ND_CMD_CALL for the dimm > command mask and implement necessary scaffolding in the module to > handle ND_CMD_CALL ioctl and PDSM requests that we receive. >>> ... > + * > + * Payload Version: > + * > + * A 'payload_version' field is present in PDSM header that indicates a > specific > + * version of the structure present in PDSM Payload for a given PDSM > command. > + * This provides backward compatibility in case the PDSM Payload > structure > + * evolves and different structures are supported by 'papr_scm' and > 'libndctl'. > + * > + * When sending a PDSM Payload to 'papr_scm', 'libndctl' should send the > version > + * of the payload struct it supports via 'payload_version' field. The > 'papr_scm' > + * module when servicing the PDSM envelope checks the 'payload_version' > and then > + * uses 'payload struct version' == MIN('payload_version field', > + * 'max payload-struct-version supported by papr_scm') to service the > PDSM. > + * After servicing the PDSM, 'papr_scm' put the negotiated version of > payload > + * struct in returned 'payload_version' field. FWIW many people believe using a size rather than version is more sustainable. It is expected that new payload structures are larger (more features) than the previous payload structure. I can't find references at the moment through. >>> >>> I think clone_args is a good modern example: >>> >>> >>> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/uapi/linux/sched.h#n88 >> >> Thank Ira and Mpe for pointing this out. I looked into how clone3 sycall >> handles clone_args and few differences came out: >> >> * Unlike clone_args that are always transferred in one direction from >> user-space to kernel, payload contents of pdsms are transferred in both >> directions. Having a single version number makes it easier for >> user-space and kernel to determine what data will be exchanged. >> >> * For PDSMs, the version number is negotiated between libndctl and >> kernel. For example in case kernel only supports an older version of >> a structure, its free to send a lower version number back to >> libndctl. Such negotiations doesnt happen with clone3 syscall. > > If you are ok with the explaination above please let me know. I will > quickly spin off a v8 addressing your review comments. I don't have strong opinions about the user API, it's really up to the nvdimm folks. cheers
[PATCH v3 7/7] [not for merge] netstats: example use of stats_fs API
Apply stats_fs on the networking statistics subsystem. Currently it only works with disabled network namespace (CONFIG_NET_NS=n), because multiple namespaces will have the same device name under the same root source that will cause a conflict in stats_fs. Signed-off-by: Emanuele Giuseppe Esposito --- include/linux/netdevice.h | 2 ++ net/Kconfig | 1 + net/core/dev.c| 66 +++ 3 files changed, 69 insertions(+) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..408c4e7b0e21 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -48,6 +48,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -2117,6 +2118,7 @@ struct net_device { unsignedwol_enabled:1; struct list_headnet_notifier_list; + struct stats_fs_source *stats_fs_src; #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ diff --git a/net/Kconfig b/net/Kconfig index df8d8c9bd021..3441d5bb6107 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -8,6 +8,7 @@ menuconfig NET select NLATTR select GENERIC_NET_UTILS select BPF + select STATS_FS_API ---help--- Unless you really know what you are doing, you should say Y here. The reason is that some programs need kernel networking support even diff --git a/net/core/dev.c b/net/core/dev.c index 522288177bbd..3db48cd1a097 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -142,6 +142,7 @@ #include #include #include +#include #include "net-sysfs.h" @@ -150,6 +151,11 @@ /* This should be increased if a protocol with a bigger head is added. */ #define GRO_MAX_HEAD (MAX_HEADER + 128) +#define NETDEV_STAT(str, m, ...) \ + { str, offsetof(struct rtnl_link_stats64, m), \ + &stats_fs_type_netdev_u64, \ + STATS_FS_SUM, ## __VA_ARGS__ } + static DEFINE_SPINLOCK(ptype_lock); static DEFINE_SPINLOCK(offload_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; @@ -196,6 +202,53 @@ static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8); static seqcount_t devnet_rename_seq; +static uint64_t stats_fs_get_netdev_u64(struct stats_fs_value *val, + void *base) +{ + struct net_device *netdev = (struct net_device *)base; + struct rtnl_link_stats64 net_stats; + + dev_get_stats(netdev, &net_stats); + + return stats_fs_get_u64(val, &net_stats); +} + +static struct stats_fs_type stats_fs_type_netdev_u64 = { + .get = stats_fs_get_netdev_u64, + .clear = NULL, + .sign = false +}; + +static struct stats_fs_source *netdev_root; + +static struct stats_fs_value stats_fs_netdev_entries[] = { + NETDEV_STAT("rx_packets", rx_packets), + NETDEV_STAT("tx_packets", tx_packets), + NETDEV_STAT("rx_bytes", rx_bytes), + NETDEV_STAT("tx_bytes", tx_bytes), + NETDEV_STAT("rx_errors", rx_errors), + NETDEV_STAT("tx_errors", tx_errors), + NETDEV_STAT("rx_dropped", rx_dropped), + NETDEV_STAT("tx_dropped", tx_dropped), + NETDEV_STAT("multicast", multicast), + NETDEV_STAT("collisions", collisions), + NETDEV_STAT("rx_length_errors", rx_length_errors), + NETDEV_STAT("rx_over_errors", rx_over_errors), + NETDEV_STAT("rx_crc_errors", rx_crc_errors), + NETDEV_STAT("rx_frame_errors", rx_frame_errors), + NETDEV_STAT("rx_fifo_errors", rx_fifo_errors), + NETDEV_STAT("rx_missed_errors", rx_missed_errors), + NETDEV_STAT("tx_aborted_errors", tx_aborted_errors), + NETDEV_STAT("tx_carrier_errors", tx_carrier_errors), + NETDEV_STAT("tx_fifo_errors", tx_fifo_errors), + NETDEV_STAT("tx_heartbeat_errors", tx_heartbeat_errors), + NETDEV_STAT("tx_window_errors", tx_window_errors), + NETDEV_STAT("rx_compressed", rx_compressed), + NETDEV_STAT("tx_compressed", tx_compressed), + NETDEV_STAT("rx_nohandler", rx_nohandler), + { NULL } +}; + static inline void dev_base_seq_inc(struct net *net) { while (++net->dev_base_seq == 0) @@ -8783,6 +8836,11 @@ static void rollback_registered_many(struct list_head *head) ASSERT_RTNL(); list_for_each_entry_safe(dev, tmp, head, unreg_list) { + stats_fs_source_remove_subordinate(netdev_root, + dev->stats_fs_src); + stats_fs_source_revoke(dev->stats_fs_src); + stats_fs_source_put(dev->stats_fs_src); + /* Some devices call without registering * for initialization unwind. Remove those * devices and proceed with the remaining. @@ -9436,6 +9494,11 @@ int register_netdevice(struct net_device *dev) dev->rtnl_lin
[PATCH v3 6/7] [not for merge] kvm: example of stats_fs_value show function
Add an example of the show function using the mp_state value. mp_state is an enum that represents the VCPU state, so instead of displaying its integer representation, the show function takes care of translating the integer into a more meaningful string representation. The VCPU status is shown in the kvm//vcpu/mp_state file Signed-off-by: Emanuele Giuseppe Esposito --- arch/x86/kvm/stats_fs.c | 54 + 1 file changed, 54 insertions(+) diff --git a/arch/x86/kvm/stats_fs.c b/arch/x86/kvm/stats_fs.c index f6edebb9c559..902be18562da 100644 --- a/arch/x86/kvm/stats_fs.c +++ b/arch/x86/kvm/stats_fs.c @@ -39,11 +39,65 @@ struct stats_fs_value stats_fs_vcpu_arch_tsc_frac[] = { { NULL } /* base is &kvm_tsc_scaling_ratio_frac_bits */ }; +char *stats_fs_vcpu_get_mpstate(uint64_t state) +{ + char *state_str; + + state_str = kzalloc(20, GFP_KERNEL); + if (!state_str) + return ERR_PTR(-ENOMEM); + + switch (state) { + case KVM_MP_STATE_RUNNABLE: + strcpy(state_str, "RUNNABLE"); + break; + case KVM_MP_STATE_UNINITIALIZED: + strcpy(state_str, "UNINITIALIZED"); + break; + case KVM_MP_STATE_INIT_RECEIVED: + strcpy(state_str, "INIT_RECEIVED"); + break; + case KVM_MP_STATE_HALTED: + strcpy(state_str, "HALTED"); + break; + case KVM_MP_STATE_SIPI_RECEIVED: + strcpy(state_str, "SIPI_RECEIVED"); + break; + case KVM_MP_STATE_STOPPED: + strcpy(state_str, "STOPPED"); + break; + case KVM_MP_STATE_CHECK_STOP: + strcpy(state_str, "CHECK_STOP"); + break; + case KVM_MP_STATE_OPERATING: + strcpy(state_str, "OPERATING"); + break; + case KVM_MP_STATE_LOAD: + strcpy(state_str, "LOAD"); + break; + default: + strcpy(state_str, "UNRECOGNIZED"); + break; + } + + return state_str; +} + +struct stats_fs_value stats_fs_vcpu_mp_state[] = { + VCPU_ARCH_STATS_FS("mp_state", kvm_vcpu_arch, mp_state, + .type = &stats_fs_type_u32, + .show = stats_fs_vcpu_get_mpstate), + { NULL } +}; + void kvm_arch_create_vcpu_stats_fs(struct kvm_vcpu *vcpu) { stats_fs_source_add_values(vcpu->stats_fs_src, stats_fs_vcpu_tsc_offset, &vcpu->arch, 0); + stats_fs_source_add_values(vcpu->stats_fs_src, stats_fs_vcpu_mp_state, + &vcpu->arch, 0); + if (lapic_in_kernel(vcpu)) stats_fs_source_add_values(vcpu->stats_fs_src, stats_fs_vcpu_arch_lapic_timer, -- 2.25.4
[PATCH v3 5/7] kvm_main: replace debugfs with stats_fs
Use stats_fs API instead of debugfs to create sources and add values. This also requires to change all architecture files to replace the old debugfs_entries with stats_fs_vcpu_entries and statsfs_vm_entries. The files/folders name and organization is kept unchanged, and a symlink in sys/kernel/debugfs/kvm is left for backward compatibility. Signed-off-by: Emanuele Giuseppe Esposito --- arch/arm64/kvm/Kconfig | 1 + arch/arm64/kvm/guest.c | 2 +- arch/mips/kvm/Kconfig | 1 + arch/mips/kvm/mips.c| 2 +- arch/powerpc/kvm/Kconfig| 1 + arch/powerpc/kvm/book3s.c | 12 +- arch/powerpc/kvm/booke.c| 8 +- arch/s390/kvm/Kconfig | 1 + arch/s390/kvm/kvm-s390.c| 16 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/Kconfig| 1 + arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/debugfs.c | 64 --- arch/x86/kvm/stats_fs.c | 60 ++ arch/x86/kvm/x86.c | 11 +- include/linux/kvm_host.h| 45 ++--- virt/kvm/arm/arm.c | 2 +- virt/kvm/kvm_main.c | 318 +--- 18 files changed, 161 insertions(+), 388 deletions(-) delete mode 100644 arch/x86/kvm/debugfs.c create mode 100644 arch/x86/kvm/stats_fs.c diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 449386d76441..f95f6d1c3610 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -23,6 +23,7 @@ config KVM depends on OF # for TASKSTATS/TASK_DELAY_ACCT: depends on NET && MULTIUSER + select STATS_FS_API select MMU_NOTIFIER select PREEMPT_NOTIFIERS select HAVE_KVM_CPU_RELAX_INTERCEPT diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 8417b200bec9..235ed44e4353 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -29,7 +29,7 @@ #include "trace.h" -struct kvm_stats_debugfs_item debugfs_entries[] = { +struct stats_fs_value stats_fs_vcpu_entries[] = { VCPU_STAT("halt_successful_poll", halt_successful_poll), VCPU_STAT("halt_attempted_poll", halt_attempted_poll), VCPU_STAT("halt_poll_invalid", halt_poll_invalid), diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index b91d145aa2d5..b19fbc5297b4 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -19,6 +19,7 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM depends on MIPS_FP_SUPPORT + select STATS_FS_API select EXPORT_UASM select PREEMPT_NOTIFIERS select KVM_GENERIC_DIRTYLOG_READ_PROTECT diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index fdf1c14d9205..a47d21f35444 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -39,7 +39,7 @@ #define VECTORSPACING 0x100/* for EI/VI mode */ #endif -struct kvm_stats_debugfs_item debugfs_entries[] = { +struct stats_fs_value stats_fs_vcpu_entries[] = { VCPU_STAT("wait", wait_exits), VCPU_STAT("cache", cache_exits), VCPU_STAT("signal", signal_exits), diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 12885eda324e..6f0675edfe7c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -19,6 +19,7 @@ if VIRTUALIZATION config KVM bool + select STATS_FS_API select PREEMPT_NOTIFIERS select HAVE_KVM_EVENTFD select HAVE_KVM_VCPU_ASYNC_IOCTL diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 37508a356f28..e3346b3087d0 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -38,7 +38,7 @@ /* #define EXIT_DEBUG */ -struct kvm_stats_debugfs_item debugfs_entries[] = { +struct stats_fs_value stats_fs_vcpu_entries[] = { VCPU_STAT("exits", sum_exits), VCPU_STAT("mmio", mmio_exits), VCPU_STAT("sig", signal_exits), @@ -66,8 +66,14 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("pthru_all", pthru_all), VCPU_STAT("pthru_host", pthru_host), VCPU_STAT("pthru_bad_aff", pthru_bad_aff), - VM_STAT("largepages_2M", num_2M_pages, .mode = 0444), - VM_STAT("largepages_1G", num_1G_pages, .mode = 0444), + { NULL } +}; + +struct stats_fs_value stats_fs_vm_entries[] = { + VM_STAT("largepages_2M", num_2M_pages, + .value_flag = STATS_FS_FLOATING_VALUE), + VM_STAT("largepages_1G", num_1G_pages, + .value_flag = STATS_FS_FLOATING_VALUE), { NULL } }; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index c2984cb6dfa7..b14c07786cc8 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -35,7 +35,12 @@ unsigned long kvmppc_booke_handlers; -struct kvm_stats_debugfs_item debugfs_entries[] = { +struct stats_fs_value stats_fs_vm_entries[] = { + VM_STAT("remote_tlb_flush", remote_tlb_flush), + { NULL } +}; + +stru
[PATCH v3 4/7] stats_fs fs: virtual fs to show stats to the end-user
Add virtual fs that maps stats_fs sources with directories, and values (simple or aggregates) to files. Every time a file is read/cleared, the fs internally invokes the stats_fs API to get/set the requested value. Also introduce the optional show function in each value, that allows to customize how the value is displayed inside a file. This could be especially useful with enums. fs/stats_fs/inode.cis pretty much similar to what is done in fs/debugfs/inode.c, with the exception that the API is only composed by stats_fs_create_file, stats_fs_create_dir and stats_fs_remove. Signed-off-by: Emanuele Giuseppe Esposito --- fs/stats_fs/Makefile | 2 +- fs/stats_fs/inode.c| 461 + fs/stats_fs/internal.h | 15 ++ fs/stats_fs/stats_fs.c | 92 +++- include/linux/stats_fs.h | 18 ++ include/uapi/linux/magic.h | 1 + tools/lib/api/fs/fs.c | 21 ++ 7 files changed, 608 insertions(+), 2 deletions(-) create mode 100644 fs/stats_fs/inode.c diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile index bc59a54d5721..19b7e13f6c3d 100644 --- a/fs/stats_fs/Makefile +++ b/fs/stats_fs/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -stats_fs-objs := stats_fs.o +stats_fs-objs := inode.o stats_fs.o stats_fs-tests-objs:= stats_fs-tests.o obj-$(CONFIG_STATS_FS) += stats_fs.o diff --git a/fs/stats_fs/inode.c b/fs/stats_fs/inode.c new file mode 100644 index ..eaa0a8bc7466 --- /dev/null +++ b/fs/stats_fs/inode.c @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * inode.c - part of stats_fs, a tiny little stats_fs file system + * + * Copyright (C) 2020 Emanuele Giuseppe Esposito + * Copyright (C) 2020 Redhat + */ +#define pr_fmt(fmt)"stats_fs: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +#define STATS_FS_DEFAULT_MODE 0700 + +static struct simple_fs stats_fs; +static bool stats_fs_registered; + +struct stats_fs_mount_opts { + kuid_t uid; + kgid_t gid; + umode_t mode; +}; + +enum { + Opt_uid, + Opt_gid, + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct stats_fs_fs_info { + struct stats_fs_mount_opts mount_opts; +}; + +static int stats_fs_parse_options(char *data, struct stats_fs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + kuid_t uid; + kgid_t gid; + char *p; + + opts->mode = STATS_FS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* +* We might like to report bad mount options here; +* but traditionally stats_fs has ignored all mount options +*/ + } + } + + return 0; +} + +static int stats_fs_apply_options(struct super_block *sb) +{ + struct stats_fs_fs_info *fsi = sb->s_fs_info; + struct inode *inode = d_inode(sb->s_root); + struct stats_fs_mount_opts *opts = &fsi->mount_opts; + + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; + + return 0; +} + +static int stats_fs_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct stats_fs_fs_info *fsi = sb->s_fs_info; + + sync_filesystem(sb); + err = stats_fs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + stats_fs_apply_options(sb); + +fail: + return err; +} + +static int stats_fs_show_options(struct seq_file *m, struct dentry *root) +{ + struct stats_fs_fs_info *f
[PATCH v3 3/7] kunit: tests for stats_fs API
Add kunit tests to extensively test the stats_fs API functionality. In order to run them, the kernel .config must set CONFIG_KUNIT=y and a new .kunitconfig file must be created with CONFIG_STATS_FS=y and CONFIG_STATS_FS_TEST=y Tests can be then started by running the following command from the root directory of the linux kernel source tree: ./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all` Signed-off-by: Emanuele Giuseppe Esposito --- fs/Kconfig |6 + fs/stats_fs/Makefile |2 + fs/stats_fs/stats_fs-tests.c | 1097 ++ 3 files changed, 1105 insertions(+) create mode 100644 fs/stats_fs/stats_fs-tests.c diff --git a/fs/Kconfig b/fs/Kconfig index 684ad61129ab..02bbb0e4cdf7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -227,6 +227,12 @@ config STATS_FS stats_fs is a virtual file system that provides counters and other statistics about the running kernel. +config STATS_FS_TEST + bool "Tests for stats_fs" + depends on STATS_FS && KUNIT + help + tests for the stats_fs API. + config STATS_FS_API bool imply STATS_FS diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile index bd988daa4c39..bc59a54d5721 100644 --- a/fs/stats_fs/Makefile +++ b/fs/stats_fs/Makefile @@ -1,5 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only stats_fs-objs := stats_fs.o +stats_fs-tests-objs:= stats_fs-tests.o obj-$(CONFIG_STATS_FS) += stats_fs.o obj-$(CONFIG_STATS_FS_STUB) += stub.o +obj-$(CONFIG_STATS_FS_TEST) += stats_fs-tests.o diff --git a/fs/stats_fs/stats_fs-tests.c b/fs/stats_fs/stats_fs-tests.c new file mode 100644 index ..bbac133d7fe7 --- /dev/null +++ b/fs/stats_fs/stats_fs-tests.c @@ -0,0 +1,1097 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "internal.h" + +#define STATS_FS_STAT(el, x, ...) \ + { \ + .name = #x, .offset = offsetof(struct container, el.x),\ + ##__VA_ARGS__ \ + } + +#define ARR_SIZE(el) ((int)(sizeof(el) / sizeof(struct stats_fs_value) - 1)) + +struct test_values_struct { + uint64_t u64; + int32_t s32; + bool bo; + uint8_t u8; + int16_t s16; +}; + +struct container { + struct test_values_struct vals; +}; + +struct stats_fs_value test_values[6] = { + STATS_FS_STAT(vals, u64, .type = &stats_fs_type_u64, + .aggr_kind = STATS_FS_NONE, + .value_flag = STATS_FS_FLOATING_VALUE), + STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32, + .aggr_kind = STATS_FS_NONE), + STATS_FS_STAT(vals, bo, .type = &stats_fs_type_bool, + .aggr_kind = STATS_FS_NONE, + .value_flag = STATS_FS_FLOATING_VALUE), + STATS_FS_STAT(vals, u8, .type = &stats_fs_type_u8, + .aggr_kind = STATS_FS_NONE), + STATS_FS_STAT(vals, s16, .type = &stats_fs_type_s16, + .aggr_kind = STATS_FS_NONE, + .value_flag = STATS_FS_FLOATING_VALUE), + { NULL }, +}; + +struct stats_fs_value test_aggr[4] = { + STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32, + .aggr_kind = STATS_FS_MIN, + .value_flag = STATS_FS_FLOATING_VALUE), + STATS_FS_STAT(vals, bo, .type = &stats_fs_type_bool, + .aggr_kind = STATS_FS_MAX, + .value_flag = STATS_FS_FLOATING_VALUE), + STATS_FS_STAT(vals, u64, .type = &stats_fs_type_u64, + .aggr_kind = STATS_FS_SUM, + .value_flag = STATS_FS_FLOATING_VALUE), + { NULL }, +}; + +struct stats_fs_value test_same_name[3] = { + STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32, + .aggr_kind = STATS_FS_NONE), + STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32, + .aggr_kind = STATS_FS_MIN), + { NULL }, +}; + +struct stats_fs_value test_all_aggr[6] = { + STATS_FS_STAT(vals, s32, .type = &stats_fs_type_s32, + .aggr_kind = STATS_FS_MIN), + STATS_FS_STAT(vals, bo, .type = &stats_fs_type_bool, + .aggr_kind = STATS_FS_COUNT_ZERO, + .value_flag = STATS_FS_FLOATING_VALUE), + STATS_FS_STAT(vals, u64, .type = &stats_fs_type_u64, + .aggr_kind = STATS_FS_SUM), + STATS_FS_STAT(vals, u8, .type = &stats_fs_type_u8, + .aggr_kind = STATS_FS_AVG, + .value_flag = STATS_FS_FLOATING_VALUE), + STATS_FS_STAT(vals, s16, .type = &stats_fs_type_s16, +
[PATCH v3 2/7] documentation for stats_fs
Html docs for a complete documentation of the stats_fs API, filesystem and usage. Signed-off-by: Emanuele Giuseppe Esposito --- Documentation/filesystems/index.rst| 1 + Documentation/filesystems/stats_fs.rst | 222 + 2 files changed, 223 insertions(+) create mode 100644 Documentation/filesystems/stats_fs.rst diff --git a/Documentation/filesystems/index.rst b/Documentation/filesystems/index.rst index e7b46dac7079..9a46fd851c6e 100644 --- a/Documentation/filesystems/index.rst +++ b/Documentation/filesystems/index.rst @@ -89,6 +89,7 @@ Documentation for filesystem implementations. relay romfs squashfs + stats_fs sysfs sysv-fs tmpfs diff --git a/Documentation/filesystems/stats_fs.rst b/Documentation/filesystems/stats_fs.rst new file mode 100644 index ..292c689ffb98 --- /dev/null +++ b/Documentation/filesystems/stats_fs.rst @@ -0,0 +1,222 @@ + +Stats_FS + + +Stats_fs is a synthetic ram-based virtual filesystem that takes care of +gathering and displaying statistics for the Linux kernel subsystems. + +The motivation for stats_fs comes from the fact that there is no common +way for Linux kernel subsystems to expose statistics to userspace shared +throughout the Linux kernel; subsystems have to take care of gathering and +displaying statistics by themselves, for example in the form of files in +debugfs. + +Allowing each subsystem of the kernel to do so has two disadvantages. +First, it will introduce redundant code. Second, debugfs is anyway not the +right place for statistics (for example it is affected by lockdown). + +Stats_fs offers a generic and stable API, allowing any kind of +directory/file organization and supporting multiple kind of aggregations +(not only sum, but also average, max, min and count_zero) and data types +(boolean, all unsigned/signed and custom types). The implementation takes +care of gathering and displaying information at run time; users only need +to specify the values to be included in each source. Optionally, users can +also provide a display function for each value, that will take care of +displaying the provided value in a custom format. + +Its main function is to display each statistics as a file in the desired +folder hierarchy defined through the API. Stats_fs files can be read, and +possibly cleared if their file mode allows it. + +Stats_fs is typically mounted with a command like:: + +mount -t stats_fs stats_fs /sys/kernel/stats_fs + +(Or an equivalent /etc/fstab line). + +Stats_fs has two main components: the public API defined by +include/linux/stats_fs.h, and the virtual file system in +/sys/kernel/stats. + +The API has two main elements, values and sources. Kernel +subsystems will create a source, add child +sources/values/aggregates and register it to the root source (that on the +virtual fs would be /sys/kernel/stats). + +The stats_fs API is defined in . + +Sources +Sources are created via ``stats_fs_source_create()``, and each +source becomes a directory in the file system. Sources form a +parent-child relationship; root sources are added to the file +system via ``stats_fs_source_register()``. Therefore each Linux +subsystem will add its own entry to the root, filesystem similar +to what it is done in debugfs. Every other source is added to or +removed from a parent through the +``stats_fs_source_add_subordinate()`` and +``stats_fs_source_remove_subordinate()`` APIs. Once a source is +created and added to the tree (via add_subordinate), it will be +used to compute aggregate values in the parent source. A source +can optionally be hidden from the filesystem but still considered +in the aggregation operations if the corresponding flag is set +during initialization. + +Values +Values represent quantites that are gathered by the stats_fs user. +Examples of values include the number of vm exits of a given kind, +the amount of memory used by some data structure, the length of +the longest hash table chain, or anything like that. Values are +defined with the stats_fs_source_add_values function. Each value +is defined by a ``struct stats_fs_value``; the same +``stats_fs_value`` can be added to many different sources. A value +can be considered "simple" if it fetches data from a user-provided +location, or "aggregate" if it groups all values in the +subordinate sources that include the same ``stats_fs_value``. +Values by default are considered to be cumulative, meaning the +value they represent never decreases, but can also be defined as +floating if they exibith a different behavior. The main difference +between these two is reflected into the file permission, since a +floating value file does not allow the user to clear it. Each +value
[PATCH v3 1/7] stats_fs API: create, add and remove stats_fs sources and values
Introduction to the stats_fs API, that allows to easily create, add and remove stats_fs sources and values. The API allows to easily building the statistics directory tree to automatically gather them for the linux kernel. The main functionalities are: create a source, add child sources/values/aggregates, register it to the root source (that on the virtual fs would be /sys/kernel/stats), ad perform a search for a value/aggregate. Each source and value has an optional flag parameter: in a value, it represent whether the statistic is cumulative or floating, in a source whether it should be visible from the filesystem or not. Defaults are respectively cumulative and visible. Both flags fields are represented as an uint32_t to offer portability for future flags. Each value also takes a struct stats_fs_type pointer that defines get and clear function for that stat, allowing custom types handling. The API also provides default get and clear types for the supported standard types (stats_fs_type_*). The API representation is only logical and will be backed up by a virtual file system in patch 4. Its usage will be shared between the stats_fs file system and the end-users like kvm, the former calling it when it needs to display and clear statistics, the latter to add values and sources. Signed-off-by: Emanuele Giuseppe Esposito --- MAINTAINERS | 7 + fs/Kconfig | 14 + fs/Makefile | 1 + fs/stats_fs/Makefile | 5 + fs/stats_fs/internal.h | 19 ++ fs/stats_fs/stats_fs.c | 552 +++ fs/stats_fs/stub.c | 13 + include/linux/stats_fs.h | 363 + 8 files changed, 974 insertions(+) create mode 100644 fs/stats_fs/Makefile create mode 100644 fs/stats_fs/internal.h create mode 100644 fs/stats_fs/stats_fs.c create mode 100644 fs/stats_fs/stub.c create mode 100644 include/linux/stats_fs.h diff --git a/MAINTAINERS b/MAINTAINERS index b816a453b10e..a8403d07cee5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5229,6 +5229,13 @@ F: include/linux/debugfs.h F: include/linux/kobj* F: lib/kobj* +STATS_FS +M: Paolo Bonzini +R: Emanuele Giuseppe Esposito +S: Supported +F: include/linux/stats_fs.h +F: fs/stats_fs + DRIVERS FOR ADAPTIVE VOLTAGE SCALING (AVS) M: Kevin Hilman M: Nishanth Menon diff --git a/fs/Kconfig b/fs/Kconfig index f08fbbfafd9a..684ad61129ab 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -221,6 +221,20 @@ config MEMFD_CREATE config ARCH_HAS_GIGANTIC_PAGE bool +config STATS_FS + bool "Statistics Filesystem" + help + stats_fs is a virtual file system that provides counters and + other statistics about the running kernel. + +config STATS_FS_API + bool + imply STATS_FS + +config STATS_FS_STUB + bool + default y if STATS_FS_API && !STATS_FS + source "fs/configfs/Kconfig" source "fs/efivarfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 2ce5112b02c8..91558eca0cf7 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -125,6 +125,7 @@ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ +obj-$(CONFIG_STATS_FS) += stats_fs/ obj-$(CONFIG_TRACING) += tracefs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ diff --git a/fs/stats_fs/Makefile b/fs/stats_fs/Makefile new file mode 100644 index ..bd988daa4c39 --- /dev/null +++ b/fs/stats_fs/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only +stats_fs-objs := stats_fs.o + +obj-$(CONFIG_STATS_FS) += stats_fs.o +obj-$(CONFIG_STATS_FS_STUB) += stub.o diff --git a/fs/stats_fs/internal.h b/fs/stats_fs/internal.h new file mode 100644 index ..4993afbb1e45 --- /dev/null +++ b/fs/stats_fs/internal.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _STATS_FS_INTERNAL_H_ +#define _STATS_FS_INTERNAL_H_ + +#include +#include +#include +#include + +/* values, grouped by base */ +struct stats_fs_value_source { + void *base_addr; + bool files_created; + uint32_t common_flags; + struct stats_fs_value *values; + struct list_head list_element; +}; + +#endif /* _STATS_FS_INTERNAL_H_ */ diff --git a/fs/stats_fs/stats_fs.c b/fs/stats_fs/stats_fs.c new file mode 100644 index ..b76ee44f6dac --- /dev/null +++ b/fs/stats_fs/stats_fs.c @@ -0,0 +1,552 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +struct stats_fs_aggregate_value { + uint64_t sum, min, max; + uint32_t count, count_zero; +}; + +#define STATS_FS_DEFINE_TYPE_STRUCT(gtype, stype, si) \ + const struct stats_fs_type stats_fs_type_##gtype =
[PATCH v3 0/7] Statsfs: a new ram-based file system for Linux kernel statistics
There is currently no common way for Linux kernel subsystems to expose statistics to userspace shared throughout the Linux kernel; subsystems have to take care of gathering and displaying statistics by themselves, for example in the form of files in debugfs. For example KVM has its own code section that takes care of this in virt/kvm/kvm_main.c, where it sets up debugfs handlers for displaying values and aggregating them from various subfolders to obtain information about the system state (i.e. displaying the total number of exits, calculated by summing all exits of all cpus of all running virtual machines). Allowing each section of the kernel to do so has two disadvantages. First, it will introduce redundant code. Second, debugfs is anyway not the right place for statistics (for example it is affected by lockdown) In this patch series I introduce statsfs, a synthetic ram-based virtual filesystem that takes care of gathering and displaying statistics for the Linux kernel subsystems. The file system is mounted on /sys/kernel/stats and would be already used by kvm. Statsfs was initially introduced by Paolo Bonzini [1]. Statsfs offers a generic and stable API, allowing any kind of directory/file organization and supporting multiple kind of aggregations (not only sum, but also average, max, min and count_zero) and data types (boolean, unsigned/signed and custom types). The implementation, which is a generalization of KVM’s debugfs statistics code, takes care of gathering and displaying information at run time; users only need to specify the values to be included in each source. Statsfs would also be a different mountpoint from debugfs, and would not suffer from limited access due to the security lock down patches. Its main function is to display each statistics as a file in the desired folder hierarchy defined through the API. Statsfs files can be read, and possibly cleared if their file mode allows it. Statsfs has two main components: the public API defined by include/linux/statsfs.h, and the virtual file system which should end up in /sys/kernel/stats. The API has two main elements, values and sources. Kernel subsystems like KVM can use the API to create a source, add child sources/values/aggregates and register it to the root source (that on the virtual fs would be /sys/kernel/statsfs). Sources are created via statsfs_source_create(), and each source becomes a directory in the file system. Sources form a parent-child relationship; root sources are added to the file system via statsfs_source_register(). Every other source is added to or removed from a parent through the statsfs_source_add_subordinate and statsfs_source_remote_subordinate APIs. Once a source is created and added to the tree (via add_subordinate), it will be used to compute aggregate values in the parent source. A source can optionally be hidden from the filesystem but still considered in the aggregation operations if the corresponding flag is set during initialization. Values represent quantites that are gathered by the statsfs user. Examples of values include the number of vm exits of a given kind, the amount of memory used by some data structure, the length of the longest hash table chain, or anything like that. Values are defined with the statsfs_source_add_values function. Each value is defined by a struct statsfs_value; the same statsfs_value can be added to many different sources. A value can be considered "simple" if it fetches data from a user-provided location, or "aggregate" if it groups all values in the subordinates sources that include the same statsfs_value. Each value has a stats_fs_type pointer in order to allow the user to provide custom get and clear functions. The library, however, also exports default stats_fs_type structs for the standard types (all unsigned and signed types plus boolean). A value can also provide a show function, that takes care of displaying the value in a custom string format. This can be especially useful when displaying enums. For more information, please consult the kerneldoc documentation in patch 2 and the sample uses in the kunit tests, KVM and networking. This series of patches is based on my previous series "libfs: group and simplify linux fs code" and the single patch sent to kvm "kvm_host: unify VM_STAT and VCPU_STAT definitions in a single place". The former simplifies code duplicated in debugfs and tracefs (from which statsfs is based on), the latter groups all macros definition for statistics in kvm in a single common file shared by all architectures. Patch 1 adds a new refcount and kref destructor wrappers that take a semaphore, as those are used later by statsfs. Patch 2 introduces the statsfs API, patch 3 provides extensive tests that can also be used as example on how to use the API and patch 4 adds the file system support. Finally, patch 5 provides a real-life example of statsfs usage in KVM, with patch 6 providing a concrete example of the show function and patch 7 anothe
Re: [PATCH v4 3/6] asm-generic/tlb, arch: Invert CONFIG_HAVE_RCU_TABLE_INVALIDATE
On Wed, May 20, 2020 at 02:00:22PM +0530, Santosh Sivaraj wrote: > From: Peter Zijlstra > > commit 96bc9567cbe112e9320250f01b9c060c882e8619 upstream > > Make issuing a TLB invalidate for page-table pages the normal case. > > The reason is twofold: > > - too many invalidates is safer than too few, > - most architectures use the linux page-tables natively >and would thus require this. > > Make it an opt-out, instead of an opt-in. > > No change in behavior intended. > > Signed-off-by: Peter Zijlstra (Intel) > Cc: # 4.19 > Signed-off-by: Santosh Sivaraj > [santosh: prerequisite for upcoming tlbflush backports] > --- > arch/Kconfig | 2 +- > arch/powerpc/Kconfig | 1 + > arch/sparc/Kconfig | 1 + > arch/x86/Kconfig | 1 - > mm/memory.c | 2 +- > 5 files changed, 4 insertions(+), 3 deletions(-) Why did you not also change arch/arm64/Kconfig and include/asm-generic/tlb.h like the original patch changed? Why can those files be ignored/left out? You need to explain that in the backport section, all you said was "prerequisite..." and did not say why you changed this patch. Please fix up, and make sure you do the same for all of the other patches in this series for when you resend it. thanks, greg k-h
Re: [linux-next PATCH] mm/gup.c: Convert to use get_user_{page|pages}_fast_only()
On Tue, May 26, 2020 at 1:29 PM Paul Mackerras wrote: > > On Mon, May 25, 2020 at 02:23:32PM +0530, Souptick Joarder wrote: > > API __get_user_pages_fast() renamed to get_user_pages_fast_only() > > to align with pin_user_pages_fast_only(). > > > > As part of this we will get rid of write parameter. > > Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). > > This will not change any existing functionality of the API. > > > > All the callers are changed to pass FOLL_WRITE. > > > > Also introduce get_user_page_fast_only(), and use it in a few > > places that hard-code nr_pages to 1. > > > > Updated the documentation of the API. > > > > Signed-off-by: Souptick Joarder > > The arch/powerpc/kvm bits look reasonable. > > Reviewed-by: Paul Mackerras Thanks Paul. This patch is merged through mm-tree. https://lore.kernel.org/kvm/1590396812-31277-1-git-send-email-jrdr.li...@gmail.com/
Re: [PATCH v3 1/3] riscv: Move kernel mapping to vmalloc zone
On Sun, May 24, 2020 at 4:54 PM Alexandre Ghiti wrote: > > This is a preparatory patch for relocatable kernel. > > The kernel used to be linked at PAGE_OFFSET address and used to be loaded > physically at the beginning of the main memory. Therefore, we could use > the linear mapping for the kernel mapping. > > But the relocated kernel base address will be different from PAGE_OFFSET > and since in the linear mapping, two different virtual addresses cannot > point to the same physical address, the kernel mapping needs to lie outside > the linear mapping. > > In addition, because modules and BPF must be close to the kernel (inside > +-2GB window), the kernel is placed at the end of the vmalloc zone minus > 2GB, which leaves room for modules and BPF. The kernel could not be > placed at the beginning of the vmalloc zone since other vmalloc > allocations from the kernel could get all the +-2GB window around the > kernel which would prevent new modules and BPF programs to be loaded. > > Signed-off-by: Alexandre Ghiti > --- > arch/riscv/boot/loader.lds.S | 3 +- > arch/riscv/include/asm/page.h| 10 +- > arch/riscv/include/asm/pgtable.h | 37 +--- > arch/riscv/kernel/head.S | 3 +- > arch/riscv/kernel/module.c | 4 +-- > arch/riscv/kernel/vmlinux.lds.S | 3 +- > arch/riscv/mm/init.c | 58 +--- > arch/riscv/mm/physaddr.c | 2 +- > 8 files changed, 87 insertions(+), 33 deletions(-) > > diff --git a/arch/riscv/boot/loader.lds.S b/arch/riscv/boot/loader.lds.S > index 47a5003c2e28..62d94696a19c 100644 > --- a/arch/riscv/boot/loader.lds.S > +++ b/arch/riscv/boot/loader.lds.S > @@ -1,13 +1,14 @@ > /* SPDX-License-Identifier: GPL-2.0 */ > > #include > +#include > > OUTPUT_ARCH(riscv) > ENTRY(_start) > > SECTIONS > { > - . = PAGE_OFFSET; > + . = KERNEL_LINK_ADDR; > > .payload : { > *(.payload) > diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h > index 2d50f76efe48..48bb09b6a9b7 100644 > --- a/arch/riscv/include/asm/page.h > +++ b/arch/riscv/include/asm/page.h > @@ -90,18 +90,26 @@ typedef struct page *pgtable_t; > > #ifdef CONFIG_MMU > extern unsigned long va_pa_offset; > +extern unsigned long va_kernel_pa_offset; > extern unsigned long pfn_base; > #define ARCH_PFN_OFFSET(pfn_base) > #else > #define va_pa_offset 0 > +#define va_kernel_pa_offset0 > #define ARCH_PFN_OFFSET(PAGE_OFFSET >> PAGE_SHIFT) > #endif /* CONFIG_MMU */ > > extern unsigned long max_low_pfn; > extern unsigned long min_low_pfn; > +extern unsigned long kernel_virt_addr; > > #define __pa_to_va_nodebug(x) ((void *)((unsigned long) (x) + va_pa_offset)) > -#define __va_to_pa_nodebug(x) ((unsigned long)(x) - va_pa_offset) > +#define linear_mapping_va_to_pa(x) ((unsigned long)(x) - va_pa_offset) > +#define kernel_mapping_va_to_pa(x) \ > + ((unsigned long)(x) - va_kernel_pa_offset) > +#define __va_to_pa_nodebug(x) \ > + (((x) >= PAGE_OFFSET) ? \ > + linear_mapping_va_to_pa(x) : kernel_mapping_va_to_pa(x)) > > #ifdef CONFIG_DEBUG_VIRTUAL > extern phys_addr_t __virt_to_phys(unsigned long x); > diff --git a/arch/riscv/include/asm/pgtable.h > b/arch/riscv/include/asm/pgtable.h > index 35b60035b6b0..25213cfaf680 100644 > --- a/arch/riscv/include/asm/pgtable.h > +++ b/arch/riscv/include/asm/pgtable.h > @@ -11,23 +11,29 @@ > > #include > > -#ifndef __ASSEMBLY__ > - > -/* Page Upper Directory not used in RISC-V */ > -#include > -#include > -#include > -#include > - > -#ifdef CONFIG_MMU > +#ifndef CONFIG_MMU > +#define KERNEL_VIRT_ADDR PAGE_OFFSET > +#define KERNEL_LINK_ADDR PAGE_OFFSET > +#else > +/* > + * Leave 2GB for modules and BPF that must lie within a 2GB range around > + * the kernel. > + */ > +#define KERNEL_VIRT_ADDR (VMALLOC_END - SZ_2G + 1) > +#define KERNEL_LINK_ADDR KERNEL_VIRT_ADDR > > #define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1) > #define VMALLOC_END (PAGE_OFFSET - 1) > #define VMALLOC_START(PAGE_OFFSET - VMALLOC_SIZE) > > #define BPF_JIT_REGION_SIZE(SZ_128M) > -#define BPF_JIT_REGION_START (PAGE_OFFSET - BPF_JIT_REGION_SIZE) > -#define BPF_JIT_REGION_END (VMALLOC_END) > +#define BPF_JIT_REGION_START (kernel_virt_addr) > +#define BPF_JIT_REGION_END (kernel_virt_addr + BPF_JIT_REGION_SIZE) It seems to have a potential risk here, the region of bpf is overlapping with kernel mapping, so if kernel size is bigger than 128MB, bpf region would be occupied and run out by kernel mapping. > + > +#ifdef CONFIG_64BIT > +#define VMALLOC_MODULE_START BPF_JIT_REGION_END > +#define VMALLOC_MODULE_END VMALLOC_END > +#endif > Although kernel_virt_addr is a fixed address now, I think it could be changed for the purpose of relocatable or KASLR, so if kernel_virt_addr is moved to far from VMALLOC_END than 2G, the region of module w
Re: [PATCH v3 2/3] riscv: Introduce CONFIG_RELOCATABLE
On Sun, May 24, 2020 at 4:55 PM Alexandre Ghiti wrote: > > This config allows to compile the kernel as PIE and to relocate it at > any virtual address at runtime: this paves the way to KASLR and to 4-level > page table folding at runtime. Runtime relocation is possible since > relocation metadata are embedded into the kernel. > > Note that relocating at runtime introduces an overhead even if the > kernel is loaded at the same address it was linked at and that the compiler > options are those used in arm64 which uses the same RELA relocation > format. > > Signed-off-by: Alexandre Ghiti > --- > arch/riscv/Kconfig | 12 +++ > arch/riscv/Makefile | 5 ++- > arch/riscv/kernel/vmlinux.lds.S | 6 ++-- > arch/riscv/mm/Makefile | 4 +++ > arch/riscv/mm/init.c| 63 + > 5 files changed, 87 insertions(+), 3 deletions(-) > > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig > index a31e1a41913a..93127d5913fe 100644 > --- a/arch/riscv/Kconfig > +++ b/arch/riscv/Kconfig > @@ -170,6 +170,18 @@ config PGTABLE_LEVELS > default 3 if 64BIT > default 2 > > +config RELOCATABLE > + bool > + depends on MMU > + help > + This builds a kernel as a Position Independent Executable (PIE), > + which retains all relocation metadata required to relocate the > + kernel binary at runtime to a different virtual address than the > + address it was linked at. > + Since RISCV uses the RELA relocation format, this requires a > + relocation pass at runtime even if the kernel is loaded at the > + same address it was linked at. > + > source "arch/riscv/Kconfig.socs" > > menu "Platform type" > diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile > index fb6e37db836d..1406416ea743 100644 > --- a/arch/riscv/Makefile > +++ b/arch/riscv/Makefile > @@ -9,7 +9,10 @@ > # > > OBJCOPYFLAGS:= -O binary > -LDFLAGS_vmlinux := > +ifeq ($(CONFIG_RELOCATABLE),y) > +LDFLAGS_vmlinux := -shared -Bsymbolic -z notext -z norelro > +KBUILD_CFLAGS += -fPIE > +endif > ifeq ($(CONFIG_DYNAMIC_FTRACE),y) > LDFLAGS_vmlinux := --no-relax > endif > diff --git a/arch/riscv/kernel/vmlinux.lds.S b/arch/riscv/kernel/vmlinux.lds.S > index a9abde62909f..e8ffba8c2044 100644 > --- a/arch/riscv/kernel/vmlinux.lds.S > +++ b/arch/riscv/kernel/vmlinux.lds.S > @@ -85,8 +85,10 @@ SECTIONS > > BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) > > - .rel.dyn : { > - *(.rel.dyn*) > + .rela.dyn : ALIGN(8) { > + __rela_dyn_start = .; > + *(.rela .rela*) > + __rela_dyn_end = .; > } > > _end = .; > diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile > index 363ef01c30b1..dc5cdaa80bc1 100644 > --- a/arch/riscv/mm/Makefile > +++ b/arch/riscv/mm/Makefile > @@ -1,6 +1,10 @@ > # SPDX-License-Identifier: GPL-2.0-only > > CFLAGS_init.o := -mcmodel=medany > +ifdef CONFIG_RELOCATABLE > +CFLAGS_init.o += -fno-pie > +endif > + > ifdef CONFIG_FTRACE > CFLAGS_REMOVE_init.o = -pg > endif > diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c > index 17f108baec4f..7074522d40c6 100644 > --- a/arch/riscv/mm/init.c > +++ b/arch/riscv/mm/init.c > @@ -13,6 +13,9 @@ > #include > #include > #include > +#ifdef CONFIG_RELOCATABLE > +#include > +#endif > > #include > #include > @@ -379,6 +382,53 @@ static uintptr_t __init best_map_size(phys_addr_t base, > phys_addr_t size) > #error "setup_vm() is called from head.S before relocate so it should not > use absolute addressing." > #endif > > +#ifdef CONFIG_RELOCATABLE > +extern unsigned long __rela_dyn_start, __rela_dyn_end; > + > +#ifdef CONFIG_64BIT > +#define Elf_Rela Elf64_Rela > +#define Elf_Addr Elf64_Addr > +#else > +#define Elf_Rela Elf32_Rela > +#define Elf_Addr Elf32_Addr > +#endif > + > +void __init relocate_kernel(uintptr_t load_pa) > +{ > + Elf_Rela *rela = (Elf_Rela *)&__rela_dyn_start; > + /* > +* This holds the offset between the linked virtual address and the > +* relocated virtual address. > +*/ > + uintptr_t reloc_offset = kernel_virt_addr - KERNEL_LINK_ADDR; > + /* > +* This holds the offset between kernel linked virtual address and > +* physical address. > +*/ > + uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - load_pa; > + > + for ( ; rela < (Elf_Rela *)&__rela_dyn_end; rela++) { > + Elf_Addr addr = (rela->r_offset - va_kernel_link_pa_offset); > + Elf_Addr relocated_addr = rela->r_addend; > + > + if (rela->r_info != R_RISCV_RELATIVE) > + continue; > + > + /* > +* Make sure to not relocate vdso symbols like rt_sigreturn > +* which are linked from the address 0 in vmlinux since > +* vdso symbol addresses are actually used as an of
[PATCH 6/6] powerpc/ppc-opcode: fold PPC_INST_* macros into PPC_RAW_* macros
Lot of PPC_INST_* macros are used only ever in PPC_* macros, fold those PPC_INST_* into PPC_RAW_* to avoid using PPC_INST_* accidentally. Signed-off-by: Balamuruhan S Acked-by: Naveen N. Rao Tested-by: Naveen N. Rao --- arch/powerpc/include/asm/ppc-opcode.h | 381 +- 1 file changed, 125 insertions(+), 256 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index c8d71a8bef46..bbb77f998f19 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -208,56 +208,27 @@ #define OP_LQ56 /* sorted alphabetically */ -#define PPC_INST_BHRBE 0x7c00025c -#define PPC_INST_CLRBHRB 0x7c00035c #define PPC_INST_COPY 0x7c20060c -#define PPC_INST_CP_ABORT 0x7c00068c -#define PPC_INST_DARN 0x7c0005e6 #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe -#define PPC_INST_DCBAL 0x7c2005ec -#define PPC_INST_DCBZL 0x7c2007ec -#define PPC_INST_ICBT 0x7c2c -#define PPC_INST_ICSWX 0x7c00032d -#define PPC_INST_ICSWEPX 0x7c00076d #define PPC_INST_ISEL 0x7c1e #define PPC_INST_ISEL_MASK 0xfc3e -#define PPC_INST_LDARX 0x7ca8 -#define PPC_INST_STDCX 0x7c0001ad -#define PPC_INST_LQARX 0x7c000228 -#define PPC_INST_STQCX 0x7c00016d #define PPC_INST_LSWI 0x7c0004aa #define PPC_INST_LSWX 0x7c00042a -#define PPC_INST_LWARX 0x7c28 -#define PPC_INST_STWCX 0x7c00012d #define PPC_INST_LWSYNC0x7c2004ac #define PPC_INST_SYNC 0x7c0004ac #define PPC_INST_SYNC_MASK 0xfc0007fe #define PPC_INST_ISYNC 0x4c00012c -#define PPC_INST_LXVD2X0x7c000698 #define PPC_INST_MCRXR 0x7c000400 #define PPC_INST_MCRXR_MASK0xfc0007fe #define PPC_INST_MFSPR_PVR 0x7c1f42a6 #define PPC_INST_MFSPR_PVR_MASK0xfc1e -#define PPC_INST_MFTMR 0x7c0002dc -#define PPC_INST_MSGSND0x7c00019c -#define PPC_INST_MSGCLR0x7c0001dc -#define PPC_INST_MSGSYNC 0x7c0006ec -#define PPC_INST_MSGSNDP 0x7c00011c -#define PPC_INST_MSGCLRP 0x7c00015c #define PPC_INST_MTMSRD0x7c000164 -#define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x6000 -#define PPC_INST_PASTE 0x7c20070d #define PPC_INST_POPCNTB 0x7cf4 #define PPC_INST_POPCNTB_MASK 0xfc0007fe -#define PPC_INST_POPCNTD 0x7c0003f4 -#define PPC_INST_POPCNTW 0x7c0002f4 #define PPC_INST_RFEBB 0x4c000124 -#define PPC_INST_RFCI 0x4c66 -#define PPC_INST_RFDI 0x4c4e #define PPC_INST_RFID 0x4c24 -#define PPC_INST_RFMCI 0x4c4c #define PPC_INST_MFSPR 0x7c0002a6 #define PPC_INST_MFSPR_DSCR0x7c1102a6 #define PPC_INST_MFSPR_DSCR_MASK 0xfc1e @@ -267,131 +238,30 @@ #define PPC_INST_MFSPR_DSCR_USER_MASK 0xfc1e #define PPC_INST_MTSPR_DSCR_USER 0x7c0303a6 #define PPC_INST_MTSPR_DSCR_USER_MASK 0xfc1e -#define PPC_INST_MFVSRD0x7c66 -#define PPC_INST_MTVSRD0x7c000166 #define PPC_INST_SC0x4402 -#define PPC_INST_SLBFEE0x7c0007a7 -#define PPC_INST_SLBIA 0x7c0003e4 - #define PPC_INST_STRING0x7c00042a #define PPC_INST_STRING_MASK 0xfc0007fe #define PPC_INST_STRING_GEN_MASK 0xfc00067e - #define PPC_INST_STSWI 0x7c0005aa #define PPC_INST_STSWX 0x7c00052a -#define PPC_INST_STXVD2X 0x7c000798 -#define PPC_INST_TLBIE 0x7c000264 -#define PPC_INST_TLBIEL0x7c000224 -#define PPC_INST_TLBILX0x7c24 -#define PPC_INST_WAIT 0x7c7c -#define PPC_INST_TLBIVAX 0x7c000624 -#define PPC_INST_TLBSRX_DOT0x7c0006a5 -#define PPC_INST_VPMSUMW 0x1488 -#define PPC_INST_VPMSUMD 0x14c8 -#define PPC_INST_VPERMXOR 0x102d -#define PPC_INST_XXLOR 0xf490 -#define PPC_INST_XXSWAPD 0xf250 -#define PPC_INST_XVCPSGNDP 0xf780 #define PPC_INST_TRECHKPT 0x7c0007dd #define PPC_INST_TRECLAIM 0x7c00075d -#define PPC_INST_TABORT0x7c00071d #define PPC_INST_TSR 0x
[PATCH 5/6] powerpc/ppc-opcode: reuse raw instruction macros to stringify
Wrap existing stringify macros to reuse raw instruction encoding macros that are newly added. Signed-off-by: Balamuruhan S Acked-by: Naveen N. Rao Tested-by: Naveen N. Rao --- arch/powerpc/include/asm/ppc-opcode.h | 220 +- 1 file changed, 71 insertions(+), 149 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 94b889d89395..c8d71a8bef46 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -781,166 +781,92 @@ ___PPC_RA(a)) /* Deal with instructions that older assemblers aren't aware of */ -#definePPC_CP_ABORTstringify_in_c(.long PPC_INST_CP_ABORT) -#definePPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ - ___PPC_RA(a) | ___PPC_RB(b)) -#define PPC_DARN(t, l) stringify_in_c(.long PPC_INST_DARN | \ - ___PPC_RT(t) | \ - (((l) & 0x3) << 16)) -#definePPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ - __PPC_RA(a) | __PPC_RB(b)) -#definePPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ - __PPC_RA(a) | __PPC_RB(b)) -#define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LQARX | \ - ___PPC_RT(t) | ___PPC_RA(a) | \ - ___PPC_RB(b) | __PPC_EH(eh)) -#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \ - ___PPC_RT(t) | ___PPC_RA(a) | \ - ___PPC_RB(b) | __PPC_EH(eh)) -#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \ - ___PPC_RT(t) | ___PPC_RA(a) | \ - ___PPC_RB(b) | __PPC_EH(eh)) -#define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_INST_STQCX | \ - ___PPC_RT(t) | ___PPC_RA(a) | \ - ___PPC_RB(b)) -#define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDHD | \ - ___PPC_RT(t) | ___PPC_RA(a) | \ - ___PPC_RB(b) | ___PPC_RC(c)) -#define PPC_MADDHDU(t, a, b, c)stringify_in_c(.long PPC_INST_MADDHDU | \ - ___PPC_RT(t) | ___PPC_RA(a) | \ - ___PPC_RB(b) | ___PPC_RC(c)) -#define PPC_MADDLD(t, a, b, c) stringify_in_c(.long PPC_INST_MADDLD | \ - ___PPC_RT(t) | ___PPC_RA(a) | \ - ___PPC_RB(b) | ___PPC_RC(c)) -#define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ - ___PPC_RB(b)) -#define PPC_MSGSYNCstringify_in_c(.long PPC_INST_MSGSYNC) -#define PPC_MSGCLR(b) stringify_in_c(.long PPC_INST_MSGCLR | \ - ___PPC_RB(b)) -#define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \ - ___PPC_RB(b)) -#define PPC_MSGCLRP(b) stringify_in_c(.long PPC_INST_MSGCLRP | \ - ___PPC_RB(b)) -#define PPC_PASTE(a, b)stringify_in_c(.long PPC_INST_PASTE | \ - ___PPC_RA(a) | ___PPC_RB(b)) -#define PPC_POPCNTB(a, s) stringify_in_c(.long PPC_INST_POPCNTB | \ - __PPC_RA(a) | __PPC_RS(s)) -#define PPC_POPCNTD(a, s) stringify_in_c(.long PPC_INST_POPCNTD | \ - __PPC_RA(a) | __PPC_RS(s)) -#define PPC_POPCNTW(a, s) stringify_in_c(.long PPC_INST_POPCNTW | \ - __PPC_RA(a) | __PPC_RS(s)) -#define PPC_RFCI stringify_in_c(.long PPC_INST_RFCI) -#define PPC_RFDI stringify_in_c(.long PPC_INST_RFDI) -#define PPC_RFMCI stringify_in_c(.long PPC_INST_RFMCI) -#define PPC_TLBILX(t, a, b)stringify_in_c(.long PPC_INST_TLBILX | \ - __PPC_T_TLB(t) | __PPC_RA0(a) | __PPC_RB(b)) +#definePPC_CP_ABORTstringify_in_c(.long PPC_RAW_CP_ABORT) +#definePPC_COPY(a, b) stringify_in_c(.long PPC_RAW_COPY(a, b)) +#define PPC_DARN(t, l) stringify_in_c(.long PPC_RAW_DARN(t, l)) +#definePPC_DCBAL(a, b) stringify_in_c(.long PPC_RAW_DCBAL(a, b)) +#definePPC_DCBZL(a, b) stringify_in_c(.long PPC_RAW_DCBZL(a, b)) +#define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh)) +#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LDARX(t, a, b, eh)) +#define PPC_
[PATCH 4/6] powerpc/ppc-opcode: consolidate powerpc instructions from bpf_jit.h
move macro definitions of powerpc instructions from bpf_jit.h to ppc-opcode.h and adopt the users of the macros accordingly. `PPC_MR()` is defined twice in bpf_jit.h, remove the duplicate one. Signed-off-by: Balamuruhan S Acked-by: Naveen N. Rao Tested-by: Naveen N. Rao --- arch/powerpc/include/asm/ppc-opcode.h | 139 + arch/powerpc/net/bpf_jit.h| 166 ++- arch/powerpc/net/bpf_jit32.h | 24 +-- arch/powerpc/net/bpf_jit64.h | 12 +- arch/powerpc/net/bpf_jit_comp.c | 132 ++-- arch/powerpc/net/bpf_jit_comp64.c | 278 +- 6 files changed, 378 insertions(+), 373 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index ca3f0351b878..94b889d89395 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -79,6 +79,16 @@ #define IMM_L(i) ((uintptr_t)(i) & 0x) #define IMM_DS(i) ((uintptr_t)(i) & 0xfffc) +/* + * 16-bit immediate helper macros: HA() is for use with sign-extending instrs + * (e.g. LD, ADDI). If the bottom 16 bits is "-ve", add another bit into the + * top half to negate the effect (i.e. 0x + 1 = 0x(1)). + */ +#define IMM_H(i)((uintptr_t)(i)>>16) +#define IMM_HA(i) (((uintptr_t)(i)>>16) + \ + (((uintptr_t)(i) & 0x8000) >> 15)) + + /* opcode and xopcode for instructions */ #define OP_TRAP 3 #define OP_TRAP_64 2 @@ -640,6 +650,135 @@ #define PPC_RAW_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ ___PPC_RA(a) | ___PPC_RB(b) | \ 0x1) +#define PPC_RAW_NOP() (PPC_INST_NOP) +#define PPC_RAW_BLR() (PPC_INST_BLR) +#define PPC_RAW_BLRL() (PPC_INST_BLRL) +#define PPC_RAW_MTLR(r)(PPC_INST_MTLR | ___PPC_RT(r)) +#define PPC_RAW_BCTR() (PPC_INST_BCTR) +#define PPC_RAW_MTCTR(r) (PPC_INST_MTCTR | ___PPC_RT(r)) +#define PPC_RAW_ADDI(d, a, i) (PPC_INST_ADDI | ___PPC_RT(d) | \ + ___PPC_RA(a) | IMM_L(i)) +#define PPC_RAW_LI(r, i) PPC_RAW_ADDI(r, 0, i) +#define PPC_RAW_ADDIS(d, a, i) (PPC_INST_ADDIS | \ + ___PPC_RT(d) | ___PPC_RA(a) | \ + IMM_L(i)) +#define PPC_RAW_LIS(r, i) PPC_RAW_ADDIS(r, 0, i) +#define PPC_RAW_STDX(r, base, b) (PPC_INST_STDX | ___PPC_RS(r) | \ + ___PPC_RA(base) | ___PPC_RB(b)) +#define PPC_RAW_STDU(r, base, i) (PPC_INST_STDU | ___PPC_RS(r) | \ + ___PPC_RA(base) | \ + ((i) & 0xfffc)) +#define PPC_RAW_STW(r, base, i)(PPC_INST_STW | ___PPC_RS(r) | \ + ___PPC_RA(base) | IMM_L(i)) +#define PPC_RAW_STWU(r, base, i) (PPC_INST_STWU | ___PPC_RS(r) | \ + ___PPC_RA(base) | IMM_L(i)) +#define PPC_RAW_STH(r, base, i)(PPC_INST_STH | ___PPC_RS(r) | \ + ___PPC_RA(base) | IMM_L(i)) +#define PPC_RAW_STB(r, base, i)(PPC_INST_STB | ___PPC_RS(r) | \ + ___PPC_RA(base) | IMM_L(i)) +#define PPC_RAW_LBZ(r, base, i)(PPC_INST_LBZ | ___PPC_RT(r) | \ + ___PPC_RA(base) | IMM_L(i)) +#define PPC_RAW_LDX(r, base, b)(PPC_INST_LDX | ___PPC_RT(r) | \ + ___PPC_RA(base) | ___PPC_RB(b)) +#define PPC_RAW_LHZ(r, base, i)(PPC_INST_LHZ | ___PPC_RT(r) | \ + ___PPC_RA(base) | IMM_L(i)) +#define PPC_RAW_LHBRX(r, base, b) (PPC_INST_LHBRX | ___PPC_RT(r) | \ + ___PPC_RA(base) | ___PPC_RB(b)) +#define PPC_RAW_LDBRX(r, base, b) (PPC_INST_LDBRX | ___PPC_RT(r) | \ + ___PPC_RA(base) | ___PPC_RB(b)) +#define PPC_RAW_STWCX(s, a, b) (PPC_INST_STWCX | ___PPC_RS(s) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_CMPWI(a, i)(PPC_INST_CMPWI | ___PPC_RA(a) | IMM_L(i)) +#define PPC_RAW_CMPDI(a, i)(PPC_INST_CMPDI | ___PPC_RA(a) | IMM_L(i)) +#define PPC_RAW_CMPW(a, b) (PPC_INST_CMPW | ___PPC_RA(a) | \ + ___PPC_RB(b)) +#define PPC_RAW_CMPD(a, b) (PPC_INST_CMPD | ___PPC_RA(a) | \ + ___PPC_RB(b)) +
[PATCH 3/6] powerpc/bpf_jit: reuse instruction macros from ppc-opcode.h
remove duplicate macro definitions from bpf_jit.h and reuse the macros from ppc-opcode.h Signed-off-by: Balamuruhan S Acked-by: Naveen N. Rao Tested-by: Naveen N. Rao --- arch/powerpc/net/bpf_jit.h| 18 +- arch/powerpc/net/bpf_jit32.h | 10 +- arch/powerpc/net/bpf_jit64.h | 4 ++-- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 20 ++-- 5 files changed, 19 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 55d4377ccfae..535d1de4dfee 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -11,6 +11,7 @@ #ifndef __ASSEMBLY__ #include +#include #ifdef PPC64_ELF_ABI_v1 #define FUNCTION_DESCR_SIZE24 @@ -26,7 +27,6 @@ #define IMM_H(i) ((uintptr_t)(i)>>16) #define IMM_HA(i) (((uintptr_t)(i)>>16) + \ (((uintptr_t)(i) & 0x8000) >> 15)) -#define IMM_L(i) ((uintptr_t)(i) & 0x) #define PLANT_INSTR(d, idx, instr) \ do { if (d) { (d)[idx] = instr; } idx++; } while (0) @@ -45,8 +45,6 @@ #define PPC_ADDIS(d, a, i) EMIT(PPC_INST_ADDIS | \ ___PPC_RT(d) | ___PPC_RA(a) | IMM_L(i)) #define PPC_LIS(r, i) PPC_ADDIS(r, 0, i) -#define PPC_STD(r, base, i)EMIT(PPC_INST_STD | ___PPC_RS(r) |\ -___PPC_RA(base) | ((i) & 0xfffc)) #define PPC_STDX(r, base, b) EMIT(PPC_INST_STDX | ___PPC_RS(r) | \ ___PPC_RA(base) | ___PPC_RB(b)) #define PPC_STDU(r, base, i) EMIT(PPC_INST_STDU | ___PPC_RS(r) | \ @@ -62,12 +60,8 @@ #define PPC_LBZ(r, base, i)EMIT(PPC_INST_LBZ | ___PPC_RT(r) |\ ___PPC_RA(base) | IMM_L(i)) -#define PPC_LD(r, base, i) EMIT(PPC_INST_LD | ___PPC_RT(r) | \ -___PPC_RA(base) | ((i) & 0xfffc)) #define PPC_LDX(r, base, b)EMIT(PPC_INST_LDX | ___PPC_RT(r) |\ ___PPC_RA(base) | ___PPC_RB(b)) -#define PPC_LWZ(r, base, i)EMIT(PPC_INST_LWZ | ___PPC_RT(r) |\ -___PPC_RA(base) | IMM_L(i)) #define PPC_LHZ(r, base, i)EMIT(PPC_INST_LHZ | ___PPC_RT(r) |\ ___PPC_RA(base) | IMM_L(i)) #define PPC_LHBRX(r, base, b) EMIT(PPC_INST_LHBRX | ___PPC_RT(r) | \ @@ -75,16 +69,8 @@ #define PPC_LDBRX(r, base, b) EMIT(PPC_INST_LDBRX | ___PPC_RT(r) | \ ___PPC_RA(base) | ___PPC_RB(b)) -#define PPC_BPF_LDARX(t, a, b, eh) EMIT(PPC_INST_LDARX | ___PPC_RT(t) | \ - ___PPC_RA(a) | ___PPC_RB(b) | \ - __PPC_EH(eh)) -#define PPC_BPF_LWARX(t, a, b, eh) EMIT(PPC_INST_LWARX | ___PPC_RT(t) | \ - ___PPC_RA(a) | ___PPC_RB(b) | \ - __PPC_EH(eh)) #define PPC_BPF_STWCX(s, a, b) EMIT(PPC_INST_STWCX | ___PPC_RS(s) | \ ___PPC_RA(a) | ___PPC_RB(b)) -#define PPC_BPF_STDCX(s, a, b) EMIT(PPC_INST_STDCX | ___PPC_RS(s) | \ - ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_CMPWI(a, i)EMIT(PPC_INST_CMPWI | ___PPC_RA(a) | IMM_L(i)) #define PPC_CMPDI(a, i)EMIT(PPC_INST_CMPDI | ___PPC_RA(a) | IMM_L(i)) #define PPC_CMPW(a, b) EMIT(PPC_INST_CMPW | ___PPC_RA(a) | \ @@ -100,8 +86,6 @@ #define PPC_SUB(d, a, b) EMIT(PPC_INST_SUB | ___PPC_RT(d) |\ ___PPC_RB(a) | ___PPC_RA(b)) -#define PPC_ADD(d, a, b) EMIT(PPC_INST_ADD | ___PPC_RT(d) |\ -___PPC_RA(a) | ___PPC_RB(b)) #define PPC_MULD(d, a, b) EMIT(PPC_INST_MULLD | ___PPC_RT(d) | \ ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_MULW(d, a, b) EMIT(PPC_INST_MULLW | ___PPC_RT(d) | \ diff --git a/arch/powerpc/net/bpf_jit32.h b/arch/powerpc/net/bpf_jit32.h index 4ec2a9f14f84..753c244a7cf9 100644 --- a/arch/powerpc/net/bpf_jit32.h +++ b/arch/powerpc/net/bpf_jit32.h @@ -76,13 +76,13 @@ DECLARE_LOAD_FUNC(sk_load_byte_msh); else { PPC_ADDIS(r, base, IMM_HA(i));\ PPC_LBZ(r, r, IMM_L(i)); } } while(0) -#define PPC_LD_OFFS(r, base, i) do { if ((i) < 32768) PPC_LD(r, base, i); \ +#define PPC_LD_OFFS(r, base, i) do { if ((i) < 32768) EMIT(PPC_RAW_LD(r, base, i)); \ else { PPC_ADDIS(r, base, IMM_HA(i));
[PATCH 2/6] powerpc/ppc-opcode: move ppc instruction encoding from test_emulate_step
Few ppc instructions are encoded in test_emulate_step.c, consolidate them and use it from ppc-opcode.h Signed-off-by: Balamuruhan S Acked-by: Naveen N. Rao Tested-by: Naveen N. Rao --- arch/powerpc/include/asm/ppc-opcode.h | 35 ++ arch/powerpc/lib/test_emulate_step.c | 155 ++ 2 files changed, 91 insertions(+), 99 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index e3540be1fc17..ca3f0351b878 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -76,6 +76,9 @@ #define__REGA0_R30 30 #define__REGA0_R31 31 +#define IMM_L(i) ((uintptr_t)(i) & 0x) +#define IMM_DS(i) ((uintptr_t)(i) & 0xfffc) + /* opcode and xopcode for instructions */ #define OP_TRAP 3 #define OP_TRAP_64 2 @@ -605,6 +608,38 @@ ___PPC_RT(vrt) | \ ___PPC_RA(vra) | \ ___PPC_RB(vrb) | __PPC_RC21) +#define PPC_RAW_LD(r, base, i) (PPC_INST_LD | ___PPC_RT(r) | \ + ___PPC_RA(base) | IMM_DS(i)) +#define PPC_RAW_LWZ(r, base, i)(PPC_INST_LWZ | ___PPC_RT(r) | \ + ___PPC_RA(base) | IMM_L(i)) +#define PPC_RAW_LWZX(t, a, b) (PPC_INST_LWZX | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_STD(r, base, i)(PPC_INST_STD | ___PPC_RS(r) | \ + ___PPC_RA(base) | IMM_DS(i)) +#define PPC_RAW_STDCX(s, a, b) (PPC_INST_STDCX | ___PPC_RS(s) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_LFSX(t, a, b) (PPC_INST_LFSX | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_STFSX(s, a, b) (PPC_INST_STFSX | ___PPC_RS(s) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_LFDX(t, a, b) (PPC_INST_LFDX | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_STFDX(s, a, b) (PPC_INST_STFDX | ___PPC_RS(s) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_LVX(t, a, b) (PPC_INST_LVX | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_STVX(s, a, b) (PPC_INST_STVX | ___PPC_RS(s) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_ADD(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_ADD_DOT(t, a, b) (PPC_INST_ADD | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | \ + 0x1) +#define PPC_RAW_ADDC(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b)) +#define PPC_RAW_ADDC_DOT(t, a, b) (PPC_INST_ADDC | ___PPC_RT(t) | \ + ___PPC_RA(a) | ___PPC_RB(b) | \ + 0x1) /* Deal with instructions that older assemblers aren't aware of */ #definePPC_CP_ABORTstringify_in_c(.long PPC_INST_CP_ABORT) diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 46af80279ebc..e508290eb15d 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -13,49 +13,6 @@ #include #include -#define IMM_L(i) ((uintptr_t)(i) & 0x) -#define IMM_DS(i) ((uintptr_t)(i) & 0xfffc) - -/* - * Defined with TEST_ prefix so it does not conflict with other - * definitions. - */ -#define TEST_LD(r, base, i)ppc_inst(PPC_INST_LD | ___PPC_RT(r) | \ - ___PPC_RA(base) | IMM_DS(i)) -#define TEST_LWZ(r, base, i) ppc_inst(PPC_INST_LWZ | ___PPC_RT(r) | \ - ___PPC_RA(base) | IMM_L(i)) -#define TEST_LWZX(t, a, b) ppc_inst(PPC_INST_LWZX | ___PPC_RT(t) | \ - ___PPC_RA(a) | ___PPC_RB(b)) -#define TEST_STD(r, base, i) ppc_inst(PPC_INST_STD | ___PPC_RS(r) | \ - ___PPC_RA(base) | IMM_DS(i)) -#define TEST_LDARX(t, a, b, eh)ppc_inst(PPC_INST_LDARX | ___PPC_RT(t) |\ - ___PPC_RA(a) | ___PPC_RB(b) | \ - __PPC_EH(eh)) -#define TEST_STDCX(s, a, b)ppc_inst(PPC_INST_STDCX |
[PATCH 1/6] powerpc/ppc-opcode: introduce PPC_RAW_* macros for base instruction encoding
Introduce PPC_RAW_* macros to have all the bare encoding of ppc instructions. Move `VSX_XX*()` and `TMRN()` macros up to reuse it. Signed-off-by: Balamuruhan S Acked-by: Naveen N. Rao Tested-by: Naveen N. Rao --- arch/powerpc/include/asm/ppc-opcode.h | 183 -- 1 file changed, 175 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 2a39c716c343..e3540be1fc17 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -431,6 +431,181 @@ #define __PPC_EH(eh) 0 #endif +/* Base instruction encoding */ +#define PPC_RAW_CP_ABORT (PPC_INST_CP_ABORT) +#define PPC_RAW_COPY(a, b) (PPC_INST_COPY | ___PPC_RA(a) | \ + ___PPC_RB(b)) +#define PPC_RAW_DARN(t, l) (PPC_INST_DARN | ___PPC_RT(t) | \ + (((l) & 0x3) << 16)) +#define PPC_RAW_DCBAL(a, b)(PPC_INST_DCBAL | __PPC_RA(a) | \ + __PPC_RB(b)) +#define PPC_RAW_DCBZL(a, b)(PPC_INST_DCBZL | __PPC_RA(a) | \ + __PPC_RB(b)) +#define PPC_RAW_LQARX(t, a, b, eh) (PPC_INST_LQARX | ___PPC_RT(t) | \ + ___PPC_RA(a) | \ + ___PPC_RB(b) | __PPC_EH(eh)) +#define PPC_RAW_LDARX(t, a, b, eh) (PPC_INST_LDARX | ___PPC_RT(t) | \ + ___PPC_RA(a) | \ + ___PPC_RB(b) | __PPC_EH(eh)) +#define PPC_RAW_LWARX(t, a, b, eh) (PPC_INST_LWARX | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | __PPC_EH(eh)) +#define PPC_RAW_STQCX(t, a, b) (PPC_INST_STQCX | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b)) +#define PPC_RAW_MADDHD(t, a, b, c) (PPC_INST_MADDHD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_RAW_MADDHDU(t, a, b, c)(PPC_INST_MADDHDU | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_RAW_MADDLD(t, a, b, c) (PPC_INST_MADDLD | \ + ___PPC_RT(t) | ___PPC_RA(a) | \ + ___PPC_RB(b) | ___PPC_RC(c)) +#define PPC_RAW_MSGSND(b) (PPC_INST_MSGSND | ___PPC_RB(b)) +#define PPC_RAW_MSGSYNC(PPC_INST_MSGSYNC) +#define PPC_RAW_MSGCLR(b) (PPC_INST_MSGCLR | ___PPC_RB(b)) +#define PPC_RAW_MSGSNDP(b) (PPC_INST_MSGSNDP | ___PPC_RB(b)) +#define PPC_RAW_MSGCLRP(b) (PPC_INST_MSGCLRP | ___PPC_RB(b)) +#define PPC_RAW_PASTE(a, b)(PPC_INST_PASTE | ___PPC_RA(a) | \ + ___PPC_RB(b)) +#define PPC_RAW_POPCNTB(a, s) (PPC_INST_POPCNTB | __PPC_RA(a) | \ + __PPC_RS(s)) +#define PPC_RAW_POPCNTD(a, s) (PPC_INST_POPCNTD | __PPC_RA(a) | \ + __PPC_RS(s)) +#define PPC_RAW_POPCNTW(a, s) (PPC_INST_POPCNTW | __PPC_RA(a) | \ + __PPC_RS(s)) +#define PPC_RAW_RFCI (PPC_INST_RFCI) +#define PPC_RAW_RFDI (PPC_INST_RFDI) +#define PPC_RAW_RFMCI (PPC_INST_RFMCI) +#define PPC_RAW_TLBILX(t, a, b)(PPC_INST_TLBILX | \ + __PPC_T_TLB(t) | \ + __PPC_RA0(a) | \ + __PPC_RB(b)) +#define PPC_RAW_WAIT(w)(PPC_INST_WAIT | __PPC_WC(w)) +#define PPC_RAW_TLBIE(lp, a) (PPC_INST_TLBIE | ___PPC_RB(a) | \ + ___PPC_RS(lp)) +#define PPC_RAW_TLBIE_5(rb, rs, ric, prs, r) \ + (PPC_INST_TLBIE | \ + ___PPC_RB(rb) | \ + ___PPC_RS(rs) | \ + ___PPC_RIC(ric) | \ + ___PPC_PRS(prs) | \ + ___PPC_R(r)) +#define PPC_RAW_TLBIEL(rb, rs, ric, prs, r) \ + (PPC_INST_TLBIEL | \ + ___PPC_RB(rb) | \ + ___PPC_RS(rs) |
[PATCH 0/6] consolidate PowerPC instruction encoding macros
ppc-opcode.h have base instruction encoding wrapped with stringify_in_c() for raw encoding to have compatibility. But there are redundant macros for base instruction encodings in bpf, instruction emulation test infrastructure and powerpc selftests. Currently PPC_INST_* macros are used for encoding instruction opcode and PPC_* for raw instuction encoding, this rfc patchset introduces PPC_RAW_* macros for base instruction encoding and reuse it from elsewhere. With this change we can avoid redundant macro definitions in multiple files and start adding new instructions in ppc-opcode.h in future. Changes in v1: - * Drop the patch that had changes in stringloops Makefile. * Include Acked-by and Tested-by tag from Naveen. * Rebased on next branch of linuxppc tree. Changes in rfc v2: - Fix review comments/suggestions from Naveen and Michael Ellerman, * Rename PPC_ENCODE_* to PPC_RAW_* for base instruction encoding macros. * Split the patches that does mass renaming and make them simpler that just adds new macros. * Keep the patch to update all the existing names later (patch 6). * Lot of PPC_INST_* macros are used only in ppc-opcode.h for PPC_* macros, fold PPC_INST_* encoding into PPC_RAW_* to avoid using them accidentally. * Fixed clipped macros that was due to a typo/copy-paste * Consolidated all the instruction encoding macros from bpf_jit.h to ppc-opcode.h * squashed patch that removes the duplicate macro PPC_MR() in bpf_jit.h * merge few changes in bpf_jit files from patch 2 into patch 3 * few fixes in powerpc selftest stringloops Makefile * build tested for ppc64le_defconfig, ppc64e_defconfig and pmac32_defconfig * Rebased on next branch of linuxppc tree Testing: --- * Tested it by compiling vmlinux and comparing objdump of it with and without the patchset and observed that it remains same, # diff vmlinux_objdump vmlinux_rfc_objdump 2c2 < vmlinux: file format elf64-powerpcle --- > vmlinux_rfc: file format elf64-powerpcle * Tested building it with this changes for Fedora30 config, booted VM with powerpc next and powerpc next + patchset to run powerpc selftest and ftrace selftest. There were couple of failures that were common and patchset did not introduce any new failures. ftrace selftest: --- # # of passed: 96 # # of failed: 1 # # of unresolved: 7 # # of untested: 0 # # of unsupported: 1 # # of xfailed: 1 # # of undefined(test bug): 0 not ok 1 selftests: ftrace: ftracetest # exit=1 powerpc selftest: not ok 7 selftests: powerpc/dscr: dscr_sysfs_thread_test # exit=1 not ok 20 selftests: powerpc/pmu/ebb: lost_exception_test # TIMEOUT not ok 2 selftests: powerpc/security: spectre_v2 # exit=1 Thanks to Naveen, Sandipan and Michael on overall suggestions/improvements. I would request for review and suggestions to make it better. rfc v2: https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-April/209395.html rfc v1: https://lists.ozlabs.org/pipermail/linuxppc-dev/2020-March/206494.html Balamuruhan S (6): powerpc/ppc-opcode: introduce PPC_RAW_* macros for base instruction encoding powerpc/ppc-opcode: move ppc instruction encoding from test_emulate_step powerpc/bpf_jit: reuse instruction macros from ppc-opcode.h powerpc/ppc-opcode: consolidate powerpc instructions from bpf_jit.h powerpc/ppc-opcode: reuse raw instruction macros to stringify powerpc/ppc-opcode: fold PPC_INST_* macros into PPC_RAW_* macros arch/powerpc/include/asm/ppc-opcode.h | 706 +++--- arch/powerpc/lib/test_emulate_step.c | 155 ++ arch/powerpc/net/bpf_jit.h| 184 +-- arch/powerpc/net/bpf_jit32.h | 34 +- arch/powerpc/net/bpf_jit64.h | 16 +- arch/powerpc/net/bpf_jit_comp.c | 134 ++--- arch/powerpc/net/bpf_jit_comp64.c | 298 +-- 7 files changed, 733 insertions(+), 794 deletions(-) base-commit: 30df74d67d48949da87e3a5b57c381763e8fd526 -- 2.24.1
Re: [linux-next PATCH] mm/gup.c: Convert to use get_user_{page|pages}_fast_only()
On Mon, May 25, 2020 at 02:23:32PM +0530, Souptick Joarder wrote: > API __get_user_pages_fast() renamed to get_user_pages_fast_only() > to align with pin_user_pages_fast_only(). > > As part of this we will get rid of write parameter. > Instead caller will pass FOLL_WRITE to get_user_pages_fast_only(). > This will not change any existing functionality of the API. > > All the callers are changed to pass FOLL_WRITE. > > Also introduce get_user_page_fast_only(), and use it in a few > places that hard-code nr_pages to 1. > > Updated the documentation of the API. > > Signed-off-by: Souptick Joarder The arch/powerpc/kvm bits look reasonable. Reviewed-by: Paul Mackerras
Re: [RFC PATCH v2 7/7] powerpc/selftest: reuse ppc-opcode macros to avoid redundancy
On Thu, 2020-04-30 at 17:27 +0530, Naveen N. Rao wrote: > Michael Ellerman wrote: > > "Naveen N. Rao" writes: > > > Michael Ellerman wrote: > > > > Balamuruhan S writes: > > > > > Avoid redefining macros to encode ppc instructions instead reuse it > > > > > from > > > > > ppc-opcode.h, Makefile changes are necessary to compile memcmp_64.S > > > > > with > > > > > __ASSEMBLY__ defined from selftests. > > > > > > > > > > Signed-off-by: Balamuruhan S > > > > > --- > > > > > .../selftests/powerpc/stringloops/Makefile| 34 ++--- > > > > > - > > > > > .../powerpc/stringloops/asm/asm-const.h | 1 + > > > > > .../powerpc/stringloops/asm/ppc-opcode.h | 36 +-- > > > > > > > > > > 3 files changed, 29 insertions(+), 42 deletions(-) > > > > > create mode 12 > > > > > tools/testing/selftests/powerpc/stringloops/asm/asm-const.h > > > > > mode change 100644 => 12 > > > > > tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h > > > > > > > > > > diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile > > > > > b/tools/testing/selftests/powerpc/stringloops/Makefile > > > > > index 7fc0623d85c3..efe76c5a5b94 100644 > > > > > --- a/tools/testing/selftests/powerpc/stringloops/Makefile > > > > > +++ b/tools/testing/selftests/powerpc/stringloops/Makefile > > > > > @@ -1,26 +1,44 @@ > > > > > # SPDX-License-Identifier: GPL-2.0 > > > > > # The loops are all 64-bit code > > > > > -CFLAGS += -I$(CURDIR) > > > > > +GIT_VERSION = $(shell git describe --always --long --dirty || echo > > > > > "unknown") > > > > > +CFLAGS += -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) > > > > > -I$(CURDIR)/../include > > > > > > > > > > EXTRA_SOURCES := ../harness.c > > > > > > > > > > build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c > > > > > >/dev/null 2>&1) then echo "1"; fi) > > > > > > > > > > +ifneq ($(build_32bit),1) > > > > > TEST_GEN_PROGS := memcmp_64 strlen > > > > > +TEST_GEN_FILES := memcmp.o memcmp_64.o memcmp_64 > > > > > +MEMCMP := $(OUTPUT)/memcmp.o > > > > > +MEMCMP_64 := $(OUTPUT)/memcmp_64.o > > > > > +HARNESS := $(OUTPUT)/../harness.o > > > > > +CFLAGS += -m64 -maltivec > > > > > > > > > > -$(OUTPUT)/memcmp_64: memcmp.c > > > > > -$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec > > > > > +OVERRIDE_TARGETS := 1 > > > > > +include ../../lib.mk > > > > > > > > > > -ifeq ($(build_32bit),1) > > > > > +$(OUTPUT)/memcmp_64: $(MEMCMP_64) $(MEMCMP) $(HARNESS) > > > > > + $(CC) $(CFLAGS) memcmp.o memcmp_64.o ../harness.o -o memcmp_64 > > > > > + > > > > > +$(MEMCMP_64): memcmp_64.S > > > > > + $(CC) $(CFLAGS) -D__ASSEMBLY__ -o memcmp_64.o -c memcmp_64.S > > > > > + > > > > > +$(MEMCMP): memcmp.c > > > > > + $(CC) $(CFLAGS) -o memcmp.o -c memcmp.c > > > > > + > > > > > +$(HARNESS): $(EXTRA_SOURCES) > > > > > + $(CC) $(CFLAGS) -DGIT_VERSION='"$(GIT_VERSION)"' -o > > > > > ../harness.o -c $(EXTRA_SOURCES) > > > > > > > > What are you actually trying to do here? Is it just that you need to > > > > define __ASSEMBLY__ for memcmp_64.S? > > > > > > Adding __ASSEMBLY__ while building memcmp_64.S would be the goal, so as > > > to reuse ppc-opcode.h. However, asm/ppc-opcode.h under stringloops test > > > is tiny and doesn't seem to justify the change. Okay, I will drop the last patch that have changes for stringloops Makefile. make and make clean is not working from inside stringloops directory which is fixed with this change. > > > > I don't see ppc-opcode.h testing __ASSEMBLY__ though, so I don't think > > we even need to define it? > > Right -- it's rather 'stringify_in_c' which tests it. 'asm/ppc-opcode.h' > under stringloops/ unconditionally defines 'stringify_in_c' this way: > # define stringify_in_c(...) __VA_ARGS__ > It is expecting __ASSEMBLY__ through ppc-opcode.h -> asm-const.h to raw encode the instruction in assembly file instead to stringify it for c file. we observe this Assembler messages without defining __ASSEMBLY__, memcmp_64.S: Assembler messages: memcmp_64.S:473: Error: unknown pseudo-op: `.long (0x10c7 | (((0) & 0x1f) << 21) | (((0) & 0x1f) << 16) | (((1) & 0x1f) << 11) | (0x1 << 10))' memcmp_64.S:477: Error: unknown pseudo-op: `.long (0x10c7 | (((0) & 0x1f) << 21) | (((0) & 0x1f) << 16) | (((1) & 0x1f) << 11) | (0x1 << 10))' memcmp_64.S:586: Error: unknown pseudo-op: `.long (0x1006 | (((7) & 0x1f) << 21) | (((9) & 0x1f) << 16) | (((10) & 0x1f) << 11) | (0x1 << 10))' memcmp_64.S:607: Error: unknown pseudo-op: `.long (0x1006 | (((7) & 0x1f) << 21) | (((9) & 0x1f) << 16) | (((10) & 0x1f) << 11) | (0x1 << 10))' memcmp_64.S:616: Error: unknown pseudo-op: `.long (0x1006 | (((7) & 0x1f) << 21) | (((9) & 0x1f) << 16) | (((10) & 0x1f) << 11) | (0x1 << 10))' make[1]: *** [../../lib.mk:148: /home/bala/linux/tools/testing/selftests/powerpc/stringloops/memcmp_64] Error 1 -- Bala > > - Naveen >
Re: [PATCH V3 2/2] tools/perf: Add perf tools support for extended register capability in powerpc
On 5/20/20 3:15 PM, Athira Rajeev wrote: From: Anju T Sudhakar Add extended regs to sample_reg_mask in the tool side to use with `-I?` option. Perf tools side uses extended mask to display the platform supported register names (with -I? option) to the user and also send this mask to the kernel to capture the extended registers in each sample. Hence decide the mask value based on the processor version. Signed-off-by: Anju T Sudhakar [Decide extended mask at run time based on platform] Signed-off-by: Athira Rajeev Reviewed-by: Madhavan Srinivasan --- tools/arch/powerpc/include/uapi/asm/perf_regs.h | 14 ++- tools/perf/arch/powerpc/include/perf_regs.h | 5 ++- tools/perf/arch/powerpc/util/perf_regs.c| 55 + 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/tools/arch/powerpc/include/uapi/asm/perf_regs.h b/tools/arch/powerpc/include/uapi/asm/perf_regs.h index f599064..485b1d5 100644 --- a/tools/arch/powerpc/include/uapi/asm/perf_regs.h +++ b/tools/arch/powerpc/include/uapi/asm/perf_regs.h @@ -48,6 +48,18 @@ enum perf_event_powerpc_regs { PERF_REG_POWERPC_DSISR, PERF_REG_POWERPC_SIER, PERF_REG_POWERPC_MMCRA, - PERF_REG_POWERPC_MAX, + /* Extended registers */ + PERF_REG_POWERPC_MMCR0, + PERF_REG_POWERPC_MMCR1, + PERF_REG_POWERPC_MMCR2, + /* Max regs without the extended regs */ + PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1, }; + +#define PERF_REG_PMU_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1) + +/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */ +#define PERF_REG_PMU_MASK_300 (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \ + - PERF_REG_PMU_MASK) + #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */ diff --git a/tools/perf/arch/powerpc/include/perf_regs.h b/tools/perf/arch/powerpc/include/perf_regs.h index e18a355..46ed00d 100644 --- a/tools/perf/arch/powerpc/include/perf_regs.h +++ b/tools/perf/arch/powerpc/include/perf_regs.h @@ -64,7 +64,10 @@ [PERF_REG_POWERPC_DAR] = "dar", [PERF_REG_POWERPC_DSISR] = "dsisr", [PERF_REG_POWERPC_SIER] = "sier", - [PERF_REG_POWERPC_MMCRA] = "mmcra" + [PERF_REG_POWERPC_MMCRA] = "mmcra", + [PERF_REG_POWERPC_MMCR0] = "mmcr0", + [PERF_REG_POWERPC_MMCR1] = "mmcr1", + [PERF_REG_POWERPC_MMCR2] = "mmcr2", }; static inline const char *perf_reg_name(int id) diff --git a/tools/perf/arch/powerpc/util/perf_regs.c b/tools/perf/arch/powerpc/util/perf_regs.c index 0a52429..9179230 100644 --- a/tools/perf/arch/powerpc/util/perf_regs.c +++ b/tools/perf/arch/powerpc/util/perf_regs.c @@ -6,9 +6,14 @@ #include "../../../util/perf_regs.h" #include "../../../util/debug.h" +#include "../../../util/event.h" +#include "../../../util/header.h" +#include "../../../perf-sys.h" #include +#define PVR_POWER9 0x004E + const struct sample_reg sample_reg_masks[] = { SMPL_REG(r0, PERF_REG_POWERPC_R0), SMPL_REG(r1, PERF_REG_POWERPC_R1), @@ -55,6 +60,9 @@ SMPL_REG(dsisr, PERF_REG_POWERPC_DSISR), SMPL_REG(sier, PERF_REG_POWERPC_SIER), SMPL_REG(mmcra, PERF_REG_POWERPC_MMCRA), + SMPL_REG(mmcr0, PERF_REG_POWERPC_MMCR0), + SMPL_REG(mmcr1, PERF_REG_POWERPC_MMCR1), + SMPL_REG(mmcr2, PERF_REG_POWERPC_MMCR2), SMPL_REG_END }; @@ -163,3 +171,50 @@ int arch_sdt_arg_parse_op(char *old_op, char **new_op) return SDT_ARG_VALID; } + +uint64_t arch__intr_reg_mask(void) +{ + struct perf_event_attr attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .sample_type= PERF_SAMPLE_REGS_INTR, + .precise_ip = 1, + .disabled = 1, + .exclude_kernel = 1, + }; + int fd, ret; + char buffer[64]; + u32 version; + u64 extended_mask = 0; + + /* Get the PVR value to set the extended +* mask specific to platform +*/ + get_cpuid(buffer, sizeof(buffer)); + ret = sscanf(buffer, "%u,", &version); + + if (ret != 1) { + pr_debug("Failed to get the processor version, unable to output extended registers\n"); + return PERF_REGS_MASK; + } + + if (version == PVR_POWER9) + extended_mask = PERF_REG_PMU_MASK_300; + else + return PERF_REGS_MASK; + + attr.sample_regs_intr = extended_mask; + attr.sample_period = 1; + event_attr_init(&attr); + + /* +* check if the pmu supports perf extended regs, before +* returning the register mask to sample. +*/ + fd = sys_perf_event_open(&attr, 0, -1, -1, 0); + if (fd != -1) { + close(fd); + return (extended_mask | PERF_REGS_MASK); + } + return PERF_REGS_MASK
Re: [PATCH V3 1/2] powerpc/perf: Add support for outputting extended regs in perf intr_regs
On 5/20/20 3:15 PM, Athira Rajeev wrote: From: Anju T Sudhakar Add support for perf extended register capability in powerpc. The capability flag PERF_PMU_CAP_EXTENDED_REGS, is used to indicate the PMU which support extended registers. The generic code define the mask of extended registers as 0 for non supported architectures. Patch adds extended regs support for power9 platform by exposing MMCR0, MMCR1 and MMCR2 registers. REG_RESERVED mask needs update to include extended regs. `PERF_REG_EXTENDED_MASK`, contains mask value of the supported registers, is defined at runtime in the kernel based on platform since the supported registers may differ from one processor version to another and hence the MASK value. with patch -- available registers: r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21 r22 r23 r24 r25 r26 r27 r28 r29 r30 r31 nip msr orig_r3 ctr link xer ccr softe trap dar dsisr sier mmcra mmcr0 mmcr1 mmcr2 PERF_RECORD_SAMPLE(IP, 0x1): 4784/4784: 0 period: 1 addr: 0 ... intr regs: mask 0x ABI 64-bit r00xc012b77c r10xc03fe5e03930 r20xc1b0e000 r30xc03fdcddf800 r40xc03fc788 r50x9c422724be r60xc03fe5e03908 r70xff63bddc8706 r80x9e4 r90x0 r10 0x1 r11 0x0 r12 0xc01299c0 r13 0xc03c4800 r14 0x0 r15 0x7fffdd8b8b00 r16 0x0 r17 0x7fffdd8be6b8 r18 0x7e7076607730 r19 0x2f r20 0xc0001fc26c68 r21 0xc0002041e4227e00 r22 0xc0002018fb60 r23 0x1 r24 0xc03ffec4d900 r25 0x8000 r26 0x0 r27 0x1 r28 0x1 r29 0xc1be1260 r30 0x6008010 r31 0xc03ffebb7218 nip 0xc012b910 msr 0x90009033 orig_r3 0xc012b86c ctr 0xc01299c0 link 0xc012b77c xer 0x0 ccr 0x2800 softe 0x1 trap 0xf00 dar 0x0 dsisr 0x800 sier 0x0 mmcra 0x800 mmcr0 0x82008090 mmcr1 0x1e00 mmcr2 0x0 ... thread: perf:4784 Signed-off-by: Anju T Sudhakar [Defined PERF_REG_EXTENDED_MASK at run time to add support for different platforms ] Signed-off-by: Athira Rajeev Patch looks fine except for couple minor nits (extra tabs and newline issue). Reviewed-by: Madhavan Srinivasan --- arch/powerpc/include/asm/perf_event_server.h | 8 +++ arch/powerpc/include/uapi/asm/perf_regs.h| 14 +++- arch/powerpc/perf/core-book3s.c | 1 + arch/powerpc/perf/perf_regs.c| 34 +--- arch/powerpc/perf/power9-pmu.c | 6 + 5 files changed, 59 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 3e9703f..1458e1a 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -15,6 +15,9 @@ #define MAX_EVENT_ALTERNATIVES8 #define MAX_LIMITED_HWCOUNTERS2 +extern u64 mask_var; +#define PERF_REG_EXTENDED_MASK mask_var + struct perf_event; /* @@ -55,6 +58,11 @@ struct power_pmu { int *blacklist_ev; /* BHRB entries in the PMU */ int bhrb_nr; + /* +* set this flag with `PERF_PMU_CAP_EXTENDED_REGS` if +* the pmu supports extended perf regs capability +*/ + int capabilities; }; /* diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h b/arch/powerpc/include/uapi/asm/perf_regs.h index f599064..485b1d5 100644 --- a/arch/powerpc/include/uapi/asm/perf_regs.h +++ b/arch/powerpc/include/uapi/asm/perf_regs.h @@ -48,6 +48,18 @@ enum perf_event_powerpc_regs { PERF_REG_POWERPC_DSISR, PERF_REG_POWERPC_SIER, PERF_REG_POWERPC_MMCRA, - PERF_REG_POWERPC_MAX, + /* Extended registers */ + PERF_REG_POWERPC_MMCR0, + PERF_REG_POWERPC_MMCR1, + PERF_REG_POWERPC_MMCR2, + /* Max regs without the extended regs */ + PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1, }; + +#define PERF_REG_PMU_MASK ((1ULL << PERF_REG_POWERPC_MAX) - 1) + +/* PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300 */ +#define PERF_REG_PMU_MASK_300 (((1ULL << (PERF_REG_POWERPC_MMCR2 + 1)) - 1) \ + - PERF_REG_PMU_MASK) + #endif /* _UAPI_ASM_POWERPC_PERF_REGS_H */ diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 3dcfecf..f56b778 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2276,6 +2276,7 @@ int register_power_pmu(struct power_pmu *pmu) power_pmu.attr_groups = ppmu->attr_groups; + power_pmu.capabilities |= (ppmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS); Co
[PATCH v2] powerpc: Add ppc_inst_as_u64()
The code patching code wants to get the value of a struct ppc_inst as a u64 when the instruction is prefixed, so we can pass the u64 down to __put_user_asm() and write it with a single store. The optprobes code wants to load a struct ppc_inst as an immediate into a register so it is useful to have it as a u64 to use the existing helper function. Currently this is a bit awkward because the value differs based on the CPU endianness, so add a helper to do the conversion. This fixes the usage in arch_prepare_optimized_kprobe() which was previously incorrect on big endian. Fixes: 650b55b707fd ("powerpc: Add prefixed instructions to instruction data type") Signed-off-by: Michael Ellerman Tested-by: Jordan Niethe Link: https://lore.kernel.org/r/20200525055004.2182328-1-...@ellerman.id.au --- arch/powerpc/include/asm/inst.h | 9 + arch/powerpc/kernel/optprobes.c | 3 +-- arch/powerpc/lib/code-patching.c | 8 +--- 3 files changed, 11 insertions(+), 9 deletions(-) v2: Update the commit message as noted by Jordan. Add a Fixes tag. diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index 5b756ba77ed2..45f3ec868258 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -113,6 +113,15 @@ static inline struct ppc_inst *ppc_inst_next(void *location, struct ppc_inst *va return location + ppc_inst_len(tmp); } +static inline u64 ppc_inst_as_u64(struct ppc_inst x) +{ +#ifdef CONFIG_CPU_LITTLE_ENDIAN + return (u64)ppc_inst_suffix(x) << 32 | ppc_inst_val(x); +#else + return (u64)ppc_inst_val(x) << 32 | ppc_inst_suffix(x); +#endif +} + int probe_user_read_inst(struct ppc_inst *inst, struct ppc_inst __user *nip); diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index 3ac105e7faae..69bfe96884e2 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -283,8 +283,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) * 3. load instruction to be emulated into relevant register, and */ temp = ppc_inst_read((struct ppc_inst *)p->ainsn.insn); - patch_imm64_load_insns(ppc_inst_val(temp) | ((u64)ppc_inst_suffix(temp) << 32), - 4, buff + TMPL_INSN_IDX); + patch_imm64_load_insns(ppc_inst_as_u64(temp), 4, buff + TMPL_INSN_IDX); /* * 4. branch back from trampoline diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 64cf621e5b00..5ecf0d635a8d 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -27,13 +27,7 @@ static int __patch_instruction(struct ppc_inst *exec_addr, struct ppc_inst instr if (!ppc_inst_prefixed(instr)) { __put_user_asm(ppc_inst_val(instr), patch_addr, err, "stw"); } else { -#ifdef CONFIG_CPU_LITTLE_ENDIAN - __put_user_asm((u64)ppc_inst_suffix(instr) << 32 | - ppc_inst_val(instr), patch_addr, err, "std"); -#else - __put_user_asm((u64)ppc_inst_val(instr) << 32 | - ppc_inst_suffix(instr), patch_addr, err, "std"); -#endif + __put_user_asm(ppc_inst_as_u64(instr), patch_addr, err, "std"); } if (err) -- 2.25.1