[Bug 203647] Locking API testsuite fails "mixed read-lock/lock-write ABBA" rlock on kernels >=4.14.x
https://bugzilla.kernel.org/show_bug.cgi?id=203647 Michael Ellerman (mich...@ellerman.id.au) changed: What|Removed |Added Status|NEW |RESOLVED CC||mich...@ellerman.id.au Resolution|--- |DOCUMENTED --- Comment #6 from Michael Ellerman (mich...@ellerman.id.au) --- This appears to be working as expected, which I admit is a little confusing. The key thing is that at the end you see, eg: [0.179788] Good, all 261 testcases passed! | See the commit that added the test: https://git.kernel.org/torvalds/c/e91498589746 locking/lockdep/selftests: Add mixed read-write ABBA tests Currently lockdep has limited support for recursive readers, add a few mixed read-write ABBA selftests to show the extend of these limitations. And in the code: print_testname("mixed read-lock/lock-write ABBA"); pr_cont(" |"); dotest(rlock_ABBA1, FAILURE, LOCKTYPE_RWLOCK); #ifdef CONFIG_PROVE_LOCKING /* * Lockdep does indeed fail here, but there's nothing we can do about * that now. Don't kill lockdep for it. */ unexpected_testcase_failures--; #endif -- You are receiving this mail because: You are watching the assignee of the bug.
Re: [RFC PATCH v2 07/10] KVM: PPC: Ultravisor: Restrict LDBAR access
On 18/05/19 7:55 PM, Claudio Carvalho wrote: From: Ram Pai When the ultravisor firmware is available, it takes control over the LDBAR register. In this case, thread-imc updates and save/restore operations on the LDBAR register are handled by ultravisor. we should remove the core and thread imc nodes in the skiboot if the ultravisor is enabled. We dont need imc-pmu.c file changes, imc-pmu.c inits only if we have the corresponding nodes. I will post a skiboot patch. Maddy Signed-off-by: Ram Pai [Restrict LDBAR access in assembly code and some in C, update the commit message] Signed-off-by: Claudio Carvalho --- arch/powerpc/kvm/book3s_hv.c | 4 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 + arch/powerpc/perf/imc-pmu.c | 64 arch/powerpc/platforms/powernv/idle.c| 6 +- arch/powerpc/platforms/powernv/subcore-asm.S | 4 ++ 5 files changed, 52 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0fab0a201027..81f35f955d16 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -75,6 +75,7 @@ #include #include #include +#include #include "book3s.h" @@ -3117,7 +3118,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) subcore_size = MAX_SMT_THREADS / split; split_info.rpr = mfspr(SPRN_RPR); split_info.pmmar = mfspr(SPRN_PMMAR); - split_info.ldbar = mfspr(SPRN_LDBAR); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + split_info.ldbar = mfspr(SPRN_LDBAR); split_info.subcore_size = subcore_size; } else { split_info.subcore_size = 1; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index dd014308f065..938cfa5dceed 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -375,8 +375,10 @@ BEGIN_FTR_SECTION mtspr SPRN_RPR, r0 ld r0, KVM_SPLIT_PMMAR(r6) mtspr SPRN_PMMAR, r0 +BEGIN_FW_FTR_SECTION_NESTED(70) ld r0, KVM_SPLIT_LDBAR(r6) mtspr SPRN_LDBAR, r0 +END_FW_FTR_SECTION_NESTED(FW_FEATURE_ULTRAVISOR, 0, 70) isync FTR_SECTION_ELSE /* On P9 we use the split_info for coordinating LPCR changes */ diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 31fa753e2eb2..39c84de74da9 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -17,6 +17,7 @@ #include #include #include +#include /* Nest IMC data structures and variables */ @@ -816,6 +817,17 @@ static int core_imc_event_init(struct perf_event *event) return 0; } +static void thread_imc_ldbar_disable(void *dummy) +{ + /* +* By Zeroing LDBAR, we disable thread-imc updates. When the ultravisor +* firmware is available, it is responsible for handling thread-imc +* updates, though +*/ + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) + mtspr(SPRN_LDBAR, 0); +} + /* * Allocates a page of memory for each of the online cpus, and load * LDBAR with 0. @@ -856,7 +868,7 @@ static int thread_imc_mem_alloc(int cpu_id, int size) per_cpu(thread_imc_mem, cpu_id) = local_mem; } - mtspr(SPRN_LDBAR, 0); + thread_imc_ldbar_disable(NULL); return 0; } @@ -867,7 +879,7 @@ static int ppc_thread_imc_cpu_online(unsigned int cpu) static int ppc_thread_imc_cpu_offline(unsigned int cpu) { - mtspr(SPRN_LDBAR, 0); + thread_imc_ldbar_disable(NULL); return 0; } @@ -1010,7 +1022,6 @@ static int thread_imc_event_add(struct perf_event *event, int flags) { int core_id; struct imc_pmu_ref *ref; - u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, smp_processor_id()); if (flags & PERF_EF_START) imc_event_start(event, flags); @@ -1019,8 +1030,14 @@ static int thread_imc_event_add(struct perf_event *event, int flags) return -EINVAL; core_id = smp_processor_id() / threads_per_core; - ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; - mtspr(SPRN_LDBAR, ldbar_value); + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) { + u64 ldbar_value, *local_mem; + + local_mem = per_cpu(thread_imc_mem, smp_processor_id()); + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | + THREAD_IMC_ENABLE; + mtspr(SPRN_LDBAR, ldbar_value); + } /* * imc pmus are enabled only when it is used. @@ -1053,7 +1070,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) int core_id;
Re: [PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests
> diff --git a/arch/powerpc/include/asm/mem_encrypt.h > b/arch/powerpc/include/asm/mem_encrypt.h > new file mode 100644 > index ..45d5e4d0e6e0 > --- /dev/null > +++ b/arch/powerpc/include/asm/mem_encrypt.h > @@ -0,0 +1,19 @@ > +/* SPDX-License-Identifier: GPL-2.0+ */ > +/* > + * SVM helper functions > + * > + * Copyright 2019 IBM Corporation > + */ > + > +#ifndef _ASM_POWERPC_MEM_ENCRYPT_H > +#define _ASM_POWERPC_MEM_ENCRYPT_H > + > +#define sme_me_mask 0ULL > + > +static inline bool sme_active(void) { return false; } > +static inline bool sev_active(void) { return false; } > + > +int set_memory_encrypted(unsigned long addr, int numpages); > +int set_memory_decrypted(unsigned long addr, int numpages); > + > +#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */ S/390 seems to be adding a stub header just like this. Can you please clean up the Kconfig and generic headers bits for memory encryption so that we don't need all this boilerplate code? > config PPC_SVM > bool "Secure virtual machine (SVM) support for POWER" > depends on PPC_PSERIES > + select SWIOTLB > + select ARCH_HAS_MEM_ENCRYPT > default n n is the default default, no need to explictly specify it.
Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the
On 20/05/19 2:35 PM, Anju T Sudhakar wrote: Nest and core imc(In-memory Collection counters) assigns a particular cpu as the designated target for counter data collection. During system boot, the first online cpu in a chip gets assigned as the designated cpu for that chip(for nest-imc) and the first online cpu in a core gets assigned as the designated cpu for that core(for core-imc). If the designated cpu goes offline, the next online cpu from the same chip(for nest-imc)/core(for core-imc) is assigned as the next target, and the event context is migrated to the target cpu. Currently, cpumask_any_but() function is used to find the target cpu. Though this function is expected to return a `random` cpu, this always returns the next online cpu. If all cpus in a chip/core is offlined in a sequential manner, starting from the first cpu, the event migration has to happen for all the cpus which goes offline. Since the migration process involves a grace period, the total time taken to offline all the cpus will be significantly high. Example: In a system which has 2 sockets, with NUMA node0 CPU(s): 0-87 NUMA node8 CPU(s): 88-175 Time taken to offline cpu 88-175: real2m56.099s user0m0.191s sys 0m0.000s Use cpumask_last() to choose the target cpu, when the designated cpu goes online, so the migration will happen only when the last_cpu in the mask goes offline. This way the time taken to offline all cpus in a chip/core can be reduced. With the patch, Time taken to offline cpu 88-175: real0m12.207s user0m0.171s sys 0m0.000s cpumask_last() is a better way to find the target cpu, since in most of the cases cpuhotplug is performed in an increasing order(even in ppc64_cpu). cpumask_any_but() can still be used to check the possibility of other online cpus from the same chip/core if the last cpu in the mask goes offline. Reviewed-by: Madhavan Srinivasan Also add "Fixes:" tag and this should go to stable right? Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 31fa753..fbfd6e7 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu) */ nid = cpu_to_node(cpu); l_cpumask = cpumask_of_node(nid); - target = cpumask_any_but(l_cpumask, cpu); + target = cpumask_last(l_cpumask); + + /* +* If this(target) is the last cpu in the cpumask for this chip, +* check for any possible online cpu in the chip. +*/ + if (unlikely(target == cpu)) + target = cpumask_any_but(l_cpumask, cpu); /* * Update the cpumask with the target cpu and @@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) return 0; /* Find any online cpu in that core except the current "cpu" */ - ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); + ncpu = cpumask_last(cpu_sibling_mask(cpu)); + + if (unlikely(ncpu == cpu)) + ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); if (ncpu >= 0 && ncpu < nr_cpu_ids) { cpumask_set_cpu(ncpu, _imc_cpumask);
Re: [RFC PATCH 02/12] powerpc: Add support for adding an ESM blob to the zImage wrapper
On Tue, May 21, 2019 at 01:49:02AM -0300, Thiago Jung Bauermann wrote: > From: Benjamin Herrenschmidt > > For secure VMs, the signing tool will create a ticket called the "ESM blob" > for the Enter Secure Mode ultravisor call with the signatures of the kernel > and initrd among other things. > > This adds support to the wrapper script for adding that blob via the "-e" > option to the zImage.pseries. > > It also adds code to the zImage wrapper itself to retrieve and if necessary > relocate the blob, and pass its address to Linux via the device-tree, to be > later consumed by prom_init. Where does the "BLOB" come from? How is it licensed and how can we satisfy the GPL with it?
[PATCH 12/12] powerpc/configs: Enable secure guest support in pseries and ppc64 defconfigs
From: Ryan Grimm Enables running as a secure guest in platforms with an Ultravisor. Signed-off-by: Ryan Grimm Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/configs/ppc64_defconfig | 1 + arch/powerpc/configs/pseries_defconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index d7c381009636..725297438320 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -31,6 +31,7 @@ CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y +CONFIG_PPC_SVM=y CONFIG_PPC_MAPLE=y CONFIG_PPC_PASEMI=y CONFIG_PPC_PASEMI_IOMMU=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 62e12f61a3b2..724a574fe4b2 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -42,6 +42,7 @@ CONFIG_DTL=y CONFIG_SCANLOG=m CONFIG_PPC_SMLPAR=y CONFIG_IBMEBUS=y +CONFIG_PPC_SVM=y # CONFIG_PPC_PMAC is not set CONFIG_RTAS_FLASH=m CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
Re: [PATCH] powerpc/powernv: Return for invalid IMC domain
On 20/05/19 2:27 PM, Anju T Sudhakar wrote: Currently init_imc_pmu() can be failed either because an IMC unit with invalid domain(i.e an IMC node not supported by the kernel) is attempted a pmu-registration or something went wrong while registering a valid IMC unit. In both the cases kernel provides a 'Registration failed' error message. Example: Log message, when trace-imc node is not supported by the kernel, and the skiboot supports trace-imc node. So for kernel, trace-imc node is now an unknown domain. [1.731870] nest_phb5_imc performance monitor hardware support registered [1.731944] nest_powerbus0_imc performance monitor hardware support registered [1.734458] thread_imc performance monitor hardware support registered [1.734460] IMC Unknown Device type [1.734462] IMC PMU (null) Register failed [1.734558] nest_xlink0_imc performance monitor hardware support registered [1.734614] nest_xlink1_imc performance monitor hardware support registered [1.734670] nest_xlink2_imc performance monitor hardware support registered [1.747043] Initialise system trusted keyrings [1.747054] Key type blacklist registered To avoid ambiguity on the error message, return for invalid domain before attempting a pmu registration. Reviewed-by: Madhavan Srinivasan Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`) Reported-by: Pavaman Subramaniyam Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 58a0794..4e8b0e1 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) struct imc_pmu *pmu_ptr; u32 offset; + /* Return for unknown domain */ + if (domain < 0) + return -EINVAL; + /* memory for pmu */ pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL); if (!pmu_ptr)
[PATCH 10/12] powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests
Secure guest memory is inacessible to devices so regular DMA isn't possible. In that case set devices' dma_map_ops to NULL so that the generic DMA code path will use SWIOTLB and DMA to bounce buffers. Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/platforms/pseries/iommu.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 03bbb299320e..7d9550edb700 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -50,6 +50,7 @@ #include #include #include +#include #include "pseries.h" @@ -1332,7 +1333,10 @@ void iommu_init_early_pSeries(void) of_reconfig_notifier_register(_reconfig_nb); register_memory_notifier(_mem_nb); - set_pci_dma_ops(_iommu_ops); + if (is_secure_guest()) + set_pci_dma_ops(NULL); + else + set_pci_dma_ops(_iommu_ops); } static int __init disable_multitce(char *str)
Re: [PATCH] crypto: vmx - convert to skcipher API
Eric Biggers writes: > From: Eric Biggers > > Convert the VMX implementations of AES-CBC, AES-CTR, and AES-XTS from > the deprecated "blkcipher" API to the "skcipher" API. > > As part of this, I moved the skcipher_request for the fallback algorithm > off the stack and into the request context of the parent algorithm. > > I tested this in a PowerPC VM with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y. I booted it a few times on a Power9 bare metal machine with panic_on_fail=1 and fuzz_iterations=400, no issues. Tested-by: Michael Ellerman cheers > Signed-off-by: Eric Biggers > --- > drivers/crypto/vmx/aes_cbc.c | 183 - > drivers/crypto/vmx/aes_ctr.c | 165 + > drivers/crypto/vmx/aes_xts.c | 175 ++- > drivers/crypto/vmx/aesp8-ppc.h | 2 - > drivers/crypto/vmx/vmx.c | 72 +++-- > 5 files changed, 252 insertions(+), 345 deletions(-) > > diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c > index dae8af3c46dce..92e75a05d6a9e 100644 > --- a/drivers/crypto/vmx/aes_cbc.c > +++ b/drivers/crypto/vmx/aes_cbc.c > @@ -7,64 +7,52 @@ > * Author: Marcelo Henrique Cerri > */ > > -#include > -#include > -#include > -#include > #include > #include > #include > #include > -#include > -#include > +#include > > #include "aesp8-ppc.h" > > struct p8_aes_cbc_ctx { > - struct crypto_sync_skcipher *fallback; > + struct crypto_skcipher *fallback; > struct aes_key enc_key; > struct aes_key dec_key; > }; > > -static int p8_aes_cbc_init(struct crypto_tfm *tfm) > +static int p8_aes_cbc_init(struct crypto_skcipher *tfm) > { > - const char *alg = crypto_tfm_alg_name(tfm); > - struct crypto_sync_skcipher *fallback; > - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); > - > - fallback = crypto_alloc_sync_skcipher(alg, 0, > - CRYPTO_ALG_NEED_FALLBACK); > + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); > + struct crypto_skcipher *fallback; > > + fallback = crypto_alloc_skcipher("cbc(aes)", 0, > + CRYPTO_ALG_NEED_FALLBACK | > + CRYPTO_ALG_ASYNC); > if (IS_ERR(fallback)) { > - printk(KERN_ERR > -"Failed to allocate transformation for '%s': %ld\n", > -alg, PTR_ERR(fallback)); > + pr_err("Failed to allocate cbc(aes) fallback: %ld\n", > +PTR_ERR(fallback)); > return PTR_ERR(fallback); > } > > - crypto_sync_skcipher_set_flags( > - fallback, > - crypto_skcipher_get_flags((struct crypto_skcipher *)tfm)); > + crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + > + crypto_skcipher_reqsize(fallback)); > ctx->fallback = fallback; > - > return 0; > } > > -static void p8_aes_cbc_exit(struct crypto_tfm *tfm) > +static void p8_aes_cbc_exit(struct crypto_skcipher *tfm) > { > - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); > + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); > > - if (ctx->fallback) { > - crypto_free_sync_skcipher(ctx->fallback); > - ctx->fallback = NULL; > - } > + crypto_free_skcipher(ctx->fallback); > } > > -static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key, > +static int p8_aes_cbc_setkey(struct crypto_skcipher *tfm, const u8 *key, >unsigned int keylen) > { > + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); > int ret; > - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); > > preempt_disable(); > pagefault_disable(); > @@ -75,108 +63,71 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, > const u8 *key, > pagefault_enable(); > preempt_enable(); > > - ret |= crypto_sync_skcipher_setkey(ctx->fallback, key, keylen); > + ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen); > > return ret ? -EINVAL : 0; > } > > -static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc, > - struct scatterlist *dst, > - struct scatterlist *src, unsigned int nbytes) > +static int p8_aes_cbc_crypt(struct skcipher_request *req, int enc) > { > + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); > + const struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); > + struct skcipher_walk walk; > + unsigned int nbytes; > int ret; > - struct blkcipher_walk walk; > - struct p8_aes_cbc_ctx *ctx = > - crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm)); > > if (!crypto_simd_usable()) { > - SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback); > - skcipher_request_set_sync_tfm(req, ctx->fallback); > -
[PATCH 07/12] powerpc/pseries/svm: Use shared memory for Debug Trace Log (DTL)
From: Anshuman Khandual Secure guests need to share the DTL buffers with the hypervisor. To that end, use a kmem_cache constructor which converts the underlying buddy allocated SLUB cache pages into shared memory. Signed-off-by: Anshuman Khandual Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/svm.h | 5 arch/powerpc/platforms/pseries/Makefile | 1 + arch/powerpc/platforms/pseries/setup.c | 5 +++- arch/powerpc/platforms/pseries/svm.c| 40 + 4 files changed, 50 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h index fef3740f46a6..f253116c31fc 100644 --- a/arch/powerpc/include/asm/svm.h +++ b/arch/powerpc/include/asm/svm.h @@ -15,6 +15,9 @@ static inline bool is_secure_guest(void) return mfmsr() & MSR_S; } +void dtl_cache_ctor(void *addr); +#define get_dtl_cache_ctor() (is_secure_guest() ? dtl_cache_ctor : NULL) + #else /* CONFIG_PPC_SVM */ static inline bool is_secure_guest(void) @@ -22,5 +25,7 @@ static inline bool is_secure_guest(void) return false; } +#define get_dtl_cache_ctor() NULL + #endif /* CONFIG_PPC_SVM */ #endif /* _ASM_POWERPC_SVM_H */ diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index a43ec843c8e2..b7b6e6f52bd0 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_LPARCFG) += lparcfg.o obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_PAPR_SCM) += papr_scm.o +obj-$(CONFIG_PPC_SVM) += svm.o ifdef CONFIG_PPC_PSERIES obj-$(CONFIG_SUSPEND) += suspend.o diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index e4f0dfd4ae33..c928e6e8a279 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -71,6 +71,7 @@ #include #include #include +#include #include "pseries.h" #include "../../../../drivers/pci/pci.h" @@ -329,8 +330,10 @@ static inline int alloc_dispatch_logs(void) static int alloc_dispatch_log_kmem_cache(void) { + void (*ctor)(void *) = get_dtl_cache_ctor(); + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, - DISPATCH_LOG_BYTES, 0, NULL); + DISPATCH_LOG_BYTES, 0, ctor); if (!dtl_cache) { pr_warn("Failed to create dispatch trace log buffer cache\n"); pr_warn("Stolen time statistics will be unreliable\n"); diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c new file mode 100644 index ..c508196f7c83 --- /dev/null +++ b/arch/powerpc/platforms/pseries/svm.c @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Secure VM platform + * + * Copyright 2019 IBM Corporation + * Author: Anshuman Khandual + */ + +#include +#include + +/* There's one dispatch log per CPU. */ +#define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE) + +static struct page *dtl_page_store[NR_DTL_PAGE]; +static long dtl_nr_pages; + +static bool is_dtl_page_shared(struct page *page) +{ + long i; + + for (i = 0; i < dtl_nr_pages; i++) + if (dtl_page_store[i] == page) + return true; + + return false; +} + +void dtl_cache_ctor(void *addr) +{ + unsigned long pfn = PHYS_PFN(__pa(addr)); + struct page *page = pfn_to_page(pfn); + + if (!is_dtl_page_shared(page)) { + dtl_page_store[dtl_nr_pages] = page; + dtl_nr_pages++; + WARN_ON(dtl_nr_pages >= NR_DTL_PAGE); + uv_share_page(pfn, 1); + } +}
[PATCH 11/12] powerpc/pseries/svm: Force SWIOTLB for secure guests
From: Anshuman Khandual SWIOTLB checks range of incoming CPU addresses to be bounced and sees if the device can access it through its DMA window without requiring bouncing. In such cases it just chooses to skip bouncing. But for cases like secure guests on powerpc platform all addresses need to be bounced into the shared pool of memory because the host cannot access it otherwise. Hence the need to do the bouncing is not related to device's DMA window and use of bounce buffers is forced by setting swiotlb_force. Also, connect the shared memory conversion functions into the ARCH_HAS_MEM_ENCRYPT hooks and call swiotlb_update_mem_attributes() to convert SWIOTLB's memory pool to shared memory. Signed-off-by: Anshuman Khandual [ Use ARCH_HAS_MEM_ENCRYPT hooks to share swiotlb memory pool. ] Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/mem_encrypt.h | 19 +++ arch/powerpc/platforms/pseries/Kconfig | 5 +++ arch/powerpc/platforms/pseries/svm.c | 45 ++ 3 files changed, 69 insertions(+) diff --git a/arch/powerpc/include/asm/mem_encrypt.h b/arch/powerpc/include/asm/mem_encrypt.h new file mode 100644 index ..45d5e4d0e6e0 --- /dev/null +++ b/arch/powerpc/include/asm/mem_encrypt.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * SVM helper functions + * + * Copyright 2019 IBM Corporation + */ + +#ifndef _ASM_POWERPC_MEM_ENCRYPT_H +#define _ASM_POWERPC_MEM_ENCRYPT_H + +#define sme_me_mask0ULL + +static inline bool sme_active(void) { return false; } +static inline bool sev_active(void) { return false; } + +int set_memory_encrypted(unsigned long addr, int numpages); +int set_memory_decrypted(unsigned long addr, int numpages); + +#endif /* _ASM_POWERPC_MEM_ENCRYPT_H */ diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 82c16aa4f1ce..41b10f3bc729 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -145,9 +145,14 @@ config PAPR_SCM help Enable access to hypervisor provided storage class memory. +config ARCH_HAS_MEM_ENCRYPT + def_bool n + config PPC_SVM bool "Secure virtual machine (SVM) support for POWER" depends on PPC_PSERIES + select SWIOTLB + select ARCH_HAS_MEM_ENCRYPT default n help Support secure guests on POWER. There are certain POWER platforms which diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c index c508196f7c83..618622d636d5 100644 --- a/arch/powerpc/platforms/pseries/svm.c +++ b/arch/powerpc/platforms/pseries/svm.c @@ -7,8 +7,53 @@ */ #include +#include +#include +#include #include +static int __init init_svm(void) +{ + if (!is_secure_guest()) + return 0; + + /* Don't release the SWIOTLB buffer. */ + ppc_swiotlb_enable = 1; + + /* +* Since the guest memory is inaccessible to the host, devices always +* need to use the SWIOTLB buffer for DMA even if dma_capable() says +* otherwise. +*/ + swiotlb_force = SWIOTLB_FORCE; + + /* Share the SWIOTLB buffer with the host. */ + swiotlb_update_mem_attributes(); + + return 0; +} +machine_early_initcall(pseries, init_svm); + +int set_memory_encrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_unshare_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + +int set_memory_decrypted(unsigned long addr, int numpages) +{ + if (!PAGE_ALIGNED(addr)) + return -EINVAL; + + uv_share_page(PHYS_PFN(__pa(addr)), numpages); + + return 0; +} + /* There's one dispatch log per CPU. */ #define NR_DTL_PAGE (DISPATCH_LOG_BYTES * CONFIG_NR_CPUS / PAGE_SIZE)
[PATCH 09/12] powerpc/pseries/svm: Disable doorbells in SVM guests
From: Sukadev Bhattiprolu Normally, the HV emulates some instructions like MSGSNDP, MSGCLRP from a KVM guest. To emulate the instructions, it must first read the instruction from the guest's memory and decode its parameters. However for a secure guest (aka SVM), the page containing the instruction is in secure memory and the HV cannot access directly. It would need the Ultravisor (UV) to facilitate accessing the instruction and parameters but the UV currently does not have the support for such accesses. Until the UV has such support, disable doorbells in SVMs. This might incur a performance hit but that is yet to be quantified. With this patch applied (needed only in SVMs not needed for HV) we are able to launch SVM guests with multi-core support. Eg: qemu -smp sockets=2,cores=2,threads=2. Fix suggested by Benjamin Herrenschmidt. Thanks to input from Paul Mackerras, Ram Pai and Michael Anderson. Signed-off-by: Sukadev Bhattiprolu Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/platforms/pseries/smp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 3df46123cce3..95a5c24a1544 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -45,6 +45,7 @@ #include #include #include +#include #include "pseries.h" #include "offline_states.h" @@ -225,7 +226,7 @@ static __init void pSeries_smp_probe_xics(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) + if (cpu_has_feature(CPU_FTR_DBELL) && !is_secure_guest()) smp_ops->cause_ipi = smp_pseries_cause_ipi; else smp_ops->cause_ipi = icp_ops->cause_ipi;
[PATCH 08/12] powerpc/pseries/svm: Export guest SVM status to user space via sysfs
From: Ryan Grimm User space might want to know it's running in a secure VM. It can't do a mfmsr because mfmsr is a privileged instruction. The solution here is to create a cpu attribute: /sys/devices/system/cpu/svm which will read 0 or 1 based on the S bit of the guest's CPU 0. Signed-off-by: Ryan Grimm Reviewed-by: Ram Pai Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/kernel/sysfs.c | 29 + 1 file changed, 29 insertions(+) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index e8e93c2c7d03..8fdab134e9ae 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cacheinfo.h" #include "setup.h" @@ -714,6 +715,32 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ +#ifdef CONFIG_PPC_SVM +static void get_svm(void *val) +{ + u32 *value = val; + + *value = is_secure_guest(); +} + +static ssize_t show_svm(struct device *dev, struct device_attribute *attr, char *buf) +{ + u32 val; + smp_call_function_single(0, get_svm, , 1); + return sprintf(buf, "%u\n", val); +} +static DEVICE_ATTR(svm, 0444, show_svm, NULL); + +static void create_svm_file(void) +{ + device_create_file(cpu_subsys.dev_root, _attr_svm); +} +#else +static void create_svm_file(void) +{ +} +#endif /* CONFIG_PPC_SVM */ + static int register_cpu_online(unsigned int cpu) { struct cpu *c = _cpu(cpu_devices, cpu); @@ -1057,6 +1084,8 @@ static int __init topology_init(void) sysfs_create_dscr_default(); #endif /* CONFIG_PPC64 */ + create_svm_file(); + return 0; } subsys_initcall(topology_init);
[PATCH 06/12] powerpc/pseries/svm: Use shared memory for LPPACA structures
From: Anshuman Khandual LPPACA structures need to be shared with the host. Hence they need to be in shared memory. Instead of allocating individual chunks of memory for a given structure from memblock, a contiguous chunk of memory is allocated and then converted into shared memory. Subsequent allocation requests will come from the contiguous chunk which will be always shared memory for all structures. While we are able to use a kmem_cache constructor for the Debug Trace Log, LPPACAs are allocated very early in the boot process (before SLUB is available) so we need to use a simpler scheme here. Introduce helper is_svm_platform() which uses the S bit of the MSR to tell whether we're running as a secure guest. Signed-off-by: Anshuman Khandual Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/svm.h | 26 arch/powerpc/kernel/paca.c | 43 +- 2 files changed, 68 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/svm.h b/arch/powerpc/include/asm/svm.h new file mode 100644 index ..fef3740f46a6 --- /dev/null +++ b/arch/powerpc/include/asm/svm.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * SVM helper functions + * + * Copyright 2019 Anshuman Khandual, IBM Corporation. + */ + +#ifndef _ASM_POWERPC_SVM_H +#define _ASM_POWERPC_SVM_H + +#ifdef CONFIG_PPC_SVM + +static inline bool is_secure_guest(void) +{ + return mfmsr() & MSR_S; +} + +#else /* CONFIG_PPC_SVM */ + +static inline bool is_secure_guest(void) +{ + return false; +} + +#endif /* CONFIG_PPC_SVM */ +#endif /* _ASM_POWERPC_SVM_H */ diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 854105db5cff..a9622f4b45bb 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include "setup.h" @@ -58,6 +60,41 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, #define LPPACA_SIZE 0x400 +static void *__init alloc_shared_lppaca(unsigned long size, unsigned long align, + unsigned long limit, int cpu) +{ + size_t shared_lppaca_total_size = PAGE_ALIGN(nr_cpu_ids * LPPACA_SIZE); + static unsigned long shared_lppaca_size; + static void *shared_lppaca; + void *ptr; + + if (!shared_lppaca) { + memblock_set_bottom_up(true); + + shared_lppaca = + memblock_alloc_try_nid(shared_lppaca_total_size, + PAGE_SIZE, MEMBLOCK_LOW_LIMIT, + limit, NUMA_NO_NODE); + if (!shared_lppaca) + panic("cannot allocate shared data"); + + memblock_set_bottom_up(false); + uv_share_page(PHYS_PFN(__pa(shared_lppaca)), + shared_lppaca_total_size >> PAGE_SHIFT); + } + + ptr = shared_lppaca + shared_lppaca_size; + shared_lppaca_size += size; + + /* +* This is very early in boot, so no harm done if the kernel crashes at +* this point. +*/ + BUG_ON(shared_lppaca_size >= shared_lppaca_total_size); + + return ptr; +} + /* * See asm/lppaca.h for more detail. * @@ -87,7 +124,11 @@ static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) if (early_cpu_has_feature(CPU_FTR_HVMODE)) return NULL; - lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); + if (is_secure_guest()) + lp = alloc_shared_lppaca(LPPACA_SIZE, 0x400, limit, cpu); + else + lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); + init_lppaca(lp); return lp;
[PATCH 05/12] powerpc/pseries: Add and use LPPACA_SIZE constant
Helps document what the hard-coded number means. Also take the opportunity to fix an #endif comment. Suggested-by: Alexey Kardashevskiy Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/kernel/paca.c | 11 ++- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 9cc91d03ab62..854105db5cff 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -56,6 +56,8 @@ static void *__init alloc_paca_data(unsigned long size, unsigned long align, #ifdef CONFIG_PPC_PSERIES +#define LPPACA_SIZE 0x400 + /* * See asm/lppaca.h for more detail. * @@ -69,7 +71,7 @@ static inline void init_lppaca(struct lppaca *lppaca) *lppaca = (struct lppaca) { .desc = cpu_to_be32(0xd397d781),/* "LpPa" */ - .size = cpu_to_be16(0x400), + .size = cpu_to_be16(LPPACA_SIZE), .fpregs_in_use = 1, .slb_count = cpu_to_be16(64), .vmxregs_in_use = 0, @@ -79,19 +81,18 @@ static inline void init_lppaca(struct lppaca *lppaca) static struct lppaca * __init new_lppaca(int cpu, unsigned long limit) { struct lppaca *lp; - size_t size = 0x400; - BUILD_BUG_ON(size < sizeof(struct lppaca)); + BUILD_BUG_ON(sizeof(struct lppaca) > LPPACA_SIZE); if (early_cpu_has_feature(CPU_FTR_HVMODE)) return NULL; - lp = alloc_paca_data(size, 0x400, limit, cpu); + lp = alloc_paca_data(LPPACA_SIZE, 0x400, limit, cpu); init_lppaca(lp); return lp; } -#endif /* CONFIG_PPC_BOOK3S */ +#endif /* CONFIG_PPC_PSERIES */ #ifdef CONFIG_PPC_BOOK3S_64
[PATCH 00/12] Secure Virtual Machine Enablement
This series enables Secure Virtual Machines (SVMs) on powerpc. SVMs use the Protected Execution Facility (PEF) and request to be migrated to secure memory during prom_init() so by default all of their memory is inaccessible to the hypervisor. There is an Ultravisor call that the VM can use to request certain pages to be made accessible to (or shared with) the hypervisor. The objective of these patches is to have the guest perform this request for buffers that need to be accessed by the hypervisor such as the LPPACAs, the SWIOTLB memory and the Debug Trace Log. The patch set applies on top of Claudio Carvalho's "kvmppc: Paravirtualize KVM to support ultravisor" series: https://lore.kernel.org/linuxppc-dev/20190518142524.28528-1-cclau...@linux.ibm.com/ I only need the following two patches from his series: [RFC PATCH v2 02/10] KVM: PPC: Ultravisor: Introduce the MSR_S bit [RFC PATCH v2 04/10] KVM: PPC: Ultravisor: Add generic ultravisor call handler Patches 2 and 3 are posted as RFC because we are still finalizing the details on how the ESM blob will be passed to the kernel. All other patches are (hopefully) in upstreamable shape. Unfortunately this series still doesn't enable the use of virtio devices in the secure guest. This support depends on a discussion that is currently ongoing with the virtio community: https://lore.kernel.org/linuxppc-dev/87womn8inf.fsf@morokweng.localdomain/ This was the last time I posted this patch set: https://lore.kernel.org/linuxppc-dev/20180824162535.22798-1-bauer...@linux.ibm.com/ At that time, it wasn't possible to launch a real secure guest because the Ultravisor was still in very early development. Now there is a relatively mature Ultravisor and I was able to test it using Claudio's patches in the host kernel, booting normally using an initramfs for the root filesystem. This is the command used to start up the guest with QEMU 4.0: qemu-system-ppc64 \ -nodefaults \ -cpu host \ -machine pseries,accel=kvm,kvm-type=HV,cap-htm=off,cap-cfpc=broken,cap-sbbc=broken,cap-ibs=broken \ -display none \ -serial mon:stdio \ -smp 1 \ -m 4G \ -kernel /home/bauermann/vmlinux \ -initrd /home/bauermann/fs_small.cpio \ -append 'debug' Changelog since the RFC from August: - Patch "powerpc/pseries: Introduce option to build secure virtual machines" - New patch. - Patch "powerpc: Add support for adding an ESM blob to the zImage wrapper" - Patch from Benjamin Herrenschmidt, first posted here: https://lore.kernel.org/linuxppc-dev/20180531043417.25073-1-b...@kernel.crashing.org/ - Made minor adjustments to some comments. Code is unchanged. - Patch "powerpc/prom_init: Add the ESM call to prom_init" - New patch from Ram Pai and Michael Anderson. - Patch "powerpc/pseries/svm: Add helpers for UV_SHARE_PAGE and UV_UNSHARE_PAGE" - New patch from Ram Pai. - Patch "powerpc/pseries: Add and use LPPACA_SIZE constant" - Moved LPPACA_SIZE macro inside the CONFIG_PPC_PSERIES #ifdef. - Put sizeof() operand left of comparison operator in BUILD_BUG_ON() macro to appease a checkpatch warning. - Patch "powerpc/pseries/svm: Use shared memory for LPPACA structures" - Moved definition of is_secure_guest() helper to this patch. - Changed shared_lppaca and shared_lppaca_size from globals to static variables inside alloc_shared_lppaca(). - Changed shared_lppaca to hold virtual address instead of physical address. - Patch "powerpc/pseries/svm: Use shared memory for Debug Trace Log (DTL)" - Add get_dtl_cache_ctor() macro. Suggested by Ram Pai. - Patch "powerpc/pseries/svm: Export guest SVM status to user space via sysfs" - New patch from Ryan Grimm. - Patch "powerpc/pseries/svm: Disable doorbells in SVM guests" - New patch from Sukadev Bhattiprolu. - Patch "powerpc/pseries/iommu: Don't use dma_iommu_ops on secure guests" - New patch. - Patch "powerpc/pseries/svm: Force SWIOTLB for secure guests" - New patch with code that was previously in other patches. - Patch "powerpc/configs: Enable secure guest support in pseries and ppc64 defconfigs" - New patch from Ryan Grimm. - Patch "powerpc/pseries/svm: Detect Secure Virtual Machine (SVM) platform" - Dropped this patch by moving its code to other patches. - Patch "powerpc/svm: Select CONFIG_DMA_DIRECT_OPS and CONFIG_SWIOTLB" - No need to select CONFIG_DMA_DIRECT_OPS anymore. The CONFIG_SWIOTLB change was moved to another patch and this patch was dropped. - Patch "powerpc/pseries/svm: Add memory conversion (shared/secure) helper functions" - Dropped patch since the helper functions were unnecessary wrappers around uv_share_page() and uv_unshare_page(). - Patch "powerpc/svm: Convert SWIOTLB buffers
[PATCH 04/12] powerpc/pseries/svm: Add helpers for UV_SHARE_PAGE and UV_UNSHARE_PAGE
From: Ram Pai These functions are used when the guest wants to grant the hypervisor access to certain pages. Signed-off-by: Ram Pai Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/ultravisor-api.h | 2 ++ arch/powerpc/include/asm/ultravisor.h | 14 ++ 2 files changed, 16 insertions(+) diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h index 0e8b72081718..ed68b02869fd 100644 --- a/arch/powerpc/include/asm/ultravisor-api.h +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -20,6 +20,8 @@ /* opcodes */ #define UV_WRITE_PATE 0xF104 #define UV_ESM 0xF110 +#define UV_SHARE_PAGE 0xF130 +#define UV_UNSHARE_PAGE0xF134 #define UV_RETURN 0xF11C #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h index 09e0a615d96f..537f7717d21a 100644 --- a/arch/powerpc/include/asm/ultravisor.h +++ b/arch/powerpc/include/asm/ultravisor.h @@ -44,6 +44,20 @@ static inline int uv_register_pate(u64 lpid, u64 dw0, u64 dw1) return ucall(UV_WRITE_PATE, retbuf, lpid, dw0, dw1); } +static inline int uv_share_page(u64 pfn, u64 npages) +{ + unsigned long retbuf[UCALL_BUFSIZE]; + + return ucall(UV_SHARE_PAGE, retbuf, pfn, npages); +} + +static inline int uv_unshare_page(u64 pfn, u64 npages) +{ + unsigned long retbuf[UCALL_BUFSIZE]; + + return ucall(UV_UNSHARE_PAGE, retbuf, pfn, npages); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_ULTRAVISOR_H */
[RFC PATCH 03/12] powerpc/prom_init: Add the ESM call to prom_init
From: Ram Pai Make the Enter-Secure-Mode (ESM) ultravisor call to switch the VM to secure mode. Add "svm=" command line option to turn off switching to secure mode. Introduce CONFIG_PPC_SVM to control support for secure guests. Signed-off-by: Ram Pai [ Generate an RTAS os-term hcall when the ESM ucall fails. ] Signed-off-by: Michael Anderson [ Cleaned up the code a bit. ] Signed-off-by: Thiago Jung Bauermann --- .../admin-guide/kernel-parameters.txt | 5 + arch/powerpc/include/asm/ultravisor-api.h | 1 + arch/powerpc/kernel/prom_init.c | 124 ++ 3 files changed, 130 insertions(+) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c45a19d654f3..7237d86b25c6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4501,6 +4501,11 @@ /sys/power/pm_test). Only available when CONFIG_PM_DEBUG is set. Default value is 5. + svm=[PPC] + Format: { on | off | y | n | 1 | 0 } + This parameter controls use of the Protected + Execution Facility on pSeries. + swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable diff --git a/arch/powerpc/include/asm/ultravisor-api.h b/arch/powerpc/include/asm/ultravisor-api.h index 15e6ce77a131..0e8b72081718 100644 --- a/arch/powerpc/include/asm/ultravisor-api.h +++ b/arch/powerpc/include/asm/ultravisor-api.h @@ -19,6 +19,7 @@ /* opcodes */ #define UV_WRITE_PATE 0xF104 +#define UV_ESM 0xF110 #define UV_RETURN 0xF11C #endif /* _ASM_POWERPC_ULTRAVISOR_API_H */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 523bb99d7676..5d8a3efb54f2 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -44,6 +44,7 @@ #include #include #include +#include #include @@ -174,6 +175,10 @@ static unsigned long __prombss prom_tce_alloc_end; static bool __prombss prom_radix_disable; #endif +#ifdef CONFIG_PPC_SVM +static bool __prombss prom_svm_disable; +#endif + struct platform_support { bool hash_mmu; bool radix_mmu; @@ -809,6 +814,17 @@ static void __init early_cmdline_parse(void) if (prom_radix_disable) prom_debug("Radix disabled from cmdline\n"); #endif /* CONFIG_PPC_PSERIES */ + +#ifdef CONFIG_PPC_SVM + opt = prom_strstr(prom_cmd_line, "svm="); + if (opt) { + bool val; + + opt += sizeof("svm=") - 1; + if (!prom_strtobool(opt, )) + prom_svm_disable = !val; + } +#endif /* CONFIG_PPC_SVM */ } #ifdef CONFIG_PPC_PSERIES @@ -1707,6 +1723,43 @@ static void __init prom_close_stdin(void) } } +#ifdef CONFIG_PPC_SVM +static int prom_rtas_os_term_hcall(uint64_t args) +{ + register uint64_t arg1 asm("r3") = 0xf000; + register uint64_t arg2 asm("r4") = args; + + asm volatile("sc 1\n" : "=r" (arg1) : + "r" (arg1), + "r" (arg2) :); + return arg1; +} + +static struct rtas_args __prombss os_term_args; + +static void __init prom_rtas_os_term(char *str) +{ + phandle rtas_node; + __be32 val; + u32 token; + + prom_printf("%s: start...\n", __func__); + rtas_node = call_prom("finddevice", 1, 1, ADDR("/rtas")); + prom_printf("rtas_node: %x\n", rtas_node); + if (!PHANDLE_VALID(rtas_node)) + return; + + val = 0; + prom_getprop(rtas_node, "ibm,os-term", , sizeof(val)); + token = be32_to_cpu(val); + prom_printf("ibm,os-term: %x\n", token); + if (token == 0) + prom_panic("Could not get token for ibm,os-term\n"); + os_term_args.token = cpu_to_be32(token); + prom_rtas_os_term_hcall((uint64_t)_term_args); +} +#endif /* CONFIG_PPC_SVM */ + /* * Allocate room for and instantiate RTAS */ @@ -3162,6 +3215,74 @@ static void unreloc_toc(void) #endif #endif +#ifdef CONFIG_PPC_SVM +/* + * The ESM blob is a data structure with information needed by the Ultravisor to + * validate the integrity of the secure guest. + */ +static void *get_esm_blob(void) +{ + /* +* FIXME: We are still finalizing the details on how prom_init will grab +* the ESM blob. When that is done, this function will be updated. +*/ + return (void *)0xdeadbeef; +} + +/* + * Perform the Enter Secure Mode ultracall. + */ +static int enter_secure_mode(void *esm_blob, void *retaddr, void *fdt) +{ + register uint64_t func asm("r0") = UV_ESM; + register uint64_t arg1 asm("r3") = (uint64_t)esm_blob; + register uint64_t arg2
[RFC PATCH 02/12] powerpc: Add support for adding an ESM blob to the zImage wrapper
From: Benjamin Herrenschmidt For secure VMs, the signing tool will create a ticket called the "ESM blob" for the Enter Secure Mode ultravisor call with the signatures of the kernel and initrd among other things. This adds support to the wrapper script for adding that blob via the "-e" option to the zImage.pseries. It also adds code to the zImage wrapper itself to retrieve and if necessary relocate the blob, and pass its address to Linux via the device-tree, to be later consumed by prom_init. Signed-off-by: Benjamin Herrenschmidt [ Minor adjustments to some comments. ] Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/boot/main.c | 41 ++ arch/powerpc/boot/ops.h| 2 ++ arch/powerpc/boot/wrapper | 24 +--- arch/powerpc/boot/zImage.lds.S | 8 +++ 4 files changed, 72 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/boot/main.c b/arch/powerpc/boot/main.c index 78aaf4ffd7ab..ca612efd3e81 100644 --- a/arch/powerpc/boot/main.c +++ b/arch/powerpc/boot/main.c @@ -150,6 +150,46 @@ static struct addr_range prep_initrd(struct addr_range vmlinux, void *chosen, return (struct addr_range){(void *)initrd_addr, initrd_size}; } +#ifdef __powerpc64__ +static void prep_esm_blob(struct addr_range vmlinux, void *chosen) +{ + unsigned long esm_blob_addr, esm_blob_size; + + /* Do we have an ESM (Enter Secure Mode) blob? */ + if (_esm_blob_end <= _esm_blob_start) + return; + + printf("Attached ESM blob at 0x%p-0x%p\n\r", + _esm_blob_start, _esm_blob_end); + esm_blob_addr = (unsigned long)_esm_blob_start; + esm_blob_size = _esm_blob_end - _esm_blob_start; + + /* +* If the ESM blob is too low it will be clobbered when the +* kernel relocates to its final location. In this case, +* allocate a safer place and move it. +*/ + if (esm_blob_addr < vmlinux.size) { + void *old_addr = (void *)esm_blob_addr; + + printf("Allocating 0x%lx bytes for esm_blob ...\n\r", + esm_blob_size); + esm_blob_addr = (unsigned long)malloc(esm_blob_size); + if (!esm_blob_addr) + fatal("Can't allocate memory for ESM blob !\n\r"); + printf("Relocating ESM blob 0x%lx <- 0x%p (0x%lx bytes)\n\r", + esm_blob_addr, old_addr, esm_blob_size); + memmove((void *)esm_blob_addr, old_addr, esm_blob_size); + } + + /* Tell the kernel ESM blob address via device tree. */ + setprop_val(chosen, "linux,esm-blob-start", (u32)(esm_blob_addr)); + setprop_val(chosen, "linux,esm-blob-end", (u32)(esm_blob_addr + esm_blob_size)); +} +#else +static inline void prep_esm_blob(struct addr_range vmlinux, void *chosen) { } +#endif + /* A buffer that may be edited by tools operating on a zImage binary so as to * edit the command line passed to vmlinux (by setting /chosen/bootargs). * The buffer is put in it's own section so that tools may locate it easier. @@ -218,6 +258,7 @@ void start(void) vmlinux = prep_kernel(); initrd = prep_initrd(vmlinux, chosen, loader_info.initrd_addr, loader_info.initrd_size); + prep_esm_blob(vmlinux, chosen); prep_cmdline(chosen); printf("Finalizing device tree..."); diff --git a/arch/powerpc/boot/ops.h b/arch/powerpc/boot/ops.h index cd043726ed88..e0606766480f 100644 --- a/arch/powerpc/boot/ops.h +++ b/arch/powerpc/boot/ops.h @@ -251,6 +251,8 @@ extern char _initrd_start[]; extern char _initrd_end[]; extern char _dtb_start[]; extern char _dtb_end[]; +extern char _esm_blob_start[]; +extern char _esm_blob_end[]; static inline __attribute__((const)) int __ilog2_u32(u32 n) diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index f9141eaec6ff..36b2ad6cd5b7 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -14,6 +14,7 @@ # -i initrdspecify initrd file # -d devtree specify device-tree blob # -s tree.dts specify device-tree source file (needs dtc installed) +# -e esm_blob specify ESM blob for secure images # -c cache $kernel.strip.gz (use if present & newer, else make) # -C prefixspecify command prefix for cross-building tools # (strip, objcopy, ld) @@ -38,6 +39,7 @@ platform=of initrd= dtb= dts= +esm_blob= cacheit= binary= compression=.gz @@ -60,9 +62,9 @@ tmpdir=. usage() { echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2 -echo ' [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2 -echo ' [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2 -echo ' [--no-compression] [vmlinux]' >&2 +echo ' [-d devtree] [-s tree.dts] [-e esm_blob]' >&2 +echo ' [-c] [-C cross-prefix] [-D datadir] [-W workingdir]' >&2 +echo ' [-Z (gz|xz|none)] [--no-compression]
[PATCH 01/12] powerpc/pseries: Introduce option to build secure virtual machines
Introduce CONFIG_PPC_SVM to control support for secure guests and include Ultravisor-related helpers when it is selected Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/include/asm/ultravisor.h | 2 +- arch/powerpc/kernel/Makefile | 4 +++- arch/powerpc/platforms/pseries/Kconfig | 12 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/ultravisor.h b/arch/powerpc/include/asm/ultravisor.h index 4ffec7a36acd..09e0a615d96f 100644 --- a/arch/powerpc/include/asm/ultravisor.h +++ b/arch/powerpc/include/asm/ultravisor.h @@ -28,7 +28,7 @@ extern int early_init_dt_scan_ultravisor(unsigned long node, const char *uname, * This call supports up to 6 arguments and 4 return arguments. Use * UCALL_BUFSIZE to size the return argument buffer. */ -#if defined(CONFIG_PPC_UV) +#if defined(CONFIG_PPC_UV) || defined(CONFIG_PPC_SVM) long ucall(unsigned long opcode, unsigned long *retbuf, ...); #else static long ucall(unsigned long opcode, unsigned long *retbuf, ...) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 43ff4546e469..1e9b721634c8 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -154,7 +154,9 @@ endif obj-$(CONFIG_EPAPR_PARAVIRT) += epapr_paravirt.o epapr_hcalls.o obj-$(CONFIG_KVM_GUEST)+= kvm.o kvm_emul.o -obj-$(CONFIG_PPC_UV) += ultravisor.o ucall.o +ifneq ($(CONFIG_PPC_UV)$(CONFIG_PPC_SVM),) +obj-y += ultravisor.o ucall.o +endif # Disable GCOV, KCOV & sanitizers in odd or sensitive code GCOV_PROFILE_prom_init.o := n diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index 9c6b3d860518..82c16aa4f1ce 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -144,3 +144,15 @@ config PAPR_SCM tristate "Support for the PAPR Storage Class Memory interface" help Enable access to hypervisor provided storage class memory. + +config PPC_SVM + bool "Secure virtual machine (SVM) support for POWER" + depends on PPC_PSERIES + default n + help +Support secure guests on POWER. There are certain POWER platforms which +support secure guests using the Protected Execution Facility, with the +help of an Ultravisor executing below the hypervisor layer. This +enables the support for those guests. + +If unsure, say "N".
Re: [RFC PATCH] powerpc/mm: Implement STRICT_MODULE_RWX
On Wed, 2019-05-15 at 06:20 +, Christophe Leroy wrote: Confirming this works on hash and radix book3s64. > + > + // only operate on VM areas for now > + area = find_vm_area((void *)addr); > + if (!area || end > (unsigned long)area->addr + area->size || > + !(area->flags & VM_ALLOC)) > + return -EINVAL; https://lore.kernel.org/patchwork/project/lkml/list/?series=391470 With this patch, the above series causes crashes on (at least) Hash, since it adds another user of change_page_rw() and change_page_nx() that for reasons I don't understand yet, we can't handle. I can work around this with: if (area->flags & VM_FLUSH_RESET_PERMS) return 0; so this is broken on at least one platform as of 5.2-rc1. We're going to look into this more to see if there's anything else we have to do as a result of this series before the next merge window, or if just working around it like this is good enough. - Russell
Re: [PATCH] crypto: vmx - convert to SPDX license identifiers
Eric Biggers writes: > From: Eric Biggers > > Remove the boilerplate license text and replace it with the equivalent > SPDX license identifier. > > Signed-off-by: Eric Biggers > --- > drivers/crypto/vmx/aes.c | 14 +- > drivers/crypto/vmx/aes_cbc.c | 14 +- > drivers/crypto/vmx/aes_ctr.c | 14 +- > drivers/crypto/vmx/aes_xts.c | 14 +- > drivers/crypto/vmx/vmx.c | 14 +- > 5 files changed, 5 insertions(+), 65 deletions(-) Looks good to me. Reviewed-by: Michael Ellerman cheers
Re: [PATCH 10/10] docs: fix broken documentation links
On Mon, May 20, 2019 at 11:47:39AM -0300, Mauro Carvalho Chehab wrote: > Mostly due to x86 and acpi conversion, several documentation > links are still pointing to the old file. Fix them. > > Signed-off-by: Mauro Carvalho Chehab Thanks, didn't notice that. > Documentation/i2c/instantiating-devices | 2 +- ... > diff --git a/Documentation/i2c/instantiating-devices > b/Documentation/i2c/instantiating-devices > index 0d85ac1935b7..5a3e2f331e8c 100644 > --- a/Documentation/i2c/instantiating-devices > +++ b/Documentation/i2c/instantiating-devices > @@ -85,7 +85,7 @@ Method 1c: Declare the I2C devices via ACPI > --- > > ACPI can also describe I2C devices. There is special documentation for this > -which is currently located at Documentation/acpi/enumeration.txt. > +which is currently located at > Documentation/firmware-guide/acpi/enumeration.rst. > > > Method 2: Instantiate the devices explicitly For this I2C part: Reviewed-by: Wolfram Sang signature.asc Description: PGP signature
Re: [PATCH] misc: remove redundant 'default n' from Kconfig-s
On Mon, May 20, 2019 at 04:10:46PM +0200, Bartlomiej Zolnierkiewicz wrote: > 'default n' is the default value for any bool or tristate Kconfig > setting so there is no need to write it explicitly. > > Also since commit f467c5640c29 ("kconfig: only write '# CONFIG_FOO > is not set' for visible symbols") the Kconfig behavior is the same > regardless of 'default n' being present or not: > > ... > One side effect of (and the main motivation for) this change is making > the following two definitions behave exactly the same: > > config FOO > bool > > config FOO > bool > default n > > With this change, neither of these will generate a > '# CONFIG_FOO is not set' line (assuming FOO isn't selected/implied). > That might make it clearer to people that a bare 'default n' is > redundant. > ... > > Signed-off-by: Bartlomiej Zolnierkiewicz [...] > drivers/misc/cb710/Kconfig|1 - Acked-by: Michał Mirosław
[PATCH] crypto: testmgr - fix length truncation with large page size
From: Eric Biggers On PowerPC with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y, there is sometimes a crash in generate_random_aead_testvec(). The problem is that the generated test vectors use data lengths of up to about 2 * PAGE_SIZE, which is 128 KiB on PowerPC; however, the data length fields in the test vectors are 'unsigned short', so the lengths get truncated. Fix this by changing the relevant fields to 'unsigned int'. Fixes: 40153b10d91c ("crypto: testmgr - fuzz AEADs against their generic implementation") Signed-off-by: Eric Biggers --- crypto/testmgr.h | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crypto/testmgr.h b/crypto/testmgr.h index 9a13c634b2077..fb2afdd544e00 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -43,7 +43,7 @@ struct hash_testvec { const char *key; const char *plaintext; const char *digest; - unsigned short psize; + unsigned int psize; unsigned short ksize; int setkey_error; int digest_error; @@ -74,7 +74,7 @@ struct cipher_testvec { const char *ctext; unsigned char wk; /* weak key flag */ unsigned short klen; - unsigned short len; + unsigned int len; bool fips_skip; bool generates_iv; int setkey_error; @@ -110,9 +110,9 @@ struct aead_testvec { unsigned char novrfy; unsigned char wk; unsigned char klen; - unsigned short plen; - unsigned short clen; - unsigned short alen; + unsigned int plen; + unsigned int clen; + unsigned int alen; int setkey_error; int setauthsize_error; int crypt_error; -- 2.21.0.1020.gf2820cf01a-goog
[PATCH] crypto: vmx - convert to skcipher API
From: Eric Biggers Convert the VMX implementations of AES-CBC, AES-CTR, and AES-XTS from the deprecated "blkcipher" API to the "skcipher" API. As part of this, I moved the skcipher_request for the fallback algorithm off the stack and into the request context of the parent algorithm. I tested this in a PowerPC VM with CONFIG_CRYPTO_MANAGER_EXTRA_TESTS=y. Signed-off-by: Eric Biggers --- drivers/crypto/vmx/aes_cbc.c | 183 - drivers/crypto/vmx/aes_ctr.c | 165 + drivers/crypto/vmx/aes_xts.c | 175 ++- drivers/crypto/vmx/aesp8-ppc.h | 2 - drivers/crypto/vmx/vmx.c | 72 +++-- 5 files changed, 252 insertions(+), 345 deletions(-) diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c index dae8af3c46dce..92e75a05d6a9e 100644 --- a/drivers/crypto/vmx/aes_cbc.c +++ b/drivers/crypto/vmx/aes_cbc.c @@ -7,64 +7,52 @@ * Author: Marcelo Henrique Cerri */ -#include -#include -#include -#include #include #include #include #include -#include -#include +#include #include "aesp8-ppc.h" struct p8_aes_cbc_ctx { - struct crypto_sync_skcipher *fallback; + struct crypto_skcipher *fallback; struct aes_key enc_key; struct aes_key dec_key; }; -static int p8_aes_cbc_init(struct crypto_tfm *tfm) +static int p8_aes_cbc_init(struct crypto_skcipher *tfm) { - const char *alg = crypto_tfm_alg_name(tfm); - struct crypto_sync_skcipher *fallback; - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); - - fallback = crypto_alloc_sync_skcipher(alg, 0, - CRYPTO_ALG_NEED_FALLBACK); + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct crypto_skcipher *fallback; + fallback = crypto_alloc_skcipher("cbc(aes)", 0, +CRYPTO_ALG_NEED_FALLBACK | +CRYPTO_ALG_ASYNC); if (IS_ERR(fallback)) { - printk(KERN_ERR - "Failed to allocate transformation for '%s': %ld\n", - alg, PTR_ERR(fallback)); + pr_err("Failed to allocate cbc(aes) fallback: %ld\n", + PTR_ERR(fallback)); return PTR_ERR(fallback); } - crypto_sync_skcipher_set_flags( - fallback, - crypto_skcipher_get_flags((struct crypto_skcipher *)tfm)); + crypto_skcipher_set_reqsize(tfm, sizeof(struct skcipher_request) + + crypto_skcipher_reqsize(fallback)); ctx->fallback = fallback; - return 0; } -static void p8_aes_cbc_exit(struct crypto_tfm *tfm) +static void p8_aes_cbc_exit(struct crypto_skcipher *tfm) { - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); - if (ctx->fallback) { - crypto_free_sync_skcipher(ctx->fallback); - ctx->fallback = NULL; - } + crypto_free_skcipher(ctx->fallback); } -static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key, +static int p8_aes_cbc_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { + struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); int ret; - struct p8_aes_cbc_ctx *ctx = crypto_tfm_ctx(tfm); preempt_disable(); pagefault_disable(); @@ -75,108 +63,71 @@ static int p8_aes_cbc_setkey(struct crypto_tfm *tfm, const u8 *key, pagefault_enable(); preempt_enable(); - ret |= crypto_sync_skcipher_setkey(ctx->fallback, key, keylen); + ret |= crypto_skcipher_setkey(ctx->fallback, key, keylen); return ret ? -EINVAL : 0; } -static int p8_aes_cbc_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int p8_aes_cbc_crypt(struct skcipher_request *req, int enc) { + struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); + const struct p8_aes_cbc_ctx *ctx = crypto_skcipher_ctx(tfm); + struct skcipher_walk walk; + unsigned int nbytes; int ret; - struct blkcipher_walk walk; - struct p8_aes_cbc_ctx *ctx = - crypto_tfm_ctx(crypto_blkcipher_tfm(desc->tfm)); if (!crypto_simd_usable()) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, ctx->fallback); - skcipher_request_set_sync_tfm(req, ctx->fallback); - skcipher_request_set_callback(req, desc->flags, NULL, NULL); - skcipher_request_set_crypt(req, src, dst, nbytes, desc->info); - ret = crypto_skcipher_encrypt(req); - skcipher_request_zero(req); - } else { - blkcipher_walk_init(, dst, src, nbytes); - ret
[PATCH] crypto: vmx - convert to SPDX license identifiers
From: Eric Biggers Remove the boilerplate license text and replace it with the equivalent SPDX license identifier. Signed-off-by: Eric Biggers --- drivers/crypto/vmx/aes.c | 14 +- drivers/crypto/vmx/aes_cbc.c | 14 +- drivers/crypto/vmx/aes_ctr.c | 14 +- drivers/crypto/vmx/aes_xts.c | 14 +- drivers/crypto/vmx/vmx.c | 14 +- 5 files changed, 5 insertions(+), 65 deletions(-) diff --git a/drivers/crypto/vmx/aes.c b/drivers/crypto/vmx/aes.c index 603a620819941..2e9476158df49 100644 --- a/drivers/crypto/vmx/aes.c +++ b/drivers/crypto/vmx/aes.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /** * AES routines supporting VMX instructions on the Power 8 * * Copyright (C) 2015 International Business Machines Inc. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 only. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * * Author: Marcelo Henrique Cerri */ diff --git a/drivers/crypto/vmx/aes_cbc.c b/drivers/crypto/vmx/aes_cbc.c index a1a9a6f0d42cf..dae8af3c46dce 100644 --- a/drivers/crypto/vmx/aes_cbc.c +++ b/drivers/crypto/vmx/aes_cbc.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /** * AES CBC routines supporting VMX instructions on the Power 8 * * Copyright (C) 2015 International Business Machines Inc. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 only. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * * Author: Marcelo Henrique Cerri */ diff --git a/drivers/crypto/vmx/aes_ctr.c b/drivers/crypto/vmx/aes_ctr.c index 192a53512f5e8..dc31101178446 100644 --- a/drivers/crypto/vmx/aes_ctr.c +++ b/drivers/crypto/vmx/aes_ctr.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /** * AES CTR routines supporting VMX instructions on the Power 8 * * Copyright (C) 2015 International Business Machines Inc. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 only. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * * Author: Marcelo Henrique Cerri */ diff --git a/drivers/crypto/vmx/aes_xts.c b/drivers/crypto/vmx/aes_xts.c index 00d412d811ae6..aee1339f134ec 100644 --- a/drivers/crypto/vmx/aes_xts.c +++ b/drivers/crypto/vmx/aes_xts.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /** * AES XTS routines supporting VMX In-core instructions on Power 8 * * Copyright (C) 2015 International Business Machines Inc. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundations; version 2 only. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY of FITNESS FOR A PARTICUPAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - * * Author: Leonidas S. Barbosa */ diff --git a/drivers/crypto/vmx/vmx.c b/drivers/crypto/vmx/vmx.c index a9f5198306155..abd89c2bcec4d 100644 --- a/drivers/crypto/vmx/vmx.c +++ b/drivers/crypto/vmx/vmx.c @@ -1,21 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /** * Routines supporting VMX instructions on
Re: [PATCH] crypto: vmx - CTR: always increment IV as quadword
On Mon, May 20, 2019 at 11:59:05AM +1000, Daniel Axtens wrote: > Daniel Axtens writes: > > > The kernel self-tests picked up an issue with CTR mode: > > alg: skcipher: p8_aes_ctr encryption test failed (wrong result) on test > > vector 3, cfg="uneven misaligned splits, may sleep" > > > > Test vector 3 has an IV of FFFD, so > > after 3 increments it should wrap around to 0. > > > > In the aesp8-ppc code from OpenSSL, there are two paths that > > increment IVs: the bulk (8 at a time) path, and the individual > > path which is used when there are fewer than 8 AES blocks to > > process. > > > > In the bulk path, the IV is incremented with vadduqm: "Vector > > Add Unsigned Quadword Modulo", which does 128-bit addition. > > > > In the individual path, however, the IV is incremented with > > vadduwm: "Vector Add Unsigned Word Modulo", which instead > > does 4 32-bit additions. Thus the IV would instead become > > , throwing off the result. > > > > Use vadduqm. > > > > This was probably a typo originally, what with q and w being > > adjacent. It is a pretty narrow edge case: I am really > > impressed by the quality of the kernel self-tests! > > > > Fixes: 5c380d623ed3 ("crypto: vmx - Add support for VMS instructions by > > ASM") > > Cc: sta...@vger.kernel.org > > Signed-off-by: Daniel Axtens > > > > --- > > > > I'll pass this along internally to get it into OpenSSL as well. > > I passed this along to OpenSSL and got pretty comprehensively schooled: > https://github.com/openssl/openssl/pull/8942 > > It seems we tweak the openssl code to use a 128-bit counter, whereas > the original code was in fact designed for a 32-bit counter. We must > have changed the vaddu instruction in the bulk path but not in the > individual path, as they're both vadduwm (4x32-bit) upstream. > > I think this change is still correct with regards to the kernel, > but I guess it's probably something where I should have done a more > thorough read of the documentation before diving in to the code, and > perhaps we should note it in the code somewhere too. Ah well. > > Regards, > Daniel > Ah, I didn't realize there are multiple conventions for CTR. Yes, all CTR implementations in the kernel follow the 128-bit convention, and evidently the test vectors test for that. Apparently the VMX OpenSSL code got incompletely changed from the 32-bit convention by this commit, so that's what you're fixing: commit 1d4aa0b4c1816e8ca92a6aadb0d8f6b43c56c0d0 Author: Leonidas Da Silva Barbosa Date: Fri Aug 14 10:12:22 2015 -0300 crypto: vmx - Fixing AES-CTR counter bug AES-CTR is using a counter 8bytes-8bytes what miss match with kernel specs. In the previous code a vadduwm was done to increment counter. Replacing this for a vadduqm now considering both cases counter 8-8 bytes and full 16bytes. A comment in the code would indeed be helpful. Note that the kernel doesn't currently need a 32-bit CTR implementation for GCM like OpenSSL does, because the kernel currently only supports 12-byte IVs with GCM. So the low 32 bits of the counter start at 1 and don't overflow, regardless of whether the counter is incremented mod 2^32 or mod 2^128. - Eric
Re: [PATCH 5/5] asm-generic: remove ptrace.h
On 05/20, Christoph Hellwig wrote: > > No one is using this header anymore. > > Signed-off-by: Christoph Hellwig > Acked-by: Arnd Bergmann > --- > MAINTAINERS| 1 - > arch/mips/include/asm/ptrace.h | 5 --- > include/asm-generic/ptrace.h | 74 -- > 3 files changed, 80 deletions(-) > delete mode 100644 include/asm-generic/ptrace.h Acked-by: Oleg Nesterov
Re: [PATCH 4/5] x86: don't use asm-generic/ptrace.h
On 05/20, Christoph Hellwig wrote: > > Doing the indirection through macros for the regs accessors just > makes them harder to read, so implement the helpers directly. Acked-by: Oleg Nesterov
[PATCH v3 1/2] pid: add pidfd_open()
This adds the pidfd_open() syscall. It allows a caller to retrieve pollable pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a process that is created via traditional fork()/clone() calls that is only referenced by a PID: int pidfd = pidfd_open(1234, 0); ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0); With the introduction of pidfds through CLONE_PIDFD it is possible to created pidfds at process creation time. However, a lot of processes get created with traditional PID-based calls such as fork() or clone() (without CLONE_PIDFD). For these processes a caller can currently not create a pollable pidfd. This is a problem for Android's low memory killer (LMK) and service managers such as systemd. Both are examples of tools that want to make use of pidfds to get reliable notification of process exit for non-parents (pidfd polling) and race-free signal sending (pidfd_send_signal()). They intend to switch to this API for process supervision/management as soon as possible. Having no way to get pollable pidfds from PID-only processes is one of the biggest blockers for them in adopting this api. With pidfd_open() making it possible to retrieve pidfds for PID-based processes we enable them to adopt this api. In line with Arnd's recent changes to consolidate syscall numbers across architectures, I have added the pidfd_open() syscall to all architectures at the same time. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Acked-by: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Joel Fernandes (Google) Cc: Thomas Gleixner Cc: Jann Horn Cc: David Howells Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro Cc: linux-...@vger.kernel.org --- v1: - kbuild test robot : - add missing entry for pidfd_open to arch/arm/tools/syscall.tbl - Oleg Nesterov : - use simpler thread-group leader check v2: - Oleg Nesterov : - avoid using additional variable - remove unneeded comment - Arnd Bergmann : - switch from 428 to 434 since the new mount api has taken it - bump syscall numbers in arch/arm64/include/asm/unistd.h - Joel Fernandes (Google) : - switch from ESRCH to EINVAL when the passed-in pid does not refer to a thread-group leader - Christian Brauner : - rebase on v5.2-rc1 - adapt syscall number to account for new mount api syscalls v3: - Arnd Bergmann : - add missing syscall entries for mips-o32 and mips-n64 --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl| 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/pid.h | 1 + include/linux/syscalls.h| 1 + include/uapi/asm-generic/unistd.h | 4 +- kernel/fork.c | 2 +- kernel/pid.c| 43 + 23 files changed, 68 insertions(+), 3 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 9e7704e44f6d..1db9bbcfb84e 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -473,3 +473,4 @@ 541common fsconfigsys_fsconfig 542common fsmount sys_fsmount 543common fspick sys_fspick +544common pidfd_open sys_pidfd_open diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index aaf479a9e92d..81e6e1817c45 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -447,3 +447,4 @@ 431common fsconfigsys_fsconfig 432common fsmount sys_fsmount 433common fspick sys_fspick +434common pidfd_open sys_pidfd_open diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 70e6882853c0..e8f7d95a1481 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls(__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END(__ARM_NR_COMPAT_BASE +
[PATCH v3 2/2] tests: add pidfd_open() tests
This adds testing for the new pidfd_open() syscalls. Specifically, we test: - that no invalid flags can be passed to pidfd_open() - that no invalid pid can be passed to pidfd_open() - that a pidfd can be retrieved with pidfd_open() - that the retrieved pidfd references the correct pid Signed-off-by: Christian Brauner Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Joel Fernandes (Google) Cc: Thomas Gleixner Cc: Jann Horn Cc: David Howells Cc: "Michael Kerrisk (man-pages)" Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Oleg Nesterov Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro Cc: linux-...@vger.kernel.org --- v1: unchanged v2: - Christian Brauner : - set number of planned tests via ksft_set_plan() v3: unchanged --- tools/testing/selftests/pidfd/.gitignore | 1 + tools/testing/selftests/pidfd/Makefile| 2 +- tools/testing/selftests/pidfd/pidfd.h | 57 ++ .../testing/selftests/pidfd/pidfd_open_test.c | 169 ++ tools/testing/selftests/pidfd/pidfd_test.c| 41 + 5 files changed, 229 insertions(+), 41 deletions(-) create mode 100644 tools/testing/selftests/pidfd/pidfd.h create mode 100644 tools/testing/selftests/pidfd/pidfd_open_test.c diff --git a/tools/testing/selftests/pidfd/.gitignore b/tools/testing/selftests/pidfd/.gitignore index 822a1e63d045..16d84d117bc0 100644 --- a/tools/testing/selftests/pidfd/.gitignore +++ b/tools/testing/selftests/pidfd/.gitignore @@ -1 +1,2 @@ +pidfd_open_test pidfd_test diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index deaf8073bc06..b36c0be70848 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,6 +1,6 @@ CFLAGS += -g -I../../../../usr/include/ -TEST_GEN_PROGS := pidfd_test +TEST_GEN_PROGS := pidfd_test pidfd_open_test include ../lib.mk diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h new file mode 100644 index ..8452e910463f --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PIDFD_H +#define __PIDFD_H + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +/* + * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c + * That means, when it wraps around any pid < 300 will be skipped. + * So we need to use a pid > 300 in order to test recycling. + */ +#define PID_RECYCLE 1000 + +/* + * Define a few custom error codes for the child process to clearly indicate + * what is happening. This way we can tell the difference between a system + * error, a test error, etc. + */ +#define PIDFD_PASS 0 +#define PIDFD_FAIL 1 +#define PIDFD_ERROR 2 +#define PIDFD_SKIP 3 +#define PIDFD_XFAIL 4 + +int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, , 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + + +#endif /* __PIDFD_H */ diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c new file mode 100644 index ..0377133dd6dc --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest.h" + +static inline int sys_pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int safe_int(const char *numstr, int *converted) +{ + char *err = NULL; + long sli; + + errno = 0; + sli = strtol(numstr, , 0); + if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN)) + return -ERANGE; + + if (errno != 0 && sli == 0) + return -EINVAL; + + if (err == numstr || *err != '\0') + return -EINVAL; + + if (sli > INT_MAX || sli < INT_MIN) + return -ERANGE; + + *converted = (int)sli; + return 0; +} + +static int char_left_gc(const char *buffer, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (buffer[i] == ' ' || + buffer[i] == '\t') + continue; + + return i; + } + + return 0; +} + +static int char_right_gc(const char *buffer, size_t len) +{ + int i; + + for (i = len - 1; i >= 0; i--) { + if (buffer[i] == ' ' || + buffer[i] == '\t' || + buffer[i] ==
Re: [PATCH] mm: add account_locked_vm utility function
On Mon, May 20, 2019 at 04:19:34PM +1000, Alexey Kardashevskiy wrote: > On 04/05/2019 06:16, Daniel Jordan wrote: > > locked_vm accounting is done roughly the same way in five places, so > > unify them in a helper. Standardize the debug prints, which vary > > slightly. > > And I rather liked that prints were different and tell precisely which > one of three each printk is. I'm not following. One of three...callsites? But there were five callsites. Anyway, I added a _RET_IP_ to the debug print so you can differentiate. > I commented below but in general this seems working. > > Tested-by: Alexey Kardashevskiy Thanks! And for the review as well. > > diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c > > b/drivers/vfio/vfio_iommu_spapr_tce.c > > index 6b64e45a5269..d39a1b830d82 100644 > > --- a/drivers/vfio/vfio_iommu_spapr_tce.c > > +++ b/drivers/vfio/vfio_iommu_spapr_tce.c > > @@ -34,49 +35,13 @@ > > static void tce_iommu_detach_group(void *iommu_data, > > struct iommu_group *iommu_group); > > > > -static long try_increment_locked_vm(struct mm_struct *mm, long npages) > > +static int tce_account_locked_vm(struct mm_struct *mm, unsigned long > > npages, > > +bool inc) > > { > > - long ret = 0, locked, lock_limit; > > - > > if (WARN_ON_ONCE(!mm)) > > return -EPERM; > > > If this WARN_ON is the only reason for having tce_account_locked_vm() > instead of calling account_locked_vm() directly, you can then ditch the > check as I have never ever seen this triggered. Great, will do. > > diff --git a/drivers/vfio/vfio_iommu_type1.c > > b/drivers/vfio/vfio_iommu_type1.c > > index d0f731c9920a..15ac76171ccd 100644 > > --- a/drivers/vfio/vfio_iommu_type1.c > > +++ b/drivers/vfio/vfio_iommu_type1.c > > @@ -273,25 +273,14 @@ static int vfio_lock_acct(struct vfio_dma *dma, long > > npage, bool async) > > return -ESRCH; /* process exited */ > > > > ret = down_write_killable(>mmap_sem); > > - if (!ret) { > > - if (npage > 0) { > > - if (!dma->lock_cap) { > > - unsigned long limit; > > - > > - limit = task_rlimit(dma->task, > > - RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > - > > - if (mm->locked_vm + npage > limit) > > - ret = -ENOMEM; > > - } > > - } > > + if (ret) > > + goto out; > > > A single "goto" to jump just 3 lines below seems unnecessary. No strong preference here, I'll take out the goto. > > +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool > > inc, > > + struct task_struct *task, bool bypass_rlim) > > +{ > > + unsigned long locked_vm, limit; > > + int ret = 0; > > + > > + locked_vm = mm->locked_vm; > > + if (inc) { > > + if (!bypass_rlim) { > > + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > > + if (locked_vm + pages > limit) { > > + ret = -ENOMEM; > > + goto out; > > + } > > + } > > Nit: > > if (!ret) > > and then you don't need "goto out". Ok, sure. > > + mm->locked_vm = locked_vm + pages; > > + } else { > > + WARN_ON_ONCE(pages > locked_vm); > > + mm->locked_vm = locked_vm - pages; > > > Can go negative here. Not a huge deal but inaccurate imo. I hear you, but setting a negative value to zero, as we had done previously, doesn't make much sense to me.
Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest
On Tue, May 21, 2019 at 12:55:49AM +1000, Nicholas Piggin wrote: > Bharata B Rao's on May 21, 2019 12:29 am: > > On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote: > >> On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote: > >> > Bharata B Rao's on May 20, 2019 3:56 pm: > >> > > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote: > >> > >> >> > git bisect points to > >> > >> >> > > >> > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d > >> > >> >> > Author: Nicholas Piggin > >> > >> >> > Date: Fri Jul 27 21:48:17 2018 +1000 > >> > >> >> > > >> > >> >> > powerpc/64s: Fix page table fragment refcount race vs > >> > >> >> > speculative references > >> > >> >> > > >> > >> >> > The page table fragment allocator uses the main page > >> > >> >> > refcount racily > >> > >> >> > with respect to speculative references. A customer observed > >> > >> >> > a BUG due > >> > >> >> > to page table page refcount underflow in the fragment > >> > >> >> > allocator. This > >> > >> >> > can be caused by the fragment allocator set_page_count > >> > >> >> > stomping on a > >> > >> >> > speculative reference, and then the speculative failure > >> > >> >> > handler > >> > >> >> > decrements the new reference, and the underflow eventually > >> > >> >> > pops when > >> > >> >> > the page tables are freed. > >> > >> >> > > >> > >> >> > Fix this by using a dedicated field in the struct page for > >> > >> >> > the page > >> > >> >> > table fragment allocator. > >> > >> >> > > >> > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory > >> > >> >> > wastage") > >> > >> >> > Cc: sta...@vger.kernel.org # v3.10+ > >> > >> >> > >> > >> >> That's the commit that added the BUG_ON(), so prior to that you > >> > >> >> won't > >> > >> >> see the crash. > >> > >> > > >> > >> > Right, but the commit says it fixes page table page refcount > >> > >> > underflow by > >> > >> > introducing a new field >pt_frag_refcount. Now we are hitting > >> > >> > the underflow > >> > >> > for this pt_frag_refcount. > >> > >> > >> > >> The fixed underflow is caused by a bug (race on page count) that got > >> > >> fixed by that patch. You are hitting a different underflow here. It's > >> > >> not certain my patch caused it, I'm just trying to reproduce now. > >> > > > >> > > Ok. > >> > > >> > Can't reproduce I'm afraid, tried adding and removing 8GB memory from a > >> > 4GB guest (via host adding / removing memory device), and it just works. > >> > >> Boot, add 8G, reboot, remove 8G is the sequence to reproduce. > >> > >> > > >> > It's likely to be an edge case like an off by one or rounding error > >> > that just happens to trigger in your config. Might be easiest if you > >> > could test with a debug patch. > >> > >> Sure, I will continue debugging. > > > > When the guest is rebooted after hotplug, the entire memory (which includes > > the hotplugged memory) gets remapped again freshly. However at this time > > since no slab is available yet, pt_frag_refcount never gets initialized as > > we > > never do pte_fragment_alloc() for these mappings. So we right away hit the > > underflow during the first unplug itself, it looks like. > > Nice catch, good debugging work. Thanks, with help from Aneesh. > > > I will check how this can be fixed. > > Tricky problem. What do you think? You might be able to make the early > page table allocations in the same pattern as the frag allocations, and > then fill in the struct page metadata when you have those. Will explore. > > Other option may be create a new set of page tables after mm comes up > to replace the early page tables with. That's a bigger hammer though. Will also check if similar scenario exists on x86 and if so, how and when pte frag data is fixed there. Regards, Bharata.
Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest
On 5/20/19 8:25 PM, Nicholas Piggin wrote: Bharata B Rao's on May 21, 2019 12:29 am: On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote: On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote: Bharata B Rao's on May 20, 2019 3:56 pm: On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote: git bisect points to commit 4231aba000f5a4583dd9f67057aadb68c3eca99d Author: Nicholas Piggin Date: Fri Jul 27 21:48:17 2018 +1000 powerpc/64s: Fix page table fragment refcount race vs speculative references The page table fragment allocator uses the main page refcount racily with respect to speculative references. A customer observed a BUG due to page table page refcount underflow in the fragment allocator. This can be caused by the fragment allocator set_page_count stomping on a speculative reference, and then the speculative failure handler decrements the new reference, and the underflow eventually pops when the page tables are freed. Fix this by using a dedicated field in the struct page for the page table fragment allocator. Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage") Cc: sta...@vger.kernel.org # v3.10+ That's the commit that added the BUG_ON(), so prior to that you won't see the crash. Right, but the commit says it fixes page table page refcount underflow by introducing a new field >pt_frag_refcount. Now we are hitting the underflow for this pt_frag_refcount. The fixed underflow is caused by a bug (race on page count) that got fixed by that patch. You are hitting a different underflow here. It's not certain my patch caused it, I'm just trying to reproduce now. Ok. Can't reproduce I'm afraid, tried adding and removing 8GB memory from a 4GB guest (via host adding / removing memory device), and it just works. Boot, add 8G, reboot, remove 8G is the sequence to reproduce. It's likely to be an edge case like an off by one or rounding error that just happens to trigger in your config. Might be easiest if you could test with a debug patch. Sure, I will continue debugging. When the guest is rebooted after hotplug, the entire memory (which includes the hotplugged memory) gets remapped again freshly. However at this time since no slab is available yet, pt_frag_refcount never gets initialized as we never do pte_fragment_alloc() for these mappings. So we right away hit the underflow during the first unplug itself, it looks like. Nice catch, good debugging work. I will check how this can be fixed. Tricky problem. What do you think? You might be able to make the early page table allocations in the same pattern as the frag allocations, and then fill in the struct page metadata when you have those. I guess we need to do something similar to what x86 does. We need to walk the init_mm page table again and re-init struct page and other data structures backing the tables? -aneesh
Re: [PATCH 1/3] powerpc/pseries: Simplify cpu readd to use drc_index
Tyrel Datwyler writes: > On 05/16/2019 12:17 PM, Nathan Lynch wrote: >> Tyrel Datwyler writes: >>> The current dlpar_cpu_readd() takes in a cpu_id and uses that to look up >>> the cpus device_node so that we can get at the ibm,my-drc-index >>> property. The only user of cpu readd is an OF notifier call back. This >>> call back already has a reference to the device_node and therefore can >>> retrieve the drc_index from the device_node. >> >> dlpar_cpu_readd is a hack to try to change the CPU-node relationship at >> runtime without destabilizing the system. It doesn't accomplish that and >> it should just be removed (and I'm working on that). >> > > I will politely disagree. We've done exactly this from userspace for > years. My experience still suggests that memory affinity is the > problem area, and that the work to push this all into the kernel > originally was poorly tested. Kernel implementation details aside, how do you change the cpu-node relationship at runtime without breaking NUMA-aware applications? Is this not a fundamental issue to address before adding code like this?
Re: [PATCH v2 1/2] pid: add pidfd_open()
On Mon, May 20, 2019 at 4:48 PM Christian Brauner wrote: > > On Mon, May 20, 2019 at 04:37:03PM +0200, Arnd Bergmann wrote: > > On Mon, May 20, 2019 at 3:46 PM Christian Brauner > > wrote: > > > > > > In line with Arnd's recent changes to consolidate syscall numbers across > > > architectures, I have added the pidfd_open() syscall to all architectures > > > at the same time. > > > > Thanks! I've checked that the ones you have added are all > > done correctly. However, double-checking that you got all of them, > > I noticed that you missed mips-o32 and mips-n64. With those added: > > > > Acked-by: Arnd Bergmann > > Perfect, will plumb mips-o32 and mips-n64 and resend once more with your > ack added. > Sidenote: You plan on merging the common syscall tables or will there be > a script to do this work per-arch in the future? David Howells also asked about this. I think having a common table will be best in the long run, patches welcome. As you noticed, there are still a few minor differences between the files on mips, arm, x86, alpha and s390, and we are missing the .tbl files for arm-compat and asm-generic, as well as an architecture independent script. Once that is all taken care of, we can move the entries for syscall 403 and higher into a common file, and change the script to pick up the contents from there in addition to the architecture specific file. Arnd
Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest
Bharata B Rao's on May 21, 2019 12:29 am: > On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote: >> On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote: >> > Bharata B Rao's on May 20, 2019 3:56 pm: >> > > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote: >> > >> >> > git bisect points to >> > >> >> > >> > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d >> > >> >> > Author: Nicholas Piggin >> > >> >> > Date: Fri Jul 27 21:48:17 2018 +1000 >> > >> >> > >> > >> >> > powerpc/64s: Fix page table fragment refcount race vs >> > >> >> > speculative references >> > >> >> > >> > >> >> > The page table fragment allocator uses the main page refcount >> > >> >> > racily >> > >> >> > with respect to speculative references. A customer observed a >> > >> >> > BUG due >> > >> >> > to page table page refcount underflow in the fragment >> > >> >> > allocator. This >> > >> >> > can be caused by the fragment allocator set_page_count >> > >> >> > stomping on a >> > >> >> > speculative reference, and then the speculative failure handler >> > >> >> > decrements the new reference, and the underflow eventually >> > >> >> > pops when >> > >> >> > the page tables are freed. >> > >> >> > >> > >> >> > Fix this by using a dedicated field in the struct page for the >> > >> >> > page >> > >> >> > table fragment allocator. >> > >> >> > >> > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory >> > >> >> > wastage") >> > >> >> > Cc: sta...@vger.kernel.org # v3.10+ >> > >> >> >> > >> >> That's the commit that added the BUG_ON(), so prior to that you won't >> > >> >> see the crash. >> > >> > >> > >> > Right, but the commit says it fixes page table page refcount >> > >> > underflow by >> > >> > introducing a new field >pt_frag_refcount. Now we are hitting >> > >> > the underflow >> > >> > for this pt_frag_refcount. >> > >> >> > >> The fixed underflow is caused by a bug (race on page count) that got >> > >> fixed by that patch. You are hitting a different underflow here. It's >> > >> not certain my patch caused it, I'm just trying to reproduce now. >> > > >> > > Ok. >> > >> > Can't reproduce I'm afraid, tried adding and removing 8GB memory from a >> > 4GB guest (via host adding / removing memory device), and it just works. >> >> Boot, add 8G, reboot, remove 8G is the sequence to reproduce. >> >> > >> > It's likely to be an edge case like an off by one or rounding error >> > that just happens to trigger in your config. Might be easiest if you >> > could test with a debug patch. >> >> Sure, I will continue debugging. > > When the guest is rebooted after hotplug, the entire memory (which includes > the hotplugged memory) gets remapped again freshly. However at this time > since no slab is available yet, pt_frag_refcount never gets initialized as we > never do pte_fragment_alloc() for these mappings. So we right away hit the > underflow during the first unplug itself, it looks like. Nice catch, good debugging work. > I will check how this can be fixed. Tricky problem. What do you think? You might be able to make the early page table allocations in the same pattern as the frag allocations, and then fill in the struct page metadata when you have those. Other option may be create a new set of page tables after mm comes up to replace the early page tables with. That's a bigger hammer though. Thanks, Nick
Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()
On Mon, 20 May 2019 20:12:48 +0530 "Naveen N. Rao" wrote: > Thanks, that definitely helps make things clearer. A very small nit from > your first patch -- it would be good to also convert the calls to > ftrace_check_record() to use 'true' or 'false' for the 'update' field. Heh, I was so focused on the "enable" part, I did the "update" as a second thought, and forgot to update the callers. Thanks for pointing that out. > > I will test my series in more detail and post a v1. Great. -- Steve
Re: [PATCH v2 1/2] pid: add pidfd_open()
On Mon, May 20, 2019 at 04:37:03PM +0200, Arnd Bergmann wrote: > On Mon, May 20, 2019 at 3:46 PM Christian Brauner > wrote: > > > > In line with Arnd's recent changes to consolidate syscall numbers across > > architectures, I have added the pidfd_open() syscall to all architectures > > at the same time. > > Thanks! I've checked that the ones you have added are all > done correctly. However, double-checking that you got all of them, > I noticed that you missed mips-o32 and mips-n64. With those added: > > Acked-by: Arnd Bergmann Perfect, will plumb mips-o32 and mips-n64 and resend once more with your ack added. Sidenote: You plan on merging the common syscall tables or will there be a script to do this work per-arch in the future?
[PATCH 10/10] docs: fix broken documentation links
Mostly due to x86 and acpi conversion, several documentation links are still pointing to the old file. Fix them. Signed-off-by: Mauro Carvalho Chehab --- Documentation/acpi/dsd/leds.txt | 2 +- Documentation/admin-guide/kernel-parameters.rst | 6 +++--- Documentation/admin-guide/kernel-parameters.txt | 16 Documentation/admin-guide/ras.rst| 2 +- .../devicetree/bindings/net/fsl-enetc.txt| 7 +++ .../bindings/pci/amlogic,meson-pcie.txt | 2 +- .../bindings/regulator/qcom,rpmh-regulator.txt | 2 +- Documentation/devicetree/booting-without-of.txt | 2 +- Documentation/driver-api/gpio/board.rst | 2 +- Documentation/driver-api/gpio/consumer.rst | 2 +- .../firmware-guide/acpi/enumeration.rst | 2 +- .../firmware-guide/acpi/method-tracing.rst | 2 +- Documentation/i2c/instantiating-devices | 2 +- Documentation/sysctl/kernel.txt | 4 ++-- .../translations/it_IT/process/4.Coding.rst | 2 +- .../translations/it_IT/process/howto.rst | 2 +- .../it_IT/process/stable-kernel-rules.rst| 4 ++-- .../translations/zh_CN/process/4.Coding.rst | 2 +- Documentation/x86/x86_64/5level-paging.rst | 2 +- Documentation/x86/x86_64/boot-options.rst| 4 ++-- .../x86/x86_64/fake-numa-for-cpusets.rst | 2 +- MAINTAINERS | 6 +++--- arch/arm/Kconfig | 2 +- arch/arm64/kernel/kexec_image.c | 2 +- arch/powerpc/Kconfig | 2 +- arch/x86/Kconfig | 16 arch/x86/Kconfig.debug | 2 +- arch/x86/boot/header.S | 2 +- arch/x86/entry/entry_64.S| 2 +- arch/x86/include/asm/bootparam_utils.h | 2 +- arch/x86/include/asm/page_64_types.h | 2 +- arch/x86/include/asm/pgtable_64_types.h | 2 +- arch/x86/kernel/cpu/microcode/amd.c | 2 +- arch/x86/kernel/kexec-bzimage64.c| 2 +- arch/x86/kernel/pci-dma.c| 2 +- arch/x86/mm/tlb.c| 2 +- arch/x86/platform/pvh/enlighten.c| 2 +- drivers/acpi/Kconfig | 10 +- drivers/net/ethernet/faraday/ftgmac100.c | 2 +- .../fieldbus/Documentation/fieldbus_dev.txt | 4 ++-- drivers/vhost/vhost.c| 2 +- include/acpi/acpi_drivers.h | 2 +- include/linux/fs_context.h | 2 +- include/linux/lsm_hooks.h| 2 +- mm/Kconfig | 2 +- security/Kconfig | 2 +- tools/include/linux/err.h| 2 +- tools/objtool/Documentation/stack-validation.txt | 4 ++-- tools/testing/selftests/x86/protection_keys.c| 2 +- 49 files changed, 78 insertions(+), 79 deletions(-) diff --git a/Documentation/acpi/dsd/leds.txt b/Documentation/acpi/dsd/leds.txt index 81a63af42ed2..cc58b1a574c5 100644 --- a/Documentation/acpi/dsd/leds.txt +++ b/Documentation/acpi/dsd/leds.txt @@ -96,4 +96,4 @@ where http://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf>, referenced 2019-02-21. -[7] Documentation/acpi/dsd/data-node-reference.txt +[7] Documentation/firmware-guide/acpi/dsd/data-node-references.rst diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index 0124980dca2d..8d3273e32eb1 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -167,7 +167,7 @@ parameter is applicable:: X86-32 X86-32, aka i386 architecture is enabled. X86-64 X86-64 architecture is enabled. More X86-64 boot options can be found in - Documentation/x86/x86_64/boot-options.txt . + Documentation/x86/x86_64/boot-options.rst. X86 Either 32-bit or 64-bit x86 (same as X86-32+X86-64) X86_UV SGI UV support is enabled. XEN Xen support is enabled @@ -181,10 +181,10 @@ In addition, the following text indicates that the option:: Parameters denoted with BOOT are actually interpreted by the boot loader, and have no meaning to the kernel directly. Do not modify the syntax of boot loader parameters without extreme -need or coordination with . +need or coordination with . There are also arch-specific kernel-parameters not documented here. -See for example . +See for example . Note that ALL kernel parameters listed below are CASE SENSITIVE, and that a trailing = on the name of any parameter states that that parameter will diff --git a/Documentation/admin-guide/kernel-parameters.txt
Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()
Hi Steven, Steven Rostedt wrote: On Mon, 20 May 2019 09:13:20 -0400 Steven Rostedt wrote: > I haven't yet tested this patch on x86, but this looked wrong so sending > this as a RFC. This code has been through a bit of updates, and I need to go through and clean it up. I'll have to take a look and convert "int" to "bool" so that "enable" is not confusing. Thanks, I think I'll try to do a clean up first, and then this patch shouldn't "look wrong" after that. I'm going to apply the attached two patches. There may be some conflicts between yours here and these, but nothing that Linus can't figure out. Do you feel more comfortable with this code, if these patches are applied? Thanks, that definitely helps make things clearer. A very small nit from your first patch -- it would be good to also convert the calls to ftrace_check_record() to use 'true' or 'false' for the 'update' field. I will test my series in more detail and post a v1. - Naveen
Re: [PATCH v2 1/2] pid: add pidfd_open()
On Mon, May 20, 2019 at 3:46 PM Christian Brauner wrote: > > In line with Arnd's recent changes to consolidate syscall numbers across > architectures, I have added the pidfd_open() syscall to all architectures > at the same time. Thanks! I've checked that the ones you have added are all done correctly. However, double-checking that you got all of them, I noticed that you missed mips-o32 and mips-n64. With those added: Acked-by: Arnd Bergmann
Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest
On Mon, May 20, 2019 at 01:50:35PM +0530, Bharata B Rao wrote: > On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote: > > Bharata B Rao's on May 20, 2019 3:56 pm: > > > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote: > > >> >> > git bisect points to > > >> >> > > > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d > > >> >> > Author: Nicholas Piggin > > >> >> > Date: Fri Jul 27 21:48:17 2018 +1000 > > >> >> > > > >> >> > powerpc/64s: Fix page table fragment refcount race vs > > >> >> > speculative references > > >> >> > > > >> >> > The page table fragment allocator uses the main page refcount > > >> >> > racily > > >> >> > with respect to speculative references. A customer observed a > > >> >> > BUG due > > >> >> > to page table page refcount underflow in the fragment > > >> >> > allocator. This > > >> >> > can be caused by the fragment allocator set_page_count stomping > > >> >> > on a > > >> >> > speculative reference, and then the speculative failure handler > > >> >> > decrements the new reference, and the underflow eventually pops > > >> >> > when > > >> >> > the page tables are freed. > > >> >> > > > >> >> > Fix this by using a dedicated field in the struct page for the > > >> >> > page > > >> >> > table fragment allocator. > > >> >> > > > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage") > > >> >> > Cc: sta...@vger.kernel.org # v3.10+ > > >> >> > > >> >> That's the commit that added the BUG_ON(), so prior to that you won't > > >> >> see the crash. > > >> > > > >> > Right, but the commit says it fixes page table page refcount underflow > > >> > by > > >> > introducing a new field >pt_frag_refcount. Now we are hitting > > >> > the underflow > > >> > for this pt_frag_refcount. > > >> > > >> The fixed underflow is caused by a bug (race on page count) that got > > >> fixed by that patch. You are hitting a different underflow here. It's > > >> not certain my patch caused it, I'm just trying to reproduce now. > > > > > > Ok. > > > > Can't reproduce I'm afraid, tried adding and removing 8GB memory from a > > 4GB guest (via host adding / removing memory device), and it just works. > > Boot, add 8G, reboot, remove 8G is the sequence to reproduce. > > > > > It's likely to be an edge case like an off by one or rounding error > > that just happens to trigger in your config. Might be easiest if you > > could test with a debug patch. > > Sure, I will continue debugging. When the guest is rebooted after hotplug, the entire memory (which includes the hotplugged memory) gets remapped again freshly. However at this time since no slab is available yet, pt_frag_refcount never gets initialized as we never do pte_fragment_alloc() for these mappings. So we right away hit the underflow during the first unplug itself, it looks like. I will check how this can be fixed. > > Regards, > Bharata.
[PATCH] misc: remove redundant 'default n' from Kconfig-s
'default n' is the default value for any bool or tristate Kconfig setting so there is no need to write it explicitly. Also since commit f467c5640c29 ("kconfig: only write '# CONFIG_FOO is not set' for visible symbols") the Kconfig behavior is the same regardless of 'default n' being present or not: ... One side effect of (and the main motivation for) this change is making the following two definitions behave exactly the same: config FOO bool config FOO bool default n With this change, neither of these will generate a '# CONFIG_FOO is not set' line (assuming FOO isn't selected/implied). That might make it clearer to people that a bare 'default n' is redundant. ... Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/misc/Kconfig | 10 -- drivers/misc/altera-stapl/Kconfig |1 - drivers/misc/c2port/Kconfig |2 -- drivers/misc/cb710/Kconfig|1 - drivers/misc/cxl/Kconfig |3 --- drivers/misc/echo/Kconfig |1 - drivers/misc/genwqe/Kconfig |1 - drivers/misc/lis3lv02d/Kconfig|2 -- drivers/misc/ocxl/Kconfig |1 - 9 files changed, 22 deletions(-) Index: b/drivers/misc/Kconfig === --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -8,7 +8,6 @@ config SENSORS_LIS3LV02D tristate depends on INPUT select INPUT_POLLDEV - default n config AD525X_DPOT tristate "Analog Devices Digital Potentiometers" @@ -61,7 +60,6 @@ config ATMEL_TCLIB config DUMMY_IRQ tristate "Dummy IRQ handler" - default n ---help--- This module accepts a single 'irq' parameter, which it should register for. The sole purpose of this module is to help with debugging of systems on @@ -117,7 +115,6 @@ config PHANTOM config INTEL_MID_PTI tristate "Parallel Trace Interface for MIPI P1149.7 cJTAG standard" depends on PCI && TTY && (X86_INTEL_MID || COMPILE_TEST) - default n help The PTI (Parallel Trace Interface) driver directs trace data routed from various parts in the system out @@ -193,7 +190,6 @@ config ATMEL_SSC config ENCLOSURE_SERVICES tristate "Enclosure Services" - default n help Provides support for intelligent enclosures (bays which contain storage devices). You also need either a host @@ -217,7 +213,6 @@ config SGI_XP config CS5535_MFGPT tristate "CS5535/CS5536 Geode Multi-Function General Purpose Timer (MFGPT) support" depends on MFD_CS5535 - default n help This driver provides access to MFGPT functionality for other drivers that need timers. MFGPTs are available in the CS5535 and @@ -250,7 +245,6 @@ config CS5535_CLOCK_EVENT_SRC config HP_ILO tristate "Channel interface driver for the HP iLO processor" depends on PCI - default n help The channel interface driver allows applications to communicate with iLO management processors present on HP ProLiant servers. @@ -285,7 +279,6 @@ config QCOM_FASTRPC config SGI_GRU tristate "SGI GRU driver" depends on X86_UV && SMP - default n select MMU_NOTIFIER ---help--- The GRU is a hardware resource located in the system chipset. The GRU @@ -300,7 +293,6 @@ config SGI_GRU config SGI_GRU_DEBUG bool "SGI GRU driver debug" depends on SGI_GRU - default n ---help--- This option enables additional debugging code for the SGI GRU driver. If you are unsure, say N. @@ -358,7 +350,6 @@ config SENSORS_BH1770 config SENSORS_APDS990X tristate "APDS990X combined als and proximity sensors" depends on I2C -default n ---help--- Say Y here if you want to build a driver for Avago APDS990x combined ambient light and proximity sensor chip. @@ -386,7 +377,6 @@ config DS1682 config SPEAR13XX_PCIE_GADGET bool "PCIe gadget support for SPEAr13XX platform" depends on ARCH_SPEAR13XX && BROKEN - default n help This option enables gadget support for PCIe controller. If board file defines any controller as PCIe endpoint then a sysfs Index: b/drivers/misc/altera-stapl/Kconfig === --- a/drivers/misc/altera-stapl/Kconfig +++ b/drivers/misc/altera-stapl/Kconfig @@ -4,6 +4,5 @@ comment "Altera FPGA firmware download m config ALTERA_STAPL tristate "Altera FPGA firmware download module" depends on I2C - default n help An Altera FPGA module. Say Y when you want to support this tool. Index: b/drivers/misc/c2port/Kconfig
Re: [PATCH v2 1/2] pid: add pidfd_open()
On Mon, May 20, 2019 at 3:46 PM Christian Brauner wrote: > This adds the pidfd_open() syscall. It allows a caller to retrieve pollable > pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a > process that is created via traditional fork()/clone() calls that is only > referenced by a PID: > > int pidfd = pidfd_open(1234, 0); > ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0); > > With the introduction of pidfds through CLONE_PIDFD it is possible to > created pidfds at process creation time. > However, a lot of processes get created with traditional PID-based calls > such as fork() or clone() (without CLONE_PIDFD). For these processes a > caller can currently not create a pollable pidfd. This is a problem for > Android's low memory killer (LMK) and service managers such as systemd. > Both are examples of tools that want to make use of pidfds to get reliable > notification of process exit for non-parents (pidfd polling) and race-free > signal sending (pidfd_send_signal()). They intend to switch to this API for > process supervision/management as soon as possible. Having no way to get > pollable pidfds from PID-only processes is one of the biggest blockers for > them in adopting this api. With pidfd_open() making it possible to retrieve > pidfds for PID-based processes we enable them to adopt this api. > > In line with Arnd's recent changes to consolidate syscall numbers across > architectures, I have added the pidfd_open() syscall to all architectures > at the same time. > > Signed-off-by: Christian Brauner > Reviewed-by: Oleg Nesterov > arch/m68k/kernel/syscalls/syscall.tbl | 1 + Acked-by: Geert Uytterhoeven Gr{oetje,eeting}s, Geert -- Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org In personal conversations with technical people, I call myself a hacker. But when I'm talking to journalists I just say "programmer" or something like that. -- Linus Torvalds
[PATCH v7 1/1] iommu: enhance IOMMU dma mode build options
First, add build option IOMMU_DEFAULT_{LAZY|STRICT}, so that we have the opportunity to set {lazy|strict} mode as default at build time. Then put the three config options in an choice, make people can only choose one of the three at a time. The default IOMMU dma modes on each ARCHs have no change. Signed-off-by: Zhen Lei --- arch/ia64/kernel/pci-dma.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++- arch/s390/pci/pci_dma.c | 2 +- arch/x86/kernel/pci-dma.c | 7 ++--- drivers/iommu/Kconfig | 44 ++- drivers/iommu/amd_iommu_init.c| 3 ++- drivers/iommu/intel-iommu.c | 2 +- drivers/iommu/iommu.c | 3 ++- 8 files changed, 48 insertions(+), 18 deletions(-) diff --git a/arch/ia64/kernel/pci-dma.c b/arch/ia64/kernel/pci-dma.c index fe988c49f01ce6a..655511dbf3c3b34 100644 --- a/arch/ia64/kernel/pci-dma.c +++ b/arch/ia64/kernel/pci-dma.c @@ -22,7 +22,7 @@ int force_iommu __read_mostly; #endif -int iommu_pass_through; +int iommu_pass_through = IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH); static int __init pci_iommu_init(void) { diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 3ead4c237ed0ec9..383e082a9bb985c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -85,7 +85,8 @@ void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level, va_end(args); } -static bool pnv_iommu_bypass_disabled __read_mostly; +static bool pnv_iommu_bypass_disabled __read_mostly = + !IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH); static bool pci_reset_phbs __read_mostly; static int __init iommu_setup(char *str) diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 9e52d1527f71495..784ad1e0acecfb1 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -17,7 +17,7 @@ static struct kmem_cache *dma_region_table_cache; static struct kmem_cache *dma_page_table_cache; -static int s390_iommu_strict; +static int s390_iommu_strict = IS_ENABLED(CONFIG_IOMMU_DEFAULT_STRICT); static int zpci_refresh_global(struct zpci_dev *zdev) { diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d460998ae828514..fb2bab42a0a3173 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -43,11 +43,8 @@ * It is also possible to disable by default in kernel config, and enable with * iommu=nopt at boot time. */ -#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH -int iommu_pass_through __read_mostly = 1; -#else -int iommu_pass_through __read_mostly; -#endif +int iommu_pass_through __read_mostly = + IS_ENABLED(CONFIG_IOMMU_DEFAULT_PASSTHROUGH); extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 6f07f3b21816c64..8a1f1793cde76b4 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -74,17 +74,47 @@ config IOMMU_DEBUGFS debug/iommu directory, and then populate a subdirectory with entries as required. -config IOMMU_DEFAULT_PASSTHROUGH - bool "IOMMU passthrough by default" +choice + prompt "IOMMU default DMA mode" depends on IOMMU_API -help - Enable passthrough by default, removing the need to pass in - iommu.passthrough=on or iommu=pt through command line. If this - is enabled, you can still disable with iommu.passthrough=off - or iommu=nopt depending on the architecture. + default IOMMU_DEFAULT_PASSTHROUGH if (PPC_POWERNV && PCI) + default IOMMU_DEFAULT_LAZY if (AMD_IOMMU || INTEL_IOMMU || S390_IOMMU) + default IOMMU_DEFAULT_STRICT + help + This option allows IOMMU DMA mode to be chose at build time, to + override the default DMA mode of each ARCHs, removing the need to + pass in kernel parameters through command line. You can still use + ARCHs specific boot options to override this option again. + +config IOMMU_DEFAULT_PASSTHROUGH + bool "passthrough" + help + In this mode, the DMA access through IOMMU without any addresses + translation. That means, the wrong or illegal DMA access can not + be caught, no error information will be reported. If unsure, say N here. +config IOMMU_DEFAULT_LAZY + bool "lazy" + help + Support lazy mode, where for every IOMMU DMA unmap operation, the + flush operation of IOTLB and the free operation of IOVA are deferred. + They are only guaranteed to be done before the related IOVA will be + reused. + +config IOMMU_DEFAULT_STRICT + bool "strict" + help + For every IOMMU DMA unmap operation, the flush operation of IOTLB and + the free operation of IOVA are guaranteed to be done in the
[PATCH v7 0/1] iommu: enhance IOMMU dma mode build options
v6 --> v7: 1. Fix some text editing errors v5 --> v6: 1. give up adding boot option iommu.dma_mode v4 --> v5: As Hanjun and Thomas Gleixner's suggestion: 1. Keep the old ARCH specific boot options no change. 2. Keep build option CONFIG_IOMMU_DEFAULT_PASSTHROUGH no change. v4: As Robin Murphy's suggestion: "It's also not necessarily obvious to the user how this interacts with IOMMU_DEFAULT_PASSTHROUGH, so if we really do go down this route, maybe it would be better to refactor the whole lot into a single selection of something like IOMMU_DEFAULT_MODE anyway." In this version, I tried to normalize the IOMMU dma mode boot options for all ARCHs. When IOMMU is enabled, there are 3 dma modes: paasthrough(bypass), lazy(mapping but defer the IOTLB invalidation), strict. But currently each ARCHs defined their private boot options, different with each other. For example, to enable/disable "passthrough", ARM64 use iommu.passthrough=1/0, X86 use iommu=pt/nopt, PPC/POWERNV use iommu=nobypass. Zhen Lei (1): iommu: enhance IOMMU dma mode build options arch/ia64/kernel/pci-dma.c| 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 3 ++- arch/s390/pci/pci_dma.c | 2 +- arch/x86/kernel/pci-dma.c | 7 ++--- drivers/iommu/Kconfig | 44 ++- drivers/iommu/amd_iommu_init.c| 3 ++- drivers/iommu/intel-iommu.c | 2 +- drivers/iommu/iommu.c | 3 ++- 8 files changed, 48 insertions(+), 18 deletions(-) -- 1.8.3
[PATCH v2 2/2] tests: add pidfd_open() tests
This adds testing for the new pidfd_open() syscalls. Specifically, we test: - that no invalid flags can be passed to pidfd_open() - that no invalid pid can be passed to pidfd_open() - that a pidfd can be retrieved with pidfd_open() - that the retrieved pidfd references the correct pid Signed-off-by: Christian Brauner Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Joel Fernandes (Google) Cc: Thomas Gleixner Cc: Jann Horn Cc: David Howells Cc: "Michael Kerrisk (man-pages)" Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Oleg Nesterov Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro Cc: linux-...@vger.kernel.org --- v1: unchanged v2: - Christian Brauner : - set number of planned tests via ksft_set_plan() --- tools/testing/selftests/pidfd/Makefile| 2 +- tools/testing/selftests/pidfd/pidfd.h | 57 ++ .../testing/selftests/pidfd/pidfd_open_test.c | 169 ++ tools/testing/selftests/pidfd/pidfd_test.c| 41 + 4 files changed, 228 insertions(+), 41 deletions(-) create mode 100644 tools/testing/selftests/pidfd/pidfd.h create mode 100644 tools/testing/selftests/pidfd/pidfd_open_test.c diff --git a/tools/testing/selftests/pidfd/Makefile b/tools/testing/selftests/pidfd/Makefile index deaf8073bc06..b36c0be70848 100644 --- a/tools/testing/selftests/pidfd/Makefile +++ b/tools/testing/selftests/pidfd/Makefile @@ -1,6 +1,6 @@ CFLAGS += -g -I../../../../usr/include/ -TEST_GEN_PROGS := pidfd_test +TEST_GEN_PROGS := pidfd_test pidfd_open_test include ../lib.mk diff --git a/tools/testing/selftests/pidfd/pidfd.h b/tools/testing/selftests/pidfd/pidfd.h new file mode 100644 index ..8452e910463f --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __PIDFD_H +#define __PIDFD_H + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +/* + * The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c + * That means, when it wraps around any pid < 300 will be skipped. + * So we need to use a pid > 300 in order to test recycling. + */ +#define PID_RECYCLE 1000 + +/* + * Define a few custom error codes for the child process to clearly indicate + * what is happening. This way we can tell the difference between a system + * error, a test error, etc. + */ +#define PIDFD_PASS 0 +#define PIDFD_FAIL 1 +#define PIDFD_ERROR 2 +#define PIDFD_SKIP 3 +#define PIDFD_XFAIL 4 + +int wait_for_pid(pid_t pid) +{ + int status, ret; + +again: + ret = waitpid(pid, , 0); + if (ret == -1) { + if (errno == EINTR) + goto again; + + return -1; + } + + if (!WIFEXITED(status)) + return -1; + + return WEXITSTATUS(status); +} + + +#endif /* __PIDFD_H */ diff --git a/tools/testing/selftests/pidfd/pidfd_open_test.c b/tools/testing/selftests/pidfd/pidfd_open_test.c new file mode 100644 index ..0377133dd6dc --- /dev/null +++ b/tools/testing/selftests/pidfd/pidfd_open_test.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "pidfd.h" +#include "../kselftest.h" + +static inline int sys_pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(__NR_pidfd_open, pid, flags); +} + +static int safe_int(const char *numstr, int *converted) +{ + char *err = NULL; + long sli; + + errno = 0; + sli = strtol(numstr, , 0); + if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN)) + return -ERANGE; + + if (errno != 0 && sli == 0) + return -EINVAL; + + if (err == numstr || *err != '\0') + return -EINVAL; + + if (sli > INT_MAX || sli < INT_MIN) + return -ERANGE; + + *converted = (int)sli; + return 0; +} + +static int char_left_gc(const char *buffer, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (buffer[i] == ' ' || + buffer[i] == '\t') + continue; + + return i; + } + + return 0; +} + +static int char_right_gc(const char *buffer, size_t len) +{ + int i; + + for (i = len - 1; i >= 0; i--) { + if (buffer[i] == ' ' || + buffer[i] == '\t' || + buffer[i] == '\n' || + buffer[i] == '\0') + continue; + + return i + 1; + } + + return 0; +} + +static char *trim_whitespace_in_place(char *buffer) +{ + buffer += char_left_gc(buffer, strlen(buffer)); + buffer[char_right_gc(buffer, strlen(buffer))] = '\0'; + return buffer;
[PATCH v2 1/2] pid: add pidfd_open()
This adds the pidfd_open() syscall. It allows a caller to retrieve pollable pidfds for a process which did not get created via CLONE_PIDFD, i.e. for a process that is created via traditional fork()/clone() calls that is only referenced by a PID: int pidfd = pidfd_open(1234, 0); ret = pidfd_send_signal(pidfd, SIGSTOP, NULL, 0); With the introduction of pidfds through CLONE_PIDFD it is possible to created pidfds at process creation time. However, a lot of processes get created with traditional PID-based calls such as fork() or clone() (without CLONE_PIDFD). For these processes a caller can currently not create a pollable pidfd. This is a problem for Android's low memory killer (LMK) and service managers such as systemd. Both are examples of tools that want to make use of pidfds to get reliable notification of process exit for non-parents (pidfd polling) and race-free signal sending (pidfd_send_signal()). They intend to switch to this API for process supervision/management as soon as possible. Having no way to get pollable pidfds from PID-only processes is one of the biggest blockers for them in adopting this api. With pidfd_open() making it possible to retrieve pidfds for PID-based processes we enable them to adopt this api. In line with Arnd's recent changes to consolidate syscall numbers across architectures, I have added the pidfd_open() syscall to all architectures at the same time. Signed-off-by: Christian Brauner Reviewed-by: Oleg Nesterov Cc: Arnd Bergmann Cc: "Eric W. Biederman" Cc: Kees Cook Cc: Joel Fernandes (Google) Cc: Thomas Gleixner Cc: Jann Horn Cc: David Howells Cc: Andy Lutomirsky Cc: Andrew Morton Cc: Aleksa Sarai Cc: Linus Torvalds Cc: Al Viro Cc: linux-...@vger.kernel.org --- v1: - kbuild test robot : - add missing entry for pidfd_open to arch/arm/tools/syscall.tbl - Oleg Nesterov : - use simpler thread-group leader check v2: - Oleg Nesterov : - avoid using additional variable - remove unneeded comment - Arnd Bergmann : - switch from 428 to 434 since the new mount api has taken it - bump syscall numbers in arch/arm64/include/asm/unistd.h - Joel Fernandes (Google) : - switch from ESRCH to EINVAL when the passed-in pid does not refer to a thread-group leader - Christian Brauner : - rebase on v5.2-rc1 - adapt syscall number to account for new mount api syscalls --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 + arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl| 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/pid.h | 1 + include/linux/syscalls.h| 1 + include/uapi/asm-generic/unistd.h | 4 +- kernel/fork.c | 2 +- kernel/pid.c| 43 + 21 files changed, 66 insertions(+), 3 deletions(-) diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 9e7704e44f6d..1db9bbcfb84e 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -473,3 +473,4 @@ 541common fsconfigsys_fsconfig 542common fsmount sys_fsmount 543common fspick sys_fspick +544common pidfd_open sys_pidfd_open diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index aaf479a9e92d..81e6e1817c45 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -447,3 +447,4 @@ 431common fsconfigsys_fsconfig 432common fsmount sys_fsmount 433common fspick sys_fspick +434common pidfd_open sys_pidfd_open diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 70e6882853c0..e8f7d95a1481 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls(__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END(__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 434 +#define __NR_compat_syscalls 435 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h
Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()
On Mon, 20 May 2019 09:13:20 -0400 Steven Rostedt wrote: > > I haven't yet tested this patch on x86, but this looked wrong so sending > > this as a RFC. > > This code has been through a bit of updates, and I need to go through > and clean it up. I'll have to take a look and convert "int" to "bool" > so that "enable" is not confusing. > > Thanks, I think I'll try to do a clean up first, and then this patch > shouldn't "look wrong" after that. > I'm going to apply the attached two patches. There may be some conflicts between yours here and these, but nothing that Linus can't figure out. Do you feel more comfortable with this code, if these patches are applied? -- Steve >From d6b694e350e61259ad18fe2b912911b52ff4767a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 20 May 2019 09:26:24 -0400 Subject: [PATCH 1/2] ftrace: Make enable and update parameters bool when applicable The code modification functions have "enable" and "update" variables that are sometimes "int" but used as "bool". Remove the ambiguity and make them "bool" when they are only used for true or false values. Link: http://lkml.kernel.org/r/e1429923d9eda92a3cf5ee9e33c7eacce539781d.1558115654.git.naveen.n@linux.vnet.ibm.com Reported-by: "Naveen N. Rao" Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 ++-- kernel/trace/ftrace.c | 16 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 25e2995d4a4c..8a8cb3c401b2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -427,8 +427,8 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter); iter = ftrace_rec_iter_next(iter)) -int ftrace_update_record(struct dyn_ftrace *rec, int enable); -int ftrace_test_record(struct dyn_ftrace *rec, int enable); +int ftrace_update_record(struct dyn_ftrace *rec, bool enable); +int ftrace_test_record(struct dyn_ftrace *rec, bool enable); void ftrace_run_stop_machine(int command); unsigned long ftrace_location(unsigned long ip); unsigned long ftrace_location_range(unsigned long start, unsigned long end); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a12aff849c04..47b41502a24c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1768,7 +1768,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, count++; /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ - update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + update |= ftrace_test_record(rec, true) != FTRACE_UPDATE_IGNORE; /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) @@ -2047,7 +2047,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) } } -static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) +static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) { unsigned long flag = 0UL; @@ -2146,12 +2146,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) /** * ftrace_update_record, set a record that now is tracing or not * @rec: the record to update - * @enable: set to 1 if the record is tracing, zero to force disable + * @enable: set to true if the record is tracing, false to force disable * * The records that represent all functions that can be traced need * to be updated when tracing has been enabled. */ -int ftrace_update_record(struct dyn_ftrace *rec, int enable) +int ftrace_update_record(struct dyn_ftrace *rec, bool enable) { return ftrace_check_record(rec, enable, 1); } @@ -2159,13 +2159,13 @@ int ftrace_update_record(struct dyn_ftrace *rec, int enable) /** * ftrace_test_record, check if the record has been enabled or not * @rec: the record to test - * @enable: set to 1 to check if enabled, 0 if it is disabled + * @enable: set to true to check if enabled, false if it is disabled * * The arch code may need to test if a record is already set to * tracing to determine how to modify the function code that it * represents. */ -int ftrace_test_record(struct dyn_ftrace *rec, int enable) +int ftrace_test_record(struct dyn_ftrace *rec, bool enable) { return ftrace_check_record(rec, enable, 0); } @@ -2356,7 +2356,7 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) } static int -__ftrace_replace_code(struct dyn_ftrace *rec, int enable) +__ftrace_replace_code(struct dyn_ftrace *rec, bool enable) { unsigned long ftrace_old_addr; unsigned long ftrace_addr; @@ -2395,7 +2395,7 @@ void __weak ftrace_replace_code(int mod_flags) { struct dyn_ftrace *rec; struct ftrace_page *pg; - int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; + bool enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; int schedulable = mod_flags & FTRACE_MODIFY_MAY_SLEEP_FL; int failed; -- 2.20.1 >From a15f59dc19e1fdf7eda74e7aec386975740d1515 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt
Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted
On Wed, Apr 17, 2019 at 06:42:00PM -0300, Thiago Jung Bauermann wrote: > I rephrased it in terms of address translation. What do you think of > this version? The flag name is slightly different too: > > > VIRTIO_F_ACCESS_PLATFORM_NO_TRANSLATION This feature has the same > meaning as VIRTIO_F_ACCESS_PLATFORM both when set and when not set, > with the exception that address translation is guaranteed to be > unnecessary when accessing memory addresses supplied to the device > by the driver. Which is to say, the device will always use physical > addresses matching addresses used by the driver (typically meaning > physical addresses used by the CPU) and not translated further. This > flag should be set by the guest if offered, but to allow for > backward-compatibility device implementations allow for it to be > left unset by the guest. It is an error to set both this flag and > VIRTIO_F_ACCESS_PLATFORM. OK so VIRTIO_F_ACCESS_PLATFORM is designed to allow unpriveledged drivers. This is why devices fail when it's not negotiated. This confuses me. If driver is unpriveledged then what happens with this flag? It can supply any address it wants. Will that corrupt kernel memory? -- MST
Re: [RFC PATCH 2/4] x86/ftrace: Fix use of flags in ftrace_replace_code()
On Sat, 18 May 2019 00:32:46 +0530 "Naveen N. Rao" wrote: > In commit a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be > schedulable), the generic ftrace_replace_code() function was modified to > accept a flags argument in place of a single 'enable' flag. However, the > x86 version of this function was not updated. Fix the same. > > Fixes: a0572f687fb3c ("ftrace: Allow ftrace_replace_code() to be schedulable") > Signed-off-by: Naveen N. Rao > --- > I haven't yet tested this patch on x86, but this looked wrong so sending > this as a RFC. This code has been through a bit of updates, and I need to go through and clean it up. I'll have to take a look and convert "int" to "bool" so that "enable" is not confusing. Thanks, I think I'll try to do a clean up first, and then this patch shouldn't "look wrong" after that. -- Steve > > - Naveen > > > arch/x86/kernel/ftrace.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c > index 0caf8122d680..0c01b344ba16 100644 > --- a/arch/x86/kernel/ftrace.c > +++ b/arch/x86/kernel/ftrace.c > @@ -554,8 +554,9 @@ static void run_sync(void) > local_irq_disable(); > } > > -void ftrace_replace_code(int enable) > +void ftrace_replace_code(int mod_flags) > { > + int enable = mod_flags & FTRACE_MODIFY_ENABLE_FL; > struct ftrace_rec_iter *iter; > struct dyn_ftrace *rec; > const char *report = "adding breakpoints";
Re: [RFC PATCH] virtio_ring: Use DMA API if guest memory is encrypted
On Fri, Apr 26, 2019 at 08:56:43PM -0300, Thiago Jung Bauermann wrote: > > Michael S. Tsirkin writes: > > > On Wed, Apr 24, 2019 at 10:01:56PM -0300, Thiago Jung Bauermann wrote: > >> > >> Michael S. Tsirkin writes: > >> > >> > On Wed, Apr 17, 2019 at 06:42:00PM -0300, Thiago Jung Bauermann wrote: > >> >> > >> >> Michael S. Tsirkin writes: > >> >> > >> >> > On Thu, Mar 21, 2019 at 09:05:04PM -0300, Thiago Jung Bauermann wrote: > >> >> >> > >> >> >> Michael S. Tsirkin writes: > >> >> >> > >> >> >> > On Wed, Mar 20, 2019 at 01:13:41PM -0300, Thiago Jung Bauermann > >> >> >> > wrote: > >> >> >> >> >From what I understand of the ACCESS_PLATFORM definition, the > >> >> >> >> >host will > >> >> >> >> only ever try to access memory addresses that are supplied to it > >> >> >> >> by the > >> >> >> >> guest, so all of the secure guest memory that the host cares > >> >> >> >> about is > >> >> >> >> accessible: > >> >> >> >> > >> >> >> >> If this feature bit is set to 0, then the device has same > >> >> >> >> access to > >> >> >> >> memory addresses supplied to it as the driver has. In > >> >> >> >> particular, > >> >> >> >> the device will always use physical addresses matching > >> >> >> >> addresses > >> >> >> >> used by the driver (typically meaning physical addresses used > >> >> >> >> by the > >> >> >> >> CPU) and not translated further, and can access any address > >> >> >> >> supplied > >> >> >> >> to it by the driver. When clear, this overrides any > >> >> >> >> platform-specific description of whether device access is > >> >> >> >> limited or > >> >> >> >> translated in any way, e.g. whether an IOMMU may be present. > >> >> >> >> > >> >> >> >> All of the above is true for POWER guests, whether they are secure > >> >> >> >> guests or not. > >> >> >> >> > >> >> >> >> Or are you saying that a virtio device may want to access memory > >> >> >> >> addresses that weren't supplied to it by the driver? > >> >> >> > > >> >> >> > Your logic would apply to IOMMUs as well. For your mode, there are > >> >> >> > specific encrypted memory regions that driver has access to but > >> >> >> > device > >> >> >> > does not. that seems to violate the constraint. > >> >> >> > >> >> >> Right, if there's a pre-configured 1:1 mapping in the IOMMU such that > >> >> >> the device can ignore the IOMMU for all practical purposes I would > >> >> >> indeed say that the logic would apply to IOMMUs as well. :-) > >> >> >> > >> >> >> I guess I'm still struggling with the purpose of signalling to the > >> >> >> driver that the host may not have access to memory addresses that it > >> >> >> will never try to access. > >> >> > > >> >> > For example, one of the benefits is to signal to host that driver does > >> >> > not expect ability to access all memory. If it does, host can > >> >> > fail initialization gracefully. > >> >> > >> >> But why would the ability to access all memory be necessary or even > >> >> useful? When would the host access memory that the driver didn't tell it > >> >> to access? > >> > > >> > When I say all memory I mean even memory not allowed by the IOMMU. > >> > >> Yes, but why? How is that memory relevant? > > > > It's relevant when driver is not trusted to only supply correct > > addresses. The feature was originally designed to support userspace > > drivers within guests. > > Ah, thanks for clarifying. I don't think that's a problem in our case. > If the guest provides an incorrect address, the hardware simply won't > allow the host to access it. > > >> >> >> > Another idea is maybe something like virtio-iommu? > >> >> >> > >> >> >> You mean, have legacy guests use virtio-iommu to request an IOMMU > >> >> >> bypass? If so, it's an interesting idea for new guests but it doesn't > >> >> >> help with guests that are out today in the field, which don't have A > >> >> >> virtio-iommu driver. > >> >> > > >> >> > I presume legacy guests don't use encrypted memory so why do we > >> >> > worry about them at all? > >> >> > >> >> They don't use encrypted memory, but a host machine will run a mix of > >> >> secure and legacy guests. And since the hypervisor doesn't know whether > >> >> a guest will be secure or not at the time it is launched, legacy guests > >> >> will have to be launched with the same configuration as secure guests. > >> > > >> > OK and so I think the issue is that hosts generally fail if they set > >> > ACCESS_PLATFORM and guests do not negotiate it. > >> > So you can not just set ACCESS_PLATFORM for everyone. > >> > Is that the issue here? > >> > >> Yes, that is one half of the issue. The other is that even if hosts > >> didn't fail, existing legacy guests wouldn't "take the initiative" of > >> not negotiating ACCESS_PLATFORM to get the improved performance. They'd > >> have to be modified to do that. > > > > So there's a non-encrypted guest, hypervisor wants to set > > ACCESS_PLATFORM to allow encrypted guests but that will slow down legacy > > guests since
Re: [RFC PATCH 1/4] ftrace: Expose flags used for ftrace_replace_code()
On Sat, 18 May 2019 00:32:45 +0530 "Naveen N. Rao" wrote: > Since ftrace_replace_code() is a __weak function and can be overridden, > we need to expose the flags that can be set. So, move the flags enum to > the header file. > > Signed-off-by: Naveen N. Rao Reviewed-by: Steven Rostedt (VMware) -- Steve > --- > include/linux/ftrace.h | 5 + > kernel/trace/ftrace.c | 5 - > 2 files changed, 5 insertions(+), 5 deletions(-) > > diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h > index 20899919ead8..835e761f63b0 100644 > --- a/include/linux/ftrace.h > +++ b/include/linux/ftrace.h > @@ -162,6 +162,11 @@ enum { > FTRACE_OPS_FL_TRACE_ARRAY = 1 << 15, > }; > > +enum { > + FTRACE_MODIFY_ENABLE_FL = (1 << 0), > + FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), > +}; > + > #ifdef CONFIG_DYNAMIC_FTRACE > /* The hash used to know what functions callbacks trace */ > struct ftrace_ops_hash { > diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c > index b920358dd8f7..38c15cd27fc4 100644 > --- a/kernel/trace/ftrace.c > +++ b/kernel/trace/ftrace.c > @@ -78,11 +78,6 @@ > #define ASSIGN_OPS_HASH(opsname, val) > #endif > > -enum { > - FTRACE_MODIFY_ENABLE_FL = (1 << 0), > - FTRACE_MODIFY_MAY_SLEEP_FL = (1 << 1), > -}; > - > struct ftrace_ops ftrace_list_end __read_mostly = { > .func = ftrace_stub, > .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
Re: Linux 5.1-rc5
On Thu, May 02, 2019 at 05:10:55PM +0200, Martin Schwidefsky wrote: > On Thu, 2 May 2019 16:31:10 +0200 > Greg KH wrote: > > > On Thu, May 02, 2019 at 04:17:58PM +0200, Martin Schwidefsky wrote: > > > On Thu, 2 May 2019 14:21:28 +0200 > > > Greg KH wrote: > > > > > > > On Mon, Apr 15, 2019 at 09:17:10AM -0700, Linus Torvalds wrote: > > > > > On Sun, Apr 14, 2019 at 10:19 PM Christoph Hellwig > > > > > wrote: > > > > > > > > > > > > Can we please have the page refcount overflow fixes out on the list > > > > > > for review, even if it is after the fact? > > > > > > > > > > They were actually on a list for review long before the fact, but it > > > > > was the security mailing list. The issue actually got discussed back > > > > > in January along with early versions of the patches, but then we > > > > > dropped the ball because it just wasn't on anybody's radar and it got > > > > > resurrected late March. Willy wrote a rather bigger patch-series, and > > > > > review of that is what then resulted in those commits. So they may > > > > > look recent, but that's just because the original patches got > > > > > seriously edited down and rewritten. > > > > > > > > > > That said, powerpc and s390 should at least look at maybe adding a > > > > > check for the page ref in their gup paths too. Powerpc has the special > > > > > gup_hugepte() case, and s390 has its own version of gup entirely. I > > > > > was actually hoping the s390 guys would look at using the generic gup > > > > > code. > > > > > > > > > > I ruthlessly also entirely ignored MIPS, SH and sparc, since they seem > > > > > largely irrelevant, partly since even theoretically this whole issue > > > > > needs a _lot_ of memory. > > > > > > > > > > Michael, Martin, see commit 6b3a70773630 ("Merge branch 'page-refs' > > > > > (page ref overflow)"). You may or may not really care. > > > > > > > > I've now queued these patches up for the next round of stable releases, > > > > as some people seem to care about these. > > > > > > > > I didn't see any follow-on patches for s390 or ppc64 hit the tree for > > > > these changes, am I just missing them and should also queue up a few > > > > more to handle this issue on those platforms? > > > > > > I fixed that with a different approach. The following two patches are > > > queued for the next merge window: > > > > > > d1874a0c2805 "s390/mm: make the pxd_offset functions more robust" > > > 1a42010cdc26 "s390/mm: convert to the generic get_user_pages_fast code" > > > > > > With these two s390 now uses the generic gup code in mm/gup.c > > > > Nice! Do you want me to queue those up for the stable backports once > > they hit a public -rc release? > > Yes please! Now queued up to 5.0 and 5.1, but did not apply to 4.19 or older :( thanks, greg k-h
[PATCH] selftests/powerpc: Add a test of spectre_v2 mitigations
This test uses the PMU to count branch prediction hits/misses for a known loop, and compare the result to the reported spectre v2 mitigation. This gives us a way of sanity checking that the reported mitigation is actually in effect. Sample output for some cases, eg: Power9: sysfs reports: 'Vulnerable' PM_BR_PRED_CCACHE: result368 running/enabled 5792777124 PM_BR_MPRED_CCACHE: result319 running/enabled 5792775546 PM_BR_PRED_PCACHE: result 2147483281 running/enabled 5792773128 PM_BR_MPRED_PCACHE: result 213604201 running/enabled 5792771640 Miss percent 9 % OK - Measured branch prediction rates match reported spectre v2 mitigation. sysfs reports: 'Mitigation: Indirect branch serialisation (kernel only)' PM_BR_PRED_CCACHE: result895 running/enabled 5780320920 PM_BR_MPRED_CCACHE: result822 running/enabled 5780312414 PM_BR_PRED_PCACHE: result 2147482754 running/enabled 5780308836 PM_BR_MPRED_PCACHE: result 213639731 running/enabled 5780307912 Miss percent 9 % OK - Measured branch prediction rates match reported spectre v2 mitigation. sysfs reports: 'Mitigation: Indirect branch cache disabled' PM_BR_PRED_CCACHE: result 2147483649 running/enabled 20540186160 PM_BR_MPRED_CCACHE: result 2147483649 running/enabled 20540180056 PM_BR_PRED_PCACHE: result 0 running/enabled 20540176090 PM_BR_MPRED_PCACHE: result 0 running/enabled 20540174182 Miss percent 100 % OK - Measured branch prediction rates match reported spectre v2 mitigation. Power8: sysfs reports: 'Vulnerable' PM_BR_PRED_CCACHE: result 2147483649 running/enabled 3505888142 PM_BR_MPRED_CCACHE: result 9 running/enabled 3505882788 Miss percent 0 % OK - Measured branch prediction rates match reported spectre v2 mitigation. sysfs reports: 'Mitigation: Indirect branch cache disabled' PM_BR_PRED_CCACHE: result 2147483649 running/enabled 16931421988 PM_BR_MPRED_CCACHE: result 2147483649 running/enabled 16931416478 Miss percent 100 % OK - Measured branch prediction rates match reported spectre v2 mitigation. success: spectre_v2 Signed-off-by: Michael Ellerman --- .../testing/selftests/powerpc/include/utils.h | 1 + .../selftests/powerpc/security/Makefile | 3 +- .../selftests/powerpc/security/branch_loops.S | 82 +++ .../selftests/powerpc/security/spectre_v2.c | 218 ++ tools/testing/selftests/powerpc/utils.c | 20 ++ 5 files changed, 323 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/powerpc/security/branch_loops.S create mode 100644 tools/testing/selftests/powerpc/security/spectre_v2.c diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h index 7636bf45d5d5..be493b4823f7 100644 --- a/tools/testing/selftests/powerpc/include/utils.h +++ b/tools/testing/selftests/powerpc/include/utils.h @@ -34,6 +34,7 @@ int pick_online_cpu(void); int read_debugfs_file(char *debugfs_file, int *result); int write_debugfs_file(char *debugfs_file, int result); +int read_sysfs_file(char *debugfs_file, char *result, size_t result_size); void set_dscr(unsigned long val); int perf_event_open_counter(unsigned int type, unsigned long config, int group_fd); diff --git a/tools/testing/selftests/powerpc/security/Makefile b/tools/testing/selftests/powerpc/security/Makefile index 85861c46b445..d68e6031695c 100644 --- a/tools/testing/selftests/powerpc/security/Makefile +++ b/tools/testing/selftests/powerpc/security/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0+ -TEST_GEN_PROGS := rfi_flush +TEST_GEN_PROGS := rfi_flush spectre_v2 top_srcdir = ../../../../.. CFLAGS += -I../../../../../usr/include @@ -8,3 +8,4 @@ CFLAGS += -I../../../../../usr/include include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c +$(OUTPUT)/spectre_v2: ../pmu/event.c branch_loops.S diff --git a/tools/testing/selftests/powerpc/security/branch_loops.S b/tools/testing/selftests/powerpc/security/branch_loops.S new file mode 100644 index ..22e9204e3421 --- /dev/null +++ b/tools/testing/selftests/powerpc/security/branch_loops.S @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: GPL-2.0+ + +/* + * Copyright 2019, Michael Ellerman, IBM Corp. + */ + +#include + + .data + +jump_table: + .long 0x0 + .long (.Lstate_1 - .Lstate_0) + .long (.Lstate_2 - .Lstate_0) + .long (.Lstate_3 - .Lstate_0) + .long (.Lstate_4 - .Lstate_0) + .long (.Lstate_5 - .Lstate_0) + .long (.Lstate_6 - .Lstate_0) + .long (.Lstate_7 - .Lstate_0) + + .text + +#define ITER_SHIFT 31 + +.macro state number + .balign 32 +.Lstate_\number: + .if \number==7 + li r3, 0 + .else + li r3, \number+1 + .endif + b .Lloop +.endm +
[PATCH v2] selftests/powerpc: Add a test of bad (out-of-range) accesses
Userspace isn't allowed to access certain address ranges, make sure we actually test that to at least some degree. This would have caught the recent bug where the SLB fault handler was incorrectly called on an out-of-range access when using the Radix MMU. It also would have caught the bug we had in get_region_id() where we were inserting SLB entries for bad addresses. Signed-off-by: Michael Ellerman Signed-off-by: Nicholas Piggin --- v2: Incorporate changes from Nick's version. Check the faulted address, and make sure we get BNDERR for things that are completely out of bounds. tools/testing/selftests/powerpc/mm/.gitignore | 3 +- tools/testing/selftests/powerpc/mm/Makefile | 4 +- .../selftests/powerpc/mm/bad_accesses.c | 167 ++ 3 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/powerpc/mm/bad_accesses.c diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index ba919308fe30..c189170ba041 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -3,4 +3,5 @@ subpage_prot tempfile prot_sao segv_errors -wild_bctr \ No newline at end of file +wild_bctr +bad_accesses \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 43d68420e363..f9667f980241 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -2,7 +2,8 @@ noarg: $(MAKE) -C ../ -TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr +TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors \ + wild_bctr bad_accesses TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. @@ -13,6 +14,7 @@ $(TEST_GEN_PROGS): ../harness.c $(OUTPUT)/prot_sao: ../utils.c $(OUTPUT)/wild_bctr: CFLAGS += -m64 +$(OUTPUT)/bad_accesses: CFLAGS += -m64 $(OUTPUT)/tempfile: dd if=/dev/zero of=$@ bs=64k count=1 diff --git a/tools/testing/selftests/powerpc/mm/bad_accesses.c b/tools/testing/selftests/powerpc/mm/bad_accesses.c new file mode 100644 index ..f18b2b91b263 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/bad_accesses.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Copyright 2019, Michael Ellerman, IBM Corp. +// +// Test that out-of-bounds reads/writes behave as expected. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + + +// 64-bit kernel is always here +#define PAGE_OFFSET(0xcul << 60) + +static unsigned long kernel_virt_end; + +static volatile int fault_code; +static volatile unsigned long fault_addr; +static jmp_buf setjmp_env; + +static void segv_handler(int n, siginfo_t *info, void *ctxt_v) +{ + fault_code = info->si_code; + fault_addr = (unsigned long)info->si_addr; + siglongjmp(setjmp_env, 1); +} + +int bad_access(char *p, bool write) +{ + char x; + + fault_code = 0; + fault_addr = 0; + + if (sigsetjmp(setjmp_env, 1) == 0) { + if (write) + *p = 1; + else + x = *p; + + printf("Bad - no SEGV! (%c)\n", x); + return 1; + } + + // If we see MAPERR that means we took a page fault rather than an SLB + // miss. We only expect to take page faults for addresses within the + // valid kernel range. + FAIL_IF(fault_code == SEGV_MAPERR && \ + (fault_addr < PAGE_OFFSET || fault_addr >= kernel_virt_end)); + + FAIL_IF(fault_code != SEGV_MAPERR && fault_code != SEGV_BNDERR); + + return 0; +} + +static int using_hash_mmu(bool *using_hash) +{ + char line[128]; + FILE *f; + int rc; + + f = fopen("/proc/cpuinfo", "r"); + FAIL_IF(!f); + + rc = 0; + while (fgets(line, sizeof(line), f) != NULL) { + if (strcmp(line, "MMU : Hash\n") == 0) { + *using_hash = true; + goto out; + } + + if (strcmp(line, "MMU : Radix\n") == 0) { + *using_hash = false; + goto out; + } + } + + rc = -1; +out: + fclose(f); + return rc; +} + +static int test(void) +{ + unsigned long i, j, addr, region_shift, page_shift, page_size; + struct sigaction sig; + bool hash_mmu; + + sig = (struct sigaction) { + .sa_sigaction = segv_handler, + .sa_flags = SA_SIGINFO, + }; + + FAIL_IF(sigaction(SIGSEGV, , NULL) != 0); + + FAIL_IF(using_hash_mmu(_mmu)); + + page_size = sysconf(_SC_PAGESIZE); + if (page_size == (64 * 1024)) + page_shift = 16; + else + page_shift = 12; + +
[PATCH v2] selftests/powerpc: Add a test of bad (out-of-range) accesses
Userspace isn't allowed to access certain address ranges, make sure we actually test that to at least some degree. This would have caught the recent bug where the SLB fault handler was incorrectly called on an out-of-range access when using the Radix MMU. It also would have caught the bug we had in get_region_id() where we were inserting SLB entries for bad addresses. Signed-off-by: Michael Ellerman Signed-off-by: Nicholas Piggin --- v2: Incorporate changes from Nick's version. Check the faulted address, and make sure we get BNDERR for things that are completely out of bounds. tools/testing/selftests/powerpc/mm/.gitignore | 3 +- tools/testing/selftests/powerpc/mm/Makefile | 4 +- .../selftests/powerpc/mm/bad_accesses.c | 167 ++ 3 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/powerpc/mm/bad_accesses.c diff --git a/tools/testing/selftests/powerpc/mm/.gitignore b/tools/testing/selftests/powerpc/mm/.gitignore index ba919308fe30..c189170ba041 100644 --- a/tools/testing/selftests/powerpc/mm/.gitignore +++ b/tools/testing/selftests/powerpc/mm/.gitignore @@ -3,4 +3,5 @@ subpage_prot tempfile prot_sao segv_errors -wild_bctr \ No newline at end of file +wild_bctr +bad_accesses \ No newline at end of file diff --git a/tools/testing/selftests/powerpc/mm/Makefile b/tools/testing/selftests/powerpc/mm/Makefile index 43d68420e363..f9667f980241 100644 --- a/tools/testing/selftests/powerpc/mm/Makefile +++ b/tools/testing/selftests/powerpc/mm/Makefile @@ -2,7 +2,8 @@ noarg: $(MAKE) -C ../ -TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors wild_bctr +TEST_GEN_PROGS := hugetlb_vs_thp_test subpage_prot prot_sao segv_errors \ + wild_bctr bad_accesses TEST_GEN_FILES := tempfile top_srcdir = ../../../../.. @@ -13,6 +14,7 @@ $(TEST_GEN_PROGS): ../harness.c $(OUTPUT)/prot_sao: ../utils.c $(OUTPUT)/wild_bctr: CFLAGS += -m64 +$(OUTPUT)/bad_accesses: CFLAGS += -m64 $(OUTPUT)/tempfile: dd if=/dev/zero of=$@ bs=64k count=1 diff --git a/tools/testing/selftests/powerpc/mm/bad_accesses.c b/tools/testing/selftests/powerpc/mm/bad_accesses.c new file mode 100644 index ..f18b2b91b263 --- /dev/null +++ b/tools/testing/selftests/powerpc/mm/bad_accesses.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0+ +// +// Copyright 2019, Michael Ellerman, IBM Corp. +// +// Test that out-of-bounds reads/writes behave as expected. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" + + +// 64-bit kernel is always here +#define PAGE_OFFSET(0xcul << 60) + +static unsigned long kernel_virt_end; + +static volatile int fault_code; +static volatile unsigned long fault_addr; +static jmp_buf setjmp_env; + +static void segv_handler(int n, siginfo_t *info, void *ctxt_v) +{ + fault_code = info->si_code; + fault_addr = (unsigned long)info->si_addr; + siglongjmp(setjmp_env, 1); +} + +int bad_access(char *p, bool write) +{ + char x; + + fault_code = 0; + fault_addr = 0; + + if (sigsetjmp(setjmp_env, 1) == 0) { + if (write) + *p = 1; + else + x = *p; + + printf("Bad - no SEGV! (%c)\n", x); + return 1; + } + + // If we see MAPERR that means we took a page fault rather than an SLB + // miss. We only expect to take page faults for addresses within the + // valid kernel range. + FAIL_IF(fault_code == SEGV_MAPERR && \ + (fault_addr < PAGE_OFFSET || fault_addr >= kernel_virt_end)); + + FAIL_IF(fault_code != SEGV_MAPERR && fault_code != SEGV_BNDERR); + + return 0; +} + +static int using_hash_mmu(bool *using_hash) +{ + char line[128]; + FILE *f; + int rc; + + f = fopen("/proc/cpuinfo", "r"); + FAIL_IF(!f); + + rc = 0; + while (fgets(line, sizeof(line), f) != NULL) { + if (strcmp(line, "MMU : Hash\n") == 0) { + *using_hash = true; + goto out; + } + + if (strcmp(line, "MMU : Radix\n") == 0) { + *using_hash = false; + goto out; + } + } + + rc = -1; +out: + fclose(f); + return rc; +} + +static int test(void) +{ + unsigned long i, j, addr, region_shift, page_shift, page_size; + struct sigaction sig; + bool hash_mmu; + + sig = (struct sigaction) { + .sa_sigaction = segv_handler, + .sa_flags = SA_SIGINFO, + }; + + FAIL_IF(sigaction(SIGSEGV, , NULL) != 0); + + FAIL_IF(using_hash_mmu(_mmu)); + + page_size = sysconf(_SC_PAGESIZE); + if (page_size == (64 * 1024)) + page_shift = 16; + else + page_shift = 12; + +
[PATCH v4 1/3] PM: wakeup: Add routine to help fetch wakeup source object.
Some user might want to go through all registered wakeup sources and doing things accordingly. For example, SoC PM driver might need to do HW programming to prevent powering down specific IP which wakeup source depending on. And is user's responsibility to identify if this wakeup source he is interested in. Signed-off-by: Ran Wang --- Change in v4: - None. Change in v3: - Adjust indentation of *attached_dev;. Change in v2: - None. drivers/base/power/wakeup.c | 18 ++ include/linux/pm_wakeup.h |3 +++ 2 files changed, 21 insertions(+), 0 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 5b2b6a0..6904485 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -226,6 +227,22 @@ void wakeup_source_unregister(struct wakeup_source *ws) } } EXPORT_SYMBOL_GPL(wakeup_source_unregister); +/** + * wakeup_source_get_next - Get next wakeup source from the list + * @ws: Previous wakeup source object, null means caller want first one. + */ +struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws) +{ + struct list_head *ws_head = _sources; + + if (ws) + return list_next_or_null_rcu(ws_head, >entry, + struct wakeup_source, entry); + else + return list_entry_rcu(ws_head->next, + struct wakeup_source, entry); +} +EXPORT_SYMBOL_GPL(wakeup_source_get_next); /** * device_wakeup_attach - Attach a wakeup source object to a device object. @@ -242,6 +259,7 @@ static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws) return -EEXIST; } dev->power.wakeup = ws; + ws->attached_dev = dev; if (dev->power.wakeirq) device_wakeup_attach_irq(dev, dev->power.wakeirq); spin_unlock_irq(>power.lock); diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 0ff134d..913b2fb 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -50,6 +50,7 @@ * @wakeup_count: Number of times the wakeup source might abort suspend. * @active: Status of the wakeup source. * @has_timeout: The wakeup source has been activated with a timeout. + * @attached_dev: The device it attached to */ struct wakeup_source { const char *name; @@ -70,6 +71,7 @@ struct wakeup_source { unsigned long wakeup_count; boolactive:1; boolautosleep_enabled:1; + struct device *attached_dev; }; #ifdef CONFIG_PM_SLEEP @@ -101,6 +103,7 @@ static inline void device_set_wakeup_path(struct device *dev) extern void wakeup_source_remove(struct wakeup_source *ws); extern struct wakeup_source *wakeup_source_register(const char *name); extern void wakeup_source_unregister(struct wakeup_source *ws); +extern struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws); extern int device_wakeup_enable(struct device *dev); extern int device_wakeup_disable(struct device *dev); extern void device_set_wakeup_capable(struct device *dev, bool capable); -- 1.7.1
[PATCH v4 3/3] soc: fsl: add RCPM driver
The NXP's QorIQ Processors based on ARM Core have RCPM module (Run Control and Power Management), which performs all device-level tasks associated with power management such as wakeup source control. This driver depends on PM wakeup source framework which help to collect wake information. Signed-off-by: Ran Wang Acked-by: Pavel Machek --- Change in v4: - Remove extra ',' in author line of rcpm.c - Update usage of wakeup_source_get_next() to be less confusing to the reader, code logic remain the same. Change in v3: - Some whitespace ajdustment. Change in v2: - Rebase Kconfig and Makefile update to latest mainline. drivers/soc/fsl/Kconfig |8 +++ drivers/soc/fsl/Makefile |1 + drivers/soc/fsl/rcpm.c | 122 ++ 3 files changed, 131 insertions(+), 0 deletions(-) create mode 100644 drivers/soc/fsl/rcpm.c diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig index 61f8e14..8e84e40 100644 --- a/drivers/soc/fsl/Kconfig +++ b/drivers/soc/fsl/Kconfig @@ -29,4 +29,12 @@ config FSL_MC_DPIO other DPAA2 objects. This driver does not expose the DPIO objects individually, but groups them under a service layer API. + +config FSL_RCPM + bool "Freescale RCPM support" + depends on PM_SLEEP + help + The NXP's QorIQ Processors based on ARM Core have RCPM module + (Run Control and Power Management), which performs all device-level + tasks associated with power management, such as wakeup source control. endmenu diff --git a/drivers/soc/fsl/Makefile b/drivers/soc/fsl/Makefile index 803ef1b..c1be6ee 100644 --- a/drivers/soc/fsl/Makefile +++ b/drivers/soc/fsl/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_QUICC_ENGINE) += qe/ obj-$(CONFIG_CPM) += qe/ obj-$(CONFIG_FSL_GUTS) += guts.o obj-$(CONFIG_FSL_MC_DPIO) += dpio/ +obj-$(CONFIG_FSL_RCPM) += rcpm.o diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c new file mode 100644 index 000..ff27adc --- /dev/null +++ b/drivers/soc/fsl/rcpm.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// rcpm.c - Freescale QorIQ RCPM driver +// +// Copyright 2019 NXP +// +// Author: Ran Wang + +#include +#include +#include +#include +#include +#include +#include + +#define RCPM_WAKEUP_CELL_MAX_SIZE 7 + +struct rcpm { + unsigned intwakeup_cells; + void __iomem*ippdexpcr_base; + boollittle_endian; +}; + +static int rcpm_pm_prepare(struct device *dev) +{ + struct device_node *np = dev->of_node; + struct wakeup_source*ws; + struct rcpm *rcpm; + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp; + int i, ret; + + rcpm = dev_get_drvdata(dev); + if (!rcpm) + return -EINVAL; + + /* Begin with first registered wakeup source */ + ws = NULL; + while (wakeup_source_get_next(ws)) { + ret = device_property_read_u32_array(ws->attached_dev, + "fsl,rcpm-wakeup", value, rcpm->wakeup_cells + 1); + + /* Wakeup source should refer to current rcpm device */ + if (ret || (np->phandle != value[0])) { + dev_info(dev, "%s doesn't refer to this rcpm\n", + ws->name); + continue; + } + + for (i = 0; i < rcpm->wakeup_cells; i++) { + /* We can only OR related bits */ + if (value[i + 1]) { + if (rcpm->little_endian) { + tmp = ioread32(rcpm->ippdexpcr_base + i * 4); + tmp |= value[i + 1]; + iowrite32(tmp, rcpm->ippdexpcr_base + i * 4); + } else { + tmp = ioread32be(rcpm->ippdexpcr_base + i * 4); + tmp |= value[i + 1]; + iowrite32be(tmp, rcpm->ippdexpcr_base + i * 4); + } + } + } + } + + return 0; +} + +static const struct dev_pm_ops rcpm_pm_ops = { + .prepare = rcpm_pm_prepare, +}; + +static int rcpm_probe(struct platform_device *pdev) +{ + struct device *dev = >dev; + struct resource *r; + struct rcpm *rcpm; + int ret; + + rcpm = devm_kzalloc(dev, sizeof(*rcpm), GFP_KERNEL); + if (!rcpm) + return -ENOMEM; + + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!r) + return -ENODEV; + + rcpm->ippdexpcr_base = devm_ioremap_resource(>dev, r); + if (IS_ERR(rcpm->ippdexpcr_base)) { + ret = PTR_ERR(rcpm->ippdexpcr_base);
[PATCH v4 2/3] Documentation: dt: binding: fsl: Add 'little-endian' and update Chassis define
By default, QorIQ SoC's RCPM register block is Big Endian. But there are some exceptions, such as LS1088A and LS2088A, are Little Endian. So add this optional property to help identify them. Actually LS2021A and other Layerscapes won't totally follow Chassis 2.1, so separate them from powerpc SoC. Signed-off-by: Ran Wang --- Change in v4: - Adjust indectation of 'ls1021a, ls1012a, ls1043a, ls1046a'. Change in v3: - None. Change in v2: - None. Documentation/devicetree/bindings/soc/fsl/rcpm.txt |8 +++- 1 files changed, 7 insertions(+), 1 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt index e284e4e..058154c 100644 --- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt +++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt @@ -20,6 +20,7 @@ Required properites: * "fsl,qoriq-rcpm-1.0": for chassis 1.0 rcpm * "fsl,qoriq-rcpm-2.0": for chassis 2.0 rcpm * "fsl,qoriq-rcpm-2.1": for chassis 2.1 rcpm + * "fsl,qoriq-rcpm-2.1+": for chassis 2.1+ rcpm All references to "1.0" and "2.0" refer to the QorIQ chassis version to which the chip complies. @@ -27,7 +28,12 @@ Chassis Version Example Chips ------ 1.0p4080, p5020, p5040, p2041, p3041 2.0t4240, b4860, b4420 -2.1t1040, ls1021 +2.1t1040, +2.1+ ls1021a, ls1012a, ls1043a, ls1046a + +Optional properties: + - little-endian : RCPM register block is Little Endian. Without it RCPM + will be Big Endian (default case). Example: The RCPM node for T4240: -- 1.7.1
Re: [PATCH] kbuild: do not check name uniqueness of builtin modules
On Mon, May 20, 2019 at 11:54:37AM +0900, Masahiro Yamada wrote: > I just thought it was a good idea to scan builtin.modules in the name > uniqueness checking, but Stephen reported a false positive. > > ppc64_defconfig produces: > > warning: same basename if the following are built as modules: > arch/powerpc/platforms/powermac/nvram.ko > drivers/char/nvram.ko > > ..., which is a false positive because the former is never built as > a module as you see in arch/powerpc/platforms/powermac/Makefile: > > # CONFIG_NVRAM is an arch. independent tristate symbol, for pmac32 we really > # need this to be a bool. Cheat here and pretend CONFIG_NVRAM=m is really > # CONFIG_NVRAM=y > obj-$(CONFIG_NVRAM:m=y) += nvram.o > > Since we cannot predict how tricky Makefiles are written in wild, > builtin.modules may potentially contain false positives. I do not > think it is a big deal as far as kmod is concerned, but false positive > warnings in the kernel build makes people upset. It is better to not > do it. > > Even without checking builtin.modules, we have enough (and more solid) > test coverage with allmodconfig. > > While I touched this part, I replaced the sed code with neater one > provided by Stephen. > > Link: https://lkml.org/lkml/2019/5/19/120 > Link: https://lkml.org/lkml/2019/5/19/123 > Fixes: 3a48a91901c5 ("kbuild: check uniqueness of module names") > Reported-by: Stephen Rothwell > Signed-off-by: Masahiro Yamada > --- Reviewed-by: Greg Kroah-Hartman
Re: [PATCH] kbuild: do not check name uniqueness of builtin modules
On Mon, May 20, 2019 at 4:57 AM Masahiro Yamada wrote: > > I just thought it was a good idea to scan builtin.modules in the name > uniqueness checking, but Stephen reported a false positive. > > ppc64_defconfig produces: > > warning: same basename if the following are built as modules: > arch/powerpc/platforms/powermac/nvram.ko > drivers/char/nvram.ko > > ..., which is a false positive because the former is never built as > a module as you see in arch/powerpc/platforms/powermac/Makefile: > > # CONFIG_NVRAM is an arch. independent tristate symbol, for pmac32 we really > # need this to be a bool. Cheat here and pretend CONFIG_NVRAM=m is really > # CONFIG_NVRAM=y > obj-$(CONFIG_NVRAM:m=y) += nvram.o > > Since we cannot predict how tricky Makefiles are written in wild, > builtin.modules may potentially contain false positives. I do not > think it is a big deal as far as kmod is concerned, but false positive > warnings in the kernel build makes people upset. It is better to not > do it. > > Even without checking builtin.modules, we have enough (and more solid) > test coverage with allmodconfig. > > While I touched this part, I replaced the sed code with neater one > provided by Stephen. > > Link: https://lkml.org/lkml/2019/5/19/120 > Link: https://lkml.org/lkml/2019/5/19/123 > Fixes: 3a48a91901c5 ("kbuild: check uniqueness of module names") > Reported-by: Stephen Rothwell > Signed-off-by: Masahiro Yamada Looks good to me Acked-by: Arnd Bergmann > --- > > scripts/modules-check.sh | 6 +++--- > 1 file changed, 3 insertions(+), 3 deletions(-) > > diff --git a/scripts/modules-check.sh b/scripts/modules-check.sh > index 2f659530e1ec..39e8cb36ba19 100755 > --- a/scripts/modules-check.sh > +++ b/scripts/modules-check.sh > @@ -6,10 +6,10 @@ set -e > # Check uniqueness of module names > check_same_name_modules() > { > - for m in $(sed 's:.*/::' modules.order modules.builtin | sort | uniq > -d) > + for m in $(sed 's:.*/::' modules.order | sort | uniq -d) > do > - echo "warning: same basename if the following are built as > modules:" >&2 > - sed "/\/$m/!d;s:^kernel/: :" modules.order modules.builtin > >&2 > + echo "warning: same module names found:" >&2 > + sed -n "/\/$m/s:^kernel/: :p" modules.order >&2 > done > } > > -- > 2.17.1 >
Re: [PATCH V2 3/3] soc: fsl: add RCPM driver
Hi! > > > > You are right, but the current code is "interesting". What about > > > > > > > > ws = NULL; > > > > while (ws = wakeup_source_get_next(NULL)) ... > > > > > > > > then? > > > > > > Did you mean: > > > ws = NULL; > > > while (ws = wakeup_source_get_next(ws)) ... > > > > > >Yes, that will be the same to my original logic, do you recommend > > > to change to this? :) > > > > Yes please. It will be less confusing to the reader. > > OK, if no other comment, I will work out v4, fix this and extra ',' > > > Thanks (and sorry for cross-talk), > > That's OK, thanks for your time. You can add Acked-by: Pavel Machek to that version. Best regards, Pavel -- DENX Software Engineering GmbH, Managing Director: Wolfgang Denk HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany signature.asc Description: Digital signature
Re: [PATCH] powerpc/perf: Use cpumask_last() to determine the
Hi, Somehow the subject of this patch didn't appear completely here. The Subject of this patch is as follows, `Subject [PATCH] powerpc/perf: Use cpumask_last() to determine the designated cpu for nest/core units.` Thanks, Anju On 5/20/19 2:35 PM, Anju T Sudhakar wrote: Nest and core imc(In-memory Collection counters) assigns a particular cpu as the designated target for counter data collection. During system boot, the first online cpu in a chip gets assigned as the designated cpu for that chip(for nest-imc) and the first online cpu in a core gets assigned as the designated cpu for that core(for core-imc). If the designated cpu goes offline, the next online cpu from the same chip(for nest-imc)/core(for core-imc) is assigned as the next target, and the event context is migrated to the target cpu. Currently, cpumask_any_but() function is used to find the target cpu. Though this function is expected to return a `random` cpu, this always returns the next online cpu. If all cpus in a chip/core is offlined in a sequential manner, starting from the first cpu, the event migration has to happen for all the cpus which goes offline. Since the migration process involves a grace period, the total time taken to offline all the cpus will be significantly high.
RE: [PATCH V2 3/3] soc: fsl: add RCPM driver
Hi Pavel, On Monday, May 20, 2019 17:08 Pavel Machek wrote: > > > Hi! > > > > > > > > > +static int rcpm_pm_prepare(struct device *dev) { > > > > > > + struct device_node *np = dev->of_node; > > > > > > + struct wakeup_source *ws; > > > > > > + struct rcpm *rcpm; > > > > > > + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp; > > > > > > + int i, ret; > > > > > > + > > > > > > + rcpm = dev_get_drvdata(dev); > > > > > > + if (!rcpm) > > > > > > + return -EINVAL; > > > > > > + > > > > > > + /* Begin with first registered wakeup source */ > > > > > > + ws = wakeup_source_get_next(NULL); > > > > > > + while (ws) { > > > > > > > > > > while (ws = wakeup_source_get_next(NULL)) ? > > > > > > > > Actually, we only pass NULL to wakeup_source_get_next() at very > > > > first call to get 1st wakeup source. Then in the while loop, we > > > > will fetch next source but not 1st, that's different. I am afraid > > > > your suggestion is not quite correct. > > > > > > Sorry, I seen your next version before seeing this explanation. > > > > > > You are right, but the current code is "interesting". What about > > > > > > ws = NULL; > > > while (ws = wakeup_source_get_next(NULL)) ... > > > > > > then? > > > > Did you mean: > > ws = NULL; > > while (ws = wakeup_source_get_next(ws)) ... > > > >Yes, that will be the same to my original logic, do you recommend > > to change to this? :) > > Yes please. It will be less confusing to the reader. OK, if no other comment, I will work out v4, fix this and extra ',' > Thanks (and sorry for cross-talk), That's OK, thanks for your time. Regards, Ran
Re: [PATCH v2] ocxl: Fix potential memory leak on context creation
On Mon, 20 May 2019 09:16:18 +0200 Frederic Barrat wrote: > If we couldn't fully init a context, we were leaking memory. > > Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts") Oops... missed that during review :-\ > Signed-off-by: Frederic Barrat > --- > > Changelog: > v2: reset context pointer in case of allocation failure (Andrew) > Alternatively you could change the code to do: ctx = kzalloc(sizeof(struct ocxl_context), GFP_KERNEL); if (!ctx) return -ENOMEM; . . . if (pasid < 0) { mutex_unlock(>contexts_lock); kfree(ctx); return pasid; } . . . *context = ctx; return 0; } This has the advantage of clearing any risk of side-effect with *context forever, which is a safer practice IMHO. Patch is correct anyway, so: Reviewed-by: Greg Kurz > drivers/misc/ocxl/context.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c > index bab9c9364184..24e4fb010275 100644 > --- a/drivers/misc/ocxl/context.c > +++ b/drivers/misc/ocxl/context.c > @@ -22,6 +22,8 @@ int ocxl_context_alloc(struct ocxl_context **context, > struct ocxl_afu *afu, > afu->pasid_base + afu->pasid_max, GFP_KERNEL); > if (pasid < 0) { > mutex_unlock(>contexts_lock); > + kfree(*context); > + *context = NULL; > return pasid; > } > afu->pasid_count++;
Re: [PATCH V2 3/3] soc: fsl: add RCPM driver
On Mon 2019-05-20 09:03:50, Ran Wang wrote: > Hi Pavel, > > On Monday, May 20, 2019 16:57, Pavel Machek wrote: > > > > Hi! > > > > > > > +static int rcpm_pm_prepare(struct device *dev) { > > > > > + struct device_node *np = dev->of_node; > > > > > + struct wakeup_source *ws; > > > > > + struct rcpm *rcpm; > > > > > + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp; > > > > > + int i, ret; > > > > > + > > > > > + rcpm = dev_get_drvdata(dev); > > > > > + if (!rcpm) > > > > > + return -EINVAL; > > > > > + > > > > > + /* Begin with first registered wakeup source */ > > > > > + ws = wakeup_source_get_next(NULL); > > > > > + while (ws) { > > > > > > > > while (ws = wakeup_source_get_next(NULL)) ? > > > > > > Actually, we only pass NULL to wakeup_source_get_next() at very first > > > call to get 1st wakeup source. Then in the while loop, we will fetch > > > next source but not 1st, that's different. I am afraid your suggestion > > > is not quite correct. > > > > Sorry, I seen your next version before seeing this explanation. > > > > You are right, but the current code is "interesting". What about > > > > ws = NULL; > > while (ws = wakeup_source_get_next(NULL)) ... > > > > then? > > Did you mean: > ws = NULL; > while (ws = wakeup_source_get_next(ws)) ... > >Yes, that will be the same to my original logic, do you recommend to change > to this? :) Yes please. It will be less confusing to the reader. Thanks (and sorry for cross-talk), Pavel -- DENX Software Engineering GmbH, Managing Director: Wolfgang Denk HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany signature.asc Description: Digital signature
[PATCH] powerpc/perf: Use cpumask_last() to determine the
Nest and core imc(In-memory Collection counters) assigns a particular cpu as the designated target for counter data collection. During system boot, the first online cpu in a chip gets assigned as the designated cpu for that chip(for nest-imc) and the first online cpu in a core gets assigned as the designated cpu for that core(for core-imc). If the designated cpu goes offline, the next online cpu from the same chip(for nest-imc)/core(for core-imc) is assigned as the next target, and the event context is migrated to the target cpu. Currently, cpumask_any_but() function is used to find the target cpu. Though this function is expected to return a `random` cpu, this always returns the next online cpu. If all cpus in a chip/core is offlined in a sequential manner, starting from the first cpu, the event migration has to happen for all the cpus which goes offline. Since the migration process involves a grace period, the total time taken to offline all the cpus will be significantly high. Example: In a system which has 2 sockets, with NUMA node0 CPU(s): 0-87 NUMA node8 CPU(s): 88-175 Time taken to offline cpu 88-175: real2m56.099s user0m0.191s sys 0m0.000s Use cpumask_last() to choose the target cpu, when the designated cpu goes online, so the migration will happen only when the last_cpu in the mask goes offline. This way the time taken to offline all cpus in a chip/core can be reduced. With the patch, Time taken to offline cpu 88-175: real0m12.207s user0m0.171s sys 0m0.000s cpumask_last() is a better way to find the target cpu, since in most of the cases cpuhotplug is performed in an increasing order(even in ppc64_cpu). cpumask_any_but() can still be used to check the possibility of other online cpus from the same chip/core if the last cpu in the mask goes offline. Signed-off-by: Anju T Sudhakar --- arch/powerpc/perf/imc-pmu.c | 14 -- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 31fa753..fbfd6e7 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -366,7 +366,14 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu) */ nid = cpu_to_node(cpu); l_cpumask = cpumask_of_node(nid); - target = cpumask_any_but(l_cpumask, cpu); + target = cpumask_last(l_cpumask); + + /* +* If this(target) is the last cpu in the cpumask for this chip, +* check for any possible online cpu in the chip. +*/ + if (unlikely(target == cpu)) + target = cpumask_any_but(l_cpumask, cpu); /* * Update the cpumask with the target cpu and @@ -671,7 +678,10 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) return 0; /* Find any online cpu in that core except the current "cpu" */ - ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); + ncpu = cpumask_last(cpu_sibling_mask(cpu)); + + if (unlikely(ncpu == cpu)) + ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); if (ncpu >= 0 && ncpu < nr_cpu_ids) { cpumask_set_cpu(ncpu, _imc_cpumask); -- 1.8.3.1
RE: [PATCH V2 3/3] soc: fsl: add RCPM driver
Hi Pavel, On Monday, May 20, 2019 16:57, Pavel Machek wrote: > > Hi! > > > > > +static int rcpm_pm_prepare(struct device *dev) { > > > > + struct device_node *np = dev->of_node; > > > > + struct wakeup_source *ws; > > > > + struct rcpm *rcpm; > > > > + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp; > > > > + int i, ret; > > > > + > > > > + rcpm = dev_get_drvdata(dev); > > > > + if (!rcpm) > > > > + return -EINVAL; > > > > + > > > > + /* Begin with first registered wakeup source */ > > > > + ws = wakeup_source_get_next(NULL); > > > > + while (ws) { > > > > > > while (ws = wakeup_source_get_next(NULL)) ? > > > > Actually, we only pass NULL to wakeup_source_get_next() at very first > > call to get 1st wakeup source. Then in the while loop, we will fetch > > next source but not 1st, that's different. I am afraid your suggestion > > is not quite correct. > > Sorry, I seen your next version before seeing this explanation. > > You are right, but the current code is "interesting". What about > > ws = NULL; > while (ws = wakeup_source_get_next(NULL)) ... > > then? Did you mean: ws = NULL; while (ws = wakeup_source_get_next(ws)) ... Yes, that will be the same to my original logic, do you recommend to change to this? :) Regards, Ran
[PATCH] powerpc/powernv: Return for invalid IMC domain
Currently init_imc_pmu() can be failed either because an IMC unit with invalid domain(i.e an IMC node not supported by the kernel) is attempted a pmu-registration or something went wrong while registering a valid IMC unit. In both the cases kernel provides a 'Registration failed' error message. Example: Log message, when trace-imc node is not supported by the kernel, and the skiboot supports trace-imc node. So for kernel, trace-imc node is now an unknown domain. [1.731870] nest_phb5_imc performance monitor hardware support registered [1.731944] nest_powerbus0_imc performance monitor hardware support registered [1.734458] thread_imc performance monitor hardware support registered [1.734460] IMC Unknown Device type [1.734462] IMC PMU (null) Register failed [1.734558] nest_xlink0_imc performance monitor hardware support registered [1.734614] nest_xlink1_imc performance monitor hardware support registered [1.734670] nest_xlink2_imc performance monitor hardware support registered [1.747043] Initialise system trusted keyrings [1.747054] Key type blacklist registered To avoid ambiguity on the error message, return for invalid domain before attempting a pmu registration. Fixes: 8f95faaac56c1 (`powerpc/powernv: Detect and create IMC device`) Reported-by: Pavaman Subramaniyam Signed-off-by: Anju T Sudhakar --- arch/powerpc/platforms/powernv/opal-imc.c | 4 1 file changed, 4 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 58a0794..4e8b0e1 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -161,6 +161,10 @@ static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) struct imc_pmu *pmu_ptr; u32 offset; + /* Return for unknown domain */ + if (domain < 0) + return -EINVAL; + /* memory for pmu */ pmu_ptr = kzalloc(sizeof(*pmu_ptr), GFP_KERNEL); if (!pmu_ptr) -- 1.8.3.1
Re: [RFC PATCH 4/4] powerpc/ftrace: Additionally nop out the preceding mflr with -mprofile-kernel
Nicholas Piggin wrote: Naveen N. Rao's on May 18, 2019 5:02 am: With -mprofile-kernel, gcc emits 'mflr r0', followed by 'bl _mcount' to enable function tracing and profiling. So far, with dynamic ftrace, we used to only patch out the branch to _mcount(). However, Nick Piggin points out that "mflr is executed by the branch unit that can only execute one per cycle on POWER9 and shared with branches, so it would be nice to avoid it where possible." We cannot simply nop out the mflr either. Michael Ellerman pointed out that when enabling function tracing, there can be a race if tracing is enabled when some thread was interrupted after executing a nop'ed out mflr. In this case, the thread would execute the now-patched-in branch to _mcount() without having executed the preceding mflr. To solve this, we now enable function tracing in 2 steps: patch in the mflr instruction, use synchronize_rcu_tasks() to ensure all existing threads make progress, and then patch in the branch to _mcount(). We override ftrace_replace_code() with a powerpc64 variant for this purpose. Signed-off-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Nice! Thanks for doing a real patch. You needn't add my SOB there: my hack was obviously garbage :) Suggested-by if anything, then for clarity of changelog you can write the motivation directly rather than quote me. Thanks, I meant to call out the fact that I had added your SOB before sending the patch, but missed doing so. Your patch was perfectly fine ;) I don't know the ftrace subsystem well, but the powerpc instructions and patching sequence appears to match what we agreed is the right way to go. As a suggestion, I would perhaps add most of information from the second and third paragraphs of the changelog into comments (and also explain that the lone mflr r0 is harmless). But otherwise it looks good Reviewed-by: Nicholas Piggin Thanks, I will incorporate those changes. - Naveen
Re: [PATCH V2 3/3] soc: fsl: add RCPM driver
Hi! > > > +static int rcpm_pm_prepare(struct device *dev) { > > > + struct device_node *np = dev->of_node; > > > + struct wakeup_source *ws; > > > + struct rcpm *rcpm; > > > + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp; > > > + int i, ret; > > > + > > > + rcpm = dev_get_drvdata(dev); > > > + if (!rcpm) > > > + return -EINVAL; > > > + > > > + /* Begin with first registered wakeup source */ > > > + ws = wakeup_source_get_next(NULL); > > > + while (ws) { > > > > while (ws = wakeup_source_get_next(NULL)) ? > > Actually, we only pass NULL to wakeup_source_get_next() at very first > call to get 1st wakeup source. Then in the while loop, we will fetch > next source but not 1st, that's different. I am afraid your suggestion > is not quite correct. Sorry, I seen your next version before seeing this explanation. You are right, but the current code is "interesting". What about ws = NULL; while (ws = wakeup_source_get_next(NULL)) ... then? Best regards, Pavel -- DENX Software Engineering GmbH, Managing Director: Wolfgang Denk HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany signature.asc Description: Digital signature
Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest
On Mon, May 20, 2019 at 05:00:21PM +1000, Nicholas Piggin wrote: > Bharata B Rao's on May 20, 2019 3:56 pm: > > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote: > >> >> > git bisect points to > >> >> > > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d > >> >> > Author: Nicholas Piggin > >> >> > Date: Fri Jul 27 21:48:17 2018 +1000 > >> >> > > >> >> > powerpc/64s: Fix page table fragment refcount race vs speculative > >> >> > references > >> >> > > >> >> > The page table fragment allocator uses the main page refcount > >> >> > racily > >> >> > with respect to speculative references. A customer observed a BUG > >> >> > due > >> >> > to page table page refcount underflow in the fragment allocator. > >> >> > This > >> >> > can be caused by the fragment allocator set_page_count stomping > >> >> > on a > >> >> > speculative reference, and then the speculative failure handler > >> >> > decrements the new reference, and the underflow eventually pops > >> >> > when > >> >> > the page tables are freed. > >> >> > > >> >> > Fix this by using a dedicated field in the struct page for the > >> >> > page > >> >> > table fragment allocator. > >> >> > > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage") > >> >> > Cc: sta...@vger.kernel.org # v3.10+ > >> >> > >> >> That's the commit that added the BUG_ON(), so prior to that you won't > >> >> see the crash. > >> > > >> > Right, but the commit says it fixes page table page refcount underflow by > >> > introducing a new field >pt_frag_refcount. Now we are hitting the > >> > underflow > >> > for this pt_frag_refcount. > >> > >> The fixed underflow is caused by a bug (race on page count) that got > >> fixed by that patch. You are hitting a different underflow here. It's > >> not certain my patch caused it, I'm just trying to reproduce now. > > > > Ok. > > Can't reproduce I'm afraid, tried adding and removing 8GB memory from a > 4GB guest (via host adding / removing memory device), and it just works. Boot, add 8G, reboot, remove 8G is the sequence to reproduce. > > It's likely to be an edge case like an off by one or rounding error > that just happens to trigger in your config. Might be easiest if you > could test with a debug patch. Sure, I will continue debugging. Regards, Bharata.
[PATCH v2] mm/nvdimm: Pick the right alignment default when creating dax devices
Allow arch to provide the supported alignments and use hugepage alignment only if we support hugepage. Right now we depend on compile time configs whereas this patch switch this to runtime discovery. Architectures like ppc64 can have THP enabled in code, but then can have hugepage size disabled by the hypervisor. This allows us to create dax devices with PAGE_SIZE alignment in this case. Existing dax namespace with alignment larger than PAGE_SIZE will fail to initialize in this specific case. We still allow fsdax namespace initialization. With respect to identifying whether to enable hugepage fault for a dax device, if THP is enabled during compile, we default to taking hugepage fault and in dax fault handler if we find the fault size > alignment we retry with PAGE_SIZE fault size. This also addresses the below failure scenario on ppc64 ndctl create-namespace --mode=devdax | grep align "align":16777216, "align":16777216 cat /sys/devices/ndbus0/region0/dax0.0/supported_alignments 65536 16777216 daxio.static-debug -z -o /dev/dax0.0 Bus error (core dumped) $ dmesg | tail lpar: Failed hash pte insert with error -4 hash-mmu: mm: Hashing failure ! EA=0x7fff1700 access=0x8006 current=daxio hash-mmu: trap=0x300 vsid=0x22cb7a3 ssize=1 base psize=2 psize 10 pte=0xc00501002b86 daxio[3860]: bus error (7) at 7fff1700 nip 7fff973c007c lr 7fff973bff34 code 2 in libpmem.so.1.0.0[7fff973b+2] daxio[3860]: code: 792945e4 7d494b78 e95f0098 7d494b78 f93f00a0 4800012c e93f0088 f93f0120 daxio[3860]: code: e93f00a0 f93f0128 e93f0120 e95f0128 e93f0088 39290008 f93f0110 The failure was due to guest kernel using wrong page size. The namespaces created with 16M alignment will appear as below on a config with 16M page size disabled. $ ndctl list -Ni [ { "dev":"namespace0.1", "mode":"fsdax", "map":"dev", "size":5351931904, "uuid":"fc6e9667-461a-4718-82b4-69b24570bddb", "align":16777216, "blockdev":"pmem0.1", "supported_alignments":[ 65536 ] }, { "dev":"namespace0.0", "mode":"fsdax",< devdax 16M alignment marked disabled. "map":"mem", "size":5368709120, "uuid":"a4bdf81a-f2ee-4bc6-91db-7b87eddd0484", "state":"disabled" } ] Signed-off-by: Aneesh Kumar K.V --- Changes from V1: * Return -EOPNOTSUPP if we find that we can't initialize the namespace * Compare fix for DAX signature. arch/powerpc/include/asm/libnvdimm.h | 9 arch/powerpc/mm/Makefile | 1 + arch/powerpc/mm/nvdimm.c | 34 arch/x86/include/asm/libnvdimm.h | 19 drivers/nvdimm/nd.h | 6 - drivers/nvdimm/pfn_devs.c| 32 +- drivers/nvdimm/pmem.c| 26 + include/linux/huge_mm.h | 7 +- 8 files changed, 122 insertions(+), 12 deletions(-) create mode 100644 arch/powerpc/include/asm/libnvdimm.h create mode 100644 arch/powerpc/mm/nvdimm.c create mode 100644 arch/x86/include/asm/libnvdimm.h diff --git a/arch/powerpc/include/asm/libnvdimm.h b/arch/powerpc/include/asm/libnvdimm.h new file mode 100644 index ..d35fd7f48603 --- /dev/null +++ b/arch/powerpc/include/asm/libnvdimm.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_LIBNVDIMM_H +#define _ASM_POWERPC_LIBNVDIMM_H + +#define nd_pfn_supported_alignments nd_pfn_supported_alignments +extern unsigned long *nd_pfn_supported_alignments(void); +extern unsigned long nd_pfn_default_alignment(void); + +#endif diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 0f499db315d6..42e4a399ba5d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -20,3 +20,4 @@ obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ obj-$(CONFIG_KASAN)+= kasan/ +obj-$(CONFIG_NVDIMM_PFN) += nvdimm.o diff --git a/arch/powerpc/mm/nvdimm.c b/arch/powerpc/mm/nvdimm.c new file mode 100644 index ..a29a4510715e --- /dev/null +++ b/arch/powerpc/mm/nvdimm.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include +/* + * We support only pte and pmd mappings for now. + */ +const unsigned long *nd_pfn_supported_alignments(void) +{ + static unsigned long supported_alignments[3]; + + supported_alignments[0] = PAGE_SIZE; + + if (has_transparent_hugepage()) + supported_alignments[1] = HPAGE_PMD_SIZE; + else + supported_alignments[1] = 0; + + supported_alignments[2] = 0; + return supported_alignments; +} + +/* + * Use pmd mapping if supported as default alignment + */ +unsigned long nd_pfn_default_alignment(void) +{ + + if (has_transparent_hugepage()) + return HPAGE_PMD_SIZE; + return PAGE_SIZE; +} diff --git
RE: [EXT] Re: [PATCH 2/3] arm64: dts: ls1028a: Add PCIe controller DT nodes
Hi Arndt, -Original Message- From: Arnd Bergmann Sent: 2019年5月17日 16:59 To: Xiaowei Bao Cc: Bjorn Helgaas ; Rob Herring ; Mark Rutland ; Shawn Guo ; Leo Li ; Kishon ; Lorenzo Pieralisi ; gregkh ; M.h. Lian ; Mingkai Hu ; Roy Zang ; Kate Stewart ; Philippe Ombredanne ; Shawn Lin ; linux-pci ; DTML ; Linux Kernel Mailing List ; Linux ARM ; linuxppc-dev Subject: Re: [EXT] Re: [PATCH 2/3] arm64: dts: ls1028a: Add PCIe controller DT nodes Caution: EXT Email On Fri, May 17, 2019 at 5:21 AM Xiaowei Bao wrote: > -Original Message- > From: Arnd Bergmann > On Wed, May 15, 2019 at 9:36 AM Xiaowei Bao wrote: > > Signed-off-by: Xiaowei Bao > > --- > > arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi | 52 > > > > 1 files changed, 52 insertions(+), 0 deletions(-) > > > > diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi > > b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi > > index b045812..50b579b 100644 > > --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi > > +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a.dtsi > > @@ -398,6 +398,58 @@ > > status = "disabled"; > > }; > > > > + pcie@340 { > > + compatible = "fsl,ls1028a-pcie"; > > + reg = <0x00 0x0340 0x0 0x0010 /* > > controller registers */ > > + 0x80 0x 0x0 0x2000>; /* > > configuration space */ > > + reg-names = "regs", "config"; > > + interrupts = , /* > > PME interrupt */ > > +; /* > > aer interrupt */ > > + interrupt-names = "pme", "aer"; > > + #address-cells = <3>; > > + #size-cells = <2>; > > + device_type = "pci"; > > + dma-coherent; > > + num-lanes = <4>; > > + bus-range = <0x0 0xff>; > > + ranges = <0x8100 0x0 0x 0x80 0x0001 > > 0x0 0x0001 /* downstream I/O */ > > + 0x8200 0x0 0x4000 0x80 > > + 0x4000 0x0 0x4000>; /* non-prefetchable memory */ > > Are you sure there is no support for 64-bit BARs or prefetchable memory? > [Xiaowei Bao] sorry for late reply, Thought that our Layerscape platform has > not added prefetchable memory support in DTS, so this platform has not been > added, I will submit a separate patch to add prefetchable memory support for > all Layerscape platforms. Ok, thanks. > Of course, the prefetchable PCIE device can work in our boards, > because the RC will assign non-prefetchable memory for this device. We > reserve 1G no-prefetchable memory for PCIE device, it is enough for general > devices. Sure, many devices work just fine, this is mostly a question of supporting those devices that do require multiple gigabytes, or that need prefetchable memory semantics to get the expected performance. GPUs are the obvious example, but I think there are others (infiniband?). [Xiaowei Bao] sorry, I don't know much about infiniband and GPU, as you said, I think many devices works fine with this DTS, I will add the prefetchable memory entry in DTS future and submit another patch. Arnd
RE: [PATCH v3 3/3] soc: fsl: add RCPM driver
Hi Pavel, On Monday, May 20, 2019 15:27: Pavel Machek wrote: > > Hi! > > > The NXP's QorIQ Processors based on ARM Core have RCPM module (Run > > Control and Power Management), which performs all device-level tasks > > associated with power management such as wakeup source control. > > > > This driver depends on PM wakeup source framework which help to > > collect wake information. > > > > Signed-off-by: Ran Wang > > > +// Copyright 2019 NXP > > +// > > +// Author: Ran Wang , > > extra , OK, will update. > > + rcpm = dev_get_drvdata(dev); > > + if (!rcpm) > > + return -EINVAL; > > + > > + /* Begin with first registered wakeup source */ > > + ws = wakeup_source_get_next(NULL); > > + while (ws) { > > while (ws = wakeup_source_get_next(NULL)) > > ? I just answered this in v2 mail thread: "Actually, we only pass NULL to wakeup_source_get_next() at very first call to get 1st wakeup source. Then in the while loop, we will fetch next source but not 1st, that's different. I am afraid your suggestion is not quite correct." Regards Ran
Re: [PATCH v3 3/3] soc: fsl: add RCPM driver
Hi! > The NXP's QorIQ Processors based on ARM Core have RCPM module > (Run Control and Power Management), which performs all device-level > tasks associated with power management such as wakeup source control. > > This driver depends on PM wakeup source framework which help to > collect wake information. > > Signed-off-by: Ran Wang > +// Copyright 2019 NXP > +// > +// Author: Ran Wang , extra , > + rcpm = dev_get_drvdata(dev); > + if (!rcpm) > + return -EINVAL; > + > + /* Begin with first registered wakeup source */ > + ws = wakeup_source_get_next(NULL); > + while (ws) { while (ws = wakeup_source_get_next(NULL)) ? Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html signature.asc Description: Digital signature
Re: [PATCH v2] ocxl: Fix potential memory leak on context creation
Acked-by: Andrew Donnellan On 20/5/19 5:16 pm, Frederic Barrat wrote: If we couldn't fully init a context, we were leaking memory. Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts") Signed-off-by: Frederic Barrat --- Changelog: v2: reset context pointer in case of allocation failure (Andrew) drivers/misc/ocxl/context.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index bab9c9364184..24e4fb010275 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -22,6 +22,8 @@ int ocxl_context_alloc(struct ocxl_context **context, struct ocxl_afu *afu, afu->pasid_base + afu->pasid_max, GFP_KERNEL); if (pasid < 0) { mutex_unlock(>contexts_lock); + kfree(*context); + *context = NULL; return pasid; } afu->pasid_count++; -- Andrew Donnellan OzLabs, ADL Canberra a...@linux.ibm.com IBM Australia Limited
[PATCH v2] ocxl: Fix potential memory leak on context creation
If we couldn't fully init a context, we were leaking memory. Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts") Signed-off-by: Frederic Barrat --- Changelog: v2: reset context pointer in case of allocation failure (Andrew) drivers/misc/ocxl/context.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index bab9c9364184..24e4fb010275 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -22,6 +22,8 @@ int ocxl_context_alloc(struct ocxl_context **context, struct ocxl_afu *afu, afu->pasid_base + afu->pasid_max, GFP_KERNEL); if (pasid < 0) { mutex_unlock(>contexts_lock); + kfree(*context); + *context = NULL; return pasid; } afu->pasid_count++; -- 2.21.0
Re: PROBLEM: Power9: kernel oops on memory hotunplug from ppc64le guest
Bharata B Rao's on May 20, 2019 3:56 pm: > On Mon, May 20, 2019 at 02:48:35PM +1000, Nicholas Piggin wrote: >> >> > git bisect points to >> >> > >> >> > commit 4231aba000f5a4583dd9f67057aadb68c3eca99d >> >> > Author: Nicholas Piggin >> >> > Date: Fri Jul 27 21:48:17 2018 +1000 >> >> > >> >> > powerpc/64s: Fix page table fragment refcount race vs speculative >> >> > references >> >> > >> >> > The page table fragment allocator uses the main page refcount racily >> >> > with respect to speculative references. A customer observed a BUG >> >> > due >> >> > to page table page refcount underflow in the fragment allocator. >> >> > This >> >> > can be caused by the fragment allocator set_page_count stomping on a >> >> > speculative reference, and then the speculative failure handler >> >> > decrements the new reference, and the underflow eventually pops when >> >> > the page tables are freed. >> >> > >> >> > Fix this by using a dedicated field in the struct page for the page >> >> > table fragment allocator. >> >> > >> >> > Fixes: 5c1f6ee9a31c ("powerpc: Reduce PTE table memory wastage") >> >> > Cc: sta...@vger.kernel.org # v3.10+ >> >> >> >> That's the commit that added the BUG_ON(), so prior to that you won't >> >> see the crash. >> > >> > Right, but the commit says it fixes page table page refcount underflow by >> > introducing a new field >pt_frag_refcount. Now we are hitting the >> > underflow >> > for this pt_frag_refcount. >> >> The fixed underflow is caused by a bug (race on page count) that got >> fixed by that patch. You are hitting a different underflow here. It's >> not certain my patch caused it, I'm just trying to reproduce now. > > Ok. Can't reproduce I'm afraid, tried adding and removing 8GB memory from a 4GB guest (via host adding / removing memory device), and it just works. It's likely to be an edge case like an off by one or rounding error that just happens to trigger in your config. Might be easiest if you could test with a debug patch. Thanks, Nick
[PATCH v3 3/3] soc: fsl: add RCPM driver
The NXP's QorIQ Processors based on ARM Core have RCPM module (Run Control and Power Management), which performs all device-level tasks associated with power management such as wakeup source control. This driver depends on PM wakeup source framework which help to collect wake information. Signed-off-by: Ran Wang --- Change in v3: - Some whitespace ajdustment. Change in v2: - Rebase Kconfig and Makefile update to latest mainline. drivers/soc/fsl/Kconfig |8 +++ drivers/soc/fsl/Makefile |1 + drivers/soc/fsl/rcpm.c | 124 ++ 3 files changed, 133 insertions(+), 0 deletions(-) create mode 100644 drivers/soc/fsl/rcpm.c diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig index 61f8e14..8e84e40 100644 --- a/drivers/soc/fsl/Kconfig +++ b/drivers/soc/fsl/Kconfig @@ -29,4 +29,12 @@ config FSL_MC_DPIO other DPAA2 objects. This driver does not expose the DPIO objects individually, but groups them under a service layer API. + +config FSL_RCPM + bool "Freescale RCPM support" + depends on PM_SLEEP + help + The NXP's QorIQ Processors based on ARM Core have RCPM module + (Run Control and Power Management), which performs all device-level + tasks associated with power management, such as wakeup source control. endmenu diff --git a/drivers/soc/fsl/Makefile b/drivers/soc/fsl/Makefile index 803ef1b..c1be6ee 100644 --- a/drivers/soc/fsl/Makefile +++ b/drivers/soc/fsl/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_QUICC_ENGINE) += qe/ obj-$(CONFIG_CPM) += qe/ obj-$(CONFIG_FSL_GUTS) += guts.o obj-$(CONFIG_FSL_MC_DPIO) += dpio/ +obj-$(CONFIG_FSL_RCPM) += rcpm.o diff --git a/drivers/soc/fsl/rcpm.c b/drivers/soc/fsl/rcpm.c new file mode 100644 index 000..678d1cd --- /dev/null +++ b/drivers/soc/fsl/rcpm.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0 +// +// rcpm.c - Freescale QorIQ RCPM driver +// +// Copyright 2019 NXP +// +// Author: Ran Wang , + +#include +#include +#include +#include +#include +#include +#include + +#define RCPM_WAKEUP_CELL_MAX_SIZE 7 + +struct rcpm { + unsigned intwakeup_cells; + void __iomem*ippdexpcr_base; + boollittle_endian; +}; + +static int rcpm_pm_prepare(struct device *dev) +{ + struct device_node *np = dev->of_node; + struct wakeup_source*ws; + struct rcpm *rcpm; + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp; + int i, ret; + + rcpm = dev_get_drvdata(dev); + if (!rcpm) + return -EINVAL; + + /* Begin with first registered wakeup source */ + ws = wakeup_source_get_next(NULL); + while (ws) { + ret = device_property_read_u32_array(ws->attached_dev, + "fsl,rcpm-wakeup", value, rcpm->wakeup_cells + 1); + + /* Wakeup source should refer to current rcpm device */ + if (ret || (np->phandle != value[0])) { + dev_info(dev, "%s doesn't refer to this rcpm\n", + ws->name); + ws = wakeup_source_get_next(ws); + continue; + } + + for (i = 0; i < rcpm->wakeup_cells; i++) { + /* We can only OR related bits */ + if (value[i + 1]) { + if (rcpm->little_endian) { + tmp = ioread32(rcpm->ippdexpcr_base + i * 4); + tmp |= value[i + 1]; + iowrite32(tmp, rcpm->ippdexpcr_base + i * 4); + } else { + tmp = ioread32be(rcpm->ippdexpcr_base + i * 4); + tmp |= value[i + 1]; + iowrite32be(tmp, rcpm->ippdexpcr_base + i * 4); + } + } + } + ws = wakeup_source_get_next(ws); + } + + return 0; +} + +static const struct dev_pm_ops rcpm_pm_ops = { + .prepare = rcpm_pm_prepare, +}; + +static int rcpm_probe(struct platform_device *pdev) +{ + struct device *dev = >dev; + struct resource *r; + struct rcpm *rcpm; + int ret; + + rcpm = devm_kzalloc(dev, sizeof(*rcpm), GFP_KERNEL); + if (!rcpm) + return -ENOMEM; + + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!r) + return -ENODEV; + + rcpm->ippdexpcr_base = devm_ioremap_resource(>dev, r); + if (IS_ERR(rcpm->ippdexpcr_base)) { + ret = PTR_ERR(rcpm->ippdexpcr_base); + return ret; + } + + rcpm->little_endian = device_property_read_bool( +
[PATCH v3 1/3] PM: wakeup: Add routine to help fetch wakeup source object.
Some user might want to go through all registered wakeup sources and doing things accordingly. For example, SoC PM driver might need to do HW programming to prevent powering down specific IP which wakeup source depending on. And is user's responsibility to identify if this wakeup source he is interested in. Signed-off-by: Ran Wang --- Change in v3: - Adjust indentation of *attached_dev;. Change in v2: - None. drivers/base/power/wakeup.c | 18 ++ include/linux/pm_wakeup.h |3 +++ 2 files changed, 21 insertions(+), 0 deletions(-) diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 5b2b6a0..6904485 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -226,6 +227,22 @@ void wakeup_source_unregister(struct wakeup_source *ws) } } EXPORT_SYMBOL_GPL(wakeup_source_unregister); +/** + * wakeup_source_get_next - Get next wakeup source from the list + * @ws: Previous wakeup source object, null means caller want first one. + */ +struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws) +{ + struct list_head *ws_head = _sources; + + if (ws) + return list_next_or_null_rcu(ws_head, >entry, + struct wakeup_source, entry); + else + return list_entry_rcu(ws_head->next, + struct wakeup_source, entry); +} +EXPORT_SYMBOL_GPL(wakeup_source_get_next); /** * device_wakeup_attach - Attach a wakeup source object to a device object. @@ -242,6 +259,7 @@ static int device_wakeup_attach(struct device *dev, struct wakeup_source *ws) return -EEXIST; } dev->power.wakeup = ws; + ws->attached_dev = dev; if (dev->power.wakeirq) device_wakeup_attach_irq(dev, dev->power.wakeirq); spin_unlock_irq(>power.lock); diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index 0ff134d..913b2fb 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -50,6 +50,7 @@ * @wakeup_count: Number of times the wakeup source might abort suspend. * @active: Status of the wakeup source. * @has_timeout: The wakeup source has been activated with a timeout. + * @attached_dev: The device it attached to */ struct wakeup_source { const char *name; @@ -70,6 +71,7 @@ struct wakeup_source { unsigned long wakeup_count; boolactive:1; boolautosleep_enabled:1; + struct device *attached_dev; }; #ifdef CONFIG_PM_SLEEP @@ -101,6 +103,7 @@ static inline void device_set_wakeup_path(struct device *dev) extern void wakeup_source_remove(struct wakeup_source *ws); extern struct wakeup_source *wakeup_source_register(const char *name); extern void wakeup_source_unregister(struct wakeup_source *ws); +extern struct wakeup_source *wakeup_source_get_next(struct wakeup_source *ws); extern int device_wakeup_enable(struct device *dev); extern int device_wakeup_disable(struct device *dev); extern void device_set_wakeup_capable(struct device *dev, bool capable); -- 1.7.1
[PATCH v3 2/3] Documentation: dt: binding: fsl: Add 'little-endian' and update Chassis define
By default, QorIQ SoC's RCPM register block is Big Endian. But there are some exceptions, such as LS1088A and LS2088A, are Little Endian. So add this optional property to help identify them. Actually LS2021A and other Layerscapes won't totally follow Chassis 2.1, so separate them from powerpc SoC. Signed-off-by: Ran Wang --- Change in v3: - None. Change in v2: - None. Documentation/devicetree/bindings/soc/fsl/rcpm.txt |8 +++- 1 files changed, 7 insertions(+), 1 deletions(-) diff --git a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt index e284e4e..058154c 100644 --- a/Documentation/devicetree/bindings/soc/fsl/rcpm.txt +++ b/Documentation/devicetree/bindings/soc/fsl/rcpm.txt @@ -20,6 +20,7 @@ Required properites: * "fsl,qoriq-rcpm-1.0": for chassis 1.0 rcpm * "fsl,qoriq-rcpm-2.0": for chassis 2.0 rcpm * "fsl,qoriq-rcpm-2.1": for chassis 2.1 rcpm + * "fsl,qoriq-rcpm-2.1+": for chassis 2.1+ rcpm All references to "1.0" and "2.0" refer to the QorIQ chassis version to which the chip complies. @@ -27,7 +28,12 @@ Chassis Version Example Chips ------ 1.0p4080, p5020, p5040, p2041, p3041 2.0t4240, b4860, b4420 -2.1t1040, ls1021 +2.1t1040, +2.1+ ls1021a, ls1012a, ls1043a, ls1046a + +Optional properties: + - little-endian : RCPM register block is Little Endian. Without it RCPM + will be Big Endian (default case). Example: The RCPM node for T4240: -- 1.7.1
Re: [PATCH] ocxl: Fix potential memory leak on context creation
Le 20/05/2019 à 03:45, Andrew Donnellan a écrit : On 18/5/19 12:20 am, Frederic Barrat wrote: If we couldn't fully init a context, we were leaking memory. Fixes: b9721d275cc2 ("ocxl: Allow external drivers to use OpenCAPI contexts") Signed-off-by: Frederic Barrat Acked-by: Andrew Donnellan --- drivers/misc/ocxl/context.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/ocxl/context.c b/drivers/misc/ocxl/context.c index bab9c9364184..ab93156aa83e 100644 --- a/drivers/misc/ocxl/context.c +++ b/drivers/misc/ocxl/context.c @@ -22,6 +22,7 @@ int ocxl_context_alloc(struct ocxl_context **context, struct ocxl_afu *afu, afu->pasid_base + afu->pasid_max, GFP_KERNEL); if (pasid < 0) { mutex_unlock(>contexts_lock); + kfree(*context); (defensive programming: set *context = NULL so that if the caller ignores the return code we get an obvious crash) Good point. v2 on its way return pasid; } afu->pasid_count++;
RE: [PATCH V2 3/3] soc: fsl: add RCPM driver
Hi Pavel, On Monday, May 20, 2019 05:39, Pavel Machek wrote: > > Hi! > > > > + > > +struct rcpm { > > + unsigned int wakeup_cells; > > + void __iomem *ippdexpcr_base; > > + boollittle_endian; > > +}; > > Inconsistent whitespace OK, will make them aligned. > > > +static int rcpm_pm_prepare(struct device *dev) { > > + struct device_node *np = dev->of_node; > > + struct wakeup_source *ws; > > + struct rcpm *rcpm; > > + u32 value[RCPM_WAKEUP_CELL_MAX_SIZE + 1], tmp; > > + int i, ret; > > + > > + rcpm = dev_get_drvdata(dev); > > + if (!rcpm) > > + return -EINVAL; > > + > > + /* Begin with first registered wakeup source */ > > + ws = wakeup_source_get_next(NULL); > > + while (ws) { > > while (ws = wakeup_source_get_next(NULL)) ? Actually, we only pass NULL to wakeup_source_get_next() at very first call to get 1st wakeup source. Then in the while loop, we will fetch next source but not 1st, that's different. I am afraid your suggestion is not quite correct. > > > +static int rcpm_probe(struct platform_device *pdev) { > > + struct device *dev = >dev; > > + struct resource *r; > > + struct rcpm *rcpm; > > + int ret; > > Whitespace. OK, will update, thanks for your review. Regards, Ran
Re: [RFC PATCH v2 09/10] KVM: PPC: Book3S HV: Fixed for running secure guests
On Sat, May 18, 2019 at 11:25:23AM -0300, Claudio Carvalho wrote: > From: Paul Mackerras > > - Pass SRR1 in r11 for UV_RETURN because SRR0 and SRR1 get set by > the sc 2 instruction. (Note r3 - r10 potentially have hcall return > values in them.) > > - Fix kvmppc_msr_interrupt to preserve the MSR_S bit. > > Signed-off-by: Paul Mackerras > Signed-off-by: Claudio Carvalho This should be folded into the previous patch. Paul.
Re: [RFC PATCH v2 08/10] KVM: PPC: Ultravisor: Return to UV for hcalls from SVM
On Sat, May 18, 2019 at 11:25:22AM -0300, Claudio Carvalho wrote: > From: Sukadev Bhattiprolu > > All hcalls from a secure VM go to the ultravisor from where they are > reflected into the HV. When we (HV) complete processing such hcalls, > we should return to the UV rather than to the guest kernel. This paragraph in the patch description, and the comment in book3s_hv_rmhandlers.S, are confusing and possibly misleading in focussing on returns from hcalls, when the change is needed for any sort of entry to the guest from the hypervisor, whether it is a return from an hcall, a return from a hypervisor interrupt, or the first time that a guest vCPU is run. This paragraph needs to explain that to enter a secure guest, we have to go through the ultravisor, therefore we do a ucall when we are entering a secure guest. [snip] > +/* > + * The hcall we just completed was from Ultravisor. Use UV_RETURN > + * ultra call to return to the Ultravisor. Results from the hcall > + * are already in the appropriate registers (r3:12), except for > + * R6,7 which we used as temporary registers above. Restore them, > + * and set R0 to the ucall number (UV_RETURN). > + */ This needs to say something like "We are entering a secure guest, so we have to invoke the ultravisor to do that. If we are returning from a hcall, the results are already ...". > +ret_to_ultra: > + lwz r6, VCPU_CR(r4) > + mtcrr6 > + LOAD_REG_IMMEDIATE(r0, UV_RETURN) > + ld r7, VCPU_GPR(R7)(r4) > + ld r6, VCPU_GPR(R6)(r4) > + ld r4, VCPU_GPR(R4)(r4) > + sc 2 > > /* > * Enter the guest on a P9 or later system where we have exactly > -- > 2.20.1 Paul.
Re: [RFC PATCH v2 07/10] KVM: PPC: Ultravisor: Restrict LDBAR access
On Sat, May 18, 2019 at 11:25:21AM -0300, Claudio Carvalho wrote: > From: Ram Pai > > When the ultravisor firmware is available, it takes control over the > LDBAR register. In this case, thread-imc updates and save/restore > operations on the LDBAR register are handled by ultravisor. > > Signed-off-by: Ram Pai > [Restrict LDBAR access in assembly code and some in C, update the commit > message] > Signed-off-by: Claudio Carvalho Some of the places that you are patching below are explicitly only executed on POWER8, which can't have an ultravisor, and therefore the change isn't needed: > --- > arch/powerpc/kvm/book3s_hv.c | 4 +- > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 + > arch/powerpc/perf/imc-pmu.c | 64 > arch/powerpc/platforms/powernv/idle.c| 6 +- > arch/powerpc/platforms/powernv/subcore-asm.S | 4 ++ > 5 files changed, 52 insertions(+), 28 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 0fab0a201027..81f35f955d16 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -75,6 +75,7 @@ > #include > #include > #include > +#include > > #include "book3s.h" > > @@ -3117,7 +3118,8 @@ static noinline void kvmppc_run_core(struct > kvmppc_vcore *vc) > subcore_size = MAX_SMT_THREADS / split; > split_info.rpr = mfspr(SPRN_RPR); > split_info.pmmar = mfspr(SPRN_PMMAR); > - split_info.ldbar = mfspr(SPRN_LDBAR); > + if (!firmware_has_feature(FW_FEATURE_ULTRAVISOR)) > + split_info.ldbar = mfspr(SPRN_LDBAR); This is inside an if (is_power8) statement. > diff --git a/arch/powerpc/platforms/powernv/subcore-asm.S > b/arch/powerpc/platforms/powernv/subcore-asm.S > index 39bb24aa8f34..e4383fa5e150 100644 > --- a/arch/powerpc/platforms/powernv/subcore-asm.S > +++ b/arch/powerpc/platforms/powernv/subcore-asm.S > @@ -44,7 +44,9 @@ _GLOBAL(split_core_secondary_loop) > > real_mode: > /* Grab values from unsplit SPRs */ > +BEGIN_FW_FTR_SECTION > mfspr r6, SPRN_LDBAR > +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ULTRAVISOR) > mfspr r7, SPRN_PMMAR > mfspr r8, SPRN_PMCR > mfspr r9, SPRN_RPR > @@ -77,7 +79,9 @@ real_mode: > mtspr SPRN_HDEC, r4 > > /* Restore SPR values now we are split */ > +BEGIN_FW_FTR_SECTION > mtspr SPRN_LDBAR, r6 > +END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ULTRAVISOR) Only POWER8 supports split-core mode, so we can only get here on POWER8. Paul.
[WIP RFC PATCH 6/6] fwvarfs: Add opal_secvar backend
COMPILE TESTED ONLY. mount -t fwvarfs opal_secvar /fw Signed-off-by: Daniel Axtens --- Documentation/filesystems/fwvarfs.txt | 3 + fs/fwvarfs/Kconfig| 11 ++ fs/fwvarfs/Makefile | 1 + fs/fwvarfs/fwvarfs.c | 3 + fs/fwvarfs/fwvarfs.h | 4 + fs/fwvarfs/opal_secvar.c | 218 ++ 6 files changed, 240 insertions(+) create mode 100644 fs/fwvarfs/opal_secvar.c diff --git a/Documentation/filesystems/fwvarfs.txt b/Documentation/filesystems/fwvarfs.txt index 7c1e921e5c50..3ecc4b4428a5 100644 --- a/Documentation/filesystems/fwvarfs.txt +++ b/Documentation/filesystems/fwvarfs.txt @@ -36,6 +36,9 @@ Supported backends backend. Read-only with no creation or deletion, but sufficient that efivar --print --name works the same as efivarfs. + * opal_secvar - COMPILE-TESTED ONLY implementation against PowerNV + OPAL Secure Variable storage. + Usage - diff --git a/fs/fwvarfs/Kconfig b/fs/fwvarfs/Kconfig index e4474da11dbc..cb9cbc6f8fb3 100644 --- a/fs/fwvarfs/Kconfig +++ b/fs/fwvarfs/Kconfig @@ -34,3 +34,14 @@ config FWVAR_FS_EFI_BACKEND in the same way the do with efivarfs. Say N here unless you're exploring fwvarfs. + +config FWVAR_FS_OPAL_SECVAR_BACKEND + bool "OPAL Secure Variable backend" + depends on FWVAR_FS + depends on OPAL_SECVAR + help + Include a read-only, compile-tested-only, not up-to-date + backend for OPAL Secure Variables. This is really just + designed to show how the code would work, and you should + only select Y here if you are developing OPAL secure + variables. diff --git a/fs/fwvarfs/Makefile b/fs/fwvarfs/Makefile index 2ab9dfd650ca..8d258acdfef7 100644 --- a/fs/fwvarfs/Makefile +++ b/fs/fwvarfs/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_FWVAR_FS) += fwvarfs.o obj-$(CONFIG_FWVAR_FS_MEM_BACKEND) += mem.o obj-$(CONFIG_FWVAR_FS_EFI_BACKEND) += efi.o +obj-$(CONFIG_FWVAR_FS_OPAL_SECVAR_BACKEND) += opal_secvar.o diff --git a/fs/fwvarfs/fwvarfs.c b/fs/fwvarfs/fwvarfs.c index 643ec6585b4d..3c93b6442e95 100644 --- a/fs/fwvarfs/fwvarfs.c +++ b/fs/fwvarfs/fwvarfs.c @@ -24,6 +24,9 @@ static struct fwvarfs_backend *fwvarfs_backends[] = { #endif #ifdef CONFIG_FWVAR_FS_EFI_BACKEND _efi_backend, +#endif +#ifdef CONFIG_FWVAR_FS_OPAL_SECVAR_BACKEND + _opal_secvar_backend, #endif NULL, }; diff --git a/fs/fwvarfs/fwvarfs.h b/fs/fwvarfs/fwvarfs.h index 49bde268401f..5780046dafae 100644 --- a/fs/fwvarfs/fwvarfs.h +++ b/fs/fwvarfs/fwvarfs.h @@ -117,4 +117,8 @@ extern struct fwvarfs_backend fwvarfs_mem_backend; extern struct fwvarfs_backend fwvarfs_efi_backend; #endif +#if defined(CONFIG_FWVAR_FS_OPAL_SECVAR_BACKEND) +extern struct fwvarfs_backend fwvarfs_opal_secvar_backend; +#endif + #endif /* FWVARFS_H */ diff --git a/fs/fwvarfs/opal_secvar.c b/fs/fwvarfs/opal_secvar.c new file mode 100644 index ..4a1749317ed9 --- /dev/null +++ b/fs/fwvarfs/opal_secvar.c @@ -0,0 +1,218 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 IBM Corporation + * Author: Daniel Axtens + * + * Loosely based on efivarfs: + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr + * and drivers/firmware/efi/vars.c: + * Copyright (C) 2001,2003,2004 Dell + * Copyright (C) 2004 Intel Corporation + * + * We cheat by not allowing for case-insensitivity. + */ + +#include +#include +#include +#include "fwvarfs.h" + +#include +#include +#include +#include + +#define pr_fmt(fmt)"fwvarfs: opal_secvar: " fmt + +static LIST_HEAD(opal_secvar_file_list); + +struct fwvarfs_opal_secvar_file { + struct list_head list; + u16 *name; + guid_t vendor; +}; + +// stolen from efi.h +static inline char * +guid_to_str(guid_t *guid, char *out) +{ + sprintf(out, "%pUl", guid->b); + return out; +} + +// need a forward decl to pass down to register +struct fwvarfs_backend fwvarfs_opal_secvar_backend; + + +static ssize_t fwvarfs_opal_secvar_read(void *variable, char *buf, + size_t count, loff_t off) +{ + struct fwvarfs_opal_secvar_file *file_data = variable; + unsigned long datasize = 0; + u32 attributes; + void *data; + ssize_t size = 0; + loff_t ppos = off; + int rc; + + // get size + rc = opal_get_secure_variable(file_data->name, _data->vendor, + NULL, , NULL); + if (rc != OPAL_SUCCESS && rc != OPAL_PARTIAL) + return -EIO; + + data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + rc = opal_get_secure_variable(file_data->name, _data->vendor, + , , data + sizeof(attributes)); + if (rc != OPAL_SUCCESS) { + size = -EIO; + goto out_free; + } + +
[WIP RFC PATCH 5/6] powerpc/powernv: Remove EFI support for OPAL secure variables
Replace it with a generic API. Compile tested only. Signed-off-by: Daniel Axtens --- arch/powerpc/include/asm/opal-secvar.h | 58 +++ arch/powerpc/platforms/Kconfig | 3 - arch/powerpc/platforms/powernv/Kconfig | 5 +- arch/powerpc/platforms/powernv/opal-secvar.c | 104 --- 4 files changed, 83 insertions(+), 87 deletions(-) create mode 100644 arch/powerpc/include/asm/opal-secvar.h diff --git a/arch/powerpc/include/asm/opal-secvar.h b/arch/powerpc/include/asm/opal-secvar.h new file mode 100644 index ..ba9bd52138d9 --- /dev/null +++ b/arch/powerpc/include/asm/opal-secvar.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * PowerNV code for secure variables + * + * Copyright (C) 2019 IBM Corporation + * Author: Claudio Carvalho + * Author: Daniel Axtens + * + */ + +#ifndef __OPAL_SECVAR_H +#define __OPAL_SECVAR_H + +#include +#include + +#ifdef CONFIG_OPAL_SECVAR +int opal_get_secure_variable(u16 *name, guid_t *vendor, u32 *attr, +unsigned long *data_size, void *data); + +int opal_get_next_secure_variable(unsigned long *name_size, u16 *name, + guid_t *vendor); + +int opal_set_secure_variable(u16 *name, guid_t *vendor, u32 attr, +unsigned long data_size, void *data); + +int opal_query_secure_variable_info(u32 attr, u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size); +#else + +static inline int opal_get_secure_variable(u16 *name, guid_t *vendor, + u32 *attr, unsigned long *data_size, void *data) +{ + return OPAL_UNSUPPORTED; +} + +static inline int opal_get_next_secure_variable(unsigned long *name_size, + u16 *name, guid_t *vendor) +{ + return OPAL_UNSUPPORTED; +} + +static inline int opal_set_secure_variable(u16 *name, guid_t *vendor, + u32 attr, unsigned long data_size, void *data) +{ + return OPAL_UNSUPPORTED; +} + +static inline int opal_query_secure_variable_info(u32 attr, + u64 *storage_space, u64 *remaining_space, + u64 *max_variable_size) +{ + return OPAL_UNSUPPORTED; +} + +#endif +#endif diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index 8e30510bc0c1..f3fb79fccc72 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -326,7 +326,4 @@ config XILINX_PCI bool "Xilinx PCI host bridge support" depends on PCI && XILINX_VIRTEX -config EFI - bool - endmenu diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 879f8e766098..a71fc5daa60a 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -52,7 +52,6 @@ config OPAL_SECVAR bool "OPAL Secure Variables" depends on PPC_POWERNV && !CPU_BIG_ENDIAN select UCS2_STRING - select EFI help - This enables the kernel to access OPAL secure variables via EFI - runtime variable services. + This enables the kernel to access OPAL secure variables via + an API. diff --git a/arch/powerpc/platforms/powernv/opal-secvar.c b/arch/powerpc/platforms/powernv/opal-secvar.c index e333828bd0bc..af753b94cceb 100644 --- a/arch/powerpc/platforms/powernv/opal-secvar.c +++ b/arch/powerpc/platforms/powernv/opal-secvar.c @@ -8,62 +8,20 @@ */ #define pr_fmt(fmt) "secvar: "fmt -#include #include #include +#include +#include static bool opal_secvar_supported; -static efi_status_t opal_to_efi_status_log(int rc, const char *func_name) -{ - efi_status_t status; - - switch (rc) { - case OPAL_EMPTY: - status = EFI_NOT_FOUND; - break; - case OPAL_HARDWARE: - status = EFI_DEVICE_ERROR; - break; - case OPAL_NO_MEM: - pr_err("%s: No space in the volatile storage\n", func_name); - status = EFI_OUT_OF_RESOURCES; - break; - case OPAL_PARAMETER: - status = EFI_INVALID_PARAMETER; - break; - case OPAL_PARTIAL: - status = EFI_BUFFER_TOO_SMALL; - break; - case OPAL_PERMISSION: - status = EFI_WRITE_PROTECTED; - break; - case OPAL_RESOURCE: - pr_err("%s: No space in the non-volatile storage\n", func_name); - status = EFI_OUT_OF_RESOURCES; - break; - case OPAL_SUCCESS: - status = EFI_SUCCESS; - break; - default: - pr_err("%s: Unknown OPAL error %d\n", func_name, rc); - status = EFI_DEVICE_ERROR; - break; - } - - return status; -} - -#define opal_to_efi_status(rc) opal_to_efi_status_log(rc, __func__)
[WIP RFC PATCH 4/6] powerpc/powernv: Add support for OPAL secure variables
From: Claudio Carvalho [dja: this is a WIP version - a new version is coming that changes the interface. I also had to renumber the opal calls to get this to apply. Basically, this is an illustration of the concept: more work would be required to get this to actually function.] The X.509 certificates trusted by the platform and other information required to secure boot the host OS kernel are wrapped in secure variables, which are controlled by OPAL. The OPAL secure variables can be handled through the following OPAL calls. OPAL_SECVAR_GET: Returns the data for a given secure variable name and vendor GUID. OPAL_SECVAR_GET_NEXT: For a given secure variable, it returns the name and vendor GUID of the next variable. OPAL_SECVAR_ENQUEUE: Enqueue the supplied secure variable update so that it can be processed by OPAL in the next boot. Variable updates cannot be be processed right away because the variable storage is write locked at runtime. OPAL_SECVAR_INFO: Returns size information about the variable. This patch adds support for OPAL secure variables by setting up the EFI runtime variable services to make OPAL calls. This patch also introduces CONFIG_OPAL_SECVAR for enabling the OPAL secure variables support in the kernel. Since CONFIG_OPAL_SECVAR selects CONFIG_EFI, it also allow us to manage the OPAL secure variables from userspace via efivarfs. Signed-off-by: Claudio Carvalho Signed-off-by: Daniel Axtens --- arch/powerpc/include/asm/opal-api.h | 6 +- arch/powerpc/include/asm/opal.h | 10 ++ arch/powerpc/platforms/Kconfig | 3 + arch/powerpc/platforms/powernv/Kconfig | 9 + arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/opal-call.c | 4 + arch/powerpc/platforms/powernv/opal-secvar.c | 179 +++ 7 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/platforms/powernv/opal-secvar.c diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index e1577cfa7186..8054e1e983ff 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -212,7 +212,11 @@ #define OPAL_HANDLE_HMI2 166 #defineOPAL_NX_COPROC_INIT 167 #define OPAL_XIVE_GET_VP_STATE 170 -#define OPAL_LAST 170 +#define OPAL_SECVAR_GET171 +#define OPAL_SECVAR_GET_NEXT 172 +#define OPAL_SECVAR_ENQUEUE173 +#define OPAL_SECVAR_INFO 174 +#define OPAL_LAST 174 #define QUIESCE_HOLD 1 /* Spin all calls at entry */ #define QUIESCE_REJECT 2 /* Fail all calls with OPAL_BUSY */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 4cc37e708bc7..4b8046caaf4f 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -394,6 +394,16 @@ void opal_powercap_init(void); void opal_psr_init(void); void opal_sensor_groups_init(void); +extern int opal_secvar_get(uint64_t name, uint64_t vendor, uint64_t attr, + uint64_t data_size, uint64_t data); +extern int opal_secvar_get_next(uint64_t name_size, uint64_t name, + uint64_t vendor); +extern int opal_secvar_enqueue(uint64_t name, uint64_t vendor, uint64_t attr, + uint64_t data_size, uint64_t data); +extern int opal_secvar_info(uint64_t attr, uint64_t storage_space, + uint64_t remaining_space, + uint64_t max_variable_size); + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_OPAL_H */ diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig index f3fb79fccc72..8e30510bc0c1 100644 --- a/arch/powerpc/platforms/Kconfig +++ b/arch/powerpc/platforms/Kconfig @@ -326,4 +326,7 @@ config XILINX_PCI bool "Xilinx PCI host bridge support" depends on PCI && XILINX_VIRTEX +config EFI + bool + endmenu diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 850eee860cf2..879f8e766098 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -47,3 +47,12 @@ config PPC_VAS VAS adapters are found in POWER9 based systems. If unsure, say N. + +config OPAL_SECVAR + bool "OPAL Secure Variables" + depends on PPC_POWERNV && !CPU_BIG_ENDIAN + select UCS2_STRING + select EFI + help + This enables the kernel to access OPAL secure variables via EFI + runtime variable services. diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index da2e99efbd04..1511d836fd19 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@
[WIP RFC PATCH 3/6] fwvarfs: efi backend
Add a read-only EFI backend. This does not rely on efivarfs at all (although it does borrow heavily from the code). It only supports reading the variables, but it supports the same format as efivarfs so tools like efivar continue to work if you mount this filesystem in the same place. Two small quirks: - efivarfs (at least as configured on Ubuntu) allows users to access the files, here only root can. - efivarfs makes GUID comparison case-insensitive, this does not. Signed-off-by: Daniel Axtens --- Documentation/filesystems/fwvarfs.txt | 5 + fs/fwvarfs/Kconfig| 11 ++ fs/fwvarfs/Makefile | 1 + fs/fwvarfs/efi.c | 177 ++ fs/fwvarfs/fwvarfs.c | 4 +- fs/fwvarfs/fwvarfs.h | 4 + 6 files changed, 201 insertions(+), 1 deletion(-) create mode 100644 fs/fwvarfs/efi.c diff --git a/Documentation/filesystems/fwvarfs.txt b/Documentation/filesystems/fwvarfs.txt index bf1bccba6ab9..7c1e921e5c50 100644 --- a/Documentation/filesystems/fwvarfs.txt +++ b/Documentation/filesystems/fwvarfs.txt @@ -32,6 +32,10 @@ Supported backends operations. Files created persist across mount/unmount but as no hardware is involved they do not persist across reboots. + * efi - a partial reimplementation of efivarfs against the fwvarfs + backend. Read-only with no creation or deletion, but sufficient that + efivar --print --name works the same as efivarfs. + Usage - @@ -40,6 +44,7 @@ mount -t fwvarfs For example: mount -t fwvarfs mem /fw/mem/ +mount -t fwvarfs efi /sys/firmware/efi/efivars API --- diff --git a/fs/fwvarfs/Kconfig b/fs/fwvarfs/Kconfig index 62a47cddd4b5..e4474da11dbc 100644 --- a/fs/fwvarfs/Kconfig +++ b/fs/fwvarfs/Kconfig @@ -23,3 +23,14 @@ config FWVAR_FS_MEM_BACKEND demonstration of fwvarfs. You can safely say N here unless you're exploring fwvarfs. + +config FWVAR_FS_EFI_BACKEND + bool "EFI backend" + depends on FWVAR_FS + help + Include a read-only EFI backend, largely cribbed from + efivarfs. This is handy for demonstrating that the same + userspace tools can read from EFI variables over fwvarfs + in the same way the do with efivarfs. + + Say N here unless you're exploring fwvarfs. diff --git a/fs/fwvarfs/Makefile b/fs/fwvarfs/Makefile index f1585baccabe..2ab9dfd650ca 100644 --- a/fs/fwvarfs/Makefile +++ b/fs/fwvarfs/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_FWVAR_FS) += fwvarfs.o obj-$(CONFIG_FWVAR_FS_MEM_BACKEND) += mem.o +obj-$(CONFIG_FWVAR_FS_EFI_BACKEND) += efi.o diff --git a/fs/fwvarfs/efi.c b/fs/fwvarfs/efi.c new file mode 100644 index ..7aa5c186d0c9 --- /dev/null +++ b/fs/fwvarfs/efi.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2019 IBM Corporation + * Author: Daniel Axtens + * + * Based on efivarfs: + * Copyright (C) 2012 Red Hat, Inc. + * Copyright (C) 2012 Jeremy Kerr + * + * We cheat by not allowing for case-insensitivity. + */ + +#include +#include +#include +#include "fwvarfs.h" + +#include +#include +#include + +static LIST_HEAD(efivar_list); +static LIST_HEAD(efivar_file_list); + +struct fwvarfs_efi_file { + bool is_removable; + struct list_head list; + struct efivar_entry *entry; +}; + +// need a forward decl to pass down to register +struct fwvarfs_backend fwvarfs_efi_backend; + + +static ssize_t fwvarfs_efi_read(void *variable, char *buf, + size_t count, loff_t off) +{ + struct fwvarfs_efi_file *file_data = variable; + struct efivar_entry *var = file_data->entry; + unsigned long datasize = 0; + u32 attributes; + void *data; + ssize_t size = 0; + loff_t ppos = off; + int err; + + err = efivar_entry_size(var, ); + + /* +* efivarfs represents uncommitted variables with +* zero-length files. Reading them should return EOF. +*/ + if (err == -ENOENT) + return 0; + else if (err) + return err; + + data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); + + if (!data) + return -ENOMEM; + + size = efivar_entry_get(var, , , + data + sizeof(attributes)); + if (size) + goto out_free; + + memcpy(data, , sizeof(attributes)); + size = memory_read_from_buffer(buf, count, , + data, datasize + sizeof(attributes)); +out_free: + kfree(data); + + return size; +} + +static int fwvarfs_efi_callback(efi_char16_t *name16, efi_guid_t vendor, + unsigned long name_size, void *data) +{ + struct efivar_entry *entry; + struct fwvarfs_efi_file *file_data; + unsigned long size = 0; + char *name; + int len; + int err = -ENOMEM; + +
[WIP RFC PATCH 2/6] fwvarfs: a generic firmware variable filesystem
Sometimes it is helpful to be able to access firmware variables as file, like efivarfs, but not all firmware is EFI. Create a framework that allows generic access to firmware variables exposed by a implementations of a simple backend API. Add a demonstration memory-based backend. Signed-off-by: Daniel Axtens --- Documentation/filesystems/fwvarfs.txt | 146 + fs/Kconfig| 1 + fs/Makefile | 1 + fs/fwvarfs/Kconfig| 25 +++ fs/fwvarfs/Makefile | 8 + fs/fwvarfs/fwvarfs.c | 289 ++ fs/fwvarfs/fwvarfs.h | 116 +++ fs/fwvarfs/mem.c | 113 ++ fs/kernfs/dir.c | 1 - include/uapi/linux/magic.h| 1 + 10 files changed, 700 insertions(+), 1 deletion(-) create mode 100644 Documentation/filesystems/fwvarfs.txt create mode 100644 fs/fwvarfs/Kconfig create mode 100644 fs/fwvarfs/Makefile create mode 100644 fs/fwvarfs/fwvarfs.c create mode 100644 fs/fwvarfs/fwvarfs.h create mode 100644 fs/fwvarfs/mem.c diff --git a/Documentation/filesystems/fwvarfs.txt b/Documentation/filesystems/fwvarfs.txt new file mode 100644 index ..bf1bccba6ab9 --- /dev/null +++ b/Documentation/filesystems/fwvarfs.txt @@ -0,0 +1,146 @@ +fwvarfs +=== + +fwvarfs is a generic firmware variable file-system. A platform +provides a backend implementing a few very simple callbacks, and +fwvarfs handles all the details required to present the variables as a +filesystem. + +The minimum functionality for a backend is the ability to enumerate +existing variables. Backends can optionally also allow: + - reading of variables + - writing of variables + - creation of variables + - deletion of variables + +Key assumptions +--- + + * Variables for each backend live in a single, flat directory - + there's no concept of subdirectories. + + * Files are created with mode 600 if the backend provides a write + hook, and 400 otherwise, and are owned by root:root. + + * The set of variables stored can be determined at boot time, and + nothing outside of fwvarfs creates new variables after boot. + +Supported backends +-- + + * mem - a memory-backed example filesystem supporting all + operations. Files created persist across mount/unmount but as no + hardware is involved they do not persist across reboots. + +Usage +- + +mount -t fwvarfs + +For example: + +mount -t fwvarfs mem /fw/mem/ + +API +--- + +A backend is installed by creating a fwvarfs_backend struct, +containing the name of the backend and various callbacks. The backend +must be registered by adding it to the list at the top of +fs/fwvarfs/fwvarfs.c + +The fwvarfs infrastructure provides the following function to backends: + + int fwvarfs_register_var(struct fwvarfs_backend *backend, const char *name, void *variable, size_t size); + + Register a variable with fwvarfs to allow it to be seen by users. + + backend: the backend to which to add this variable + + name: the name of file representing the variable. Must be a valid +filename, so no nulls or slashes. + + variable: data private to the backend representing the variable - +will be passed back to every callback + + size: the initial size of the variable + + +The backend must then provide the following functions: + +int (*enumerate)(void); + + Mandatory and called at init time, a backend must call + fwvarfs_register_var for all variables it wants to expose to + the user. + +void (*destroy)(void *variable); + + Mandatory if you provide a create or unlink hook, and may + become mandatory in the future for cleanup. + + Free backend data associated with variable. It will not be + referenced after this point by fwvarfs. + + +ssize_t (*read)(void *variable, char *dest, size_t bytes, loff_t off); + + Read from variable into the a kernel buffer. Similar semantics + to a usual read operation, except that off is not a pointer + (unlike the usual ppos). + + variable: the variable to read + dest: kernel buffer to read into + bytes: maximum number of bytes to read + off: offset to read from + + Returns the number of bytes read or an error. + If this hook is not provided, all reads will fail with -EPERM. + +ssize_t (*write)(void *variable, const char *src, size_t bytes, loff_t off); + + Write into the variable from the given kernel buffer. + + variable: the variable to write + src: kernel buffer with contents + bytes: write at most this many bytes + off: offset into the file to write at. + + Returns the number of bytes written or an error. + If this hook is not provided, all writes will fail with -EPERM. + + +void* (*create)(const char *name); + + Create a variable with the
[WIP RFC PATCH 1/6] kernfs: add create() and unlink() hooks
I'm building a generic firmware variable filesystem on top of kernfs and I'd like to be able to create and unlink files. The hooks are fairly straightforward. create() returns a kernfs_node*, which is safe with regards to cleanup on error paths, because there is no way that things can fail after that point in the current implementation. However, currently O_EXCL is not implemented and that may create failure paths, in which case we may need to revisit this later. Signed-off-by: Daniel Axtens --- fs/kernfs/dir.c| 55 ++ include/linux/kernfs.h | 3 +++ 2 files changed, 58 insertions(+) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 016ba88f7335..74fe51dbd027 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1175,6 +1175,59 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, return ret; } +static int kernfs_iop_create(struct inode *dir, struct dentry *dentry, +umode_t mode, bool excl) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_node *kn; + struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; + + if (!scops || !scops->create) + return -EPERM; + + if (!kernfs_get_active(parent)) + return -ENODEV; + + // TODO: add some locking to ensure that scops->create + // is called only once, and possibly to handle the O_EXCL case + WARN_ONCE(excl, "excl unimplemented"); + + kn = scops->create(parent, dentry->d_name.name, mode); + + if (!kn) + return -EPERM; + + if (IS_ERR(kn)) + return PTR_ERR(kn); + + d_instantiate(dentry, kernfs_get_inode(dir->i_sb, kn)); + + return 0; +} + +static int kernfs_iop_unlink(struct inode *dir, struct dentry *dentry) +{ + struct kernfs_node *parent = dir->i_private; + struct kernfs_node *kn = d_inode(dentry)->i_private; + struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; + int ret; + + + if (!scops || !scops->unlink) + return -EPERM; + + if (!kernfs_get_active(parent)) + return -ENODEV; + + ret = scops->unlink(kn); + if (ret) + return ret; + + drop_nlink(d_inode(dentry)); + dput(dentry); + return 0; +}; + const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, @@ -1185,6 +1238,8 @@ const struct inode_operations kernfs_dir_iops = { .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, + .create = kernfs_iop_create, + .unlink = kernfs_iop_unlink, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h index 2bf477f86eb1..282b96acbd7e 100644 --- a/include/linux/kernfs.h +++ b/include/linux/kernfs.h @@ -179,6 +179,9 @@ struct kernfs_syscall_ops { const char *new_name); int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, struct kernfs_root *root); + struct kernfs_node* (*create)(struct kernfs_node *parent, + const char *name, umode_t mode); + int (*unlink)(struct kernfs_node *kn); }; struct kernfs_root { -- 2.19.1
[WIP RFC PATCH 0/6] Generic Firmware Variable Filesystem
Hi all, As PowerNV moves towards secure boot, we need a place to put secure variables. One option that has been canvassed is to make our secure variables look like EFI variables. This is an early sketch of another approach where we create a generic firmware variable file system, fwvarfs, and an OPAL Secure Variable backend for it. In short, platforms provide a simple backend that can interface with the hardware, and fwvarfs deals with translating that into a filesystem that you can use. Almost all of the hard work is done by kernfs: fwvarfs provides a pretty thin layer on top of that to make backends a simple as possible. Behaviour and the API is documented in Documentation/filesystems/fwvarfs.txt To demonstrate the concept, a fully functional memory-based backend is provided, and a read-only but userspace-compatible EFI backend. For OPAL secure variables, I have taken Claudio's commit, tweaked it to apply to linux-next, replaced all the EFI support with a generic API, and then written a backend against that. There's a coming version from Claudio that moves the opal calls towards a simple key/value interface rather than (name, vendor) pairs - I haven't waited for that: this is really just to demonstrate that it could be done rather than an attempt to get mergable code. It is also compile tested only as I haven't yet set myself up with a test machine. The patches are a bit rough, and there are a number of outstanding TODOs sprinkled in everywhere. The idea is just to do a proof of concept to inform our discussions: - Is this the sort of approach you'd like (generic vs specific)? - Does the backend API make sense? - Is the use of kernfs the correct decision, or is it potentially too limiting? (e.g. no ability to do case-insensitivity like efivarfs) - Is assuming flat fwvars correct or is there a firmware with a hierarchical structure? Regards, Daniel Claudio Carvalho (1): powerpc/powernv: Add support for OPAL secure variables Daniel Axtens (5): kernfs: add create() and unlink() hooks fwvarfs: a generic firmware variable filesystem fwvarfs: efi backend powerpc/powernv: Remove EFI support for OPAL secure variables fwvarfs: Add opal_secvar backend Documentation/filesystems/fwvarfs.txt| 154 ++ arch/powerpc/include/asm/opal-api.h | 6 +- arch/powerpc/include/asm/opal-secvar.h | 58 arch/powerpc/include/asm/opal.h | 10 + arch/powerpc/platforms/powernv/Kconfig | 8 + arch/powerpc/platforms/powernv/Makefile | 1 + arch/powerpc/platforms/powernv/opal-call.c | 4 + arch/powerpc/platforms/powernv/opal-secvar.c | 121 fs/Kconfig | 1 + fs/Makefile | 1 + fs/fwvarfs/Kconfig | 47 +++ fs/fwvarfs/Makefile | 10 + fs/fwvarfs/efi.c | 177 +++ fs/fwvarfs/fwvarfs.c | 294 +++ fs/fwvarfs/fwvarfs.h | 124 fs/fwvarfs/mem.c | 113 +++ fs/fwvarfs/opal_secvar.c | 218 ++ fs/kernfs/dir.c | 54 include/linux/kernfs.h | 3 + include/uapi/linux/magic.h | 1 + 20 files changed, 1404 insertions(+), 1 deletion(-) create mode 100644 Documentation/filesystems/fwvarfs.txt create mode 100644 arch/powerpc/include/asm/opal-secvar.h create mode 100644 arch/powerpc/platforms/powernv/opal-secvar.c create mode 100644 fs/fwvarfs/Kconfig create mode 100644 fs/fwvarfs/Makefile create mode 100644 fs/fwvarfs/efi.c create mode 100644 fs/fwvarfs/fwvarfs.c create mode 100644 fs/fwvarfs/fwvarfs.h create mode 100644 fs/fwvarfs/mem.c create mode 100644 fs/fwvarfs/opal_secvar.c -- 2.19.1
Re: [PATCH] mm: add account_locked_vm utility function
On 04/05/2019 06:16, Daniel Jordan wrote: > locked_vm accounting is done roughly the same way in five places, so > unify them in a helper. Standardize the debug prints, which vary > slightly. And I rather liked that prints were different and tell precisely which one of three each printk is. I commented below but in general this seems working. Tested-by: Alexey Kardashevskiy > Error codes stay the same, so user-visible behavior does too. > > Signed-off-by: Daniel Jordan > Cc: Alan Tull > Cc: Alexey Kardashevskiy > Cc: Alex Williamson > Cc: Andrew Morton > Cc: Benjamin Herrenschmidt > Cc: Christoph Lameter > Cc: Christophe Leroy > Cc: Davidlohr Bueso > Cc: Jason Gunthorpe > Cc: Mark Rutland > Cc: Michael Ellerman > Cc: Moritz Fischer > Cc: Paul Mackerras > Cc: Steve Sistare > Cc: Wu Hao > Cc: linux...@kvack.org > Cc: k...@vger.kernel.org > Cc: kvm-...@vger.kernel.org > Cc: linuxppc-dev@lists.ozlabs.org > Cc: linux-f...@vger.kernel.org > Cc: linux-ker...@vger.kernel.org > --- > > Based on v5.1-rc7. Tested with the vfio type1 driver. Any feedback > welcome. > > Andrew, this one patch replaces these six from [1]: > > mm-change-locked_vms-type-from-unsigned-long-to-atomic64_t.patch > vfio-type1-drop-mmap_sem-now-that-locked_vm-is-atomic.patch > vfio-spapr_tce-drop-mmap_sem-now-that-locked_vm-is-atomic.patch > fpga-dlf-afu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch > kvm-book3s-drop-mmap_sem-now-that-locked_vm-is-atomic.patch > powerpc-mmu-drop-mmap_sem-now-that-locked_vm-is-atomic.patch > > That series converts locked_vm to an atomic, but on closer inspection causes > at > least one accounting race in mremap, and fixing it just for this type > conversion came with too much ugly in the core mm to justify, especially when > the right long-term fix is making these drivers use pinned_vm instead. > > Christophe's suggestion of cmpxchg[2] does prevent the races he > mentioned for pinned_vm, but others would still remain. In perf_mmap > and the hfi1 driver, pinned_vm is checked against the rlimit racily and > then later increased when the pinned_vm originally read may have gone > stale. Any fixes for that, that I could think of, seem about as good as > what's there now, so I left it. I have a patch that uses cmpxchg with > pinned_vm if others feel strongly that the aforementioned races should > be fixed. > > Daniel > > [1] > https://lore.kernel.org/linux-mm/20190402204158.27582-1-daniel.m.jor...@oracle.com/ > [2] > https://lore.kernel.org/linux-mm/964bd5b0-f1e5-7bf0-5c58-18e75c550...@c-s.fr/ > > arch/powerpc/kvm/book3s_64_vio.c| 44 +++- > arch/powerpc/mm/mmu_context_iommu.c | 41 +++--- > drivers/fpga/dfl-afu-dma-region.c | 53 +++-- > drivers/vfio/vfio_iommu_spapr_tce.c | 52 +--- > drivers/vfio/vfio_iommu_type1.c | 23 - > include/linux/mm.h | 19 +++ > mm/util.c | 48 ++ > 7 files changed, 94 insertions(+), 186 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio.c > b/arch/powerpc/kvm/book3s_64_vio.c > index f02b04973710..f7d37fa6003a 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -30,6 +30,7 @@ > #include > #include > #include > +#include > > #include > #include > @@ -56,43 +57,6 @@ static unsigned long kvmppc_stt_pages(unsigned long > tce_pages) > return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE; > } > > -static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) > -{ > - long ret = 0; > - > - if (!current || !current->mm) > - return ret; /* process exited */ > - > - down_write(>mm->mmap_sem); > - > - if (inc) { > - unsigned long locked, lock_limit; > - > - locked = current->mm->locked_vm + stt_pages; > - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) > - ret = -ENOMEM; > - else > - current->mm->locked_vm += stt_pages; > - } else { > - if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm)) > - stt_pages = current->mm->locked_vm; > - > - current->mm->locked_vm -= stt_pages; > - } > - > - pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, > - inc ? '+' : '-', > - stt_pages << PAGE_SHIFT, > - current->mm->locked_vm << PAGE_SHIFT, > - rlimit(RLIMIT_MEMLOCK), > - ret ? " - exceeded" : ""); > - > - up_write(>mm->mmap_sem); > - > - return ret; > -} > - > static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head) > { > struct kvmppc_spapr_tce_iommu_table *stit = container_of(head, > @@
[PATCH 5/5] asm-generic: remove ptrace.h
No one is using this header anymore. Signed-off-by: Christoph Hellwig Acked-by: Arnd Bergmann --- MAINTAINERS| 1 - arch/mips/include/asm/ptrace.h | 5 --- include/asm-generic/ptrace.h | 74 -- 3 files changed, 80 deletions(-) delete mode 100644 include/asm-generic/ptrace.h diff --git a/MAINTAINERS b/MAINTAINERS index 5cfbea4ce575..f3392488ffaf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -12769,7 +12769,6 @@ F: include/linux/regset.h F: include/linux/tracehook.h F: include/uapi/linux/ptrace.h F: include/uapi/linux/ptrace.h -F: include/asm-generic/ptrace.h F: kernel/ptrace.c F: arch/*/ptrace*.c F: arch/*/*/ptrace*.c diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index b6578611dddb..1e76774b36dd 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -56,11 +56,6 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) return regs->regs[31]; } -/* - * Don't use asm-generic/ptrace.h it defines FP accessors that don't make - * sense on MIPS. We rather want an error if they get invoked. - */ - static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long val) { diff --git a/include/asm-generic/ptrace.h b/include/asm-generic/ptrace.h deleted file mode 100644 index 82e674f6b337.. --- a/include/asm-generic/ptrace.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Common low level (register) ptrace helpers - * - * Copyright 2004-2011 Analog Devices Inc. - * - * Licensed under the GPL-2 or later. - */ - -#ifndef __ASM_GENERIC_PTRACE_H__ -#define __ASM_GENERIC_PTRACE_H__ - -#ifndef __ASSEMBLY__ - -/* Helpers for working with the instruction pointer */ -#ifndef GET_IP -#define GET_IP(regs) ((regs)->pc) -#endif -#ifndef SET_IP -#define SET_IP(regs, val) (GET_IP(regs) = (val)) -#endif - -static inline unsigned long instruction_pointer(struct pt_regs *regs) -{ - return GET_IP(regs); -} -static inline void instruction_pointer_set(struct pt_regs *regs, - unsigned long val) -{ - SET_IP(regs, val); -} - -#ifndef profile_pc -#define profile_pc(regs) instruction_pointer(regs) -#endif - -/* Helpers for working with the user stack pointer */ -#ifndef GET_USP -#define GET_USP(regs) ((regs)->usp) -#endif -#ifndef SET_USP -#define SET_USP(regs, val) (GET_USP(regs) = (val)) -#endif - -static inline unsigned long user_stack_pointer(struct pt_regs *regs) -{ - return GET_USP(regs); -} -static inline void user_stack_pointer_set(struct pt_regs *regs, - unsigned long val) -{ - SET_USP(regs, val); -} - -/* Helpers for working with the frame pointer */ -#ifndef GET_FP -#define GET_FP(regs) ((regs)->fp) -#endif -#ifndef SET_FP -#define SET_FP(regs, val) (GET_FP(regs) = (val)) -#endif - -static inline unsigned long frame_pointer(struct pt_regs *regs) -{ - return GET_FP(regs); -} -static inline void frame_pointer_set(struct pt_regs *regs, - unsigned long val) -{ - SET_FP(regs, val); -} - -#endif /* __ASSEMBLY__ */ - -#endif -- 2.20.1
[PATCH 4/5] x86: don't use asm-generic/ptrace.h
Doing the indirection through macros for the regs accessors just makes them harder to read, so implement the helpers directly. Note that only the helpers actually used are implemented now. Signed-off-by: Christoph Hellwig Acked-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 30 +- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 8a7fc0cca2d1..e22816e865ca 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -98,7 +98,6 @@ struct cpuinfo_x86; struct task_struct; extern unsigned long profile_pc(struct pt_regs *regs); -#define profile_pc profile_pc extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); @@ -175,11 +174,32 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) } #endif -#define GET_IP(regs) ((regs)->ip) -#define GET_FP(regs) ((regs)->bp) -#define GET_USP(regs) ((regs)->sp) +static inline unsigned long instruction_pointer(struct pt_regs *regs) +{ + return regs->ip; +} + +static inline void instruction_pointer_set(struct pt_regs *regs, + unsigned long val) +{ + regs->ip = val; +} + +static inline unsigned long frame_pointer(struct pt_regs *regs) +{ + return regs->bp; +} -#include +static inline unsigned long user_stack_pointer(struct pt_regs *regs) +{ + return regs->sp; +} + +static inline void user_stack_pointer_set(struct pt_regs *regs, + unsigned long val) +{ + regs->sp = val; +} /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); -- 2.20.1