[PATCH kernel] powerpc/powernv/npu: Remove unused headers and a macro.
The macro and few headers are not used so remove them. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/npu-dma.c | 14 -- 1 file changed, 14 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 8006c54..3a5c4ed 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -9,32 +9,18 @@ * License as published by the Free Software Foundation. */ -#include #include #include #include -#include #include #include -#include -#include #include -#include #include -#include -#include -#include -#include -#include -#include #include -#include "powernv.h" #include "pci.h" -#define npu_to_phb(x) container_of(x, struct pnv_phb, npu) - /* * spinlock to protect initialisation of an npu_context for a particular * mm_struct. -- 2.11.0
[PATCH kernel] KVM: PPC: Optimize clearing TCEs for sparse tables
The powernv platform maintains 2 TCE tables for VFIO - a hardware TCE table and a table with userspace addresses. These tables are radix trees, we allocate indirect levels when they are written to. Since the memory allocation is problematic in real mode, we have 2 accessors to the entries: - for virtual mode: it allocates the memory and it is always expected to return non-NULL; - fr real mode: it does not allocate and can return NULL. Also, DMA windows can span to up to 55 bits of the address space and since we never have this much RAM, such windows are sparse. However currently the SPAPR TCE IOMMU driver walks through all TCEs to unpin DMA memory. Since we maintain a userspace addresses table for VFIO which is a mirror of the hardware table, we can use it to know which parts of the DMA window have not been mapped and skip these so does this patch. The bare metal systems do not have this problem as they use a bypass mode of a PHB which maps RAM directly. This helps a lot with sparse DMA windows, reducing the shutdown time from about 3 minutes per 1 billion TCEs to a few seconds for 32GB sparse guest. Just skipping the last level seems to be good enough. As non-allocating accessor is used now in virtual mode as well, rename it from IOMMU_TABLE_USERSPACE_ENTRY_RM (real mode) to _RO (read only). Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/include/asm/iommu.h| 2 +- arch/powerpc/kvm/book3s_64_vio.c| 5 ++--- arch/powerpc/kvm/book3s_64_vio_hv.c | 6 +++--- drivers/vfio/vfio_iommu_spapr_tce.c | 22 -- 4 files changed, 26 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 3d4b88c..35db0cb 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -126,7 +126,7 @@ struct iommu_table { int it_nid; }; -#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \ +#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \ ((tbl)->it_ops->useraddrptr((tbl), (entry), false)) #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ ((tbl)->it_ops->useraddrptr((tbl), (entry), true)) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index c0c64d1..62a8d03 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -410,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); if (!pua) - /* it_userspace allocation might be delayed */ - return H_TOO_HARD; + return H_SUCCESS; mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 389dac1..583031d 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -212,7 +212,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, if (!ret && ((*direction == DMA_FROM_DEVICE) || (*direction == DMA_BIDIRECTIONAL))) { - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); /* * kvmppc_rm_tce_iommu_do_map() updates the UA cache after * calling this so we still get here a valid UA. @@ -238,7 +238,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -302,7 +302,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, { long ret; unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 96721b1..b1a8ab3 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -444,7 +444,7 @@ static void tce_iommu_unuse_page_v2(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem = NULL; int ret; unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry); if (!pua) ret
[PATCH kernel] powerpc/powernv/ioda: Allocate indirect TCE levels of cached userspace addresses on demand
The powernv platform maintains 2 TCE tables for VFIO - a hardware TCE table and a table with userspace addresses; the latter is used for marking pages dirty when corresponging TCEs are unmapped from the hardware table. a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on demand") enabled on-demand allocation of the hardware table, however it missed the other table so it has still been fully allocated at the boot time. This fixes the issue by allocating a single level, just like we do for the hardware table. Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on demand") Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index fe96910..7639b21 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -299,7 +299,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (alloc_userspace_copy) { offset = 0; uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset, + tmplevels, tce_table_size, &offset, &total_allocated_uas); if (!uas) goto free_tces_exit; -- 2.11.0
[PATCH kernel] cxl: Remove unused include
The included opal.h gives a wrong idea that CXL makes PPC OPAL calls while it does not so let's remote it. Signed-off-by: Alexey Kardashevskiy --- drivers/misc/cxl/pci.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index b66d832..8cbcbb7 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include -- 2.11.0
[PATCH] selftests/powerpc: Fix Makefiles for headers_install change
Commit b2d35fa5fc80 ("selftests: add headers_install to lib.mk") introduced a requirement that Makefiles more than one level below the selftests directory need to define top_srcdir, but it didn't update any of the powerpc Makefiles. This broke building all the powerpc selftests with eg: make[1]: Entering directory '/src/linux/tools/testing/selftests/powerpc' BUILD_TARGET=/src/linux/tools/testing/selftests/powerpc/alignment; mkdir -p $BUILD_TARGET; make OUTPUT=$BUILD_TARGET -k -C alignment all make[2]: Entering directory '/src/linux/tools/testing/selftests/powerpc/alignment' ../../lib.mk:20: ../../../../scripts/subarch.include: No such file or directory make[2]: *** No rule to make target '../../../../scripts/subarch.include'. make[2]: Failed to remake makefile '../../../../scripts/subarch.include'. Makefile:38: recipe for target 'alignment' failed Fix it by setting top_srcdir in the affected Makefiles. Fixes: b2d35fa5fc80 ("selftests: add headers_install to lib.mk") Signed-off-by: Michael Ellerman --- tools/testing/selftests/powerpc/alignment/Makefile | 1 + tools/testing/selftests/powerpc/benchmarks/Makefile| 1 + tools/testing/selftests/powerpc/cache_shape/Makefile | 1 + tools/testing/selftests/powerpc/copyloops/Makefile | 1 + tools/testing/selftests/powerpc/dscr/Makefile | 1 + tools/testing/selftests/powerpc/math/Makefile | 1 + tools/testing/selftests/powerpc/mm/Makefile| 1 + tools/testing/selftests/powerpc/pmu/Makefile | 1 + tools/testing/selftests/powerpc/pmu/ebb/Makefile | 1 + tools/testing/selftests/powerpc/primitives/Makefile| 1 + tools/testing/selftests/powerpc/ptrace/Makefile| 1 + tools/testing/selftests/powerpc/signal/Makefile| 1 + tools/testing/selftests/powerpc/stringloops/Makefile | 1 + tools/testing/selftests/powerpc/switch_endian/Makefile | 1 + tools/testing/selftests/powerpc/syscalls/Makefile | 1 + tools/testing/selftests/powerpc/tm/Makefile| 1 + tools/testing/selftests/powerpc/vphn/Makefile | 1 + 17 files changed, 17 insertions(+) diff --git a/tools/testing/selftests/powerpc/alignment/Makefile b/tools/testing/selftests/powerpc/alignment/Makefile index 93baacab7693..d056486f49de 100644 --- a/tools/testing/selftests/powerpc/alignment/Makefile +++ b/tools/testing/selftests/powerpc/alignment/Makefile @@ -1,5 +1,6 @@ TEST_GEN_PROGS := copy_first_unaligned alignment_handler +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c ../utils.c diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile b/tools/testing/selftests/powerpc/benchmarks/Makefile index b4d7432a0ecd..d40300a65b42 100644 --- a/tools/testing/selftests/powerpc/benchmarks/Makefile +++ b/tools/testing/selftests/powerpc/benchmarks/Makefile @@ -4,6 +4,7 @@ TEST_GEN_FILES := exec_target CFLAGS += -O2 +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.c diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile index 1be547434a49..ede4d3dae750 100644 --- a/tools/testing/selftests/powerpc/cache_shape/Makefile +++ b/tools/testing/selftests/powerpc/cache_shape/Makefile @@ -5,6 +5,7 @@ all: $(TEST_PROGS) $(TEST_PROGS): ../harness.c ../utils.c +top_srcdir = ../../../../.. include ../../lib.mk clean: diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile b/tools/testing/selftests/powerpc/copyloops/Makefile index 1cf89a34d97c..44574f3818b3 100644 --- a/tools/testing/selftests/powerpc/copyloops/Makefile +++ b/tools/testing/selftests/powerpc/copyloops/Makefile @@ -17,6 +17,7 @@ TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 copyuser_64_t2 \ EXTRA_SOURCES := validate.c ../harness.c stubs.S +top_srcdir = ../../../../.. include ../../lib.mk $(OUTPUT)/copyuser_64_t%: copyuser_64.S $(EXTRA_SOURCES) diff --git a/tools/testing/selftests/powerpc/dscr/Makefile b/tools/testing/selftests/powerpc/dscr/Makefile index 55d7db7a616b..5df476364b4d 100644 --- a/tools/testing/selftests/powerpc/dscr/Makefile +++ b/tools/testing/selftests/powerpc/dscr/Makefile @@ -3,6 +3,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test dscr_user_test \ dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test \ dscr_sysfs_thread_test +top_srcdir = ../../../../.. include ../../lib.mk $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile index 0dd3a01fdab9..11a10d7a2bbd 100644 --- a/tools/testing/selftests/powerpc/math/Makefile +++ b/tools/testing/selftests/powerpc/math/Makefile @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal vmx_syscall vmx_preempt vmx_signal vsx_preempt +top_srcdir = ../../../../.. include ../../lib.mk $(TEST_GEN_PROGS): ../harness.
Re: [RFC PATCH 08/11] powerpc/tm: Do not reclaim on ptrace
On Thu, 2018-09-27 at 18:03 -0300, Breno Leitao wrote: > Hi Mikey, > > On 09/18/2018 02:36 AM, Michael Neuling wrote: > > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: > > > Make sure that we are not suspended on ptrace and that the registers were > > > already reclaimed. > > > > > > Since the data was already reclaimed, there is nothing to be done here > > > except to restore the SPRs. > > > > > > Signed-off-by: Breno Leitao > > > --- > > > arch/powerpc/kernel/ptrace.c | 10 -- > > > 1 file changed, 4 insertions(+), 6 deletions(-) > > > > > > diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c > > > index 9667666eb18e..cf6ee9154b11 100644 > > > --- a/arch/powerpc/kernel/ptrace.c > > > +++ b/arch/powerpc/kernel/ptrace.c > > > @@ -136,12 +136,10 @@ static void flush_tmregs_to_thread(struct > > > task_struct > > > *tsk) > > > if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) > > > return; > > > > > > - if (MSR_TM_SUSPENDED(mfmsr())) { > > > - tm_reclaim_current(TM_CAUSE_SIGNAL); > > > - } else { > > > - tm_enable(); > > > - tm_save_sprs(&(tsk->thread)); > > > - } > > > + WARN_ON(MSR_TM_SUSPENDED(mfmsr())); > > > + > > > + tm_enable(); > > > + tm_save_sprs(&(tsk->thread)); > > > > Do we need to check if TM was enabled in the task before saving the TM SPRs? > > > > What happens if TM was lazily off and hence we had someone else's TM SPRs in > > the > > CPU currently? Wouldn't this flush the wrong values to the task_struct? > > > > I think we need to check the processes MSR before doing this. > > Yes, it is a *very* good point, and I think we are vulnerable even before > this patch (in the current kernel). Take a look above, we are calling > tm_save_sprs() if MSR is not TM suspended independently if TM is lazily off. I think you're right, we might already have an issue. There are some paths in here that don't check the userspace msr or any of the lazy tm enable. :( Mikey > It shouldn't be hard to create a test case for it. I can try to call > ptrace(PTRACE_GETVRREGS) on a task that sleeps until TM is lazily disabled, > compare if the TM SPR changed in this mean time. (while doing HTM in parallel > to keep HTM SPR changing). Let's see if I can read others task TM SPRs. > > Thank you. >
Re: [RFC PATCH 05/11] powerpc/tm: Function that updates the failure code
On Thu, 2018-09-27 at 17:58 -0300, Breno Leitao wrote: > Hi Mikey, > > On 09/17/2018 10:29 PM, Michael Neuling wrote: > > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: > > > Now the transaction reclaims happens very earlier in the trap handler, and > > > it is impossible to know precisely, at that early time, what should be set > > > as the failure cause for some specific cases, as, if the task will be > > > rescheduled, thus, the transaction abort case should be updated from > > > TM_CAUSE_MISC to TM_CAUSE_RESCHED, for example. > > > > Please add comments to where this is used (in EXCEPTION_COMMON macro I > > think) > > that say this might happen. > > Is it OK if I comment it in TM_KERNEL_ENTRY macro, since the failure cause > could be updated independently of the exception being execute, so, every call > to TM_KERNEL_ENTRY can have the cause overwritten. Sure. > I.e. it does not matter if the exception is a systemcall or a page fault, > the failure cause will be updated if there is a process reschedule after the > exception/syscall is handled. > > Thank you >
Re: [RFC PATCH 11/11] selftests/powerpc: Adapt the test
On Thu, 2018-09-27 at 17:57 -0300, Breno Leitao wrote: > Hi Mikey, > > On 09/18/2018 03:36 AM, Michael Neuling wrote: > > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: > > > The Documentation/powerpc/transactional_memory.txt says: > > > > > > "Syscalls made from within a suspended transaction are performed as > > > normal > > > and the transaction is not explicitly doomed by the kernel. However, > > > what the kernel does to perform the syscall may result in the > > > transaction > > > being doomed by the hardware." > > > > > > With this new TM mechanism, the syscall will continue to be executed if > > > the > > > syscall happens on a suspended syscall, but, the syscall will *fail* if > > > the > > > transaction is still active during the syscall invocation. > > > > Not sure I get this. This doesn't seem any different to before. > > > > An active (not suspended) transaction *will* result in the syscall failing > > and > > the transaction being doomed. > > > > A syscall in a suspended transaction should succeed and the transaction. > > I understand that a transaction will always be doomed if there is a > reclaim/checkpoint, thus, if the you make a system call inside a suspended > transaction, it will reclaim and recheckpoint, thus, dooming the transaction, > and failing it on the next RFID. So currently a syscall won't explicitly treclaim/trecheckpoint so it won't necessarily be doomed. With your new patches it will be. > So, the syscall executed in suspended mode may succeed, because it will not > be skipped (as in active mode), but the transaction will *always* fail, > because there was a reclaim and recheckpoint. > > > You might need to clean up the language. I try to use: > > > >Active == transactional but not suspended (ie MSR[TS] = T) > >Suspended == suspended (ie MSR [TS] = S) > >Doomed == transaction to be rolled back at next opportinity (ie tcheck > > returns doomed) > > > > (note: the kernel MSR_TM_ACTIVE() macro is not consistent with this since > > it's > > MSR[TS] == S or T). > > Ok, I agree with this language as well. I might want to improve the code to > follow the same language all across the code. > > > > On the syscall path, if the transaction is active and not suspended, it > > > will call TM_KERNEL_ENTRY which will reclaim and recheckpoint the > > > transaction, thus, dooming the transaction on userspace return, with > > > failure code TM_CAUSE_SYSCALL. > > > > But the test below is on a suspend transaction? > > Sorry, I meant "suspended transaction" above instead of "transaction is > active and not suspended". > > > > > > This new model will break part of this test, but I understand that that > > > the > > > documentation above didn't guarantee that the syscall would succeed, and > > > it > > > will never succeed anymore now on. > > > > The syscall should pass in suspend (modulo the normal syscall checks). The > > transaction may fail as a result. > > Perfect, and if the transaction fail, the CPU will rollback the changes and > restore the checkpoint registers (replacing the r3 that contains the pid > value), thus, it will be like "getpid" system call didn't execute. No. If we are suspended, then we go back right after the sc. We don't get rolled back till the tresume. Mikey > For this test specifically, it assumes the syscall didn't execute if the > transaction failed. Take a look: > > FUNC_START(getppid_tm_suspended) > tbegin. > beq 1f > li r0, __NR_getppid > tsuspend. > sc > tresume. > tend. > blr > 1: > li r3, -1 > blr > > So, the tests thinks the syscall failed because the transaction aborted. > > Anyway, I can try to improve this test other than removing this test, but I > am not sure how. > > Thank you >
Re: [RFC PATCH 10/11] powerpc/tm: Set failure summary
On Thu, 2018-09-27 at 17:52 -0300, Breno Leitao wrote: > Hi Mikey, > > On 09/18/2018 02:50 AM, Michael Neuling wrote: > > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: > > > Since the transaction will be doomed with treckpt., the TEXASR[FS] > > > should be set, to reflect that the transaction is a failure. This patch > > > ensures it before recheckpointing, and remove changes from other places > > > that were calling recheckpoint. > > > > TEXASR[FS] should be set by the reclaim. > > Do you mean that the CPU should set TEXASR[FS] when treclaim is called, or, > that the tm_reclaim? > > Looking at the ISA, I didn't see TEXASR[FS] being set automatically when a > reclaim happens, treclaim in ISA talks about "TMRecordFailure(cause)" and that macro sets TEXASR[FS]=1. So yes treclaim always sets TEXASR[FS]=1. > although, I see it needs TEXASR[FS] to be set when executing a > trecheckpoint, otherwise it will cause a TM Bad Thing. Yep > That is why I am forcing TEXASR[FS]=1 to doom the transaction so we can > recheckpoint it, but it seems I understood this wrongly. You shouldn't need to. We do a bug_on() just before the trecheckpoint to make sure it's set, but you shouldn't need to set it (other than the signals case). > > I don't know why you'd need to set this > > explicitly in process.c. The only case is when the user supplies a bad > > signal > > context, but we should check that in the signals code, not process.c > > > > Hence I think this patch is wrong. > > > > Also, according to the architecture, TEXASR[FS] HAS TO BE SET on > > trecheckpoint > > otherwise you'll get a TM Bad Thing. You should say that rather than > > suggesting > > it's because the transaction is doomed. It's ilqlegal to not do it. That's > > why we > > have this check in arch/powerpc/kernel/tm.S. > > When you say "HAS TO BE SET", do you mean that the hardware will set it and > we shouldn't care about this flag? Thus, if I am hitting EMIT_BUG_ENTRY, it > means my TEXASR was messed somehow? I'm just saying what you said before about the tm bad thing. We have to make sure it's set before we do the trecheckpoint otherwise we end up in the TM bad thing. The treclaim should handle this for us (but again the signals case need to be checked). Mikey
Re: [RFC PATCH 09/11] powerpc/tm: Do not restore default DSCR
On Thu, 2018-09-27 at 17:51 -0300, Breno Leitao wrote: > Hi Mikey, > > On 09/18/2018 02:41 AM, Michael Neuling wrote: > > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: > > > In the previous TM code, trecheckpoint was being executed in the middle of > > > an exception, thus, DSCR was being restored to default kernel DSCR value > > > after trecheckpoint was done. > > > > > > With this current patchset, trecheckpoint is executed just before getting > > > to userspace, at ret_from_except_lite, for example. Thus, we do not need > > > to > > > set default kernel DSCR value anymore, as we are leaving kernel space. It > > > is OK to keep the checkpointed DSCR value into the live SPR, mainly > > > because > > > the transaction is doomed and it will fail soon (after RFID), > > > > What if we are going back to a suspended transaction? It will remain live > > until > > userspace does a tresume > > Hmm, I understand that once we get in kernel space, and call > treclaim/trecheckpoint, the transaction will be doomed and it will abort and > rollback when we leave kernel space. I.e., if we can treclaim/trecheckpointn > in kernel space, the transaction will *always* abort just after RFID, so, a > possible tresume will never be executed. Is my understanding wrong? Your understanding is wrong. We don't roll back until MSR[TS] = T. There are two cases for the RFID. 1) if you RFID back to userspace that is transactional (ie MSR[TS] = T), then it will immediately rollback as soon as the RFID happens since we are going directly to T. 2) If you RFID back to userspace that is suspended (ie MSR[TS] = S), then it won't roll back until userspace does a tresume. It doesn't rollback until we go MSR[TS] = S -> T). > > > > > so, > > > continuing with the pre-checkpointed DSCR value is what seems correct. > > > > Reading this description suggests this patch isn't really needed. Right? > > Maybe the description is not clear, but I understand this patch is required, > otherwise we will leave userspace with a default DSCR value. > > By the way, do you know if there is a change in DSCR inside a transaction, > will it be reverted if the transaction is aborted? Yes it will be reverted. We even have a selftest for it in tools/testing/selftests/powerpc/tm/tm-resched-dscr.c There are a bunch of checkpointed SPRs. From the arch: Checkpointed registers: The set of registers that are saved to the “checkpoint area” when a transaction is initiated, and restored upon transaction failure, is a subset of the architected register state, consisting of the General Purpose Registers, Floating-Point Regis- ters, Vector Registers, Vector-Scalar Registers, and the following Special Registers and fields: CR fields other than CR0, XER, LR, CTR, FPSCR, AMR, PPR, VRSAVE, VSCR, DSCR, and TAR. Mikey
Re: [PATCH v6] selftests: add headers_install to lib.mk
[ + linuxppc-dev ] Anders Roxell writes: > If the kernel headers aren't installed we can't build all the tests. > Add a new make target rule 'khdr' in the file lib.mk to generate the > kernel headers and that gets include for every test-dir Makefile that > includes lib.mk If the testdir in turn have its own sub-dirs the > top_srcdir needs to be set to the linux-rootdir to be able to generate > the kernel headers. > > Signed-off-by: Anders Roxell > Reviewed-by: Fathi Boudra > --- > > I sent this (v5) a month ago and wondered if it got lost. Resending > unchanged. > > Cheers, > Anders > > Makefile | 14 +- > scripts/subarch.include| 13 + > tools/testing/selftests/android/Makefile | 2 +- > tools/testing/selftests/android/ion/Makefile | 2 ++ > tools/testing/selftests/futex/functional/Makefile | 1 + > tools/testing/selftests/gpio/Makefile | 7 ++- > tools/testing/selftests/kvm/Makefile | 7 ++- > tools/testing/selftests/lib.mk | 12 > tools/testing/selftests/net/Makefile | 1 + > .../selftests/networking/timestamping/Makefile | 1 + > tools/testing/selftests/vm/Makefile| 4 > 11 files changed, 36 insertions(+), 28 deletions(-) > create mode 100644 scripts/subarch.include This broke all the powerpc selftests :( Why did it go in at rc5? cheers
[PATCH] powerpc/numa: Skip onlining a offline node in kdump path
With Commit 2ea626306810 ("powerpc/topology: Get topology for shared processors at boot"), kdump kernel on shared lpar may crash. The necessary conditions are - Shared Lpar with atleast 2 nodes having memory and CPUs. - Memory requirement for kdump kernel must be met by the first N-1 nodes where there are atleast N nodes with memory and CPUs. Example numactl of such a machine. numactl -H available: 5 nodes (0,2,5-7) node 0 cpus: node 0 size: 0 MB node 0 free: 0 MB node 2 cpus: node 2 size: 255 MB node 2 free: 189 MB node 5 cpus: 24 25 26 27 28 29 30 31 node 5 size: 4095 MB node 5 free: 4024 MB node 6 cpus: 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23 node 6 size: 6353 MB node 6 free: 5998 MB node 7 cpus: 8 9 10 11 12 13 14 15 32 33 34 35 36 37 38 39 node 7 size: 7640 MB node 7 free: 7164 MB node distances: node 0 2 5 6 7 0: 10 40 40 40 40 2: 40 10 40 40 40 5: 40 40 10 40 40 6: 40 40 40 10 20 7: 40 40 40 20 10 Steps to reproduce. 1. Load / start kdump service. 2. Trigger a kdump (for example : echo c > /proc/sysrq-trigger) When booting a kdump kernel with 2048M kexec: Starting switchover sequence. I'm in purgatory Using 1TB segments hash-mmu: Initializing hash mmu with SLB Linux version 4.19.0-rc5-master+ (srikar@linux-xxu6) (gcc version 4.8.5 (SUSE Linux)) #1 SMP Thu Sep 27 19:45:00 IST 2018 Found initrd at 0xc9e7:0xcae554b4 Using pSeries machine description - ppc64_pft_size= 0x1e phys_mem_size = 0x8800 dcache_bsize = 0x80 icache_bsize = 0x80 cpu_features = 0x00ff8f5d91a7 possible= 0xfbffcf5fb1a7 always = 0x006f8b5c91a1 cpu_user_features = 0xdc0065c2 0xef00 mmu_features = 0x7c006001 firmware_features = 0x0007c45bfc57 htab_hash_mask= 0x7f physical_start= 0x800 - numa: NODE_DATA [mem 0x87d5e300-0x87d67fff] numa: NODE_DATA(0) on node 6 numa: NODE_DATA [mem 0x87d54600-0x87d5e2ff] Top of RAM: 0x8800, Total RAM: 0x8800 Memory hole size: 0MB Zone ranges: DMA [mem 0x-0x87ff] DMA32empty Normal empty Movable zone start for each node Early memory node ranges node 6: [mem 0x-0x87ff] Could not find start_pfn for node 0 Initmem setup node 0 [mem 0x-0x] On node 0 totalpages: 0 Initmem setup node 6 [mem 0x-0x87ff] On node 6 totalpages: 34816 Unable to handle kernel paging request for data at address 0x0060 Faulting instruction address: 0xc8703a54 Oops: Kernel access of bad area, sig: 11 [#1] LE SMP NR_CPUS=2048 NUMA pSeries Modules linked in: CPU: 11 PID: 1 Comm: swapper/11 Not tainted 4.19.0-rc5-master+ #1 NIP: c8703a54 LR: c8703a38 CTR: REGS: cb673440 TRAP: 0380 Not tainted (4.19.0-rc5-master+) MSR: 82009033 CR: 24022022 XER: 2002 CFAR: c86fc238 IRQMASK: 0 GPR00: c8703a38 cb6736c0 c9281900 GPR04: f001 cb660080 GPR08: 0220 GPR12: 2200 c9e51400 0008 GPR16: c8c152e8 c8c152a8 GPR20: c9422fd8 c9412fd8 c9426040 0008 GPR24: c9168bc8 c9168c78 GPR28: cb126410 c916a0b8 cb126400 NIP [c8703a54] bus_add_device+0x84/0x1e0 LR [c8703a38] bus_add_device+0x68/0x1e0 Call Trace: [cb6736c0] [c8703a38] bus_add_device+0x68/0x1e0 (unreliable) [cb673740] [c8700194] device_add+0x454/0x7c0 [cb673800] [c872e660] __register_one_node+0xb0/0x240 [cb673860] [c839a6bc] __try_online_node+0x12c/0x180 [cb673900] [c839b978] try_online_node+0x58/0x90 [cb673930] [c80846d8] find_and_online_cpu_nid+0x158/0x190 [cb673a10] [c80848a0] numa_update_cpu_topology+0x190/0x580 [cb673c00] [c8d3f2e4] smp_cpus_done+0x94/0x108 [cb673c70] [c8d5c00c] smp_init+0x174/0x19c [cb673d00] [c8d346b8] kernel_init_freeable+0x1e0/0x450 [cb673dc0] [c80102e8] kernel_init+0x28/0x160 [cb673e30] [c800b65c] ret_from_kernel_thread+0x5c/0x80 Instruction dump: 6000 6000 e89e0020 7fe3fb78 4bff87d5 6000 7c7d1b79 4082008c e8bf0050 e93e0098 3b9f0010 2fa5 38630018 419e0114 7f84e378 ---[ end trace 593577668c2daa65 ]--- However a regular kernel with 4096M (2048 gets reserved for crash kernel) boots properly. Unlike regular kernels, which mark all available nodes as online, kdump kernel only marks just enough nodes a
Re: [PATCH] powerpc: wire up memtest
Christophe Leroy writes: > Add call to early_memtest() so that kernel compiled with > CONFIG_MEMTEST really perform memtest at startup when requested > via 'memtest' boot parameter. > > Signed-off-by: Christophe Leroy > --- > arch/powerpc/kernel/setup-common.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/arch/powerpc/kernel/setup-common.c > b/arch/powerpc/kernel/setup-common.c > index 93fa0c99681e..904b728eb20d 100644 > --- a/arch/powerpc/kernel/setup-common.c > +++ b/arch/powerpc/kernel/setup-common.c > @@ -33,6 +33,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -917,6 +918,8 @@ void __init setup_arch(char **cmdline_p) > /* Parse memory topology */ > mem_topology_setup(); > > + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT); On a ppc64le VM this boils down to early_memtest(0, 0) for me. I think it's too early, we don't set up max_low_pfn until initmem_init(). If I move it after initmem_init() then it does something more useful: early_memtest: # of tests: 17 0x01450580 - 0x01450800 pattern 4c494e5558726c7a 0x01450c00 - 0x0360 pattern 4c494e5558726c7a 0x047c - 0x2fff pattern 4c494e5558726c7a 0x3000 - 0x3ff24000 pattern 4c494e5558726c7a 0x3fff4000 - 0x3fff4c00 pattern 4c494e5558726c7a 0x3fff5000 - 0x3fff5300 pattern 4c494e5558726c7a 0x3fff5c00 - 0x3fff5f00 pattern 4c494e5558726c7a 0x3fff6800 - 0x3fff6b00 pattern 4c494e5558726c7a 0x3fff7400 - 0x3fff7700 pattern 4c494e5558726c7a 0x3fff8000 - 0x3fff8300 pattern 4c494e5558726c7a 0x3fff8c00 - 0x3fff8f00 pattern 4c494e5558726c7a 0x3fff9800 - 0x3fff9b00 pattern 4c494e5558726c7a 0x3fffa400 - 0x3fffa700 pattern 4c494e5558726c7a 0x3fffb000 - 0x3fffb300 pattern 4c494e5558726c7a 0x3fffbc00 - 0x3fffbf00 pattern 4c494e5558726c7a 0x3fffc800 - 0x3fffcb00 pattern 4c494e5558726c7a 0x3fffd400 - 0x3fffd700 pattern 4c494e5558726c7a 0x3fffe000 - 0x3fffe100 pattern 4c494e5558726c7a 0x4000 - 0xffc1 pattern 4c494e5558726c7a 0xfffa - 0xfffa5b00 pattern 4c494e5558726c7a 0x0001 - 0x0001ffbe pattern 4c494e5558726c7a 0x0001fff6 - 0x0001fff61b00 pattern 4c494e5558726c7a 0x0001fffec000 - 0x0001fffec4b8 pattern 4c494e5558726c7a 0x0001fffec524 - 0x0001fffec528 pattern 4c494e5558726c7a cheers
Re: [PATCH v4] powerpc/64s: reimplement book3s idle code in C
On Wed, 26 Sep 2018 19:39:14 +0530 Akshay Adiga wrote: > On Fri, Sep 14, 2018 at 11:52:40AM +1000, Nicholas Piggin wrote: > > + > > + /* > > +* On POWER9, SRR1 bits do not match exactly as expected. > > +* SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so > > +* always test PSSCR if there is any state loss. > > +*/ > > + if (likely((psscr & PSSCR_RL_MASK) < pnv_first_hv_loss_level)) { > > Shouldn't we check PLS field to see if the cpu/core woke up from hv loss ? > > Currently, a cpu requested stop4 (RL=4) and exited from a shallower state > (PLS=2), SPR's are unecessarily restored. > > We can do something like : > > #define PSSCR_PLS_SHIFT 60 > if (likely((psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT) < pnv_first_hv_loss_level) Ah, that corresponds with the following existing code? /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state */ LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) /* * 0-3 bits correspond to Power-Saving Level Status * which indicates the idle state we are waking up from */ mfspr r5, SPRN_PSSCR rldicl r5,r5,4,60 li r0, 0 /* clear requested_psscr to say we're awake */ std r0, PACA_REQ_PSSCR(r13) cmpdcr4,r5,r4 bge cr4,pnv_wakeup_tb_loss /* returns to caller */ Yes I didn't get that right, good catch. Thanks, Nick
Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection
On Thu, 2018-09-27 at 15:49 +0200, Christoph Hellwig wrote: > On Thu, Sep 27, 2018 at 11:45:15AM +1000, Benjamin Herrenschmidt wrote: > > I'm not sure this is entirely right. > > > > Let's say the mask is 30 bits. You will return GFP_DMA32, which will > > fail if you allocate something above 1G (which is legit for > > ZONE_DMA32). > > And then we will try GFP_DMA further down in the function: > > if (IS_ENABLED(CONFIG_ZONE_DMA) && > dev->coherent_dma_mask < DMA_BIT_MASK(32) && > !(gfp & GFP_DMA)) { > gfp = (gfp & ~GFP_DMA32) | GFP_DMA; > goto again; > } > > This is and old optimization from x86, because chances are high that > GFP_DMA32 will give you suitable memory for the infamous 31-bit > dma mask devices (at least at boot time) and thus we don't have > to deplete the tiny ZONE_DMA pool. I see, it's rather confusing :-) Wouldn't it be better to check against top of 32-bit memory instead here too ? Cheers, Ben.
[PATCH 0/3] powerpc/powernv/npu: Improve ATSD invalidation overhead
When ATS is used in a process, all CPU TLB invalidates in that process also trigger ATSD invalidates via mmu_notifiers. This additional overhead is noticeable in applications which do heavy memory allocation or page migration among nodes, particularly to and from GPUs. This patch set reduces that overhead by rearranging how the ATSDs are issued and by using size-based ATSD invalidates. Mark Hairgrove (3): powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates powerpc/powernv/npu: Use size-based ATSD invalidates powerpc/powernv/npu: Remove atsd_threshold debugfs setting arch/powerpc/platforms/powernv/npu-dma.c | 177 ++--- 1 files changed, 85 insertions(+), 92 deletions(-) -- 1.7.2.5
[PATCH 1/3] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates
There are two types of ATSDs issued to the NPU: invalidates targeting a specific virtual address and invalidates targeting the whole address space. In both cases prior to this change, the sequence was: for each NPU - Write the target address to the XTS_ATSD_AVA register - EIEIO - Write the launch value to issue the ATSD First, a target address is not required when invalidating the whole address space, so that write and the EIEIO have been removed. The AP (size) field in the launch is not needed either. Second, for per-address invalidates the above sequence is inefficient in the common case of multiple NPUs because an EIEIO is issued per NPU. This unnecessarily forces the launches of later ATSDs to be ordered with the launches of earlier ones. The new sequence only issues a single EIEIO: for each NPU - Write the target address to the XTS_ATSD_AVA register EIEIO for each NPU - Write the launch value to issue the ATSD Performance results were gathered using a microbenchmark which creates a 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in strides across the allocation. With only a single NPU active (one GPU) the difference is in the noise for both types of invalidates (+/-1%). With two NPUs active (on a 6-GPU system) the effect is more noticeable: mprotect rate (GB/s) Stride Before After Speedup 64K 5.96.5 10% 1M 31.2 33.4 7% 2M 36.3 38.7 7% 4M322.6 356.7 11% Signed-off-by: Mark Hairgrove --- arch/powerpc/platforms/powernv/npu-dma.c | 99 ++--- 1 files changed, 48 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 8006c54..c8f438a 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -454,79 +454,76 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg) } /* MMIO ATSD register offsets */ -#define XTS_ATSD_AVA 1 -#define XTS_ATSD_STAT 2 +#define XTS_ATSD_LAUNCH 0 +#define XTS_ATSD_AVA1 +#define XTS_ATSD_STAT 2 -static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg, - unsigned long launch, unsigned long va) +static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize, + bool flush) { - struct npu *npu = mmio_atsd_reg->npu; - int reg = mmio_atsd_reg->reg; + unsigned long launch = 0; - __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA); - eieio(); - __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]); + if (psize == MMU_PAGE_COUNT) { + /* IS set to invalidate entire matching PID */ + launch |= PPC_BIT(12); + } else { + /* AP set to invalidate region of psize */ + launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17); + } + + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + + return launch; } -static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], - unsigned long pid, bool flush) +static void mmio_atsd_regs_write(struct mmio_atsd_reg + mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset, + unsigned long val) { - int i; - unsigned long launch; + struct npu *npu; + int i, reg; for (i = 0; i <= max_npu2_index; i++) { - if (mmio_atsd_reg[i].reg < 0) + reg = mmio_atsd_reg[i].reg; + if (reg < 0) continue; - /* IS set to invalidate matching PID */ - launch = PPC_BIT(12); - - /* PRS set to process-scoped */ - launch |= PPC_BIT(13); - - /* AP */ - launch |= (u64) - mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); - - /* PID */ - launch |= pid << PPC_BITLSHIFT(38); + npu = mmio_atsd_reg[i].npu; + __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset); + } +} - /* No flush */ - launch |= !flush << PPC_BITLSHIFT(39); +static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], + unsigned long pid, bool flush) +{ + unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush); - /* Invalidating the entire process doesn't use a va */ - mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0); - } + /* Invalidating the entire process doesn't use a va */ + mmio_
[PATCH 2/3] powerpc/powernv/npu: Use size-based ATSD invalidates
Prior to this change only two types of ATSDs were issued to the NPU: invalidates targeting a single page and invalidates targeting the whole address space. The crossover point happened at the configurable atsd_threshold which defaulted to 2M. Invalidates that size or smaller would issue per-page invalidates for the whole range. The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all. These invalidates target addresses aligned to their size. 2M is a common invalidation size for GPU-enabled applications because that is a GPU page size, so reducing the number of invalidates by 32x in that case is a clear improvement. ATSD latency is high in general so now we always issue a single invalidate rather than multiple. This will over-invalidate in some cases, but for any invalidation size over 2M it matches or improves the prior behavior. There's also an improvement for single-page invalidates since the prior version issued two invalidates for that case instead of one. To show the benefit here are some performance numbers from a microbenchmark which creates a 1G allocation then uses mprotect with PROT_NONE to trigger invalidates in strides across the allocation. One NPU (1 GPU): mprotect rate (GB/s) Stride Before After Speedup 64K 5.35.6 5% 1M 39.3 57.4 46% 2M 49.7 82.6 66% 4M286.6 285.7 0% Two NPUs (6 GPUs): mprotect rate (GB/s) Stride Before After Speedup 64K 6.57.4 13% 1M 33.4 67.9 103% 2M 38.7 93.1 141% 4M356.7 354.6 -1% Anything over 2M is roughly the same as before since both cases issue a single ATSD. Signed-off-by: Mark Hairgrove --- arch/powerpc/platforms/powernv/npu-dma.c | 71 +- 1 files changed, 40 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index c8f438a..e471a1a 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -509,15 +509,14 @@ static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch); } -static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], - unsigned long va, unsigned long pid, bool flush) +static void mmio_invalidate_range(struct mmio_atsd_reg + mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid, + unsigned long start, unsigned long psize, bool flush) { - unsigned long launch; - - launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush); + unsigned long launch = get_atsd_launch_val(pid, psize, flush); /* Write all VAs first */ - mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va); + mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start); /* Issue one barrier for all address writes */ eieio(); @@ -608,15 +607,38 @@ static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]) } } +#define PAGE_64K (64UL * 1024) +#define PAGE_2M (2UL * 1024 * 1024) +#define PAGE_1G (1UL * 1024 * 1024 * 1024) + /* - * Invalidate either a single address or an entire PID depending on - * the value of va. + * Invalidate a virtual address range */ -static void mmio_invalidate(struct npu_context *npu_context, int va, - unsigned long address, bool flush) +static void mmio_invalidate(struct npu_context *npu_context, + unsigned long start, unsigned long size, bool flush) { struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; unsigned long pid = npu_context->mm->context.id; + unsigned long atsd_start = 0; + unsigned long end = start + size - 1; + int atsd_psize = MMU_PAGE_COUNT; + + /* +* Convert the input range into one of the supported sizes. If the range +* doesn't fit, use the next larger supported size. Invalidation latency +* is high, so over-invalidation is preferred to issuing multiple +* invalidates. +*/ + if (size == PAGE_64K) { + atsd_start = start; + atsd_psize = MMU_PAGE_64K; + } else if (ALIGN_DOWN(start, PAGE_2M) == ALIGN_DOWN(end, PAGE_2M)) { + atsd_start = ALIGN_DOWN(start, PAGE_2M); + atsd_psize = MMU_PAGE_2M; + } else if (ALIGN_DOWN(start, PAGE_1G) == ALIGN_DOWN(end, PAGE_1G)) { + atsd_start = ALIGN_DOWN(start, PAGE_1G); + atsd_psize = MMU_PAGE_1G; + } if (npu_context->nmmu_flush) /* @@ -631,10 +653,12 @@ static void mmio_invalidate(struct npu_context *npu_context, int va, * an invalidate. */ acquire_atsd_reg(npu_context, mmio_atsd_reg);
[PATCH 3/3] powerpc/powernv/npu: Remove atsd_threshold debugfs setting
This threshold is no longer used now that all invalidates issue a single ATSD to each active NPU. Signed-off-by: Mark Hairgrove --- arch/powerpc/platforms/powernv/npu-dma.c | 13 - 1 files changed, 0 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index e471a1a..426df1b 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -42,14 +42,6 @@ static DEFINE_SPINLOCK(npu_context_lock); /* - * When an address shootdown range exceeds this threshold we invalidate the - * entire TLB on the GPU for the given PID rather than each specific address in - * the range. - */ -static uint64_t atsd_threshold = 2 * 1024 * 1024; -static struct dentry *atsd_threshold_dentry; - -/* * Other types of TCE cache invalidation are not functional in the * hardware. */ @@ -968,11 +960,6 @@ int pnv_npu2_init(struct pnv_phb *phb) static int npu_index; uint64_t rc = 0; - if (!atsd_threshold_dentry) { - atsd_threshold_dentry = debugfs_create_x64("atsd_threshold", - 0600, powerpc_debugfs_root, &atsd_threshold); - } - phb->npu.nmmu_flush = of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush"); for_each_child_of_node(phb->hose->dn, dn) { -- 1.7.2.5
[PATCH 1/5] dma-mapping: make the get_required_mask method available unconditionally
This save some duplication for ia64, and makes the interface more general. In the long run we want each dma_map_ops instance to fill this out, but this will take a little more prep work. Signed-off-by: Christoph Hellwig Acked-by: Benjamin Herrenschmidt [powerpc bits] --- arch/ia64/include/asm/dma-mapping.h | 2 -- arch/ia64/include/asm/machvec.h | 7 --- arch/ia64/include/asm/machvec_init.h | 1 - arch/ia64/include/asm/machvec_sn2.h | 2 -- arch/ia64/pci/pci.c | 26 -- arch/ia64/sn/pci/pci_dma.c | 4 ++-- drivers/base/platform.c | 13 +++-- drivers/pci/controller/vmd.c | 4 include/linux/dma-mapping.h | 2 -- 9 files changed, 13 insertions(+), 48 deletions(-) diff --git a/arch/ia64/include/asm/dma-mapping.h b/arch/ia64/include/asm/dma-mapping.h index 76e4d6632d68..522745ae67bb 100644 --- a/arch/ia64/include/asm/dma-mapping.h +++ b/arch/ia64/include/asm/dma-mapping.h @@ -10,8 +10,6 @@ #include #include -#define ARCH_HAS_DMA_GET_REQUIRED_MASK - extern const struct dma_map_ops *dma_ops; extern struct ia64_machine_vector ia64_mv; extern void set_iommu_machvec(void); diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h index 267f4f170191..5133739966bc 100644 --- a/arch/ia64/include/asm/machvec.h +++ b/arch/ia64/include/asm/machvec.h @@ -44,7 +44,6 @@ typedef void ia64_mv_kernel_launch_event_t(void); /* DMA-mapping interface: */ typedef void ia64_mv_dma_init (void); -typedef u64 ia64_mv_dma_get_required_mask (struct device *); typedef const struct dma_map_ops *ia64_mv_dma_get_ops(struct device *); /* @@ -127,7 +126,6 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *); # define platform_global_tlb_purgeia64_mv.global_tlb_purge # define platform_tlb_migrate_finish ia64_mv.tlb_migrate_finish # define platform_dma_initia64_mv.dma_init -# define platform_dma_get_required_mask ia64_mv.dma_get_required_mask # define platform_dma_get_ops ia64_mv.dma_get_ops # define platform_irq_to_vector ia64_mv.irq_to_vector # define platform_local_vector_to_irq ia64_mv.local_vector_to_irq @@ -171,7 +169,6 @@ struct ia64_machine_vector { ia64_mv_global_tlb_purge_t *global_tlb_purge; ia64_mv_tlb_migrate_finish_t *tlb_migrate_finish; ia64_mv_dma_init *dma_init; - ia64_mv_dma_get_required_mask *dma_get_required_mask; ia64_mv_dma_get_ops *dma_get_ops; ia64_mv_irq_to_vector *irq_to_vector; ia64_mv_local_vector_to_irq *local_vector_to_irq; @@ -211,7 +208,6 @@ struct ia64_machine_vector { platform_global_tlb_purge, \ platform_tlb_migrate_finish,\ platform_dma_init, \ - platform_dma_get_required_mask, \ platform_dma_get_ops, \ platform_irq_to_vector, \ platform_local_vector_to_irq, \ @@ -286,9 +282,6 @@ extern const struct dma_map_ops *dma_get_ops(struct device *); #ifndef platform_dma_get_ops # define platform_dma_get_ops dma_get_ops #endif -#ifndef platform_dma_get_required_mask -# define platform_dma_get_required_mask ia64_dma_get_required_mask -#endif #ifndef platform_irq_to_vector # define platform_irq_to_vector__ia64_irq_to_vector #endif diff --git a/arch/ia64/include/asm/machvec_init.h b/arch/ia64/include/asm/machvec_init.h index 2b32fd06b7c6..2aafb69a3787 100644 --- a/arch/ia64/include/asm/machvec_init.h +++ b/arch/ia64/include/asm/machvec_init.h @@ -4,7 +4,6 @@ extern ia64_mv_send_ipi_t ia64_send_ipi; extern ia64_mv_global_tlb_purge_t ia64_global_tlb_purge; -extern ia64_mv_dma_get_required_mask ia64_dma_get_required_mask; extern ia64_mv_irq_to_vector __ia64_irq_to_vector; extern ia64_mv_local_vector_to_irq __ia64_local_vector_to_irq; extern ia64_mv_pci_get_legacy_mem_t ia64_pci_get_legacy_mem; diff --git a/arch/ia64/include/asm/machvec_sn2.h b/arch/ia64/include/asm/machvec_sn2.h index ece9fa85be88..b5153d300289 100644 --- a/arch/ia64/include/asm/machvec_sn2.h +++ b/arch/ia64/include/asm/machvec_sn2.h @@ -55,7 +55,6 @@ extern ia64_mv_readb_t __sn_readb_relaxed; extern ia64_mv_readw_t __sn_readw_relaxed; extern ia64_mv_readl_t __sn_readl_relaxed; extern ia64_mv_readq_t __sn_readq_relaxed; -extern ia64_mv_dma_get_required_mask sn_dma_get_required_mask; extern ia64_mv_dma_initsn_dma_init; extern ia64_mv_migrate_t sn_migrate; extern ia64_mv_kernel_launch_event_t sn_kernel_launch_event; @@ -100,7 +99,6 @@ extern ia64_mv_pci_fixup_bus_t sn_pci_fixup_bus; #define platform_pci_get_legacy_memsn_pci_get_legacy_mem #define platform_pci_legacy_read sn_pci_legacy_read #define platform_pci_legacy_write sn_pci_legacy_write -#define platform_dma_get_required_mask sn_dma_get_required_mask #define platform_dm
dma mask related fixups (including full bus_dma_mask support) v2
Hi all, the dma_get_required_mask dma API implementation has always been a little odd, in that we by default don't wire it up struct dma_map_ops, but instead hard code a default implementation. powerpc and ia64 override this default and either call a method or otherwise duplicate the default. This series always enabled the method and just falls back to the previous default implementation when it is not available, as well as fixing up a few bits in the default implementations. This already allows removing the ia64 override of the implementation, and will also allow to remove the powerpc one together with a few additional cleanups in the powerpc code, but those will be sent separately with other powerpc DMA API patches. Last but not least the method will allow us to return a more sensible value for typical iommu dma_ops eventually, but that is left to another series as well. Additionally the dma-direct code has been a bit sloppy in when it was using phys_to_dma in a few places, so this gets fixed up as well as actually respecting the bus_dma_mask everywhere instead of just rejecting dma mask that don't fit into it. Alltogether this should be all core dma-direct changes required to move powerpc over to the generic code. Changes since v2: - clean up a mask check - add various acks
[PATCH 5/5] dma-direct: always allow dma mask <= physiscal memory size
This way an architecture with less than 4G of RAM can support dma_mask smaller than 32-bit without a ZONE_DMA. Apparently that is a common case on powerpc. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 28 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 60c433b880e0..170bd322a94a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -284,21 +284,25 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, return nents; } +/* + * Because 32-bit DMA masks are so common we expect every architecture to be + * able to satisfy them - either by not supporting more physical memory, or by + * providing a ZONE_DMA32. If neither is the case, the architecture needs to + * use an IOMMU instead of the direct mapping. + */ int dma_direct_supported(struct device *dev, u64 mask) { -#ifdef CONFIG_ZONE_DMA - if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))) - return 0; -#else - /* -* Because 32-bit DMA masks are so common we expect every architecture -* to be able to satisfy them - either by not supporting more physical -* memory, or by providing a ZONE_DMA32. If neither is the case, the -* architecture needs to use an IOMMU instead of the direct mapping. -*/ - if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) + u64 min_mask; + + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); + else + min_mask = DMA_BIT_MASK(32); + + min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + + if (mask >= phys_to_dma(dev, min_mask)) return 0; -#endif return 1; } -- 2.19.0
[PATCH 4/5] dma-direct: implement complete bus_dma_mask handling
Instead of rejecting devices with a too small bus_dma_mask we can handle by taking the bus dma_mask into account for allocations and bounce buffering decisions. Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 3 ++- kernel/dma/direct.c| 21 +++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index b79496d8c75b..fbca184ff5a0 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -27,7 +27,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) if (!dev->dma_mask) return false; - return addr + size - 1 <= *dev->dma_mask; + return addr + size - 1 <= + min_not_zero(*dev->dma_mask, dev->bus_dma_mask); } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index e78548397a92..60c433b880e0 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -44,10 +44,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, return false; } - if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx\n", - caller, &dma_addr, size, *dev->dma_mask); + "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n", + caller, &dma_addr, size, + *dev->dma_mask, dev->bus_dma_mask); } return false; } @@ -66,12 +67,18 @@ u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) + max_dma = dev->bus_dma_mask; + return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_mask) { + if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) + dma_mask = dev->bus_dma_mask; + if (force_dma_unencrypted()) *phys_mask = __dma_to_phys(dev, dma_mask); else @@ -88,7 +95,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= - dev->coherent_dma_mask; + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); } void *dma_direct_alloc_pages(struct device *dev, size_t size, @@ -292,12 +299,6 @@ int dma_direct_supported(struct device *dev, u64 mask) if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) return 0; #endif - /* -* Upstream PCI/PCIe bridges or SoC interconnects may not carry -* as many DMA address bits as the device itself supports. -*/ - if (dev->bus_dma_mask && mask > dev->bus_dma_mask) - return 0; return 1; } -- 2.19.0
[PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection
We need to take the DMA offset and encryption bit into account when selecting a zone. User the opportunity to factor out the zone selection into a helper for reuse. Signed-off-by: Christoph Hellwig Reviewed-by: Robin Murphy --- kernel/dma/direct.c | 31 +-- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index f32b33cfa331..e78548397a92 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -69,6 +69,22 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask) +{ + if (force_dma_unencrypted()) + *phys_mask = __dma_to_phys(dev, dma_mask); + else + *phys_mask = dma_to_phys(dev, dma_mask); + + /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ + if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + return GFP_DMA; + if (*phys_mask <= DMA_BIT_MASK(32)) + return GFP_DMA32; + return 0; +} + static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= @@ -81,17 +97,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; int page_order = get_order(size); struct page *page = NULL; + u64 phys_mask; void *ret; /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - - /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ - if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - gfp |= GFP_DMA; - if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; - + gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { @@ -110,15 +122,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && - dev->coherent_dma_mask < DMA_BIT_MASK(64) && + phys_mask < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && - dev->coherent_dma_mask < DMA_BIT_MASK(32) && - !(gfp & GFP_DMA)) { + phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } -- 2.19.0
[PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask
This is somewhat modelled after the powerpc version, and differs from the legacy fallback in use fls64 instead of pointlessly splitting up the address into low and high dwords and in that it takes (__)phys_to_dma into account. Signed-off-by: Christoph Hellwig Acked-by: Benjamin Herrenschmidt Reviewed-by: Robin Murphy --- include/linux/dma-direct.h | 1 + kernel/dma/direct.c| 22 +++--- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 86a59ba5a7f3..b79496d8c75b 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -55,6 +55,7 @@ static inline void dma_mark_clean(void *addr, size_t size) } #endif /* CONFIG_ARCH_HAS_DMA_MARK_CLEAN */ +u64 dma_direct_get_required_mask(struct device *dev); void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c954f0a6dc62..f32b33cfa331 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -4,6 +4,7 @@ * * DMA operations that map physical memory directly without using an IOMMU. */ +#include /* for max_pfn */ #include #include #include @@ -53,11 +54,25 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, return true; } +static inline dma_addr_t phys_to_dma_direct(struct device *dev, + phys_addr_t phys) +{ + if (force_dma_unencrypted()) + return __phys_to_dma(dev, phys); + return phys_to_dma(dev, phys); +} + +u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + + return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; +} + static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { - dma_addr_t addr = force_dma_unencrypted() ? - __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); - return addr + size - 1 <= dev->coherent_dma_mask; + return phys_to_dma_direct(dev, phys) + size - 1 <= + dev->coherent_dma_mask; } void *dma_direct_alloc_pages(struct device *dev, size_t size, @@ -296,6 +311,7 @@ const struct dma_map_ops dma_direct_ops = { .unmap_page = dma_direct_unmap_page, .unmap_sg = dma_direct_unmap_sg, #endif + .get_required_mask = dma_direct_get_required_mask, .dma_supported = dma_direct_supported, .mapping_error = dma_direct_mapping_error, .cache_sync = arch_dma_cache_sync, -- 2.19.0
drivers binding to device node with multiple compatible strings
Hi Rob and Grant, Various device tree specs are recommending to include all the potential compatible strings in the device node, with the order from most specific to most general. But it looks like Linux kernel doesn't provide a way to bind the device to the most specific driver, however, the first registered compatible driver will be bound. As more and more generic drivers are added to the Linux kernel, they are competing with the more specific vendor drivers and causes problem when both are built into the kernel. I'm wondering if there is a generic solution (or in plan) to make the most specific driver bound to the device. Or we have to disable the more general driver or remove the more general compatible string from the device tree? Regards, Leo
Re: [RFC PATCH 08/11] powerpc/tm: Do not reclaim on ptrace
Hi Mikey, On 09/18/2018 02:36 AM, Michael Neuling wrote: > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: >> Make sure that we are not suspended on ptrace and that the registers were >> already reclaimed. >> >> Since the data was already reclaimed, there is nothing to be done here >> except to restore the SPRs. >> >> Signed-off-by: Breno Leitao >> --- >> arch/powerpc/kernel/ptrace.c | 10 -- >> 1 file changed, 4 insertions(+), 6 deletions(-) >> >> diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c >> index 9667666eb18e..cf6ee9154b11 100644 >> --- a/arch/powerpc/kernel/ptrace.c >> +++ b/arch/powerpc/kernel/ptrace.c >> @@ -136,12 +136,10 @@ static void flush_tmregs_to_thread(struct task_struct >> *tsk) >> if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) >> return; >> >> -if (MSR_TM_SUSPENDED(mfmsr())) { >> -tm_reclaim_current(TM_CAUSE_SIGNAL); >> -} else { >> -tm_enable(); >> -tm_save_sprs(&(tsk->thread)); >> -} >> +WARN_ON(MSR_TM_SUSPENDED(mfmsr())); >> + >> +tm_enable(); >> +tm_save_sprs(&(tsk->thread)); > > Do we need to check if TM was enabled in the task before saving the TM SPRs? > > What happens if TM was lazily off and hence we had someone else's TM SPRs in > the > CPU currently? Wouldn't this flush the wrong values to the task_struct? > > I think we need to check the processes MSR before doing this. Yes, it is a *very* good point, and I think we are vulnerable even before this patch (in the current kernel). Take a look above, we are calling tm_save_sprs() if MSR is not TM suspended independently if TM is lazily off. It shouldn't be hard to create a test case for it. I can try to call ptrace(PTRACE_GETVRREGS) on a task that sleeps until TM is lazily disabled, compare if the TM SPR changed in this mean time. (while doing HTM in parallel to keep HTM SPR changing). Let's see if I can read others task TM SPRs. Thank you.
Re: [RFC PATCH 05/11] powerpc/tm: Function that updates the failure code
Hi Mikey, On 09/17/2018 10:29 PM, Michael Neuling wrote: > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: >> Now the transaction reclaims happens very earlier in the trap handler, and >> it is impossible to know precisely, at that early time, what should be set >> as the failure cause for some specific cases, as, if the task will be >> rescheduled, thus, the transaction abort case should be updated from >> TM_CAUSE_MISC to TM_CAUSE_RESCHED, for example. > > Please add comments to where this is used (in EXCEPTION_COMMON macro I think) > that say this might happen. Is it OK if I comment it in TM_KERNEL_ENTRY macro, since the failure cause could be updated independently of the exception being execute, so, every call to TM_KERNEL_ENTRY can have the cause overwritten. I.e. it does not matter if the exception is a systemcall or a page fault, the failure cause will be updated if there is a process reschedule after the exception/syscall is handled. Thank you
Re: [RFC PATCH 11/11] selftests/powerpc: Adapt the test
Hi Mikey, On 09/18/2018 03:36 AM, Michael Neuling wrote: > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: >> The Documentation/powerpc/transactional_memory.txt says: >> >> "Syscalls made from within a suspended transaction are performed as normal >> and the transaction is not explicitly doomed by the kernel. However, >> what the kernel does to perform the syscall may result in the transaction >> being doomed by the hardware." >> >> With this new TM mechanism, the syscall will continue to be executed if the >> syscall happens on a suspended syscall, but, the syscall will *fail* if the >> transaction is still active during the syscall invocation. > > Not sure I get this. This doesn't seem any different to before. > > An active (not suspended) transaction *will* result in the syscall failing and > the transaction being doomed. > > A syscall in a suspended transaction should succeed and the transaction. I understand that a transaction will always be doomed if there is a reclaim/checkpoint, thus, if the you make a system call inside a suspended transaction, it will reclaim and recheckpoint, thus, dooming the transaction, and failing it on the next RFID. So, the syscall executed in suspended mode may succeed, because it will not be skipped (as in active mode), but the transaction will *always* fail, because there was a reclaim and recheckpoint. > You might need to clean up the language. I try to use: > >Active == transactional but not suspended (ie MSR[TS] = T) >Suspended == suspended (ie MSR [TS] = S) >Doomed == transaction to be rolled back at next opportinity (ie tcheck > returns doomed) > > (note: the kernel MSR_TM_ACTIVE() macro is not consistent with this since it's > MSR[TS] == S or T). Ok, I agree with this language as well. I might want to improve the code to follow the same language all across the code. >> On the syscall path, if the transaction is active and not suspended, it >> will call TM_KERNEL_ENTRY which will reclaim and recheckpoint the >> transaction, thus, dooming the transaction on userspace return, with >> failure code TM_CAUSE_SYSCALL. > > But the test below is on a suspend transaction? Sorry, I meant "suspended transaction" above instead of "transaction is active and not suspended". > >> This new model will break part of this test, but I understand that that the >> documentation above didn't guarantee that the syscall would succeed, and it >> will never succeed anymore now on. > > The syscall should pass in suspend (modulo the normal syscall checks). The > transaction may fail as a result. Perfect, and if the transaction fail, the CPU will rollback the changes and restore the checkpoint registers (replacing the r3 that contains the pid value), thus, it will be like "getpid" system call didn't execute. For this test specifically, it assumes the syscall didn't execute if the transaction failed. Take a look: FUNC_START(getppid_tm_suspended) tbegin. beq 1f li r0, __NR_getppid tsuspend. sc tresume. tend. blr 1: li r3, -1 blr So, the tests thinks the syscall failed because the transaction aborted. Anyway, I can try to improve this test other than removing this test, but I am not sure how. Thank you
Re: [RFC PATCH 10/11] powerpc/tm: Set failure summary
Hi Mikey, On 09/18/2018 02:50 AM, Michael Neuling wrote: > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: >> Since the transaction will be doomed with treckpt., the TEXASR[FS] >> should be set, to reflect that the transaction is a failure. This patch >> ensures it before recheckpointing, and remove changes from other places >> that were calling recheckpoint. > > TEXASR[FS] should be set by the reclaim. Do you mean that the CPU should set TEXASR[FS] when treclaim is called, or, that the tm_reclaim? Looking at the ISA, I didn't see TEXASR[FS] being set automatically when a reclaim happens, although, I see it needs TEXASR[FS] to be set when executing a trecheckpoint, otherwise it will cause a TM Bad Thing. That is why I am forcing TEXASR[FS]=1 to doom the transaction so we can recheckpoint it, but it seems I understood this wrongly. > I don't know why you'd need to set this > explicitly in process.c. The only case is when the user supplies a bad signal > context, but we should check that in the signals code, not process.c > > Hence I think this patch is wrong. > > Also, according to the architecture, TEXASR[FS] HAS TO BE SET on trecheckpoint > otherwise you'll get a TM Bad Thing. You should say that rather than > suggesting > it's because the transaction is doomed. It's ilqlegal to not do it. That's > why we > have this check in arch/powerpc/kernel/tm.S. When you say "HAS TO BE SET", do you mean that the hardware will set it and we shouldn't care about this flag? Thus, if I am hitting EMIT_BUG_ENTRY, it means my TEXASR was messed somehow? Thank you
Re: [RFC PATCH 09/11] powerpc/tm: Do not restore default DSCR
Hi Mikey, On 09/18/2018 02:41 AM, Michael Neuling wrote: > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: >> In the previous TM code, trecheckpoint was being executed in the middle of >> an exception, thus, DSCR was being restored to default kernel DSCR value >> after trecheckpoint was done. >> >> With this current patchset, trecheckpoint is executed just before getting >> to userspace, at ret_from_except_lite, for example. Thus, we do not need to >> set default kernel DSCR value anymore, as we are leaving kernel space. It >> is OK to keep the checkpointed DSCR value into the live SPR, mainly because >> the transaction is doomed and it will fail soon (after RFID), > > What if we are going back to a suspended transaction? It will remain live > until > userspace does a tresume Hmm, I understand that once we get in kernel space, and call treclaim/trecheckpoint, the transaction will be doomed and it will abort and rollback when we leave kernel space. I.e., if we can treclaim/trecheckpointn in kernel space, the transaction will *always* abort just after RFID, so, a possible tresume will never be executed. Is my understanding wrong? > >> so, >> continuing with the pre-checkpointed DSCR value is what seems correct. > > Reading this description suggests this patch isn't really needed. Right? Maybe the description is not clear, but I understand this patch is required, otherwise we will leave userspace with a default DSCR value. By the way, do you know if there is a change in DSCR inside a transaction, will it be reverted if the transaction is aborted? Thank you
[PATCH 8/8] powerpc: convert to generic builtin command line
From: Daniel Walker This updates the powerpc code to use the CONFIG_GENERIC_CMDLINE option. [maksym.kok...@globallogic.com: add strlcat to prom_init_check.sh whitelist] Cc: Daniel Walker Cc: Daniel Walker Signed-off-by: Daniel Walker Signed-off-by: Maksym Kokhan --- arch/powerpc/Kconfig | 23 +-- arch/powerpc/kernel/prom.c | 4 arch/powerpc/kernel/prom_init.c| 8 arch/powerpc/kernel/prom_init_check.sh | 2 +- 4 files changed, 10 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a806692..f87906a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -172,6 +172,7 @@ config PPC select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select GENERIC_TIME_VSYSCALL + select GENERIC_CMDLINE select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB @@ -787,28 +788,6 @@ config PPC_DENORMALISATION Add support for handling denormalisation of single precision values. Useful for bare metal only. If unsure say Y here. -config CMDLINE_BOOL - bool "Default bootloader kernel arguments" - -config CMDLINE - string "Initial kernel command string" - depends on CMDLINE_BOOL - default "console=ttyS0,9600 console=tty0 root=/dev/sda2" - help - On some platforms, there is currently no way for the boot loader to - pass arguments to the kernel. For these platforms, you can supply - some command-line options at build time by entering them here. In - most cases you will need to specify the root device here. - -config CMDLINE_FORCE - bool "Always use the default kernel command string" - depends on CMDLINE_BOOL - help - Always use the default kernel command string, even if the boot - loader passes other arguments to the kernel. - This is useful if you cannot or don't want to change the - command-line options your boot loader passes to the kernel. - config EXTRA_TARGETS string "Additional default image types" help diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index c4d7078..f8c6e63 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -716,6 +717,9 @@ void __init early_init_devtree(void *params) */ of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line); + /* append and prepend any arguments built into the kernel. */ + cmdline_add_builtin(boot_command_line, NULL, COMMAND_LINE_SIZE); + /* Scan memory nodes and rebuild MEMBLOCKs */ of_scan_flat_dt(early_init_dt_scan_root, NULL); of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL); diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 9b38a2e..e9214c6 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -634,11 +635,10 @@ static void __init early_cmdline_parse(void) p = prom_cmd_line; if ((long)prom.chosen > 0) l = prom_getprop(prom.chosen, "bootargs", p, COMMAND_LINE_SIZE-1); -#ifdef CONFIG_CMDLINE + if (l <= 0 || p[0] == '\0') /* dbl check */ - strlcpy(prom_cmd_line, - CONFIG_CMDLINE, sizeof(prom_cmd_line)); -#endif /* CONFIG_CMDLINE */ + cmdline_add_builtin(prom_cmd_line, NULL, sizeof(prom_cmd_line)); + prom_printf("command line: %s\n", prom_cmd_line); #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index acb6b92..b1d7ca1 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -18,7 +18,7 @@ WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush _end enter_prom memcpy memset reloc_offset __secondary_hold -__secondary_hold_acknowledge __secondary_hold_spinloop __start +__secondary_hold_acknowledge __secondary_hold_spinloop __start strlcat strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224 reloc_got2 kernstart_addr memstart_addr linux_banner _stext __prom_init_toc_start __prom_init_toc_end btext_setup_display TOC." -- 2.7.4
[PATCH 1/8] add generic builtin command line
From: Daniel Walker This code allows architectures to use a generic builtin command line. The state of the builtin command line options across architecture is diverse. On x86 and mips they have pretty much the same code and the code prepends the builtin command line onto the boot loader provided one. On powerpc there is only a builtin override and nothing else. The code in this commit unifies the mips and x86 code into a generic header file under the CONFIG_GENERIC_CMDLINE option. When this option is enabled the architecture can call the cmdline_add_builtin() to add the builtin command line. [maksym.kok...@globallogic.com: fix cmdline_add_builtin() macro] Cc: Daniel Walker Cc: Daniel Walker Signed-off-by: Daniel Walker Signed-off-by: Maksym Kokhan --- include/linux/cmdline.h | 70 + init/Kconfig| 68 +++ 2 files changed, 138 insertions(+) create mode 100644 include/linux/cmdline.h diff --git a/include/linux/cmdline.h b/include/linux/cmdline.h new file mode 100644 index 000..75ef278 --- /dev/null +++ b/include/linux/cmdline.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_CMDLINE_H +#define _LINUX_CMDLINE_H + +/* + * + * Copyright (C) 2015. Cisco Systems, Inc. + * + * Generic Append/Prepend cmdline support. + */ + +#if defined(CONFIG_GENERIC_CMDLINE) && defined(CONFIG_CMDLINE_BOOL) + +#ifndef CONFIG_CMDLINE_OVERRIDE +/* + * This function will append or prepend a builtin command line to the command + * line provided by the bootloader. Kconfig options can be used to alter + * the behavior of this builtin command line. + * @dest: The destination of the final appended/prepended string + * @src: The starting string or NULL if there isn't one. + * @tmp: temporary space used for prepending + * @length: the maximum length of the strings above. + */ +static inline void +_cmdline_add_builtin(char *dest, char *src, char *tmp, unsigned long length) +{ + if (src != dest && src != NULL) { + strlcpy(dest, " ", length); + strlcat(dest, src, length); + } + + strlcat(dest, " ", length); + + if (sizeof(CONFIG_CMDLINE_APPEND) > 1) + strlcat(dest, CONFIG_CMDLINE_APPEND, length); + + if (sizeof(CONFIG_CMDLINE_PREPEND) > 1) { + strlcpy(tmp, CONFIG_CMDLINE_PREPEND, length); + strlcat(tmp, " ", length); + strlcat(tmp, dest, length); + strlcpy(dest, tmp, length); + } +} + +#define cmdline_add_builtin(dest, src, length) \ +{ \ + if (sizeof(CONFIG_CMDLINE_PREPEND) > 1) { \ + static char cmdline_tmp_space[length] __initdata; \ + _cmdline_add_builtin(dest, src, cmdline_tmp_space, length); \ + } else {\ + _cmdline_add_builtin(dest, src, NULL, length); \ + } \ +} +#else +#define cmdline_add_builtin(dest, src, length)\ +{ \ + strlcpy(dest, CONFIG_CMDLINE_PREPEND " " CONFIG_CMDLINE_APPEND,\ + length); \ +} +#endif /* !CONFIG_CMDLINE_OVERRIDE */ + +#else +#define cmdline_add_builtin(dest, src, length) { \ + if (src != NULL) \ + strlcpy(dest, src, length);\ +} +#endif /* CONFIG_GENERIC_CMDLINE */ + + +#endif /* _LINUX_CMDLINE_H */ diff --git a/init/Kconfig b/init/Kconfig index 1e234e2..e5aa676 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1751,6 +1751,74 @@ config PROFILING config TRACEPOINTS bool +config GENERIC_CMDLINE + bool + +if GENERIC_CMDLINE + +config CMDLINE_BOOL + bool "Built-in kernel command line" + help + Allow for specifying boot arguments to the kernel at + build time. On some systems (e.g. embedded ones), it is + necessary or convenient to provide some or all of the + kernel boot arguments with the kernel itself (that is, + to not rely on the boot loader to provide them.) + + To compile command line arguments into the kernel, + set this option to 'Y', then fill in the + the boot arguments in CONFIG_CMDLINE. + + Systems with fully functional boot loaders (i.e. non-embedded) + should leave this option set to 'N'. + +config CMDLINE_APPEND + string "Built-in kernel command string append" + depends on CMDLINE_BOOL + default "" + help + Enter arguments here that should be compiled into the kernel + image and
[PATCH 0/8] add generic builtin command line
There were series of patches [1] for 4.3.0-rc3, that allowed architectures to use a generic builtin command line. I have rebased these patches on kernel 4.19.0-rc4. Things, modified in comparison with original patches: * There was some bug for mips, in the case when CONFIG_CMDLINE_PREPEND and CONFIG_CMDLINE_APPEND are empty and CMDLINE_OVERRIDE is not set, command line from bootloader was ignored, so I fixed it, modifying patch "add generic builtin command line". * Implemented new patch to resolve conflict with new kernel, which modify EFI stub code. Unfortunately, I don't have capability to test this modification on real arm board with EFI. * Removed new realisation of mips builtin command line, which was created after 4.3.0-rc3. * Kernel 4.3.0-rc3 with original patches could not be compiled for powerpc due to prom_init.c checking by prom_init_check.sh. So I added strlcat (which is used by cmdline_add_builtin macro) to prom_init_check.sh whitelist. Patches have been tested in QEMU for x86, arm (little-endian), arm64 (little-endian), mips (little-endian, 32-bit) and powerpc (big-endian, 64-bit), everything works perfectly. Also it was tested on linux-next (next-20180924 tag) for all listed above architectures. [1] : https://lore.kernel.org/patchwork/patch/604992/ Daniel Walker (7): add generic builtin command line drivers: of: ifdef out cmdline section x86: convert to generic builtin command line arm: convert to generic builtin command line arm64: convert to generic builtin command line mips: convert to generic builtin command line powerpc: convert to generic builtin command line Maksym Kokhan (1): efi: modify EFI stub code for arm/arm64 arch/arm/Kconfig| 38 +- arch/arm/kernel/atags_parse.c | 14 ++- arch/arm/kernel/devtree.c | 2 + arch/arm64/Kconfig | 17 +--- arch/arm64/kernel/setup.c | 3 ++ arch/mips/Kconfig | 24 +-- arch/mips/Kconfig.debug | 47 -- arch/mips/kernel/setup.c| 41 ++- arch/powerpc/Kconfig| 23 +-- arch/powerpc/kernel/prom.c | 4 ++ arch/powerpc/kernel/prom_init.c | 8 ++-- arch/powerpc/kernel/prom_init_check.sh | 2 +- arch/x86/Kconfig| 44 + arch/x86/kernel/setup.c | 19 ++--- drivers/firmware/efi/libstub/arm-stub.c | 10 ++--- drivers/of/fdt.c| 2 +- include/linux/cmdline.h | 70 + init/Kconfig| 68 18 files changed, 173 insertions(+), 263 deletions(-) create mode 100644 include/linux/cmdline.h -- 2.7.4
Re: [RFC PATCH 06/11] powerpc/tm: Refactor the __switch_to_tm code
hi Mikey On 09/18/2018 01:04 AM, Michael Neuling wrote: >> On top of that, both tm_reclaim_task() and tm_recheckpoint_new_task() >> functions are not used anymore, removing them. > > What about tm_reclaim_current(). This is being used in places like signals > which I would have thought we could avoid with this series In fact tm_reclaim_current() is the default function to reclaim. It is the function that is being called by TM_KERNEL_ENTRY, other than that, it should never be called on the sane path. >> + * If we got here with an active transaction, then, it was >> + * aborted by TM_KERNEL_ENTRY and the fix the failure case >> + * needs to be fixed, so, indepedently how we arrived here, the >> + * new TM abort case will be TM_CAUSE_RESCHED now. > > What does "fix the failure case needs to be fixed" mean? > > also s/indepedently/independently/ In fact, I rewrote this paragraph. I try to say that the "TEXASR Failure code" needs to be updated/fixed, but it became that messy and crazy paragraph. :-( > >> + */ >> +if (MSR_TM_ACTIVE(prev->thread.regs->msr)) { >> +/* >> + * If there was an IRQ during trecheckpoint, it will >> + * cause an IRQ to be replayed. This replayed IRQ can >> + * invoke SCHEDULE_USER, thus, we arrive here with a TM >> + * active transaction. > > I don't think this can happen. trecheckpoint (and treclaim) are called with > IRQs > hard off (since they change r1). There will be an IRQ check and replay later. An IRQ being replayed can cause a process to be reschedule and arrive here after a trecheckpoint. > I think something else is going on here. I think this code and comment needs > to > go but I assume it's here because you are seeing something. Right, and it was my major concern about this whole review. The tm_recheckpoint() was being called with IRQ hard off, as you said, but the IRQ is being re-enabled later, after "trecheckpoint", when it calls local_irq_restore(). Since the IRQ was re-enabled, there is a mechanism to check for IRQs that were pending and execute them (after recheckpoint - hence with MSR[TS] active). Some of these pending IRQ might cause a task reschedule, bringing us to __switch_to with MSR[TS] active. I also checked that there were cases where a pending IRQ was waiting to be replayed even before tm_recheckpoint() is called. I suspect we were reaching tm_recheckpoint soft-disabled. On the v2 patchset, I am following your suggestion, which is calling restore_tm_state() much later (at the beginning of fast_exception_return), after the _TIF_USER_WORK_MASK section, and after the IRQ replay section also. So, after this point, there is no way to rollback the exit to userspace after trecheckpoint. Therefore, if there is a recheckpoint, a rfid will always come directly. In order to do so, I need to remove _TIF_RESTORE_TM as part of _TIF_USER_WORK_MASK and handle _TIF_RESTORE_TM individually later. This seems to solve this problem, and I can remove this reclaim in the middle of __switch_to_tm(). Thank you
Re: [RFC PATCH 01/11] powerpc/tm: Reclaim transaction on kernel entry
Hi Mikey, On 09/17/2018 10:31 PM, Michael Neuling wrote: > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: >> This patch creates a macro that will be invoked on all entrance to the >> kernel, so, in kernel space the transaction will be completely reclaimed >> and not suspended anymore. > > There are still some calls to tm_reclaim_current() in process.c. Should these > probably go now, right? Yes, we shouldn't not call tm_reclaim_current() from anywhere else other than TM_KERNEL_ENTRY anymore. That said, I think we can still have some references for tm_reclaim_current() in some specific checkpoints. Something as: if (WARN_ON(MSR_TM_SUSPENDED(mfmsr( tm_reclaim_current(0); I initially wrote something as BUG_ON(MSR_TM_SUSPENDED(mfmsr()) but scripts/checkpatch.pl suggested me writing a WARN_ON() and create a recovery pass, which seems fair. Anyway, if you think it is not a good strategy, I can get rid of them at v2. Thank you!
Re: [RFC PATCH 00/11] New TM Model
Hi Mikey, First of all, thanks for you detailed review. I really appreciate your comments here. On 09/17/2018 02:25 AM, Michael Neuling wrote: > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote: >> This patchset for the hardware transactional memory (TM) subsystem aims to >> avoid spending a lot of time on TM suspended mode in kernel space. It >> basically >> changes where the reclaim/recheckpoint will be executed. >> >> Once a CPU enters in transactional state it uses a footprint area to track >> down the load/stores performed in transaction so it can be verified later >> to decide if a conflict happened due to some change done in that state. If >> a transaction is active in userspace and there is an exception that takes >> the CPU to the kernel space the CPU moves the transaction to suspended >> state but does not discard the footprint area. > > In this description, you should differente between memory and register > (GPR/VSX/SPR) footprints. Right, reading the ISA, I understand that footprint is a term for memory only and it represents the modified memory that was stored during a transactional state (that after tbegin). For registers, the ISA talks about checkpointed registers, which is the register state *before* a transaction starts. I.e, for register it is the previous state, and for memory, it is the current/live state. That said, if the transactional is aborted, the memory footprint is discarded and the checkpointed registers replaces the live registers. > In suspend, the CPU can disregard the memory footprint at any point, but it > has> to keep the register footprint. Yes! Anyway, I was just trying to describe how the hardware works, it is not related to the kernel at the paragraph above, but I will make sure I will re-write it better. >> This patchset aims to reclaim the checkpointed footprint as soon as the >> kernel is invoked, in the beginning of the exception handlers, thus freeing >> room to other CPUs enter in suspended mode, avoiding too many CPUs in >> suspended >> state that can cause the CPUs to stall. The same mechanism is done on kernel >> exit, doing a recheckpoint as late as possible (which will reload the >> checkpointed state into CPU's room) at the exception return. > > OK, but we are still potentially in suspend in userspace, so that doesn't help > us on the lockup issue. > > We need fake suspend in userspace to prevent lockups. Correct. I will make sure I document it. This patchset is the very first step to start creating a work around for the hardware limitations. >> The way to achieve this goal is creating a macro (TM_KERNEL_ENTRY) which >> will check if userspace was in an active transaction just after getting >> into kernel and reclaiming if that's the case. Thus all exception handlers >> will call this macro as soon as possible. >> >> All exceptions should reclaim (if necessary) at this stage and only >> recheckpoint if the task is tagged as TIF_RESTORE_TM (i.e. was in >> transactional state before being interrupted), which will be done at >> ret_from_except_lite(). >> >> Ideally all reclaims will happen at the exception entrance, however during >> the recheckpoint process another exception can hit the CPU which might >> cause the current thread to be rescheduled, thus there is another reclaim >> point to be considered at __switch_to(). > > Can we do the recheckpoint() later so that it's when we have interrupts off > and > can't be rescheduled? Yes! After thinking on it for a long time, this is definitely what should be done. I will send a v2 with this change (and others being discussed here) Thank you, Breno
Re: [PATCH v3 1/6] dt-bindings: arm64: add compatible for LX2160A
On Mon, 24 Sep 2018 05:38:56 +0530, Vabhav Sharma wrote: > Add compatible for LX2160A SoC,QDS and RDB board > > Signed-off-by: Vabhav Sharma > --- > Documentation/devicetree/bindings/arm/fsl.txt | 12 > 1 file changed, 12 insertions(+) > Reviewed-by: Rob Herring
Re: [PATCH 1/2] soc: fsl: qbman: qman_portal: defer probing when qman is not available
On Wed, Sep 26, 2018 at 1:15 PM Li Yang wrote: > > On Wed, Sep 26, 2018 at 4:28 AM Alexandre Belloni > wrote: > > > > On 25/09/2018 21:45:56+0200, Olof Johansson wrote: > > > Hi, > > > > > > > > > On Thu, Aug 23, 2018 at 11:36 PM Alexandre Belloni > > > wrote: > > > > > > > > If the qman driver (qman_ccsr) doesn't probe or fail to probe before > > > > qman_portal, qm_ccsr_start will be either NULL or a stale pointer to an > > > > unmapped page. > > > > > > > > This leads to a crash when probing qman_portal as the init_pcfg > > > > function > > > > calls qman_liodn_fixup that tries to read qman registers. > > > > > > > > Assume that qman didn't probe when the pool mask is 0. > > > > > > > > Signed-off-by: Alexandre Belloni > > > > --- > > > > drivers/soc/fsl/qbman/qman_portal.c | 2 ++ > > > > 1 file changed, 2 insertions(+) > > > > > > > > diff --git a/drivers/soc/fsl/qbman/qman_portal.c > > > > b/drivers/soc/fsl/qbman/qman_portal.c > > > > index a120002b630e..4fc80d2c8feb 100644 > > > > --- a/drivers/soc/fsl/qbman/qman_portal.c > > > > +++ b/drivers/soc/fsl/qbman/qman_portal.c > > > > @@ -277,6 +277,8 @@ static int qman_portal_probe(struct platform_device > > > > *pdev) > > > > } > > > > > > > > pcfg->pools = qm_get_pools_sdqcr(); > > > > + if (pcfg->pools == 0) > > > > + return -EPROBE_DEFER; > > > > > > This is quite late in the probe, after a bunch of resources have been > > > claimed. > > > > > > Note that the ioremaps above this are doing unwinds, and you'll end up > > > doing duplicate ioremaps if you come in and probe again. > > > > > > You should probably unwind those allocations, or move them to devm_* > > > or do this check earlier in the function. > > > > > > > The actual chance of having that happen is quite small (this was coming > > from a non working DT) and I mainly wanted to avoid a crash so the > > platform could still boot. I would think moving to devm_ would be the > > right thing to do. > > Even if it is not failing with the upstreamed device trees, it is > still good to harden the driver for possible issues. Moving to devm_ > is definitely a right thing to do. But I also think checking if the > qman is already probed should be the first thing to do before starting > to allocate resources and etc and rolling back later. Probably we can > move the qm_get_pools_sdqcr() to the begining of the probe to > determine if qman is probed as it doesn't seem to depend on any of the > setups done right now. I just find out Laurentiu also included the following patches in his SMMU patch series (although not neccessarily related to SMMU) which also fix the same problem. I think they are more straightforward and can deal with the case that qman failed to probe. So we can take these to fix this problem instead in 4.19. https://patchwork.kernel.org/patch/10616021/ https://patchwork.kernel.org/patch/10616019/ https://patchwork.kernel.org/patch/10615971/ Regards, Leo
Re: [PATCH 0/8] add generic builtin command line
On 09/27/2018 10:05 AM, Ard Biesheuvel wrote: On 27 September 2018 at 18:55, Maksym Kokhan wrote: There were series of patches [1] for 4.3.0-rc3, that allowed architectures to use a generic builtin command line. I have rebased these patches on kernel 4.19.0-rc4. Could you please elaborate on the purpose of this series? Is it simply to align between architectures? Does it solve an actual problem? 1) It removed a lot of code duplication between architecture 2) At Cisco we have issues where our bootloaders having default boot arguments. Some platforms we can't update the boot loader and it's helpful to be able to have boot arguments which are prepended to the bootloader arguments , and some parameters which are appended. These changes allow that. Daniel
Re: [PATCH] powerpc/rtas: Fix a potential race between CPU-Offline & Migration
On 09/27/2018 11:51 AM, Gautham R. Shenoy wrote: > From: "Gautham R. Shenoy" > > Live Partition Migrations require all the present CPUs to execute the > H_JOIN call, and hence rtas_ibm_suspend_me() onlines any offline CPUs > before initiating the migration for this purpose. > > The commit 85a88cabad57 > ("powerpc/pseries: Disable CPU hotplug across migrations") > disables any CPU-hotplug operations once all the offline CPUs are > brought online to prevent any further state change. Once the > CPU-Hotplug operation is disabled, the code assumes that all the CPUs > are online. > > However, there is a minor window in rtas_ibm_suspend_me() between > onlining the offline CPUs and disabling CPU-Hotplug when a concurrent > CPU-offline operations initiated by the userspace can succeed thereby > nullifying the the aformentioned assumption. In this unlikely case > these offlined CPUs will not call H_JOIN, resulting in a system hang. > > Fix this by verifying that all the present CPUs are actually online > after CPU-Hotplug has been disabled, failing which we return from > rtas_ibm_suspend_me() with -EBUSY. Would we also want to havr the ability to re-try onlining all of the cpus before failing the migration? This would involve a bigger code change as the current code to online all CPUs would work in its current form. -Nathan > > Cc: Nathan Fontenot > Cc: Tyrel Datwyler > Suggested-by: Michael Ellerman > Signed-off-by: Gautham R. Shenoy > --- > arch/powerpc/kernel/rtas.c | 10 ++ > 1 file changed, 10 insertions(+) > > diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c > index 2c7ed31..27f6fd3 100644 > --- a/arch/powerpc/kernel/rtas.c > +++ b/arch/powerpc/kernel/rtas.c > @@ -982,6 +982,16 @@ int rtas_ibm_suspend_me(u64 handle) > } > > cpu_hotplug_disable(); > + > + /* Check if we raced with a CPU-Offline Operation */ > + if (unlikely(!cpumask_equal(cpu_present_mask, cpu_online_mask))) { > + pr_err("%s: Raced against a concurrent CPU-Offline\n", > +__func__); > + atomic_set(&data.error, -EBUSY); > + cpu_hotplug_enable(); > + goto out; > + } > + > stop_topology_update(); > > /* Call function on all CPUs. One of us will make the >
[PATCH] kdb: print real address of pointers instead of hashed addresses
Since commit ad67b74d2469 ("printk: hash addresses printed with %p"), all pointers printed with %p are printed with hashed addresses instead of real addresses in order to avoid leaking addresses in dmesg and syslog. But this applies to kdb too, with is unfortunate: Entering kdb (current=0x(ptrval), pid 329) due to Keyboard Entry kdb> ps 15 sleeping system daemon (state M) processes suppressed, use 'ps A' to see all. Task Addr Pid Parent [*] cpu State Thread Command 0x(ptrval) 329 328 10 R 0x(ptrval) *sh 0x(ptrval)10 00 S 0x(ptrval) init 0x(ptrval)32 00 D 0x(ptrval) rcu_gp 0x(ptrval)42 00 D 0x(ptrval) rcu_par_gp 0x(ptrval)52 00 D 0x(ptrval) kworker/0:0 0x(ptrval)62 00 D 0x(ptrval) kworker/0:0H 0x(ptrval)72 00 D 0x(ptrval) kworker/u2:0 0x(ptrval)82 00 D 0x(ptrval) mm_percpu_wq 0x(ptrval) 102 00 D 0x(ptrval) rcu_preempt The whole purpose of kdb is to debug, and for debugging real addresses need to be known. In addition, data displayed by kdb doesn't go into dmesg. This patch replaces all %p by %px in kdb in order to display real addresses. Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: Signed-off-by: Christophe Leroy --- kernel/debug/kdb/kdb_main.c| 14 +++--- kernel/debug/kdb/kdb_support.c | 12 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2ddfce8f1e8f..f338d23b112b 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1192,7 +1192,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, if (reason == KDB_REASON_DEBUG) { /* special case below */ } else { - kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", + kdb_printf("\nEntering kdb (current=0x%px, pid %d) ", kdb_current, kdb_current ? kdb_current->pid : 0); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -1208,7 +1208,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, */ switch (db_result) { case KDB_DB_BPT: - kdb_printf("\nEntering kdb (0x%p, pid %d) ", + kdb_printf("\nEntering kdb (0x%px, pid %d) ", kdb_current, kdb_current->pid); #if defined(CONFIG_SMP) kdb_printf("on processor %d ", raw_smp_processor_id()); @@ -2048,7 +2048,7 @@ static int kdb_lsmod(int argc, const char **argv) if (mod->state == MODULE_STATE_UNFORMED) continue; - kdb_printf("%-20s%8u 0x%p ", mod->name, + kdb_printf("%-20s%8u 0x%px ", mod->name, mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); @@ -2059,7 +2059,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->core_layout.base); + kdb_printf(" 0x%px", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { @@ -2341,7 +2341,7 @@ void kdb_ps1(const struct task_struct *p) return; cpu = kdb_process_cpu(p); - kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", + kdb_printf("0x%px %8d %8d %d %4d %c 0x%px %c%s\n", (void *)p, p->pid, p->parent->pid, kdb_task_has_cpu(p), kdb_process_cpu(p), kdb_task_state_char(p), @@ -2354,7 +2354,7 @@ void kdb_ps1(const struct task_struct *p) } else { if (KDB_TSK(cpu) != p) kdb_printf(" Error: does not match running " - "process table (0x%p)\n", KDB_TSK(cpu)); + "process table (0x%px)\n", KDB_TSK(cpu)); } } } @@ -2692,7 +2692,7 @@ int kdb_register_flags(char *cmd, for_each_kdbcmd(kp, i) { if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { kdb_printf("Duplicate kdb command registered: " - "%s, func %p help %s\n", cmd, func, help); + "%s, func %px help %s\n", cmd, func, help); return 1; } } diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 990b3cc526c8..987eb73284d2 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/d
[PATCH v2] kdb: use correct pointer when 'btc' calls 'btt'
On a powerpc 8xx, 'btc' fails as follows: Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0x0 when booting the kernel with 'debug_boot_weak_hash', it fails as well Entering kdb (current=0xba99ad80, pid 284) due to Keyboard Entry kdb> btc btc: cpu status: Currently on cpu 0 Available cpus: 0 kdb_getarea: Bad address 0xba99ad80 On other platforms, Oopses have been observed too, see https://github.com/linuxppc/linux/issues/139 This is due to btc calling 'btt' with %p pointer as an argument. This patch replaces %p by %px to get the real pointer value as expected by 'btt' Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Cc: Reviewed-by: Daniel Thompson Signed-off-by: Christophe Leroy --- v2: Added fixes: and Daniel's reviewed-by: in commit's log. No code change. kernel/debug/kdb/kdb_bt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 6ad4a9fcbd6f..7921ae4fca8d 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -179,14 +179,14 @@ kdb_bt(int argc, const char **argv) kdb_printf("no process for cpu %ld\n", cpu); return 0; } - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); return 0; } kdb_printf("btc: cpu status: "); kdb_parse("cpu\n"); for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); kdb_parse(buf); touch_nmi_watchdog(); } -- 2.13.3
Re: [PATCH 0/8] add generic builtin command line
On 27 September 2018 at 18:55, Maksym Kokhan wrote: > There were series of patches [1] for 4.3.0-rc3, that allowed > architectures to use a generic builtin command line. I have rebased > these patches on kernel 4.19.0-rc4. > Could you please elaborate on the purpose of this series? Is it simply to align between architectures? Does it solve an actual problem? > Things, modified in comparison with original patches: > * There was some bug for mips, in the case when CONFIG_CMDLINE_PREPEND > and CONFIG_CMDLINE_APPEND are empty and CMDLINE_OVERRIDE is not set, > command line from bootloader was ignored, so I fixed it, modifying > patch "add generic builtin command line". > > * Implemented new patch to resolve conflict with new kernel, which > modify EFI stub code. Unfortunately, I don't have capability to test > this modification on real arm board with EFI. > > * Removed new realisation of mips builtin command line, which was > created after 4.3.0-rc3. > > * Kernel 4.3.0-rc3 with original patches could not be compiled for > powerpc due to prom_init.c checking by prom_init_check.sh. So I added > strlcat (which is used by cmdline_add_builtin macro) to > prom_init_check.sh whitelist. > > Patches have been tested in QEMU for x86, arm (little-endian), arm64 > (little-endian), mips (little-endian, 32-bit) and powerpc > (big-endian, 64-bit), everything works perfectly. Also it was tested > on linux-next (next-20180924 tag) for all listed above architectures. > > [1] : https://lore.kernel.org/patchwork/patch/604992/ > > Daniel Walker (7): > add generic builtin command line > drivers: of: ifdef out cmdline section > x86: convert to generic builtin command line > arm: convert to generic builtin command line > arm64: convert to generic builtin command line > mips: convert to generic builtin command line > powerpc: convert to generic builtin command line > > Maksym Kokhan (1): > efi: modify EFI stub code for arm/arm64 > > arch/arm/Kconfig| 38 +- > arch/arm/kernel/atags_parse.c | 14 ++- > arch/arm/kernel/devtree.c | 2 + > arch/arm64/Kconfig | 17 +--- > arch/arm64/kernel/setup.c | 3 ++ > arch/mips/Kconfig | 24 +-- > arch/mips/Kconfig.debug | 47 -- > arch/mips/kernel/setup.c| 41 ++- > arch/powerpc/Kconfig| 23 +-- > arch/powerpc/kernel/prom.c | 4 ++ > arch/powerpc/kernel/prom_init.c | 8 ++-- > arch/powerpc/kernel/prom_init_check.sh | 2 +- > arch/x86/Kconfig| 44 + > arch/x86/kernel/setup.c | 19 ++--- > drivers/firmware/efi/libstub/arm-stub.c | 10 ++--- > drivers/of/fdt.c| 2 +- > include/linux/cmdline.h | 70 > + > init/Kconfig| 68 > 18 files changed, 173 insertions(+), 263 deletions(-) > create mode 100644 include/linux/cmdline.h > > -- > 2.7.4 >
[PATCH] powerpc/rtas: Fix a potential race between CPU-Offline & Migration
From: "Gautham R. Shenoy" Live Partition Migrations require all the present CPUs to execute the H_JOIN call, and hence rtas_ibm_suspend_me() onlines any offline CPUs before initiating the migration for this purpose. The commit 85a88cabad57 ("powerpc/pseries: Disable CPU hotplug across migrations") disables any CPU-hotplug operations once all the offline CPUs are brought online to prevent any further state change. Once the CPU-Hotplug operation is disabled, the code assumes that all the CPUs are online. However, there is a minor window in rtas_ibm_suspend_me() between onlining the offline CPUs and disabling CPU-Hotplug when a concurrent CPU-offline operations initiated by the userspace can succeed thereby nullifying the the aformentioned assumption. In this unlikely case these offlined CPUs will not call H_JOIN, resulting in a system hang. Fix this by verifying that all the present CPUs are actually online after CPU-Hotplug has been disabled, failing which we return from rtas_ibm_suspend_me() with -EBUSY. Cc: Nathan Fontenot Cc: Tyrel Datwyler Suggested-by: Michael Ellerman Signed-off-by: Gautham R. Shenoy --- arch/powerpc/kernel/rtas.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 2c7ed31..27f6fd3 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -982,6 +982,16 @@ int rtas_ibm_suspend_me(u64 handle) } cpu_hotplug_disable(); + + /* Check if we raced with a CPU-Offline Operation */ + if (unlikely(!cpumask_equal(cpu_present_mask, cpu_online_mask))) { + pr_err("%s: Raced against a concurrent CPU-Offline\n", + __func__); + atomic_set(&data.error, -EBUSY); + cpu_hotplug_enable(); + goto out; + } + stop_topology_update(); /* Call function on all CPUs. One of us will make the -- 1.9.4
Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling
On 27/09/18 17:27, Christoph Hellwig wrote: On Thu, Sep 27, 2018 at 05:14:56PM +0100, Robin Murphy wrote: This just seemed more readable to me than min_not_zero, but if others prefer min_not_zero I can switch. Nah, just checking whether there were any intentionally different assumptions compared to the couple of other places in the patch where min_not_zero() *is* used. If it's purely a style thing then no worries (personally I'd have written it yet another way anyway). I'm curious: how would you have written it? Come to think of it, I actually already have, in iommu-dma: if (dev->bus_dma_mask) mask &= dev->bus_dma_mask; but of course it's not so pretty for those cases where you don't already have the local variable ready to go. Robin.
Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling
On Thu, Sep 27, 2018 at 05:14:56PM +0100, Robin Murphy wrote: >> This just seemed more readable to me than min_not_zero, but if others >> prefer min_not_zero I can switch. > > Nah, just checking whether there were any intentionally different > assumptions compared to the couple of other places in the patch where > min_not_zero() *is* used. If it's purely a style thing then no worries > (personally I'd have written it yet another way anyway). I'm curious: how would you have written it?
Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling
On 27/09/18 16:32, Christoph Hellwig wrote: On Thu, Sep 27, 2018 at 03:58:04PM +0100, Robin Murphy wrote: } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 3c404e33d946..64466b7ef67b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,10 +43,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, return false; } -if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { Hmm... say *dev->dma_mask is 31 bits and dev->bus_dma_mask is 40 bits due to a global DT property, we'll now scream where we didn't before even though the bus mask is almost certainly irrelevant - is that desirable? This is just the reporting in the error case - we'll only hit this IFF dma_capable already returned false. But if you don't want a message here we can probably drop this hunk. It was only a question of consistency - whatever the reason was to avoid warning for small device masks before, it's not obvious why it wouldn't still apply in the presence of nonzero but larger bus mask. The fact that the current check is there at all implied to me that we're expecting less-capable device to hit this often and thus wanted to avoid being noisy. If that's not the case then it's fine as-is AFAIC. @@ -65,12 +66,18 @@ u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); +if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) + max_dma = dev->bus_dma_mask; Again, I think we could just do another min_not_zero() here. A device wired to address only one single page of RAM isn't a realistic prospect (and we could just flip the -1 and the shift in the max_dma calculation if we *really* wanted to support such things). This just seemed more readable to me than min_not_zero, but if others prefer min_not_zero I can switch. Nah, just checking whether there were any intentionally different assumptions compared to the couple of other places in the patch where min_not_zero() *is* used. If it's purely a style thing then no worries (personally I'd have written it yet another way anyway). Robin.
Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection
On Thu, Sep 27, 2018 at 04:38:31PM +0100, Robin Murphy wrote: > On 27/09/18 16:30, Christoph Hellwig wrote: >> On Thu, Sep 27, 2018 at 03:30:20PM +0100, Robin Murphy wrote: +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask) +{ + if (force_dma_unencrypted()) + *phys_mask = __dma_to_phys(dev, dma_mask); + else + *phys_mask = dma_to_phys(dev, dma_mask); >>> >>> Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we can >>> reuse it here? >> >> This is a dma_to_phys and not a phys_to_dma. > > Ugh, clearly it's time to stop reviewing patches for today... sorry :( I actually made the same mistake when writing it.. ALthough I'd really like to see some feedback from you on the arm64 swiotlb series once you had more cofee ;-)
Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection
On 27/09/18 16:30, Christoph Hellwig wrote: On Thu, Sep 27, 2018 at 03:30:20PM +0100, Robin Murphy wrote: +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask) +{ + if (force_dma_unencrypted()) + *phys_mask = __dma_to_phys(dev, dma_mask); + else + *phys_mask = dma_to_phys(dev, dma_mask); Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we can reuse it here? This is a dma_to_phys and not a phys_to_dma. Ugh, clearly it's time to stop reviewing patches for today... sorry :( Robin.
Re: [PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask
On 27/09/18 16:28, Christoph Hellwig wrote: On Thu, Sep 27, 2018 at 03:12:25PM +0100, Robin Murphy wrote: +u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + + return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; I think that may as well just use __fls64() - it seems reasonable to assume max_dma > 0. Otherwise, Is there any good reason to micro-optimize given that this isn't a fast path? Not at all, I wasn't even thinking in terms of optimisation other than in terms of number of source characters and levels of parentheses. But more importantly I was also being a big idiot because no matter how much I have the fls()/__fls() thing in mind, __fls64() doesn't actually exist. Nitpick rescinded! Robin.
Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling
On Thu, Sep 27, 2018 at 03:58:04PM +0100, Robin Murphy wrote: >> } >> #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ >> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c >> index 3c404e33d946..64466b7ef67b 100644 >> --- a/kernel/dma/direct.c >> +++ b/kernel/dma/direct.c >> @@ -43,10 +43,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, >> size_t size, >> return false; >> } >> - if (*dev->dma_mask >= DMA_BIT_MASK(32)) { >> +if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { > > Hmm... say *dev->dma_mask is 31 bits and dev->bus_dma_mask is 40 bits due > to a global DT property, we'll now scream where we didn't before even > though the bus mask is almost certainly irrelevant - is that desirable? This is just the reporting in the error case - we'll only hit this IFF dma_capable already returned false. But if you don't want a message here we can probably drop this hunk. >> @@ -65,12 +66,18 @@ u64 dma_direct_get_required_mask(struct device *dev) >> { >> u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); >> + if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) >> +max_dma = dev->bus_dma_mask; > > Again, I think we could just do another min_not_zero() here. A device wired > to address only one single page of RAM isn't a realistic prospect (and we > could just flip the -1 and the shift in the max_dma calculation if we > *really* wanted to support such things). This just seemed more readable to me than min_not_zero, but if others prefer min_not_zero I can switch.
Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection
On Thu, Sep 27, 2018 at 03:30:20PM +0100, Robin Murphy wrote: >> +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 >> dma_mask, >> +u64 *phys_mask) >> +{ >> +if (force_dma_unencrypted()) >> +*phys_mask = __dma_to_phys(dev, dma_mask); >> +else >> +*phys_mask = dma_to_phys(dev, dma_mask); > > Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we can > reuse it here? This is a dma_to_phys and not a phys_to_dma.
Re: [PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask
On Thu, Sep 27, 2018 at 03:12:25PM +0100, Robin Murphy wrote: >> +u64 dma_direct_get_required_mask(struct device *dev) >> +{ >> +u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); >> + >> +return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; > > I think that may as well just use __fls64() - it seems reasonable to assume > max_dma > 0. Otherwise, Is there any good reason to micro-optimize given that this isn't a fast path?
Re: [PATCH 5/5] dma-direct: always allow dma mask <= physiscal memory size
[ oops, I should have looked at the replies first, now I see Ben already had the same thing to say about #3... ] On 27/09/18 14:49, Christoph Hellwig wrote: On Thu, Sep 27, 2018 at 11:50:14AM +1000, Benjamin Herrenschmidt wrote: -* to be able to satisfy them - either by not supporting more physical -* memory, or by providing a ZONE_DMA32. If neither is the case, the -* architecture needs to use an IOMMU instead of the direct mapping. -*/ - if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) + u64 min_mask; + + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); + else + min_mask = min_t(u64, DMA_BIT_MASK(32), +(max_pfn - 1) << PAGE_SHIFT); + + if (mask >= phys_to_dma(dev, min_mask)) return 0; nitpick ... to be completely "correct", I would have written if (IS_ENABLED(CONFIG_ZONE_DMA)) min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); else min_mask = DMA_BIT_MASK(32); min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); In "theory" it's also ok to have a mask < ZONE_DMA_BITS as long as it's big enough to fit all memory :-) Yeah, we could do that. FWIW I like it even if just for looking slightly more readable. With that fixup, Reviewed-by: Robin Murphy
Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling
On 20/09/18 19:52, Christoph Hellwig wrote: Instead of rejecting devices with a too small bus_dma_mask we can handle by taking the bus dma_mask into account for allocations and bounce buffering decisions. Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 3 ++- kernel/dma/direct.c| 21 +++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index b79496d8c75b..fbca184ff5a0 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -27,7 +27,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) if (!dev->dma_mask) return false; - return addr + size - 1 <= *dev->dma_mask; + return addr + size - 1 <= + min_not_zero(*dev->dma_mask, dev->bus_dma_mask); } #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 3c404e33d946..64466b7ef67b 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -43,10 +43,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, return false; } - if (*dev->dma_mask >= DMA_BIT_MASK(32)) { + if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) { Hmm... say *dev->dma_mask is 31 bits and dev->bus_dma_mask is 40 bits due to a global DT property, we'll now scream where we didn't before even though the bus mask is almost certainly irrelevant - is that desirable? dev_err(dev, - "%s: overflow %pad+%zu of device mask %llx\n", - caller, &dma_addr, size, *dev->dma_mask); + "%s: overflow %pad+%zu of device mask %llx bus mask %llx\n", + caller, &dma_addr, size, + *dev->dma_mask, dev->bus_dma_mask); } return false; } @@ -65,12 +66,18 @@ u64 dma_direct_get_required_mask(struct device *dev) { u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma) + max_dma = dev->bus_dma_mask; Again, I think we could just do another min_not_zero() here. A device wired to address only one single page of RAM isn't a realistic prospect (and we could just flip the -1 and the shift in the max_dma calculation if we *really* wanted to support such things). + return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, u64 *phys_mask) { + if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask) + dma_mask = dev->bus_dma_mask; + Similarly, can't we assume dma_mask to be nonzero here too? It feels like we really shouldn't have managed to get this far without one. Robin. if (force_dma_unencrypted()) *phys_mask = __dma_to_phys(dev, dma_mask); else @@ -87,7 +94,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= - dev->coherent_dma_mask; + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask); } void *dma_direct_alloc_pages(struct device *dev, size_t size, @@ -291,12 +298,6 @@ int dma_direct_supported(struct device *dev, u64 mask) if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) return 0; #endif - /* -* Upstream PCI/PCIe bridges or SoC interconnects may not carry -* as many DMA address bits as the device itself supports. -*/ - if (dev->bus_dma_mask && mask > dev->bus_dma_mask) - return 0; return 1; }
Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection
On 20/09/18 19:52, Christoph Hellwig wrote: We need to take the DMA offset and encryption bit into account when selecting a zone. User the opportunity to factor out the zone selection into a helper for reuse. Signed-off-by: Christoph Hellwig --- kernel/dma/direct.c | 31 +-- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 81b73a5bba54..3c404e33d946 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -68,6 +68,22 @@ u64 dma_direct_get_required_mask(struct device *dev) return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; } +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask, + u64 *phys_mask) +{ + if (force_dma_unencrypted()) + *phys_mask = __dma_to_phys(dev, dma_mask); + else + *phys_mask = dma_to_phys(dev, dma_mask); Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we can reuse it here? + /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ + if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) + return GFP_DMA; If we don't have ZONE_DMA it would in theory be a tiny bit better to fall back to GFP_DMA32 instead of returning 0 here, but I'm not sure it's worth the bother. + if (*phys_mask <= DMA_BIT_MASK(32)) + return GFP_DMA32; + return 0; +} + static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { return phys_to_dma_direct(dev, phys) + size - 1 <= @@ -80,17 +96,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; int page_order = get_order(size); struct page *page = NULL; + u64 phys_mask; void *ret; /* we always manually zero the memory once we are done: */ gfp &= ~__GFP_ZERO; - - /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */ - if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)) - gfp |= GFP_DMA; - if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; - + gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, + &phys_mask); again: /* CMA can be used only in the context which permits sleeping */ if (gfpflags_allow_blocking(gfp)) { @@ -109,15 +121,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, page = NULL; if (IS_ENABLED(CONFIG_ZONE_DMA32) && - dev->coherent_dma_mask < DMA_BIT_MASK(64) && + phys_mask < DMA_BIT_MASK(64) && !(gfp & (GFP_DMA32 | GFP_DMA))) { gfp |= GFP_DMA32; goto again; Ah right, in that situation we should probably end up here anyway, so that's good enough - definitely not worth any more #ifdeffery above. Nits aside, Reviewed-by: Robin Murphy } if (IS_ENABLED(CONFIG_ZONE_DMA) && - dev->coherent_dma_mask < DMA_BIT_MASK(32) && - !(gfp & GFP_DMA)) { + phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; }
Re: [PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask
On 20/09/18 19:52, Christoph Hellwig wrote: This is somewhat modelled after the powerpc version, and differs from the legacy fallback in use fls64 instead of pointlessly splitting up the address into low and high dwords and in that it takes (__)phys_to_dma into account. Signed-off-by: Christoph Hellwig --- include/linux/dma-direct.h | 1 + kernel/dma/direct.c| 21 ++--- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h index 86a59ba5a7f3..b79496d8c75b 100644 --- a/include/linux/dma-direct.h +++ b/include/linux/dma-direct.h @@ -55,6 +55,7 @@ static inline void dma_mark_clean(void *addr, size_t size) } #endif /* CONFIG_ARCH_HAS_DMA_MARK_CLEAN */ +u64 dma_direct_get_required_mask(struct device *dev); void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index c954f0a6dc62..81b73a5bba54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -53,11 +53,25 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t size, return true; } +static inline dma_addr_t phys_to_dma_direct(struct device *dev, + phys_addr_t phys) +{ + if (force_dma_unencrypted()) + return __phys_to_dma(dev, phys); + return phys_to_dma(dev, phys); +} + +u64 dma_direct_get_required_mask(struct device *dev) +{ + u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT); + + return (1ULL << (fls64(max_dma) - 1)) * 2 - 1; I think that may as well just use __fls64() - it seems reasonable to assume max_dma > 0. Otherwise, Reviewed-by: Robin Murphy +} + static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) { - dma_addr_t addr = force_dma_unencrypted() ? - __phys_to_dma(dev, phys) : phys_to_dma(dev, phys); - return addr + size - 1 <= dev->coherent_dma_mask; + return phys_to_dma_direct(dev, phys) + size - 1 <= + dev->coherent_dma_mask; } void *dma_direct_alloc_pages(struct device *dev, size_t size, @@ -296,6 +310,7 @@ const struct dma_map_ops dma_direct_ops = { .unmap_page = dma_direct_unmap_page, .unmap_sg = dma_direct_unmap_sg, #endif + .get_required_mask = dma_direct_get_required_mask, .dma_supported = dma_direct_supported, .mapping_error = dma_direct_mapping_error, .cache_sync = arch_dma_cache_sync,
Re: [PATCH 5/5] dma-direct: always allow dma mask <= physiscal memory size
On Thu, Sep 27, 2018 at 11:50:14AM +1000, Benjamin Herrenschmidt wrote: > > -* to be able to satisfy them - either by not supporting more physical > > -* memory, or by providing a ZONE_DMA32. If neither is the case, the > > -* architecture needs to use an IOMMU instead of the direct mapping. > > -*/ > > - if (mask < phys_to_dma(dev, DMA_BIT_MASK(32))) > > + u64 min_mask; > > + > > + if (IS_ENABLED(CONFIG_ZONE_DMA)) > > + min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); > > + else > > + min_mask = min_t(u64, DMA_BIT_MASK(32), > > +(max_pfn - 1) << PAGE_SHIFT); > > + > > + if (mask >= phys_to_dma(dev, min_mask)) > > return 0; > > nitpick ... to be completely "correct", I would have written > > if (IS_ENABLED(CONFIG_ZONE_DMA)) > min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS); > else > min_mask = DMA_BIT_MASK(32); > > min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); > > In "theory" it's also ok to have a mask < ZONE_DMA_BITS as long as it's > big enough to fit all memory :-) Yeah, we could do that.
Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection
On Thu, Sep 27, 2018 at 11:45:15AM +1000, Benjamin Herrenschmidt wrote: > I'm not sure this is entirely right. > > Let's say the mask is 30 bits. You will return GFP_DMA32, which will > fail if you allocate something above 1G (which is legit for > ZONE_DMA32). And then we will try GFP_DMA further down in the function: if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } This is and old optimization from x86, because chances are high that GFP_DMA32 will give you suitable memory for the infamous 31-bit dma mask devices (at least at boot time) and thus we don't have to deplete the tiny ZONE_DMA pool.
Re: [PATCH v3 1/9] powerpc: build .dtb files in dts directory
On Mon, Sep 10, 2018 at 10:04 AM Rob Herring wrote: > > Align powerpc with other architectures which build the dtb files in the > same directory as the dts files. This is also in line with most other > build targets which are located in the same directory as the source. > This move will help enable the 'dtbs' target which builds all the dtbs > regardless of kernel config. > > This transition could break some scripts if they expect dtb files in the > old location. > > Cc: Benjamin Herrenschmidt > Cc: Paul Mackerras > Cc: Michael Ellerman > Cc: linuxppc-dev@lists.ozlabs.org > Signed-off-by: Rob Herring > --- > v3: > - Remove duplicate mpc5200 dtbs from image-y targets. The dtb target already >comes from the cuImage. target. Do the PPC folks have any comments on this? Rob > > arch/powerpc/Makefile | 2 +- > arch/powerpc/boot/Makefile | 55 -- > arch/powerpc/boot/dts/Makefile | 1 + > 3 files changed, 28 insertions(+), 30 deletions(-) > create mode 100644 arch/powerpc/boot/dts/Makefile > > diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile > index 11a1acba164a..53ea887eb34e 100644 > --- a/arch/powerpc/Makefile > +++ b/arch/powerpc/Makefile > @@ -294,7 +294,7 @@ bootwrapper_install: > $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@) > > %.dtb: scripts > - $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@) > + $(Q)$(MAKE) $(build)=$(boot)/dts $(patsubst %,$(boot)/dts/%,$@) > > # Used to create 'merged defconfigs' > # To use it $(call) it with the first argument as the base defconfig > diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile > index 0fb96c26136f..bca5c23767df 100644 > --- a/arch/powerpc/boot/Makefile > +++ b/arch/powerpc/boot/Makefile > @@ -304,9 +304,9 @@ image-$(CONFIG_PPC_ADDER875)+= > cuImage.adder875-uboot \ >dtbImage.adder875-redboot > > # Board ports in arch/powerpc/platform/52xx/Kconfig > -image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200 lite5200.dtb > -image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200b lite5200b.dtb > -image-$(CONFIG_PPC_MEDIA5200) += cuImage.media5200 media5200.dtb > +image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200 > +image-$(CONFIG_PPC_LITE5200) += cuImage.lite5200b > +image-$(CONFIG_PPC_MEDIA5200) += cuImage.media5200 > > # Board ports in arch/powerpc/platform/82xx/Kconfig > image-$(CONFIG_MPC8272_ADS)+= cuImage.mpc8272ads > @@ -381,11 +381,11 @@ $(addprefix $(obj)/, $(sort $(filter zImage.%, > $(image-y: vmlinux $(wrapperb > $(call if_changed,wrap,$(subst $(obj)/zImage.,,$@)) > > # dtbImage% - a dtbImage is a zImage with an embedded device tree blob > -$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE > - $(call if_changed,wrap,$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) > +$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/dts/%.dtb FORCE > + $(call if_changed,wrap,$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) > > -$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE > - $(call if_changed,wrap,$*,,$(obj)/$*.dtb) > +$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/dts/%.dtb FORCE > + $(call if_changed,wrap,$*,,$(obj)/dts/$*.dtb) > > # This cannot be in the root of $(src) as the zImage rule always adds a > $(obj) > # prefix > @@ -395,36 +395,33 @@ $(obj)/vmlinux.strip: vmlinux > $(obj)/uImage: vmlinux $(wrapperbits) FORCE > $(call if_changed,wrap,uboot) > > -$(obj)/uImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE > - $(call > if_changed,wrap,uboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) > +$(obj)/uImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE > + $(call > if_changed,wrap,uboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) > > -$(obj)/uImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE > - $(call if_changed,wrap,uboot-$*,,$(obj)/$*.dtb) > +$(obj)/uImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE > + $(call if_changed,wrap,uboot-$*,,$(obj)/dts/$*.dtb) > > -$(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE > - $(call > if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) > +$(obj)/cuImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE > + $(call > if_changed,wrap,cuboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz) > > -$(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE > - $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb) > +$(obj)/cuImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE > + $(call if_changed,wrap,cuboot-$*,,$(obj)/dts/$*.dtb) > > -$(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE > - $(call > if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz) > +$(obj)/simpleImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE > + $(call > if_chang
Re: [PATCH -next] PCI: hotplug: Use kmemdup rather than duplicating its implementation in pnv_php_add_devtree()
YueHaibing writes: > Use kmemdup rather than duplicating its implementation > > Signed-off-by: YueHaibing > --- > drivers/pci/hotplug/pnv_php.c | 3 +-- > 1 file changed, 1 insertion(+), 2 deletions(-) > > diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c > index 5070620..ee54f5b 100644 > --- a/drivers/pci/hotplug/pnv_php.c > +++ b/drivers/pci/hotplug/pnv_php.c > @@ -275,14 +275,13 @@ static int pnv_php_add_devtree(struct pnv_php_slot > *php_slot) > goto free_fdt1; > } > > - fdt = kzalloc(fdt_totalsize(fdt1), GFP_KERNEL); > + fdt = kmemdup(fdt1, fdt_totalsize(fdt1), GFP_KERNEL); > if (!fdt) { > ret = -ENOMEM; > goto free_fdt1; > } > > /* Unflatten device tree blob */ > - memcpy(fdt, fdt1, fdt_totalsize(fdt1)); > dt = of_fdt_unflatten_tree(fdt, php_slot->dn, NULL); > if (!dt) { > ret = -EINVAL; Looks correct to me. Acked-by: Michael Ellerman cheers
Re: [PATCH v4 1/2] powerpc/32: add stack protector support
Le 27/09/2018 à 09:45, Segher Boessenkool a écrit : On Thu, Sep 27, 2018 at 08:20:00AM +0200, Christophe LEROY wrote: Le 26/09/2018 à 21:16, Segher Boessenkool a écrit : On Wed, Sep 26, 2018 at 11:40:38AM +, Christophe Leroy wrote: +static __always_inline void boot_init_stack_canary(void) +{ + unsigned long canary; + + /* Try to get a semi random initial value. */ + get_random_bytes(&canary, sizeof(canary)); + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + + current->stack_canary = canary; +} I still think you should wait until there is entropy available. You haven't answered my questions about that (or I didn't see them): what does the kernel do in other similar cases? Looks great otherwise! What do you mean by 'other similar cases' ? All arches have similar boot_init_stack_canary(). Yes, those, and other things that want entropy early. Functions add_latent_entropy() and add_device_randomness() are there for that it seems, allthough it is not real entropy. x86 uses rdtsc() which is equivalent to our mftb(). Most arches xor it with LINUX_VERSION_CODE. The issue is that it is called very early in start_kernel(), however they try to set some entropy anyway: boot_cpu_init(); page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); /* * Set up the the initial canary and entropy after arch * and after adding latent and command line entropy. */ add_latent_entropy(); add_device_randomness(command_line, strlen(command_line)); boot_init_stack_canary(); Apparently, it is too early for calling wait_for_random_bytes(), see below. Hrm. Too early to call wait_event_interruptible? From there it went into schedule(), which blew up. Well you say we have only one context at this point, so that is not too surprising then :-) However this is the canary for initial startup only. Only idle() still uses this canary once the system is running. A new canary is set for any new forked task. Ah, that makes things a lot better! Do those new tasks get a canary from something with sufficient entropy though? For the kernel threads that are started early, probably not. For the ones started a bit later, and for user processes, I believe they have better entropy. Anyway, all this is handled by the kernel core and is out of control of individual arches, as its done in kernel/fork.c in function dup_task_struct(). However this function is declared as static __latent_entropy struct task_struct *copy_process(). This __latent_entropy attibute must help in a way. Maybe should the idle canary be updated later once there is more entropy That is tricky to do, but sure, if you can, that should help. ? Today there is a new call to boot_init_stack_canary() in cpu_startup_entry(), but it is enclosed inside #ifdef CONFIG_X86. It needs to know the details of how ssp works on each platform. Well, that could be for another patch in the future. That's probably feasible on x86 and PPC because they both use TLS guard, but not for other arches where the guard is global at the moment. So we'll have to do it carefully. I agree with you that we may not for the time being get all the expected security against hackers out of it due to that little entropy, but my main concern at the time being is to deter developper's bugs clobbering the stack, and for that any non trivial canary should make it, shouldn't it ? Christophe
Re: [PATCH] kdb: use correct pointer when 'btc' calls 'btt'
Le 27/09/2018 à 13:09, Michael Ellerman a écrit : Christophe LEROY writes: Le 26/09/2018 à 13:11, Daniel Thompson a écrit : On 16/09/2018 20:06, Daniel Thompson wrote: On Fri, Sep 14, 2018 at 12:35:44PM +, Christophe Leroy wrote: On a powerpc 8xx, 'btc' fails as follows: Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry ... Signed-off-by: Christophe Leroy Cc: # 4.15+ Would a Fixes: be better here? Fixes: ad67b74d2469d9b82 ("printk: hash addresses printed with %p") Christophe, When you add the Fixes: could you also add my Reviewed-by: Daniel Thompson Ok, thanks for the review, but do I have to do anything really ? The Fixes: and now your Reviewed-by: appear automatically in patchwork (https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=65715), so I believe they'll be automatically included when Jason or someone else takes the patch, no ? patchwork won't add the Fixes tag from the reply, it needs to be in the original mail. See: https://github.com/getpatchwork/patchwork/issues/151 Ok, so it accounts it and adds a '1' in the F column in the patches list, but won't take it into account. Then I'll send a v2 with revised commit text. Christophe
Re: [PATCH] kdb: use correct pointer when 'btc' calls 'btt'
Christophe LEROY writes: > Le 26/09/2018 à 13:11, Daniel Thompson a écrit : >> On 16/09/2018 20:06, Daniel Thompson wrote: >>> On Fri, Sep 14, 2018 at 12:35:44PM +, Christophe Leroy wrote: On a powerpc 8xx, 'btc' fails as follows: Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry ... Signed-off-by: Christophe Leroy Cc: # 4.15+ >>> >>> Would a Fixes: be better here? >>> Fixes: ad67b74d2469d9b82 ("printk: hash addresses printed with %p") >> >> Christophe, When you add the Fixes: could you also add my >> >> Reviewed-by: Daniel Thompson > > Ok, thanks for the review, but do I have to do anything really ? > > The Fixes: and now your Reviewed-by: appear automatically in patchwork > (https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=65715), > so I believe they'll be automatically included when Jason or someone > else takes the patch, no ? patchwork won't add the Fixes tag from the reply, it needs to be in the original mail. See: https://github.com/getpatchwork/patchwork/issues/151 cheers
[PATCH v3 6/6] memory-hotplug.txt: Add some details about locking internals
Let's document the magic a bit, especially why device_hotplug_lock is required when adding/removing memory and how it all play together with requests to online/offline memory from user space. Cc: Jonathan Corbet Cc: Michal Hocko Cc: Andrew Morton Reviewed-by: Pavel Tatashin Reviewed-by: Rashmica Gupta Signed-off-by: David Hildenbrand --- Documentation/memory-hotplug.txt | 42 +++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 7f49ebf3ddb2..ce4faa5530fa 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -3,7 +3,7 @@ Memory Hotplug == :Created: Jul 28 2007 -:Updated: Add description of notifier of memory hotplug: Oct 11 2007 +:Updated: Add some details about locking internals:Aug 20 2018 This document is about memory hotplug including how-to-use and current status. Because Memory Hotplug is still under development, contents of this text will @@ -495,6 +495,46 @@ further processing of the notification queue. NOTIFY_STOP stops further processing of the notification queue. + +Locking Internals += + +When adding/removing memory that uses memory block devices (i.e. ordinary RAM), +the device_hotplug_lock should be held to: + +- synchronize against online/offline requests (e.g. via sysfs). This way, memory + block devices can only be accessed (.online/.state attributes) by user + space once memory has been fully added. And when removing memory, we + know nobody is in critical sections. +- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC) + +Especially, there is a possible lock inversion that is avoided using +device_hotplug_lock when adding memory and user space tries to online that +memory faster than expected: + +- device_online() will first take the device_lock(), followed by + mem_hotplug_lock +- add_memory_resource() will first take the mem_hotplug_lock, followed by + the device_lock() (while creating the devices, during bus_add_device()). + +As the device is visible to user space before taking the device_lock(), this +can result in a lock inversion. + +onlining/offlining of memory should be done via device_online()/ +device_offline() - to make sure it is properly synchronized to actions +via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type) + +When adding/removing/onlining/offlining memory or adding/removing +heterogeneous/device memory, we should always hold the mem_hotplug_lock in +write mode to serialise memory hotplug (e.g. access to global/zone +variables). + +In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read +mode allows for a quite efficient get_online_mems/put_online_mems +implementation, so code accessing memory can protect from that memory +vanishing. + + Future Work === -- 2.17.1
[PATCH v3 5/6] powerpc/powernv: hold device_hotplug_lock when calling memtrace_offline_pages()
Let's perform all checking + offlining + removing under device_hotplug_lock, so nobody can mess with these devices via sysfs concurrently. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Rashmica Gupta Cc: Balbir Singh Cc: Michael Neuling Reviewed-by: Pavel Tatashin Reviewed-by: Rashmica Gupta Acked-by: Balbir Singh Signed-off-by: David Hildenbrand --- arch/powerpc/platforms/powernv/memtrace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index fdd48f1a39f7..84d038ed3882 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -70,6 +70,7 @@ static int change_memblock_state(struct memory_block *mem, void *arg) return 0; } +/* called with device_hotplug_lock held */ static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) { u64 end_pfn = start_pfn + nr_pages - 1; @@ -110,6 +111,7 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) /* Trace memory needs to be aligned to the size */ end_pfn = round_down(end_pfn - nr_pages, nr_pages); + lock_device_hotplug(); for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) { if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) { /* @@ -118,7 +120,6 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) * we never try to remove memory that spans two iomem * resources. */ - lock_device_hotplug(); end_pfn = base_pfn + nr_pages; for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) { __remove_memory(nid, pfn << PAGE_SHIFT, bytes); @@ -127,6 +128,7 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) return base_pfn << PAGE_SHIFT; } } + unlock_device_hotplug(); return 0; } -- 2.17.1
[PATCH v3 4/6] powerpc/powernv: hold device_hotplug_lock when calling device_online()
device_online() should be called with device_hotplug_lock() held. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Rashmica Gupta Cc: Balbir Singh Cc: Michael Neuling Reviewed-by: Pavel Tatashin Reviewed-by: Rashmica Gupta Signed-off-by: David Hildenbrand --- arch/powerpc/platforms/powernv/memtrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 773623f6bfb1..fdd48f1a39f7 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -242,9 +242,11 @@ static int memtrace_online(void) * we need to online the memory ourselves. */ if (!memhp_auto_online) { + lock_device_hotplug(); walk_memory_range(PFN_DOWN(ent->start), PFN_UP(ent->start + ent->size - 1), NULL, online_mem_block); + unlock_device_hotplug(); } /* -- 2.17.1
[PATCH v3 3/6] mm/memory_hotplug: fix online/offline_pages called w.o. mem_hotplug_lock
There seem to be some problems as result of 30467e0b3be ("mm, hotplug: fix concurrent memory hot-add deadlock"), which tried to fix a possible lock inversion reported and discussed in [1] due to the two locks a) device_lock() b) mem_hotplug_lock While add_memory() first takes b), followed by a) during bus_probe_device(), onlining of memory from user space first took a), followed by b), exposing a possible deadlock. In [1], and it was decided to not make use of device_hotplug_lock, but rather to enforce a locking order. The problems I spotted related to this: 1. Memory block device attributes: While .state first calls mem_hotplug_begin() and the calls device_online() - which takes device_lock() - .online does no longer call mem_hotplug_begin(), so effectively calls online_pages() without mem_hotplug_lock. 2. device_online() should be called under device_hotplug_lock, however onlining memory during add_memory() does not take care of that. In addition, I think there is also something wrong about the locking in 3. arch/powerpc/platforms/powernv/memtrace.c calls offline_pages() without locks. This was introduced after 30467e0b3be. And skimming over the code, I assume it could need some more care in regards to locking (e.g. device_online() called without device_hotplug_lock. This will be addressed in the following patches. Now that we hold the device_hotplug_lock when - adding memory (e.g. via add_memory()/add_memory_resource()) - removing memory (e.g. via remove_memory()) - device_online()/device_offline() We can move mem_hotplug_lock usage back into online_pages()/offline_pages(). Why is mem_hotplug_lock still needed? Essentially to make get_online_mems()/put_online_mems() be very fast (relying on device_hotplug_lock would be very slow), and to serialize against addition of memory that does not create memory block devices (hmm). [1] http://driverdev.linuxdriverproject.org/pipermail/ driverdev-devel/ 2015-February/065324.html This patch is partly based on a patch by Vitaly Kuznetsov. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: "K. Y. Srinivasan" Cc: Haiyang Zhang Cc: Stephen Hemminger Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Rashmica Gupta Cc: Michael Neuling Cc: Balbir Singh Cc: Kate Stewart Cc: Thomas Gleixner Cc: Philippe Ombredanne Cc: Andrew Morton Cc: Michal Hocko Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Dan Williams Cc: Oscar Salvador Cc: YASUAKI ISHIMATSU Cc: Mathieu Malaterre Reviewed-by: Pavel Tatashin Reviewed-by: Rashmica Gupta Signed-off-by: David Hildenbrand --- drivers/base/memory.c | 13 + mm/memory_hotplug.c | 28 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 40cac122ec73..0e5985682642 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -228,7 +228,6 @@ static bool pages_correctly_probed(unsigned long start_pfn) /* * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is * OK to have direct references to sparsemem variables in here. - * Must already be protected by mem_hotplug_begin(). */ static int memory_block_action(unsigned long phys_index, unsigned long action, int online_type) @@ -294,7 +293,6 @@ static int memory_subsys_online(struct device *dev) if (mem->online_type < 0) mem->online_type = MMOP_ONLINE_KEEP; - /* Already under protection of mem_hotplug_begin() */ ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); /* clear online_type */ @@ -341,19 +339,11 @@ store_mem_state(struct device *dev, goto err; } - /* -* Memory hotplug needs to hold mem_hotplug_begin() for probe to find -* the correct memory block to online before doing device_online(dev), -* which will take dev->mutex. Take the lock early to prevent an -* inversion, memory_subsys_online() callbacks will be implemented by -* assuming it's already protected. -*/ - mem_hotplug_begin(); - switch (online_type) { case MMOP_ONLINE_KERNEL: case MMOP_ONLINE_MOVABLE: case MMOP_ONLINE_KEEP: + /* mem->online_type is protected by device_hotplug_lock */ mem->online_type = online_type; ret = device_online(&mem->dev); break; @@ -364,7 +354,6 @@ store_mem_state(struct device *dev, ret = -EINVAL; /* should never happen */ } - mem_hotplug_done(); err: unlock_device_hotplug(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index affb03e0dfef..d4c7e42e46f3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -860,7 +860,6 @@ static struct zone * __meminit move_pfn_range(int online_type, int ni
[PATCH v3 2/6] mm/memory_hotplug: make add_memory() take the device_hotplug_lock
add_memory() currently does not take the device_hotplug_lock, however is aleady called under the lock from arch/powerpc/platforms/pseries/hotplug-memory.c drivers/acpi/acpi_memhotplug.c to synchronize against CPU hot-remove and similar. In general, we should hold the device_hotplug_lock when adding memory to synchronize against online/offline request (e.g. from user space) - which already resulted in lock inversions due to device_lock() and mem_hotplug_lock - see 30467e0b3be ("mm, hotplug: fix concurrent memory hot-add deadlock"). add_memory()/add_memory_resource() will create memory block devices, so this really feels like the right thing to do. Holding the device_hotplug_lock makes sure that a memory block device can really only be accessed (e.g. via .online/.state) from user space, once the memory has been fully added to the system. The lock is not held yet in drivers/xen/balloon.c arch/powerpc/platforms/powernv/memtrace.c drivers/s390/char/sclp_cmd.c drivers/hv/hv_balloon.c So, let's either use the locked variants or take the lock. Don't export add_memory_resource(), as it once was exported to be used by XEN, which is never built as a module. If somebody requires it, we also have to export a locked variant (as device_hotplug_lock is never exported). Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Greg Kroah-Hartman Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Nathan Fontenot Cc: John Allen Cc: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Oscar Salvador Cc: Mathieu Malaterre Cc: Pavel Tatashin Cc: YASUAKI ISHIMATSU Reviewed-by: Pavel Tatashin Reviewed-by: Rafael J. Wysocki Reviewed-by: Rashmica Gupta Signed-off-by: David Hildenbrand --- .../platforms/pseries/hotplug-memory.c| 2 +- drivers/acpi/acpi_memhotplug.c| 2 +- drivers/base/memory.c | 9 ++-- drivers/xen/balloon.c | 3 +++ include/linux/memory_hotplug.h| 1 + mm/memory_hotplug.c | 22 --- 6 files changed, 32 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index dd0264c43f3e..d26a771d985e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -673,7 +673,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) nid = memory_add_physaddr_to_nid(lmb->base_addr); /* Add the memory */ - rc = add_memory(nid, lmb->base_addr, block_sz); + rc = __add_memory(nid, lmb->base_addr, block_sz); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 811148415993..8fe0960ea572 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -228,7 +228,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); - result = add_memory(node, info->start_addr, info->length); + result = __add_memory(node, info->start_addr, info->length); /* * If the memory block has been used by the kernel, add_memory() diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 817320c7c4c1..40cac122ec73 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -519,15 +519,20 @@ memory_probe_store(struct device *dev, struct device_attribute *attr, if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) return -EINVAL; + ret = lock_device_hotplug_sysfs(); + if (ret) + goto out; + nid = memory_add_physaddr_to_nid(phys_addr); - ret = add_memory(nid, phys_addr, -MIN_MEMORY_BLOCK_SIZE * sections_per_block); + ret = __add_memory(nid, phys_addr, + MIN_MEMORY_BLOCK_SIZE * sections_per_block); if (ret) goto out; ret = count; out: + unlock_device_hotplug(); return ret; } diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index a3f5cbfcd4a1..fdfc64f5acea 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -395,7 +395,10 @@ static enum bp_state reserve_additional_memory(void) * callers drop the mutex before trying again. */ mutex_unlock(&balloon_mutex); + /* add_memory_resource() requires the device_hotplug lock */ + lock_device_hotplug(); rc = add_memory_resource(nid, resource, memhp_auto_online); + unlock_device_hotplug(); mutex_lock(&balloon_mutex); if (rc) { diff --git a/include/linux/mem
[PATCH v3 1/6] mm/memory_hotplug: make remove_memory() take the device_hotplug_lock
remove_memory() is exported right now but requires the device_hotplug_lock, which is not exported. So let's provide a variant that takes the lock and only export that one. The lock is already held in arch/powerpc/platforms/pseries/hotplug-memory.c drivers/acpi/acpi_memhotplug.c arch/powerpc/platforms/powernv/memtrace.c Apart from that, there are not other users in the tree. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: "Rafael J. Wysocki" Cc: Len Brown Cc: Rashmica Gupta Cc: Michael Neuling Cc: Balbir Singh Cc: Nathan Fontenot Cc: John Allen Cc: Andrew Morton Cc: Michal Hocko Cc: Dan Williams Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Pavel Tatashin Cc: Greg Kroah-Hartman Cc: Oscar Salvador Cc: YASUAKI ISHIMATSU Cc: Mathieu Malaterre Reviewed-by: Pavel Tatashin Reviewed-by: Rafael J. Wysocki Reviewed-by: Rashmica Gupta Signed-off-by: David Hildenbrand --- arch/powerpc/platforms/powernv/memtrace.c | 2 +- arch/powerpc/platforms/pseries/hotplug-memory.c | 6 +++--- drivers/acpi/acpi_memhotplug.c | 2 +- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 9 - 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index a29fdf8a2e56..773623f6bfb1 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -121,7 +121,7 @@ static u64 memtrace_alloc_node(u32 nid, u64 size) lock_device_hotplug(); end_pfn = base_pfn + nr_pages; for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> PAGE_SHIFT) { - remove_memory(nid, pfn << PAGE_SHIFT, bytes); + __remove_memory(nid, pfn << PAGE_SHIFT, bytes); } unlock_device_hotplug(); return base_pfn << PAGE_SHIFT; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 9a15d39995e5..dd0264c43f3e 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -305,7 +305,7 @@ static int pseries_remove_memblock(unsigned long base, unsigned int memblock_siz nid = memory_add_physaddr_to_nid(base); for (i = 0; i < sections_per_block; i++) { - remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); + __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE); base += MIN_MEMORY_BLOCK_SIZE; } @@ -394,7 +394,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb) block_sz = pseries_memory_block_size(); nid = memory_add_physaddr_to_nid(lmb->base_addr); - remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(nid, lmb->base_addr, block_sz); /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); @@ -681,7 +681,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) rc = dlpar_online_lmb(lmb); if (rc) { - remove_memory(nid, lmb->base_addr, block_sz); + __remove_memory(nid, lmb->base_addr, block_sz); invalidate_lmb_associativity_index(lmb); } else { lmb->flags |= DRCONF_MEM_ASSIGNED; diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 6b0d3ef7309c..811148415993 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -282,7 +282,7 @@ static void acpi_memory_remove_memory(struct acpi_memory_device *mem_device) nid = memory_add_physaddr_to_nid(info->start_addr); acpi_unbind_memory_blocks(info); - remove_memory(nid, info->start_addr, info->length); + __remove_memory(nid, info->start_addr, info->length); list_del(&info->list); kfree(info); } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 34a28227068d..1f096852f479 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -301,6 +301,7 @@ extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern void remove_memory(int nid, u64 start, u64 size); +extern void __remove_memory(int nid, u64 start, u64 size); #else static inline bool is_mem_section_removable(unsigned long pfn, @@ -317,6 +318,7 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) } static inline void remove_memory(int nid, u64 start, u64 size) {} +static inline void __remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOT
[PATCH v3 0/6] mm: online/offline_pages called w.o. mem_hotplug_lock
@Andrew, Only patch #5 changed (see change notes below). Thanks! Reading through the code and studying how mem_hotplug_lock is to be used, I noticed that there are two places where we can end up calling device_online()/device_offline() - online_pages()/offline_pages() without the mem_hotplug_lock. And there are other places where we call device_online()/device_offline() without the device_hotplug_lock. While e.g. echo "online" > /sys/devices/system/memory/memory9/state is fine, e.g. echo 1 > /sys/devices/system/memory/memory9/online Will not take the mem_hotplug_lock. However the device_lock() and device_hotplug_lock. E.g. via memory_probe_store(), we can end up calling add_memory()->online_pages() without the device_hotplug_lock. So we can have concurrent callers in online_pages(). We e.g. touch in online_pages() basically unprotected zone->present_pages then. Looks like there is a longer history to that (see Patch #2 for details), and fixing it to work the way it was intended is not really possible. We would e.g. have to take the mem_hotplug_lock in device/base/core.c, which sounds wrong. Summary: We had a lock inversion on mem_hotplug_lock and device_lock(). More details can be found in patch 3 and patch 6. I propose the general rules (documentation added in patch 6): 1. add_memory/add_memory_resource() must only be called with device_hotplug_lock. 2. remove_memory() must only be called with device_hotplug_lock. This is already documented and holds for all callers. 3. device_online()/device_offline() must only be called with device_hotplug_lock. This is already documented and true for now in core code. Other callers (related to memory hotplug) have to be fixed up. 4. mem_hotplug_lock is taken inside of add_memory/remove_memory/ online_pages/offline_pages. To me, this looks way cleaner than what we have right now (and easier to verify). And looking at the documentation of remove_memory, using lock_device_hotplug also for add_memory() feels natural. v2 -> v3: - Take device_hotplug_lock outside of loop in patch #5 - Added Ack to patch #5 v1 -> v2: - Upstream changes in powerpc/powernv code required modifications to patch #1, #4 and #5. - Minor patch description changes. - Added more locking details in patch #6. - Added rb's RFCv2 -> v1: - Dropped an unnecessary _ref from remove_memory() in patch #1 - Minor patch description fixes. - Added rb's RFC -> RFCv2: - Don't export device_hotplug_lock, provide proper remove_memory/add_memory wrappers. - Split up the patches a bit. - Try to improve powernv memtrace locking - Add some documentation for locking that matches my knowledg David Hildenbrand (6): mm/memory_hotplug: make remove_memory() take the device_hotplug_lock mm/memory_hotplug: make add_memory() take the device_hotplug_lock mm/memory_hotplug: fix online/offline_pages called w.o. mem_hotplug_lock powerpc/powernv: hold device_hotplug_lock when calling device_online() powerpc/powernv: hold device_hotplug_lock when calling memtrace_offline_pages() memory-hotplug.txt: Add some details about locking internals Documentation/memory-hotplug.txt | 42 - arch/powerpc/platforms/powernv/memtrace.c | 8 ++- .../platforms/pseries/hotplug-memory.c| 8 +-- drivers/acpi/acpi_memhotplug.c| 4 +- drivers/base/memory.c | 22 +++ drivers/xen/balloon.c | 3 + include/linux/memory_hotplug.h| 4 +- mm/memory_hotplug.c | 59 +++ 8 files changed, 114 insertions(+), 36 deletions(-) -- 2.17.1
[PATCH 2/2] powerpc/pseries: Fix how we iterate over the DTL entries
When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set, we look up dtl_idx in the lppaca to determine the number of entries in the buffer. Since lppaca is in big endian, we need to do an endian conversion before using this in our calculation to determine the number of entries in the buffer. Without this, we do not iterate over the existing entries in the DTL buffer properly. Fixes: 7c105b63bd98 ("powerpc: Add CONFIG_CPU_LITTLE_ENDIAN kernel config option.") Signed-off-by: Naveen N. Rao --- arch/powerpc/platforms/pseries/dtl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index c762689e0eb3..ef6595153642 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -184,7 +184,7 @@ static void dtl_stop(struct dtl *dtl) static u64 dtl_current_index(struct dtl *dtl) { - return lppaca_of(dtl->cpu).dtl_idx; + return be64_to_cpu(lppaca_of(dtl->cpu).dtl_idx); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -- 2.19.0
[PATCH 1/2] powerpc/pseries: Fix DTL buffer registration
When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set, we register the DTL buffer for a cpu when the associated file under powerpc/dtl in debugfs is opened. When doing so, we need to set the size of the buffer being registered in the second u32 word of the buffer. This needs to be in big endian, but we are not doing the conversion resulting in the below error showing up in dmesg: dtl_start: DTL registration for cpu 0 (hw 0) failed with -4 Fix this in the obvious manner. Fixes: 7c105b63bd98 ("powerpc: Add CONFIG_CPU_LITTLE_ENDIAN kernel config option.") Signed-off-by: Naveen N. Rao --- arch/powerpc/platforms/pseries/dtl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 18014cdeb590..c762689e0eb3 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -149,7 +149,7 @@ static int dtl_start(struct dtl *dtl) /* Register our dtl buffer with the hypervisor. The HV expects the * buffer size to be passed in the second word of the buffer */ - ((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES; + ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES); hwcpu = get_hard_smp_processor_id(dtl->cpu); addr = __pa(dtl->buf); -- 2.19.0
[PATCH 0/2] A few fixes to DTL buffer access over debugfs
Two trivial fixes to DTL buffer access over debugfs when CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set. - Naveen Naveen N. Rao (2): powerpc/pseries: Fix DTL buffer registration powerpc/pseries: Fix how we iterate over the DTL entries arch/powerpc/platforms/pseries/dtl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) -- 2.19.0
Re: [PATCH v4 1/2] powerpc/32: add stack protector support
On Thu, Sep 27, 2018 at 08:20:00AM +0200, Christophe LEROY wrote: > Le 26/09/2018 à 21:16, Segher Boessenkool a écrit : > >On Wed, Sep 26, 2018 at 11:40:38AM +, Christophe Leroy wrote: > >>+static __always_inline void boot_init_stack_canary(void) > >>+{ > >>+ unsigned long canary; > >>+ > >>+ /* Try to get a semi random initial value. */ > >>+ get_random_bytes(&canary, sizeof(canary)); > >>+ canary ^= mftb(); > >>+ canary ^= LINUX_VERSION_CODE; > >>+ > >>+ current->stack_canary = canary; > >>+} > > > >I still think you should wait until there is entropy available. You > >haven't answered my questions about that (or I didn't see them): what > >does the kernel do in other similar cases? > > > >Looks great otherwise! > > What do you mean by 'other similar cases' ? All arches have similar > boot_init_stack_canary(). Yes, those, and other things that want entropy early. > x86 uses rdtsc() which is equivalent to our > mftb(). Most arches xor it with LINUX_VERSION_CODE. > > The issue is that it is called very early in start_kernel(), however > they try to set some entropy anyway: > > boot_cpu_init(); > page_address_init(); > pr_notice("%s", linux_banner); > setup_arch(&command_line); > /* >* Set up the the initial canary and entropy after arch >* and after adding latent and command line entropy. >*/ > add_latent_entropy(); > add_device_randomness(command_line, strlen(command_line)); > boot_init_stack_canary(); > > Apparently, it is too early for calling wait_for_random_bytes(), see below. Hrm. Too early to call wait_event_interruptible? From there it went into schedule(), which blew up. Well you say we have only one context at this point, so that is not too surprising then :-) > However this is the canary for initial startup only. Only idle() still > uses this canary once the system is running. A new canary is set for any > new forked task. Ah, that makes things a lot better! Do those new tasks get a canary from something with sufficient entropy though? > Maybe should the idle canary be updated later once there is more entropy That is tricky to do, but sure, if you can, that should help. > ? Today there is a new call to boot_init_stack_canary() in > cpu_startup_entry(), but it is enclosed inside #ifdef CONFIG_X86. It needs to know the details of how ssp works on each platform. Segher
[PATCH v5 2/2] powerpc/64: add stack protector support
On PPC64, as register r13 points to the paca_struct at all time, this patch adds a copy of the canary there, which is copied at task_switch. That new canary is then used by using the following GCC options: -mstack-protector-guard=tls -mstack-protector-guard-reg=r13 -mstack-protector-guard-offset=offsetof(struct paca_struct, canary)) Signed-off-by: Christophe Leroy --- arch/powerpc/Kconfig | 2 +- arch/powerpc/Makefile | 8 arch/powerpc/include/asm/paca.h | 3 +++ arch/powerpc/include/asm/stackprotector.h | 4 arch/powerpc/kernel/asm-offsets.c | 3 +++ arch/powerpc/kernel/entry_64.S| 4 6 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3bcb05929931..602eea723624 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -180,7 +180,7 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JITif !PPC64 - select HAVE_STACKPROTECTOR if $(cc-option,-mstack-protector-guard=tls) && PPC32 + select HAVE_STACKPROTECTOR if $(cc-option,-mstack-protector-guard=tls) select HAVE_CONTEXT_TRACKINGif PPC64 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 45b8eb4d8fe7..81552c7b46eb 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -113,7 +113,11 @@ KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET) endif cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard=tls +ifdef CONFIG_PPC64 +cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard-reg=r13 +else cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard-reg=r2 +endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie @@ -411,8 +415,12 @@ ifdef CONFIG_STACKPROTECTOR prepare: stack_protector_prepare stack_protector_prepare: prepare0 +ifdef CONFIG_PPC64 + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "PACA_CANARY") print $$3;}' include/generated/asm-offsets.h)) +else $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) endif +endif # Use the file '.tmp_gas_check' for binutils tests, as gas won't output # to stdout and these checks are run even on install targets. diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6d6b3706232c..98d883e58945 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -246,6 +246,9 @@ struct paca_struct { struct slb_entry *mce_faulty_slbs; u16 slb_save_cache_ptr; #endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_STACKPROTECTOR + unsigned long canary; +#endif } cacheline_aligned; extern struct paca_struct **paca_ptrs; diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h index d05d969c98c2..1c8460e23583 100644 --- a/arch/powerpc/include/asm/stackprotector.h +++ b/arch/powerpc/include/asm/stackprotector.h @@ -11,6 +11,7 @@ #include #include #include +#include /* * Initialize the stackprotector canary value. @@ -29,6 +30,9 @@ static __always_inline void boot_init_stack_canary(void) canary &= CANARY_MASK; current->stack_canary = canary; +#ifdef CONFIG_PPC64 + get_paca()->canary = canary; +#endif } #endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a992f7a53cf3..773dee55b3f6 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -81,6 +81,9 @@ int main(void) OFFSET(MM, task_struct, mm); #ifdef CONFIG_STACKPROTECTOR OFFSET(TASK_CANARY, task_struct, stack_canary); +#ifdef CONFIG_PPC64 + OFFSET(PACA_CANARY, paca_struct, canary); +#endif #endif OFFSET(MMCONTEXTID, mm_struct, context.id); #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 77a888bfcb53..573fa879d785 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -624,6 +624,10 @@ _GLOBAL(_switch) addir6,r4,-THREAD /* Convert THREAD to 'current' */ std r6,PACACURRENT(r13) /* Set new 'current' */ +#if defined(CONFIG_STACKPROTECTOR) + ld r6, TASK_CANARY(r6) + std r6, PACA_CANARY(r13) +#endif ld r8,KSP(r4) /* new stack pointer */ #ifdef CONFIG_PPC_BOOK3S_64 -- 2.13.3
[PATCH v5 1/2] powerpc/32: add stack protector support
This functionality was tentatively added in the past (commit 6533b7c16ee5 ("powerpc: Initial stack protector (-fstack-protector) support")) but had to be reverted (commit f2574030b0e3 ("powerpc: Revert the initial stack protector support") because of GCC implementing it differently whether it had been built with libc support or not. Now, GCC offers the possibility to manually set the stack-protector mode (global or tls) regardless of libc support. This time, the patch selects HAVE_STACKPROTECTOR only if -mstack-protector-guard=tls is supported by GCC. On PPC32, as register r2 points to current task_struct at all time, the stack_canary located inside task_struct can be used directly by using the following GCC options: -mstack-protector-guard=tls -mstack-protector-guard-reg=r2 -mstack-protector-guard-offset=offsetof(struct task_struct, stack_canary)) The protector is disabled for prom_init and bootx_init as it is too early to handle it properly. $ echo CORRUPT_STACK > /sys/kernel/debug/provoke-crash/DIRECT [ 134.943666] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: lkdtm_CORRUPT_STACK+0x64/0x64 [ 134.943666] [ 134.955414] CPU: 0 PID: 283 Comm: sh Not tainted 4.18.0-s3k-dev-12143-ga3272be41209 #835 [ 134.963380] Call Trace: [ 134.965860] [c6615d60] [c001f76c] panic+0x118/0x260 (unreliable) [ 134.971775] [c6615dc0] [c001f654] panic+0x0/0x260 [ 134.976435] [c6615dd0] [c032c368] lkdtm_CORRUPT_STACK_STRONG+0x0/0x64 [ 134.982769] [c6615e00] [] 0x Signed-off-by: Christophe Leroy --- v5: Using get_random_canary() and masking canary with CANARY_MASK v4: disable stack protector in bootx_init v3: the offset is now defined by a rule in the Makefile. No need anymore to take stack_canary out of the randomised area of task_struct arch/powerpc/Kconfig | 1 + arch/powerpc/Makefile | 10 + arch/powerpc/include/asm/stackprotector.h | 34 +++ arch/powerpc/kernel/Makefile | 2 ++ arch/powerpc/kernel/asm-offsets.c | 3 +++ arch/powerpc/platforms/powermac/Makefile | 1 + 6 files changed, 51 insertions(+) create mode 100644 arch/powerpc/include/asm/stackprotector.h diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a80669209155..3bcb05929931 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -180,6 +180,7 @@ config PPC select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_CBPF_JITif !PPC64 + select HAVE_STACKPROTECTOR if $(cc-option,-mstack-protector-guard=tls) && PPC32 select HAVE_CONTEXT_TRACKINGif PPC64 select HAVE_DEBUG_KMEMLEAK select HAVE_DEBUG_STACKOVERFLOW diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 07d9dce7eda6..45b8eb4d8fe7 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -112,6 +112,9 @@ KBUILD_LDFLAGS += -m elf$(BITS)$(LDEMULATION) KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET) endif +cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard=tls +cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard-reg=r2 + LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux:= $(LDFLAGS_vmlinux-y) @@ -404,6 +407,13 @@ archclean: archprepare: checkbin +ifdef CONFIG_STACKPROTECTOR +prepare: stack_protector_prepare + +stack_protector_prepare: prepare0 + $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if ($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h)) +endif + # Use the file '.tmp_gas_check' for binutils tests, as gas won't output # to stdout and these checks are run even on install targets. TOUT := .tmp_gas_check diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h new file mode 100644 index ..d05d969c98c2 --- /dev/null +++ b/arch/powerpc/include/asm/stackprotector.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * GCC stack protector support. + * + */ + +#ifndef _ASM_STACKPROTECTOR_H +#define _ASM_STACKPROTECTOR_H + +#include +#include +#include +#include + +/* + * Initialize the stackprotector canary value. + * + * NOTE: this must only be called from functions that never return, + * and it must always be inlined. + */ +static __always_inline void boot_init_stack_canary(void) +{ + unsigned long canary; + + /* Try to get a semi random initial value. */ + canary = get_random_canary(); + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; + + current->stack_canary = canary; +} + +#endif /* _ASM_STACKPROTECTOR_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 1e64cfe22a83..85ffa488dfb5 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -20,6 +2