[PATCH kernel] powerpc/powernv/npu: Remove unused headers and a macro.

2018-09-27 Thread Alexey Kardashevskiy
The macro and few headers are not used so remove them.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/npu-dma.c | 14 --
 1 file changed, 14 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 8006c54..3a5c4ed 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -9,32 +9,18 @@
  * License as published by the Free Software Foundation.
  */
 
-#include 
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
-#include 
 
 #include 
-#include 
 #include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
 #include 
 
-#include "powernv.h"
 #include "pci.h"
 
-#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
-
 /*
  * spinlock to protect initialisation of an npu_context for a particular
  * mm_struct.
-- 
2.11.0



[PATCH kernel] KVM: PPC: Optimize clearing TCEs for sparse tables

2018-09-27 Thread Alexey Kardashevskiy
The powernv platform maintains 2 TCE tables for VFIO - a hardware TCE
table and a table with userspace addresses. These tables are radix trees,
we allocate indirect levels when they are written to. Since
the memory allocation is problematic in real mode, we have 2 accessors
to the entries:
- for virtual mode: it allocates the memory and it is always expected
to return non-NULL;
- fr real mode: it does not allocate and can return NULL.

Also, DMA windows can span to up to 55 bits of the address space and since
we never have this much RAM, such windows are sparse. However currently
the SPAPR TCE IOMMU driver walks through all TCEs to unpin DMA memory.

Since we maintain a userspace addresses table for VFIO which is a mirror
of the hardware table, we can use it to know which parts of the DMA
window have not been mapped and skip these so does this patch.

The bare metal systems do not have this problem as they use a bypass mode
of a PHB which maps RAM directly.

This helps a lot with sparse DMA windows, reducing the shutdown time from
about 3 minutes per 1 billion TCEs to a few seconds for 32GB sparse guest.
Just skipping the last level seems to be good enough.

As non-allocating accessor is used now in virtual mode as well, rename it
from IOMMU_TABLE_USERSPACE_ENTRY_RM (real mode) to _RO (read only).

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/iommu.h|  2 +-
 arch/powerpc/kvm/book3s_64_vio.c|  5 ++---
 arch/powerpc/kvm/book3s_64_vio_hv.c |  6 +++---
 drivers/vfio/vfio_iommu_spapr_tce.c | 22 --
 4 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 3d4b88c..35db0cb 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -126,7 +126,7 @@ struct iommu_table {
int it_nid;
 };
 
-#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \
+#define IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry) \
((tbl)->it_ops->useraddrptr((tbl), (entry), false))
 #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
((tbl)->it_ops->useraddrptr((tbl), (entry), true))
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c0c64d1..62a8d03 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -410,11 +410,10 @@ static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
 {
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
if (!pua)
-   /* it_userspace allocation might be delayed */
-   return H_TOO_HARD;
+   return H_SUCCESS;
 
mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
if (!mem)
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 389dac1..583031d 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -212,7 +212,7 @@ static long iommu_tce_xchg_rm(struct mm_struct *mm, struct 
iommu_table *tbl,
 
if (!ret && ((*direction == DMA_FROM_DEVICE) ||
(*direction == DMA_BIDIRECTIONAL))) {
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
/*
 * kvmppc_rm_tce_iommu_do_map() updates the UA cache after
 * calling this so we still get here a valid UA.
@@ -238,7 +238,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
 {
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
if (!pua)
/* it_userspace allocation might be delayed */
@@ -302,7 +302,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, 
struct iommu_table *tbl,
 {
long ret;
unsigned long hpa = 0;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
struct mm_iommu_table_group_mem_t *mem;
 
if (!pua)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 96721b1..b1a8ab3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -444,7 +444,7 @@ static void tce_iommu_unuse_page_v2(struct tce_container 
*container,
struct mm_iommu_table_group_mem_t *mem = NULL;
int ret;
unsigned long hpa = 0;
-   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+   __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
 
if (!pua)
ret

[PATCH kernel] powerpc/powernv/ioda: Allocate indirect TCE levels of cached userspace addresses on demand

2018-09-27 Thread Alexey Kardashevskiy
The powernv platform maintains 2 TCE tables for VFIO - a hardware TCE
table and a table with userspace addresses; the latter is used for
marking pages dirty when corresponging TCEs are unmapped from
the hardware table.

a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels
on demand") enabled on-demand allocation of the hardware table,
however it missed the other table so it has still been fully allocated
at the boot time. This fixes the issue by allocating a single level,
just like we do for the hardware table.

Fixes: a68bd1267b72 ("powerpc/powernv/ioda: Allocate indirect TCE levels on 
demand")
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/pci-ioda-tce.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c 
b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index fe96910..7639b21 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -299,7 +299,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 
bus_offset,
if (alloc_userspace_copy) {
offset = 0;
uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
-   levels, tce_table_size, &offset,
+   tmplevels, tce_table_size, &offset,
&total_allocated_uas);
if (!uas)
goto free_tces_exit;
-- 
2.11.0



[PATCH kernel] cxl: Remove unused include

2018-09-27 Thread Alexey Kardashevskiy
The included opal.h gives a wrong idea that CXL makes PPC OPAL calls
while it does not so let's remote it.

Signed-off-by: Alexey Kardashevskiy 
---
 drivers/misc/cxl/pci.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b66d832..8cbcbb7 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -17,7 +17,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
-- 
2.11.0



[PATCH] selftests/powerpc: Fix Makefiles for headers_install change

2018-09-27 Thread Michael Ellerman
Commit b2d35fa5fc80 ("selftests: add headers_install to lib.mk")
introduced a requirement that Makefiles more than one level below the
selftests directory need to define top_srcdir, but it didn't update
any of the powerpc Makefiles.

This broke building all the powerpc selftests with eg:

  make[1]: Entering directory '/src/linux/tools/testing/selftests/powerpc'
  BUILD_TARGET=/src/linux/tools/testing/selftests/powerpc/alignment; mkdir -p 
$BUILD_TARGET; make OUTPUT=$BUILD_TARGET -k -C alignment all
  make[2]: Entering directory 
'/src/linux/tools/testing/selftests/powerpc/alignment'
  ../../lib.mk:20: ../../../../scripts/subarch.include: No such file or 
directory
  make[2]: *** No rule to make target '../../../../scripts/subarch.include'.
  make[2]: Failed to remake makefile '../../../../scripts/subarch.include'.
  Makefile:38: recipe for target 'alignment' failed

Fix it by setting top_srcdir in the affected Makefiles.

Fixes: b2d35fa5fc80 ("selftests: add headers_install to lib.mk")
Signed-off-by: Michael Ellerman 
---
 tools/testing/selftests/powerpc/alignment/Makefile | 1 +
 tools/testing/selftests/powerpc/benchmarks/Makefile| 1 +
 tools/testing/selftests/powerpc/cache_shape/Makefile   | 1 +
 tools/testing/selftests/powerpc/copyloops/Makefile | 1 +
 tools/testing/selftests/powerpc/dscr/Makefile  | 1 +
 tools/testing/selftests/powerpc/math/Makefile  | 1 +
 tools/testing/selftests/powerpc/mm/Makefile| 1 +
 tools/testing/selftests/powerpc/pmu/Makefile   | 1 +
 tools/testing/selftests/powerpc/pmu/ebb/Makefile   | 1 +
 tools/testing/selftests/powerpc/primitives/Makefile| 1 +
 tools/testing/selftests/powerpc/ptrace/Makefile| 1 +
 tools/testing/selftests/powerpc/signal/Makefile| 1 +
 tools/testing/selftests/powerpc/stringloops/Makefile   | 1 +
 tools/testing/selftests/powerpc/switch_endian/Makefile | 1 +
 tools/testing/selftests/powerpc/syscalls/Makefile  | 1 +
 tools/testing/selftests/powerpc/tm/Makefile| 1 +
 tools/testing/selftests/powerpc/vphn/Makefile  | 1 +
 17 files changed, 17 insertions(+)

diff --git a/tools/testing/selftests/powerpc/alignment/Makefile 
b/tools/testing/selftests/powerpc/alignment/Makefile
index 93baacab7693..d056486f49de 100644
--- a/tools/testing/selftests/powerpc/alignment/Makefile
+++ b/tools/testing/selftests/powerpc/alignment/Makefile
@@ -1,5 +1,6 @@
 TEST_GEN_PROGS := copy_first_unaligned alignment_handler
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 $(TEST_GEN_PROGS): ../harness.c ../utils.c
diff --git a/tools/testing/selftests/powerpc/benchmarks/Makefile 
b/tools/testing/selftests/powerpc/benchmarks/Makefile
index b4d7432a0ecd..d40300a65b42 100644
--- a/tools/testing/selftests/powerpc/benchmarks/Makefile
+++ b/tools/testing/selftests/powerpc/benchmarks/Makefile
@@ -4,6 +4,7 @@ TEST_GEN_FILES := exec_target
 
 CFLAGS += -O2
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 $(TEST_GEN_PROGS): ../harness.c
diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile 
b/tools/testing/selftests/powerpc/cache_shape/Makefile
index 1be547434a49..ede4d3dae750 100644
--- a/tools/testing/selftests/powerpc/cache_shape/Makefile
+++ b/tools/testing/selftests/powerpc/cache_shape/Makefile
@@ -5,6 +5,7 @@ all: $(TEST_PROGS)
 
 $(TEST_PROGS): ../harness.c ../utils.c
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 clean:
diff --git a/tools/testing/selftests/powerpc/copyloops/Makefile 
b/tools/testing/selftests/powerpc/copyloops/Makefile
index 1cf89a34d97c..44574f3818b3 100644
--- a/tools/testing/selftests/powerpc/copyloops/Makefile
+++ b/tools/testing/selftests/powerpc/copyloops/Makefile
@@ -17,6 +17,7 @@ TEST_GEN_PROGS := copyuser_64_t0 copyuser_64_t1 
copyuser_64_t2 \
 
 EXTRA_SOURCES := validate.c ../harness.c stubs.S
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 $(OUTPUT)/copyuser_64_t%:  copyuser_64.S $(EXTRA_SOURCES)
diff --git a/tools/testing/selftests/powerpc/dscr/Makefile 
b/tools/testing/selftests/powerpc/dscr/Makefile
index 55d7db7a616b..5df476364b4d 100644
--- a/tools/testing/selftests/powerpc/dscr/Makefile
+++ b/tools/testing/selftests/powerpc/dscr/Makefile
@@ -3,6 +3,7 @@ TEST_GEN_PROGS := dscr_default_test dscr_explicit_test 
dscr_user_test   \
  dscr_inherit_test dscr_inherit_exec_test dscr_sysfs_test  \
  dscr_sysfs_thread_test
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 $(OUTPUT)/dscr_default_test: LDLIBS += -lpthread
diff --git a/tools/testing/selftests/powerpc/math/Makefile 
b/tools/testing/selftests/powerpc/math/Makefile
index 0dd3a01fdab9..11a10d7a2bbd 100644
--- a/tools/testing/selftests/powerpc/math/Makefile
+++ b/tools/testing/selftests/powerpc/math/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 TEST_GEN_PROGS := fpu_syscall fpu_preempt fpu_signal vmx_syscall vmx_preempt 
vmx_signal vsx_preempt
 
+top_srcdir = ../../../../..
 include ../../lib.mk
 
 $(TEST_GEN_PROGS): ../harness.

Re: [RFC PATCH 08/11] powerpc/tm: Do not reclaim on ptrace

2018-09-27 Thread Michael Neuling
On Thu, 2018-09-27 at 18:03 -0300, Breno Leitao wrote:
> Hi Mikey,
> 
> On 09/18/2018 02:36 AM, Michael Neuling wrote:
> > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
> > > Make sure that we are not suspended on ptrace and that the registers were
> > > already reclaimed.
> > > 
> > > Since the data was already reclaimed, there is nothing to be done here
> > > except to restore the SPRs.
> > > 
> > > Signed-off-by: Breno Leitao 
> > > ---
> > >  arch/powerpc/kernel/ptrace.c | 10 --
> > >  1 file changed, 4 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
> > > index 9667666eb18e..cf6ee9154b11 100644
> > > --- a/arch/powerpc/kernel/ptrace.c
> > > +++ b/arch/powerpc/kernel/ptrace.c
> > > @@ -136,12 +136,10 @@ static void flush_tmregs_to_thread(struct
> > > task_struct
> > > *tsk)
> > >   if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current))
> > >   return;
> > >  
> > > - if (MSR_TM_SUSPENDED(mfmsr())) {
> > > - tm_reclaim_current(TM_CAUSE_SIGNAL);
> > > - } else {
> > > - tm_enable();
> > > - tm_save_sprs(&(tsk->thread));
> > > - }
> > > + WARN_ON(MSR_TM_SUSPENDED(mfmsr()));
> > > +
> > > + tm_enable();
> > > + tm_save_sprs(&(tsk->thread));
> > 
> > Do we need to check if TM was enabled in the task before saving the TM SPRs?
> > 
> > What happens if TM was lazily off and hence we had someone else's TM SPRs in
> > the
> > CPU currently?  Wouldn't this flush the wrong values to the task_struct?
> > 
> > I think we need to check the processes MSR before doing this.
> 
> Yes, it is a *very* good point, and I think we are vulnerable even before
> this patch (in the current kernel). Take a look above, we are calling
> tm_save_sprs() if MSR is not TM suspended independently if TM is lazily off.

I think you're right, we might already have an issue.  There are some paths in
here that don't check the userspace msr or any of the lazy tm enable. :(

Mikey


> It shouldn't be hard to create a test case for it. I can try to call
> ptrace(PTRACE_GETVRREGS) on a task that sleeps until TM is lazily disabled,
> compare if the TM SPR changed in this mean time. (while doing HTM in parallel
> to keep HTM SPR changing). Let's see if I can read others task TM SPRs.
> 
> Thank you.
> 


Re: [RFC PATCH 05/11] powerpc/tm: Function that updates the failure code

2018-09-27 Thread Michael Neuling
On Thu, 2018-09-27 at 17:58 -0300, Breno Leitao wrote:
> Hi Mikey,
> 
> On 09/17/2018 10:29 PM, Michael Neuling wrote:
> > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
> > > Now the transaction reclaims happens very earlier in the trap handler, and
> > > it is impossible to know precisely, at that early time, what should be set
> > > as the failure cause for some specific cases, as, if the task will be
> > > rescheduled, thus, the transaction abort case should be updated from
> > > TM_CAUSE_MISC to TM_CAUSE_RESCHED, for example.
> > 
> > Please add comments to where this is used (in EXCEPTION_COMMON macro I
> > think)
> > that say this might happen.
> 
> Is it OK if I comment it in TM_KERNEL_ENTRY macro, since the failure cause
> could be updated independently of the exception being execute, so, every call
> to TM_KERNEL_ENTRY can have the cause overwritten.

Sure.


> I.e. it does not matter if  the exception is a systemcall or a page fault,
> the failure cause will be updated if there is a process reschedule after the
> exception/syscall is handled.
> 
> Thank you
> 


Re: [RFC PATCH 11/11] selftests/powerpc: Adapt the test

2018-09-27 Thread Michael Neuling
On Thu, 2018-09-27 at 17:57 -0300, Breno Leitao wrote:
> Hi Mikey,
> 
> On 09/18/2018 03:36 AM, Michael Neuling wrote:
> > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
> > > The Documentation/powerpc/transactional_memory.txt says:
> > > 
> > >  "Syscalls made from within a suspended transaction are performed as
> > > normal
> > >   and the transaction is not explicitly doomed by the kernel.  However,
> > >   what the kernel does to perform the syscall may result in the
> > > transaction
> > >   being doomed by the hardware."
> > > 
> > > With this new TM mechanism, the syscall will continue to be executed if
> > > the
> > > syscall happens on a suspended syscall, but, the syscall will *fail* if
> > > the
> > > transaction is still active during the syscall invocation.
> > 
> > Not sure I get this. This doesn't seem any different to before.
> > 
> > An active (not suspended) transaction *will* result in the syscall failing
> > and
> > the transaction being doomed.  
> > 
> > A syscall in a suspended transaction should succeed and the transaction.
> 
> I understand that a transaction will always be doomed if there is a
> reclaim/checkpoint, thus, if the you make a system call inside a suspended
> transaction, it will reclaim and recheckpoint, thus, dooming the transaction,
> and failing it on the next RFID.

So currently a syscall won't explicitly treclaim/trecheckpoint so it won't
necessarily be doomed.

With your new patches it will be.

> So, the syscall executed in suspended mode may succeed, because it will not
> be skipped (as in active mode), but the transaction will *always* fail,
> because there was a reclaim and recheckpoint.
> 
> > You might need to clean up the language. I try to use:
> > 
> >Active == transactional but not suspended (ie MSR[TS] = T)
> >Suspended == suspended (ie MSR [TS] = S)
> >Doomed == transaction to be rolled back at next opportinity (ie tcheck
> > returns doomed)
> > 
> > (note: the kernel MSR_TM_ACTIVE() macro is not consistent with this since
> > it's
> > MSR[TS] == S or T).
> 
> Ok, I agree with this language as well. I might want to improve the code to
> follow the same language all across the code.
> 
> > > On the syscall path, if the transaction is active and not suspended, it
> > > will call TM_KERNEL_ENTRY which will reclaim and recheckpoint the
> > > transaction, thus, dooming the transaction on userspace return, with
> > > failure code TM_CAUSE_SYSCALL.
> > 
> > But the test below is on a suspend transaction?
> 
> Sorry, I meant "suspended transaction" above instead of "transaction is
> active and not suspended".
> 
> > 
> > > This new model will break part of this test, but I understand that that
> > > the
> > > documentation above didn't guarantee that the syscall would succeed, and
> > > it
> > > will never succeed anymore now on.
> > 
> > The syscall should pass in suspend (modulo the normal syscall checks). The
> > transaction may fail as a result.
> 
> Perfect, and if the transaction fail, the CPU will rollback the changes and
> restore the checkpoint registers (replacing the r3 that contains the pid
> value), thus, it will be like "getpid" system call didn't execute.

No.  If we are suspended, then we go back right after the sc. We don't get
rolled back till the tresume.

Mikey

> For this test specifically, it assumes the syscall didn't execute if the
> transaction failed. Take a look:
> 
>   FUNC_START(getppid_tm_suspended)
>   tbegin.
>   beq 1f
>   li  r0, __NR_getppid
>   tsuspend.
>   sc
>   tresume.
>   tend.
>   blr
>   1:
>   li  r3, -1
>   blr
> 
> So, the tests thinks the syscall failed because the transaction aborted.
> 
> Anyway, I can try to improve this test other than removing this test, but I
> am not sure how.
> 
> Thank you
> 


Re: [RFC PATCH 10/11] powerpc/tm: Set failure summary

2018-09-27 Thread Michael Neuling
On Thu, 2018-09-27 at 17:52 -0300, Breno Leitao wrote:
> Hi Mikey,
> 
> On 09/18/2018 02:50 AM, Michael Neuling wrote:
> > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
> > > Since the transaction will be doomed with treckpt., the TEXASR[FS]
> > > should be set, to reflect that the transaction is a failure. This patch
> > > ensures it before recheckpointing, and remove changes from other places
> > > that were calling recheckpoint.
> > 
> > TEXASR[FS] should be set by the reclaim. 
> 
> Do you mean that the CPU should set TEXASR[FS] when treclaim is called, or,
> that the tm_reclaim?
> 
> Looking at the ISA, I didn't see TEXASR[FS] being set automatically when a
> reclaim happens, 
treclaim in ISA talks about "TMRecordFailure(cause)" and that macro sets
TEXASR[FS]=1. 

So yes treclaim always sets TEXASR[FS]=1.

> although, I see it needs TEXASR[FS] to be set when executing a 
> trecheckpoint, otherwise it will cause a TM Bad Thing.

Yep

> That is why I am forcing TEXASR[FS]=1 to doom the transaction so we can
> recheckpoint it, but it seems I understood this wrongly.

You shouldn't need to.  We do a bug_on() just before the trecheckpoint to make
sure it's set, but you shouldn't need to set it (other than the signals case).

> > I don't know why you'd need to set this
> > explicitly in process.c. The only case is when the user supplies a bad
> > signal
> > context, but we should check that in the signals code, not process.c
> > 
> > Hence I think this patch is wrong.
> > 
> > Also, according to the architecture, TEXASR[FS] HAS TO BE SET on
> > trecheckpoint
> > otherwise you'll get a TM Bad Thing. You should say that rather than
> > suggesting
> > it's because the transaction is doomed. It's ilqlegal to not do it. That's
> > why we
> > have this check in arch/powerpc/kernel/tm.S.
> 
> When you say "HAS TO BE SET", do you mean that the hardware will set it and
> we shouldn't care about this flag? Thus, if I am hitting EMIT_BUG_ENTRY, it
> means my TEXASR was messed somehow?

I'm just saying what you said before about the tm bad thing.  We have to make
sure it's set before we do the trecheckpoint otherwise we end up in the TM bad
thing. The treclaim should handle this for us (but again the signals case need
to be checked).

Mikey


Re: [RFC PATCH 09/11] powerpc/tm: Do not restore default DSCR

2018-09-27 Thread Michael Neuling
On Thu, 2018-09-27 at 17:51 -0300, Breno Leitao wrote:
> Hi Mikey,
> 
> On 09/18/2018 02:41 AM, Michael Neuling wrote:
> > On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
> > > In the previous TM code, trecheckpoint was being executed in the middle of
> > > an exception, thus, DSCR was being restored to default kernel DSCR value
> > > after trecheckpoint was done.
> > > 
> > > With this current patchset, trecheckpoint is executed just before getting
> > > to userspace, at ret_from_except_lite, for example. Thus, we do not need
> > > to
> > > set default kernel DSCR value anymore, as we are leaving kernel space.  It
> > > is OK to keep the checkpointed DSCR value into the live SPR, mainly
> > > because
> > > the transaction is doomed and it will fail soon (after RFID), 
> > 
> > What if we are going back to a suspended transaction?  It will remain live
> > until
> > userspace does a tresume
> 
> Hmm, I understand that once we get in kernel space, and call
> treclaim/trecheckpoint, the transaction will be doomed and it will abort and
> rollback when we leave kernel space. I.e., if we can treclaim/trecheckpointn
> in kernel space, the transaction will *always* abort just after RFID, so, a
> possible tresume will never be executed. Is my understanding wrong?

Your understanding is wrong. We don't roll back until MSR[TS] = T.

There are two cases for the RFID.

1) if you RFID back to userspace that is transactional (ie MSR[TS] = T), then it
will immediately rollback as soon as the RFID happens since we are going
directly to T.

2) If you RFID back to userspace that is suspended (ie MSR[TS] = S), then it
won't roll back until userspace does a tresume. It doesn't rollback until we go
MSR[TS] = S -> T).

> > 
> > > so,
> > > continuing with the pre-checkpointed DSCR value is what seems correct.
> > 
> > Reading this description suggests this patch isn't really needed. Right?
> 
> Maybe the description is not clear, but I understand this patch is required,
> otherwise we will leave userspace with a default DSCR value.
> 
> By the way, do you know if there is a change in DSCR inside a transaction,
> will it be reverted if the transaction is aborted?

Yes it will be reverted. We even have a selftest for it in
tools/testing/selftests/powerpc/tm/tm-resched-dscr.c

There are a bunch of checkpointed SPRs. From the arch:
   Checkpointed registers: The set of registers that are
   saved to the “checkpoint area” when a transaction is
   initiated, and restored upon transaction failure, is a
   subset of the architected register state, consisting of
   the General Purpose Registers, Floating-Point Regis-
   ters, Vector Registers, Vector-Scalar Registers, and the
   following Special Registers and fields: CR fields other
   than CR0, XER, LR, CTR, FPSCR, AMR, PPR,
   VRSAVE, VSCR, DSCR, and TAR.

Mikey


Re: [PATCH v6] selftests: add headers_install to lib.mk

2018-09-27 Thread Michael Ellerman
[ + linuxppc-dev ]

Anders Roxell  writes:
> If the kernel headers aren't installed we can't build all the tests.
> Add a new make target rule 'khdr' in the file lib.mk to generate the
> kernel headers and that gets include for every test-dir Makefile that
> includes lib.mk If the testdir in turn have its own sub-dirs the
> top_srcdir needs to be set to the linux-rootdir to be able to generate
> the kernel headers.
>
> Signed-off-by: Anders Roxell 
> Reviewed-by: Fathi Boudra 
> ---
>
> I sent this (v5) a month ago and wondered if it got lost. Resending
> unchanged.
>
> Cheers,
> Anders
>
>  Makefile   | 14 +-
>  scripts/subarch.include| 13 +
>  tools/testing/selftests/android/Makefile   |  2 +-
>  tools/testing/selftests/android/ion/Makefile   |  2 ++
>  tools/testing/selftests/futex/functional/Makefile  |  1 +
>  tools/testing/selftests/gpio/Makefile  |  7 ++-
>  tools/testing/selftests/kvm/Makefile   |  7 ++-
>  tools/testing/selftests/lib.mk | 12 
>  tools/testing/selftests/net/Makefile   |  1 +
>  .../selftests/networking/timestamping/Makefile |  1 +
>  tools/testing/selftests/vm/Makefile|  4 
>  11 files changed, 36 insertions(+), 28 deletions(-)
>  create mode 100644 scripts/subarch.include

This broke all the powerpc selftests :(

Why did it go in at rc5?

cheers


[PATCH] powerpc/numa: Skip onlining a offline node in kdump path

2018-09-27 Thread Srikar Dronamraju
With Commit 2ea626306810 ("powerpc/topology: Get topology for shared
processors at boot"), kdump kernel on shared lpar may crash.

The necessary conditions are
- Shared Lpar with atleast 2 nodes having memory and CPUs.
- Memory requirement for kdump kernel must be met by the first N-1 nodes
  where there are atleast N nodes with memory and CPUs.

Example numactl of such a machine.
 numactl -H
available: 5 nodes (0,2,5-7)
node 0 cpus:
node 0 size: 0 MB
node 0 free: 0 MB
node 2 cpus:
node 2 size: 255 MB
node 2 free: 189 MB
node 5 cpus: 24 25 26 27 28 29 30 31
node 5 size: 4095 MB
node 5 free: 4024 MB
node 6 cpus: 0 1 2 3 4 5 6 7 16 17 18 19 20 21 22 23
node 6 size: 6353 MB
node 6 free: 5998 MB
node 7 cpus: 8 9 10 11 12 13 14 15 32 33 34 35 36 37 38 39
node 7 size: 7640 MB
node 7 free: 7164 MB
node distances:
node   0   2   5   6   7
  0:  10  40  40  40  40
  2:  40  10  40  40  40
  5:  40  40  10  40  40
  6:  40  40  40  10  20
  7:  40  40  40  20  10

Steps to reproduce.
1. Load / start kdump service.
2. Trigger a kdump (for example : echo c > /proc/sysrq-trigger)

When booting a kdump kernel with 2048M
kexec: Starting switchover sequence.
I'm in purgatory
Using 1TB segments
hash-mmu: Initializing hash mmu with SLB
Linux version 4.19.0-rc5-master+ (srikar@linux-xxu6) (gcc version 4.8.5 (SUSE 
Linux)) #1 SMP Thu Sep 27 19:45:00 IST 2018
Found initrd at 0xc9e7:0xcae554b4
Using pSeries machine description
-
ppc64_pft_size= 0x1e
phys_mem_size = 0x8800
dcache_bsize  = 0x80
icache_bsize  = 0x80
cpu_features  = 0x00ff8f5d91a7
  possible= 0xfbffcf5fb1a7
  always  = 0x006f8b5c91a1
cpu_user_features = 0xdc0065c2 0xef00
mmu_features  = 0x7c006001
firmware_features = 0x0007c45bfc57
htab_hash_mask= 0x7f
physical_start= 0x800
-
numa:   NODE_DATA [mem 0x87d5e300-0x87d67fff]
numa: NODE_DATA(0) on node 6
numa:   NODE_DATA [mem 0x87d54600-0x87d5e2ff]
Top of RAM: 0x8800, Total RAM: 0x8800
Memory hole size: 0MB
Zone ranges:
  DMA  [mem 0x-0x87ff]
  DMA32empty
  Normal   empty
Movable zone start for each node
Early memory node ranges
  node   6: [mem 0x-0x87ff]
Could not find start_pfn for node 0
Initmem setup node 0 [mem 0x-0x]
On node 0 totalpages: 0
Initmem setup node 6 [mem 0x-0x87ff]
On node 6 totalpages: 34816

Unable to handle kernel paging request for data at address 0x0060
Faulting instruction address: 0xc8703a54
Oops: Kernel access of bad area, sig: 11 [#1]
LE SMP NR_CPUS=2048 NUMA pSeries
Modules linked in:
CPU: 11 PID: 1 Comm: swapper/11 Not tainted 4.19.0-rc5-master+ #1
NIP:  c8703a54 LR: c8703a38 CTR: 
REGS: cb673440 TRAP: 0380   Not tainted  (4.19.0-rc5-master+)
MSR:  82009033   CR: 24022022  XER: 2002
CFAR: c86fc238 IRQMASK: 0
GPR00: c8703a38 cb6736c0 c9281900 
GPR04:   f001 cb660080
GPR08:    0220
GPR12: 2200 c9e51400  0008
GPR16:  c8c152e8 c8c152a8 
GPR20: c9422fd8 c9412fd8 c9426040 0008
GPR24:   c9168bc8 c9168c78
GPR28: cb126410  c916a0b8 cb126400
NIP [c8703a54] bus_add_device+0x84/0x1e0
LR [c8703a38] bus_add_device+0x68/0x1e0
Call Trace:
[cb6736c0] [c8703a38] bus_add_device+0x68/0x1e0 (unreliable)
[cb673740] [c8700194] device_add+0x454/0x7c0
[cb673800] [c872e660] __register_one_node+0xb0/0x240
[cb673860] [c839a6bc] __try_online_node+0x12c/0x180
[cb673900] [c839b978] try_online_node+0x58/0x90
[cb673930] [c80846d8] find_and_online_cpu_nid+0x158/0x190
[cb673a10] [c80848a0] numa_update_cpu_topology+0x190/0x580
[cb673c00] [c8d3f2e4] smp_cpus_done+0x94/0x108
[cb673c70] [c8d5c00c] smp_init+0x174/0x19c
[cb673d00] [c8d346b8] kernel_init_freeable+0x1e0/0x450
[cb673dc0] [c80102e8] kernel_init+0x28/0x160
[cb673e30] [c800b65c] ret_from_kernel_thread+0x5c/0x80
Instruction dump:
6000 6000 e89e0020 7fe3fb78 4bff87d5 6000 7c7d1b79 4082008c
e8bf0050 e93e0098 3b9f0010 2fa5  38630018 419e0114 7f84e378
---[ end trace 593577668c2daa65 ]---

However a regular kernel with 4096M (2048 gets reserved for
crash kernel) boots properly.

Unlike regular kernels, which mark all available nodes as online, kdump
kernel only marks just enough nodes a

Re: [PATCH] powerpc: wire up memtest

2018-09-27 Thread Michael Ellerman
Christophe Leroy  writes:
> Add call to early_memtest() so that kernel compiled with
> CONFIG_MEMTEST really perform memtest at startup when requested
> via 'memtest' boot parameter.
>
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/kernel/setup-common.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/arch/powerpc/kernel/setup-common.c 
> b/arch/powerpc/kernel/setup-common.c
> index 93fa0c99681e..904b728eb20d 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -33,6 +33,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -917,6 +918,8 @@ void __init setup_arch(char **cmdline_p)
>   /* Parse memory topology */
>   mem_topology_setup();
>  
> + early_memtest(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT);

On a ppc64le VM this boils down to early_memtest(0, 0) for me.

I think it's too early, we don't set up max_low_pfn until
initmem_init().

If I move it after initmem_init() then it does something more useful:

early_memtest: # of tests: 17
  0x01450580 - 0x01450800 pattern 4c494e5558726c7a
  0x01450c00 - 0x0360 pattern 4c494e5558726c7a
  0x047c - 0x2fff pattern 4c494e5558726c7a
  0x3000 - 0x3ff24000 pattern 4c494e5558726c7a
  0x3fff4000 - 0x3fff4c00 pattern 4c494e5558726c7a
  0x3fff5000 - 0x3fff5300 pattern 4c494e5558726c7a
  0x3fff5c00 - 0x3fff5f00 pattern 4c494e5558726c7a
  0x3fff6800 - 0x3fff6b00 pattern 4c494e5558726c7a
  0x3fff7400 - 0x3fff7700 pattern 4c494e5558726c7a
  0x3fff8000 - 0x3fff8300 pattern 4c494e5558726c7a
  0x3fff8c00 - 0x3fff8f00 pattern 4c494e5558726c7a
  0x3fff9800 - 0x3fff9b00 pattern 4c494e5558726c7a
  0x3fffa400 - 0x3fffa700 pattern 4c494e5558726c7a
  0x3fffb000 - 0x3fffb300 pattern 4c494e5558726c7a
  0x3fffbc00 - 0x3fffbf00 pattern 4c494e5558726c7a
  0x3fffc800 - 0x3fffcb00 pattern 4c494e5558726c7a
  0x3fffd400 - 0x3fffd700 pattern 4c494e5558726c7a
  0x3fffe000 - 0x3fffe100 pattern 4c494e5558726c7a
  0x4000 - 0xffc1 pattern 4c494e5558726c7a
  0xfffa - 0xfffa5b00 pattern 4c494e5558726c7a
  0x0001 - 0x0001ffbe pattern 4c494e5558726c7a
  0x0001fff6 - 0x0001fff61b00 pattern 4c494e5558726c7a
  0x0001fffec000 - 0x0001fffec4b8 pattern 4c494e5558726c7a
  0x0001fffec524 - 0x0001fffec528 pattern 4c494e5558726c7a


cheers


Re: [PATCH v4] powerpc/64s: reimplement book3s idle code in C

2018-09-27 Thread Nicholas Piggin
On Wed, 26 Sep 2018 19:39:14 +0530
Akshay Adiga  wrote:

> On Fri, Sep 14, 2018 at 11:52:40AM +1000, Nicholas Piggin wrote:
> > +
> > +   /*
> > +* On POWER9, SRR1 bits do not match exactly as expected.
> > +* SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
> > +* always test PSSCR if there is any state loss.
> > +*/
> > +   if (likely((psscr & PSSCR_RL_MASK) < pnv_first_hv_loss_level)) {  
> 
> Shouldn't we check PLS field to see if the cpu/core woke up from hv loss ?
> 
> Currently, a cpu requested stop4 (RL=4) and exited from a shallower state
> (PLS=2), SPR's are unecessarily restored.
> 
> We can do something like : 
> 
> #define   PSSCR_PLS_SHIFT 60
> if (likely((psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT) < pnv_first_hv_loss_level)


Ah, that corresponds with the following existing code?

/*
 * POWER ISA 3. Use PSSCR to determine if we
 * are waking up from deep idle state
 */
LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
ld  r4,ADDROFF(pnv_first_deep_stop_state)(r5)

/*
 * 0-3 bits correspond to Power-Saving Level Status
 * which indicates the idle state we are waking up from
 */
mfspr   r5, SPRN_PSSCR
rldicl  r5,r5,4,60
li  r0, 0   /* clear requested_psscr to say we're awake */
std r0, PACA_REQ_PSSCR(r13)
cmpdcr4,r5,r4
bge cr4,pnv_wakeup_tb_loss /* returns to caller */

Yes I didn't get that right, good catch.

Thanks,
Nick


Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection

2018-09-27 Thread Benjamin Herrenschmidt
On Thu, 2018-09-27 at 15:49 +0200, Christoph Hellwig wrote:
> On Thu, Sep 27, 2018 at 11:45:15AM +1000, Benjamin Herrenschmidt wrote:
> > I'm not sure this is entirely right.
> > 
> > Let's say the mask is 30 bits. You will return GFP_DMA32, which will
> > fail if you allocate something above 1G (which is legit for
> > ZONE_DMA32).
> 
> And then we will try GFP_DMA further down in the function:
> 
>   if (IS_ENABLED(CONFIG_ZONE_DMA) &&
>   dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
>   !(gfp & GFP_DMA)) {
>   gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
>   goto again;
>   }
> 
> This is and old optimization from x86, because chances are high that
> GFP_DMA32 will give you suitable memory for the infamous 31-bit
> dma mask devices (at least at boot time) and thus we don't have
> to deplete the tiny ZONE_DMA pool.

I see, it's rather confusing :-) Wouldn't it be better to check against
top of 32-bit memory instead here too ?


Cheers,
Ben.



[PATCH 0/3] powerpc/powernv/npu: Improve ATSD invalidation overhead

2018-09-27 Thread Mark Hairgrove
When ATS is used in a process, all CPU TLB invalidates in that process
also trigger ATSD invalidates via mmu_notifiers. This additional overhead
is noticeable in applications which do heavy memory allocation or page
migration among nodes, particularly to and from GPUs.

This patch set reduces that overhead by rearranging how the ATSDs are
issued and by using size-based ATSD invalidates.

Mark Hairgrove (3):
  powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates
  powerpc/powernv/npu: Use size-based ATSD invalidates
  powerpc/powernv/npu: Remove atsd_threshold debugfs setting

 arch/powerpc/platforms/powernv/npu-dma.c |  177 ++---
 1 files changed, 85 insertions(+), 92 deletions(-)

-- 
1.7.2.5



[PATCH 1/3] powerpc/powernv/npu: Reduce eieio usage when issuing ATSD invalidates

2018-09-27 Thread Mark Hairgrove
There are two types of ATSDs issued to the NPU: invalidates targeting a
specific virtual address and invalidates targeting the whole address
space. In both cases prior to this change, the sequence was:

for each NPU
- Write the target address to the XTS_ATSD_AVA register
- EIEIO
- Write the launch value to issue the ATSD

First, a target address is not required when invalidating the whole
address space, so that write and the EIEIO have been removed. The AP
(size) field in the launch is not needed either.

Second, for per-address invalidates the above sequence is inefficient in
the common case of multiple NPUs because an EIEIO is issued per NPU. This
unnecessarily forces the launches of later ATSDs to be ordered with the
launches of earlier ones. The new sequence only issues a single EIEIO:

for each NPU
- Write the target address to the XTS_ATSD_AVA register
EIEIO
for each NPU
- Write the launch value to issue the ATSD

Performance results were gathered using a microbenchmark which creates a
1G allocation then uses mprotect with PROT_NONE to trigger invalidates in
strides across the allocation.

With only a single NPU active (one GPU) the difference is in the noise for
both types of invalidates (+/-1%).

With two NPUs active (on a 6-GPU system) the effect is more noticeable:

 mprotect rate (GB/s)
Stride   Before  After  Speedup
64K 5.96.5  10%
1M 31.2   33.4   7%
2M 36.3   38.7   7%
4M322.6  356.7  11%

Signed-off-by: Mark Hairgrove 
---
 arch/powerpc/platforms/powernv/npu-dma.c |   99 ++---
 1 files changed, 48 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index 8006c54..c8f438a 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -454,79 +454,76 @@ static void put_mmio_atsd_reg(struct npu *npu, int reg)
 }
 
 /* MMIO ATSD register offsets */
-#define XTS_ATSD_AVA  1
-#define XTS_ATSD_STAT 2
+#define XTS_ATSD_LAUNCH 0
+#define XTS_ATSD_AVA1
+#define XTS_ATSD_STAT   2
 
-static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
-   unsigned long launch, unsigned long va)
+static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long 
psize,
+   bool flush)
 {
-   struct npu *npu = mmio_atsd_reg->npu;
-   int reg = mmio_atsd_reg->reg;
+   unsigned long launch = 0;
 
-   __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
-   eieio();
-   __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
+   if (psize == MMU_PAGE_COUNT) {
+   /* IS set to invalidate entire matching PID */
+   launch |= PPC_BIT(12);
+   } else {
+   /* AP set to invalidate region of psize */
+   launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
+   }
+
+   /* PRS set to process-scoped */
+   launch |= PPC_BIT(13);
+
+   /* PID */
+   launch |= pid << PPC_BITLSHIFT(38);
+
+   /* No flush */
+   launch |= !flush << PPC_BITLSHIFT(39);
+
+   return launch;
 }
 
-static void mmio_invalidate_pid(struct mmio_atsd_reg 
mmio_atsd_reg[NV_MAX_NPUS],
-   unsigned long pid, bool flush)
+static void mmio_atsd_regs_write(struct mmio_atsd_reg
+   mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
+   unsigned long val)
 {
-   int i;
-   unsigned long launch;
+   struct npu *npu;
+   int i, reg;
 
for (i = 0; i <= max_npu2_index; i++) {
-   if (mmio_atsd_reg[i].reg < 0)
+   reg = mmio_atsd_reg[i].reg;
+   if (reg < 0)
continue;
 
-   /* IS set to invalidate matching PID */
-   launch = PPC_BIT(12);
-
-   /* PRS set to process-scoped */
-   launch |= PPC_BIT(13);
-
-   /* AP */
-   launch |= (u64)
-   mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
-
-   /* PID */
-   launch |= pid << PPC_BITLSHIFT(38);
+   npu = mmio_atsd_reg[i].npu;
+   __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
+   }
+}
 
-   /* No flush */
-   launch |= !flush << PPC_BITLSHIFT(39);
+static void mmio_invalidate_pid(struct mmio_atsd_reg 
mmio_atsd_reg[NV_MAX_NPUS],
+   unsigned long pid, bool flush)
+{
+   unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT, flush);
 
-   /* Invalidating the entire process doesn't use a va */
-   mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
-   }
+   /* Invalidating the entire process doesn't use a va */
+   mmio_

[PATCH 2/3] powerpc/powernv/npu: Use size-based ATSD invalidates

2018-09-27 Thread Mark Hairgrove
Prior to this change only two types of ATSDs were issued to the NPU:
invalidates targeting a single page and invalidates targeting the whole
address space. The crossover point happened at the configurable
atsd_threshold which defaulted to 2M. Invalidates that size or smaller
would issue per-page invalidates for the whole range.

The NPU supports more invalidation sizes however: 64K, 2M, 1G, and all.
These invalidates target addresses aligned to their size. 2M is a common
invalidation size for GPU-enabled applications because that is a GPU
page size, so reducing the number of invalidates by 32x in that case is a
clear improvement.

ATSD latency is high in general so now we always issue a single invalidate
rather than multiple. This will over-invalidate in some cases, but for any
invalidation size over 2M it matches or improves the prior behavior.
There's also an improvement for single-page invalidates since the prior
version issued two invalidates for that case instead of one.

To show the benefit here are some performance numbers from a
microbenchmark which creates a 1G allocation then uses mprotect with
PROT_NONE to trigger invalidates in strides across the allocation.

One NPU (1 GPU):

 mprotect rate (GB/s)
Stride   Before  After  Speedup
64K 5.35.6   5%
1M 39.3   57.4  46%
2M 49.7   82.6  66%
4M286.6  285.7   0%

Two NPUs (6 GPUs):

 mprotect rate (GB/s)
Stride   Before  After  Speedup
64K 6.57.4  13%
1M 33.4   67.9 103%
2M 38.7   93.1 141%
4M356.7  354.6  -1%

Anything over 2M is roughly the same as before since both cases issue a
single ATSD.

Signed-off-by: Mark Hairgrove 
---
 arch/powerpc/platforms/powernv/npu-dma.c |   71 +-
 1 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index c8f438a..e471a1a 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -509,15 +509,14 @@ static void mmio_invalidate_pid(struct mmio_atsd_reg 
mmio_atsd_reg[NV_MAX_NPUS],
mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
 }
 
-static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
-   unsigned long va, unsigned long pid, bool flush)
+static void mmio_invalidate_range(struct mmio_atsd_reg
+   mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
+   unsigned long start, unsigned long psize, bool flush)
 {
-   unsigned long launch;
-
-   launch = get_atsd_launch_val(pid, mmu_virtual_psize, flush);
+   unsigned long launch = get_atsd_launch_val(pid, psize, flush);
 
/* Write all VAs first */
-   mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, va);
+   mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
 
/* Issue one barrier for all address writes */
eieio();
@@ -608,15 +607,38 @@ static void release_atsd_reg(struct mmio_atsd_reg 
mmio_atsd_reg[NV_MAX_NPUS])
}
 }
 
+#define PAGE_64K (64UL * 1024)
+#define PAGE_2M  (2UL * 1024 * 1024)
+#define PAGE_1G  (1UL * 1024 * 1024 * 1024)
+
 /*
- * Invalidate either a single address or an entire PID depending on
- * the value of va.
+ * Invalidate a virtual address range
  */
-static void mmio_invalidate(struct npu_context *npu_context, int va,
-   unsigned long address, bool flush)
+static void mmio_invalidate(struct npu_context *npu_context,
+   unsigned long start, unsigned long size, bool flush)
 {
struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
unsigned long pid = npu_context->mm->context.id;
+   unsigned long atsd_start = 0;
+   unsigned long end = start + size - 1;
+   int atsd_psize = MMU_PAGE_COUNT;
+
+   /*
+* Convert the input range into one of the supported sizes. If the range
+* doesn't fit, use the next larger supported size. Invalidation latency
+* is high, so over-invalidation is preferred to issuing multiple
+* invalidates.
+*/
+   if (size == PAGE_64K) {
+   atsd_start = start;
+   atsd_psize = MMU_PAGE_64K;
+   } else if (ALIGN_DOWN(start, PAGE_2M) == ALIGN_DOWN(end, PAGE_2M)) {
+   atsd_start = ALIGN_DOWN(start, PAGE_2M);
+   atsd_psize = MMU_PAGE_2M;
+   } else if (ALIGN_DOWN(start, PAGE_1G) == ALIGN_DOWN(end, PAGE_1G)) {
+   atsd_start = ALIGN_DOWN(start, PAGE_1G);
+   atsd_psize = MMU_PAGE_1G;
+   }
 
if (npu_context->nmmu_flush)
/*
@@ -631,10 +653,12 @@ static void mmio_invalidate(struct npu_context 
*npu_context, int va,
 * an invalidate.
 */
acquire_atsd_reg(npu_context, mmio_atsd_reg);

[PATCH 3/3] powerpc/powernv/npu: Remove atsd_threshold debugfs setting

2018-09-27 Thread Mark Hairgrove
This threshold is no longer used now that all invalidates issue a single
ATSD to each active NPU.

Signed-off-by: Mark Hairgrove 
---
 arch/powerpc/platforms/powernv/npu-dma.c |   13 -
 1 files changed, 0 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/npu-dma.c 
b/arch/powerpc/platforms/powernv/npu-dma.c
index e471a1a..426df1b 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -42,14 +42,6 @@
 static DEFINE_SPINLOCK(npu_context_lock);
 
 /*
- * When an address shootdown range exceeds this threshold we invalidate the
- * entire TLB on the GPU for the given PID rather than each specific address in
- * the range.
- */
-static uint64_t atsd_threshold = 2 * 1024 * 1024;
-static struct dentry *atsd_threshold_dentry;
-
-/*
  * Other types of TCE cache invalidation are not functional in the
  * hardware.
  */
@@ -968,11 +960,6 @@ int pnv_npu2_init(struct pnv_phb *phb)
static int npu_index;
uint64_t rc = 0;
 
-   if (!atsd_threshold_dentry) {
-   atsd_threshold_dentry = debugfs_create_x64("atsd_threshold",
-  0600, powerpc_debugfs_root, &atsd_threshold);
-   }
-
phb->npu.nmmu_flush =
of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
for_each_child_of_node(phb->hose->dn, dn) {
-- 
1.7.2.5



[PATCH 1/5] dma-mapping: make the get_required_mask method available unconditionally

2018-09-27 Thread Christoph Hellwig
This save some duplication for ia64, and makes the interface more
general.  In the long run we want each dma_map_ops instance to fill this
out, but this will take a little more prep work.

Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt  [powerpc bits]
---
 arch/ia64/include/asm/dma-mapping.h  |  2 --
 arch/ia64/include/asm/machvec.h  |  7 ---
 arch/ia64/include/asm/machvec_init.h |  1 -
 arch/ia64/include/asm/machvec_sn2.h  |  2 --
 arch/ia64/pci/pci.c  | 26 --
 arch/ia64/sn/pci/pci_dma.c   |  4 ++--
 drivers/base/platform.c  | 13 +++--
 drivers/pci/controller/vmd.c |  4 
 include/linux/dma-mapping.h  |  2 --
 9 files changed, 13 insertions(+), 48 deletions(-)

diff --git a/arch/ia64/include/asm/dma-mapping.h 
b/arch/ia64/include/asm/dma-mapping.h
index 76e4d6632d68..522745ae67bb 100644
--- a/arch/ia64/include/asm/dma-mapping.h
+++ b/arch/ia64/include/asm/dma-mapping.h
@@ -10,8 +10,6 @@
 #include 
 #include 
 
-#define ARCH_HAS_DMA_GET_REQUIRED_MASK
-
 extern const struct dma_map_ops *dma_ops;
 extern struct ia64_machine_vector ia64_mv;
 extern void set_iommu_machvec(void);
diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h
index 267f4f170191..5133739966bc 100644
--- a/arch/ia64/include/asm/machvec.h
+++ b/arch/ia64/include/asm/machvec.h
@@ -44,7 +44,6 @@ typedef void ia64_mv_kernel_launch_event_t(void);
 
 /* DMA-mapping interface: */
 typedef void ia64_mv_dma_init (void);
-typedef u64 ia64_mv_dma_get_required_mask (struct device *);
 typedef const struct dma_map_ops *ia64_mv_dma_get_ops(struct device *);
 
 /*
@@ -127,7 +126,6 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *);
 #  define platform_global_tlb_purgeia64_mv.global_tlb_purge
 #  define platform_tlb_migrate_finish  ia64_mv.tlb_migrate_finish
 #  define platform_dma_initia64_mv.dma_init
-#  define platform_dma_get_required_mask ia64_mv.dma_get_required_mask
 #  define platform_dma_get_ops ia64_mv.dma_get_ops
 #  define platform_irq_to_vector   ia64_mv.irq_to_vector
 #  define platform_local_vector_to_irq ia64_mv.local_vector_to_irq
@@ -171,7 +169,6 @@ struct ia64_machine_vector {
ia64_mv_global_tlb_purge_t *global_tlb_purge;
ia64_mv_tlb_migrate_finish_t *tlb_migrate_finish;
ia64_mv_dma_init *dma_init;
-   ia64_mv_dma_get_required_mask *dma_get_required_mask;
ia64_mv_dma_get_ops *dma_get_ops;
ia64_mv_irq_to_vector *irq_to_vector;
ia64_mv_local_vector_to_irq *local_vector_to_irq;
@@ -211,7 +208,6 @@ struct ia64_machine_vector {
platform_global_tlb_purge,  \
platform_tlb_migrate_finish,\
platform_dma_init,  \
-   platform_dma_get_required_mask, \
platform_dma_get_ops,   \
platform_irq_to_vector, \
platform_local_vector_to_irq,   \
@@ -286,9 +282,6 @@ extern const struct dma_map_ops *dma_get_ops(struct device 
*);
 #ifndef platform_dma_get_ops
 # define platform_dma_get_ops  dma_get_ops
 #endif
-#ifndef platform_dma_get_required_mask
-# define  platform_dma_get_required_mask   ia64_dma_get_required_mask
-#endif
 #ifndef platform_irq_to_vector
 # define platform_irq_to_vector__ia64_irq_to_vector
 #endif
diff --git a/arch/ia64/include/asm/machvec_init.h 
b/arch/ia64/include/asm/machvec_init.h
index 2b32fd06b7c6..2aafb69a3787 100644
--- a/arch/ia64/include/asm/machvec_init.h
+++ b/arch/ia64/include/asm/machvec_init.h
@@ -4,7 +4,6 @@
 
 extern ia64_mv_send_ipi_t ia64_send_ipi;
 extern ia64_mv_global_tlb_purge_t ia64_global_tlb_purge;
-extern ia64_mv_dma_get_required_mask ia64_dma_get_required_mask;
 extern ia64_mv_irq_to_vector __ia64_irq_to_vector;
 extern ia64_mv_local_vector_to_irq __ia64_local_vector_to_irq;
 extern ia64_mv_pci_get_legacy_mem_t ia64_pci_get_legacy_mem;
diff --git a/arch/ia64/include/asm/machvec_sn2.h 
b/arch/ia64/include/asm/machvec_sn2.h
index ece9fa85be88..b5153d300289 100644
--- a/arch/ia64/include/asm/machvec_sn2.h
+++ b/arch/ia64/include/asm/machvec_sn2.h
@@ -55,7 +55,6 @@ extern ia64_mv_readb_t __sn_readb_relaxed;
 extern ia64_mv_readw_t __sn_readw_relaxed;
 extern ia64_mv_readl_t __sn_readl_relaxed;
 extern ia64_mv_readq_t __sn_readq_relaxed;
-extern ia64_mv_dma_get_required_mask   sn_dma_get_required_mask;
 extern ia64_mv_dma_initsn_dma_init;
 extern ia64_mv_migrate_t   sn_migrate;
 extern ia64_mv_kernel_launch_event_t   sn_kernel_launch_event;
@@ -100,7 +99,6 @@ extern ia64_mv_pci_fixup_bus_t   
sn_pci_fixup_bus;
 #define platform_pci_get_legacy_memsn_pci_get_legacy_mem
 #define platform_pci_legacy_read   sn_pci_legacy_read
 #define platform_pci_legacy_write  sn_pci_legacy_write
-#define platform_dma_get_required_mask sn_dma_get_required_mask
 #define platform_dm

dma mask related fixups (including full bus_dma_mask support) v2

2018-09-27 Thread Christoph Hellwig
Hi all,

the dma_get_required_mask dma API implementation has always been a little
odd, in that we by default don't wire it up struct dma_map_ops, but
instead hard code a default implementation.  powerpc and ia64 override
this default and either call a method or otherwise duplicate the default.

This series always enabled the method and just falls back to the previous
default implementation when it is not available, as well as fixing up
a few bits in the default implementations.  This already allows removing
the ia64 override of the implementation, and will also allow to remove
the powerpc one together with a few additional cleanups in the powerpc
code, but those will be sent separately with other powerpc DMA API
patches.  Last but not least the method will allow us to return a more
sensible value for typical iommu dma_ops eventually, but that is left
to another series as well.

Additionally the dma-direct code has been a bit sloppy in when it was
using phys_to_dma in a few places, so this gets fixed up as well as
actually respecting the bus_dma_mask everywhere instead of just
rejecting dma mask that don't fit into it.

Alltogether this should be all core dma-direct changes required to
move powerpc over to the generic code.

Changes since v2:
 - clean up a mask check
 - add various acks


[PATCH 5/5] dma-direct: always allow dma mask <= physiscal memory size

2018-09-27 Thread Christoph Hellwig
This way an architecture with less than 4G of RAM can support dma_mask
smaller than 32-bit without a ZONE_DMA.  Apparently that is a common
case on powerpc.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 kernel/dma/direct.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 60c433b880e0..170bd322a94a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -284,21 +284,25 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
return nents;
 }
 
+/*
+ * Because 32-bit DMA masks are so common we expect every architecture to be
+ * able to satisfy them - either by not supporting more physical memory, or by
+ * providing a ZONE_DMA32.  If neither is the case, the architecture needs to
+ * use an IOMMU instead of the direct mapping.
+ */
 int dma_direct_supported(struct device *dev, u64 mask)
 {
-#ifdef CONFIG_ZONE_DMA
-   if (mask < phys_to_dma(dev, DMA_BIT_MASK(ARCH_ZONE_DMA_BITS)))
-   return 0;
-#else
-   /*
-* Because 32-bit DMA masks are so common we expect every architecture
-* to be able to satisfy them - either by not supporting more physical
-* memory, or by providing a ZONE_DMA32.  If neither is the case, the
-* architecture needs to use an IOMMU instead of the direct mapping.
-*/
-   if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
+   u64 min_mask;
+
+   if (IS_ENABLED(CONFIG_ZONE_DMA))
+   min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
+   else
+   min_mask = DMA_BIT_MASK(32);
+
+   min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
+
+   if (mask >= phys_to_dma(dev, min_mask))
return 0;
-#endif
return 1;
 }
 
-- 
2.19.0



[PATCH 4/5] dma-direct: implement complete bus_dma_mask handling

2018-09-27 Thread Christoph Hellwig
Instead of rejecting devices with a too small bus_dma_mask we can handle
by taking the bus dma_mask into account for allocations and bounce
buffering decisions.

Signed-off-by: Christoph Hellwig 
---
 include/linux/dma-direct.h |  3 ++-
 kernel/dma/direct.c| 21 +++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index b79496d8c75b..fbca184ff5a0 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -27,7 +27,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t 
addr, size_t size)
if (!dev->dma_mask)
return false;
 
-   return addr + size - 1 <= *dev->dma_mask;
+   return addr + size - 1 <=
+   min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
 }
 #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
 
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index e78548397a92..60c433b880e0 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -44,10 +44,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t 
size,
return false;
}
 
-   if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
+   if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
dev_err(dev,
-   "%s: overflow %pad+%zu of device mask %llx\n",
-   caller, &dma_addr, size, *dev->dma_mask);
+   "%s: overflow %pad+%zu of device mask %llx bus 
mask %llx\n",
+   caller, &dma_addr, size,
+   *dev->dma_mask, dev->bus_dma_mask);
}
return false;
}
@@ -66,12 +67,18 @@ u64 dma_direct_get_required_mask(struct device *dev)
 {
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
 
+   if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)
+   max_dma = dev->bus_dma_mask;
+
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
 }
 
 static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
u64 *phys_mask)
 {
+   if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
+   dma_mask = dev->bus_dma_mask;
+
if (force_dma_unencrypted())
*phys_mask = __dma_to_phys(dev, dma_mask);
else
@@ -88,7 +95,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device 
*dev, u64 dma_mask,
 static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 {
return phys_to_dma_direct(dev, phys) + size - 1 <=
-   dev->coherent_dma_mask;
+   min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
 }
 
 void *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -292,12 +299,6 @@ int dma_direct_supported(struct device *dev, u64 mask)
if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
return 0;
 #endif
-   /*
-* Upstream PCI/PCIe bridges or SoC interconnects may not carry
-* as many DMA address bits as the device itself supports.
-*/
-   if (dev->bus_dma_mask && mask > dev->bus_dma_mask)
-   return 0;
return 1;
 }
 
-- 
2.19.0



[PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection

2018-09-27 Thread Christoph Hellwig
We need to take the DMA offset and encryption bit into account when
selecting a zone.  User the opportunity to factor out the zone
selection into a helper for reuse.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Robin Murphy 
---
 kernel/dma/direct.c | 31 +--
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f32b33cfa331..e78548397a92 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -69,6 +69,22 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
 }
 
+static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,
+   u64 *phys_mask)
+{
+   if (force_dma_unencrypted())
+   *phys_mask = __dma_to_phys(dev, dma_mask);
+   else
+   *phys_mask = dma_to_phys(dev, dma_mask);
+
+   /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
+   if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+   return GFP_DMA;
+   if (*phys_mask <= DMA_BIT_MASK(32))
+   return GFP_DMA32;
+   return 0;
+}
+
 static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 {
return phys_to_dma_direct(dev, phys) + size - 1 <=
@@ -81,17 +97,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t 
size,
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
int page_order = get_order(size);
struct page *page = NULL;
+   u64 phys_mask;
void *ret;
 
/* we always manually zero the memory once we are done: */
gfp &= ~__GFP_ZERO;
-
-   /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
-   if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
-   gfp |= GFP_DMA;
-   if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
-   gfp |= GFP_DMA32;
-
+   gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+   &phys_mask);
 again:
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(gfp)) {
@@ -110,15 +122,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t 
size,
page = NULL;
 
if (IS_ENABLED(CONFIG_ZONE_DMA32) &&
-   dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+   phys_mask < DMA_BIT_MASK(64) &&
!(gfp & (GFP_DMA32 | GFP_DMA))) {
gfp |= GFP_DMA32;
goto again;
}
 
if (IS_ENABLED(CONFIG_ZONE_DMA) &&
-   dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
-   !(gfp & GFP_DMA)) {
+   phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}
-- 
2.19.0



[PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask

2018-09-27 Thread Christoph Hellwig
This is somewhat modelled after the powerpc version, and differs from
the legacy fallback in use fls64 instead of pointlessly splitting up the
address into low and high dwords and in that it takes (__)phys_to_dma
into account.

Signed-off-by: Christoph Hellwig 
Acked-by: Benjamin Herrenschmidt 
Reviewed-by: Robin Murphy 
---
 include/linux/dma-direct.h |  1 +
 kernel/dma/direct.c| 22 +++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 86a59ba5a7f3..b79496d8c75b 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -55,6 +55,7 @@ static inline void dma_mark_clean(void *addr, size_t size)
 }
 #endif /* CONFIG_ARCH_HAS_DMA_MARK_CLEAN */
 
+u64 dma_direct_get_required_mask(struct device *dev);
 void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs);
 void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index c954f0a6dc62..f32b33cfa331 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -4,6 +4,7 @@
  *
  * DMA operations that map physical memory directly without using an IOMMU.
  */
+#include  /* for max_pfn */
 #include 
 #include 
 #include 
@@ -53,11 +54,25 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t 
size,
return true;
 }
 
+static inline dma_addr_t phys_to_dma_direct(struct device *dev,
+   phys_addr_t phys)
+{
+   if (force_dma_unencrypted())
+   return __phys_to_dma(dev, phys);
+   return phys_to_dma(dev, phys);
+}
+
+u64 dma_direct_get_required_mask(struct device *dev)
+{
+   u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
+
+   return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
+}
+
 static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
 {
-   dma_addr_t addr = force_dma_unencrypted() ?
-   __phys_to_dma(dev, phys) : phys_to_dma(dev, phys);
-   return addr + size - 1 <= dev->coherent_dma_mask;
+   return phys_to_dma_direct(dev, phys) + size - 1 <=
+   dev->coherent_dma_mask;
 }
 
 void *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -296,6 +311,7 @@ const struct dma_map_ops dma_direct_ops = {
.unmap_page = dma_direct_unmap_page,
.unmap_sg   = dma_direct_unmap_sg,
 #endif
+   .get_required_mask  = dma_direct_get_required_mask,
.dma_supported  = dma_direct_supported,
.mapping_error  = dma_direct_mapping_error,
.cache_sync = arch_dma_cache_sync,
-- 
2.19.0



drivers binding to device node with multiple compatible strings

2018-09-27 Thread Li Yang
Hi Rob and Grant,

Various device tree specs are recommending to include all the
potential compatible strings in the device node, with the order from
most specific to most general.  But it looks like Linux kernel doesn't
provide a way to bind the device to the most specific driver, however,
the first registered compatible driver will be bound.

As more and more generic drivers are added to the Linux kernel, they
are competing with the more specific vendor drivers and causes problem
when both are built into the kernel.  I'm wondering if there is a
generic solution (or in plan) to make the most specific driver bound
to the device.   Or we have to disable the more general driver or
remove the more general compatible string from the device tree?

Regards,
Leo


Re: [RFC PATCH 08/11] powerpc/tm: Do not reclaim on ptrace

2018-09-27 Thread Breno Leitao
Hi Mikey,

On 09/18/2018 02:36 AM, Michael Neuling wrote:
> On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
>> Make sure that we are not suspended on ptrace and that the registers were
>> already reclaimed.
>>
>> Since the data was already reclaimed, there is nothing to be done here
>> except to restore the SPRs.
>>
>> Signed-off-by: Breno Leitao 
>> ---
>>  arch/powerpc/kernel/ptrace.c | 10 --
>>  1 file changed, 4 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
>> index 9667666eb18e..cf6ee9154b11 100644
>> --- a/arch/powerpc/kernel/ptrace.c
>> +++ b/arch/powerpc/kernel/ptrace.c
>> @@ -136,12 +136,10 @@ static void flush_tmregs_to_thread(struct task_struct
>> *tsk)
>>  if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current))
>>  return;
>>  
>> -if (MSR_TM_SUSPENDED(mfmsr())) {
>> -tm_reclaim_current(TM_CAUSE_SIGNAL);
>> -} else {
>> -tm_enable();
>> -tm_save_sprs(&(tsk->thread));
>> -}
>> +WARN_ON(MSR_TM_SUSPENDED(mfmsr()));
>> +
>> +tm_enable();
>> +tm_save_sprs(&(tsk->thread));
> 
> Do we need to check if TM was enabled in the task before saving the TM SPRs?
> 
> What happens if TM was lazily off and hence we had someone else's TM SPRs in 
> the
> CPU currently?  Wouldn't this flush the wrong values to the task_struct?
> 
> I think we need to check the processes MSR before doing this.

Yes, it is a *very* good point, and I think we are vulnerable even before
this patch (in the current kernel). Take a look above, we are calling
tm_save_sprs() if MSR is not TM suspended independently if TM is lazily off.

It shouldn't be hard to create a test case for it. I can try to call
ptrace(PTRACE_GETVRREGS) on a task that sleeps until TM is lazily disabled,
compare if the TM SPR changed in this mean time. (while doing HTM in parallel
to keep HTM SPR changing). Let's see if I can read others task TM SPRs.

Thank you.


Re: [RFC PATCH 05/11] powerpc/tm: Function that updates the failure code

2018-09-27 Thread Breno Leitao
Hi Mikey,

On 09/17/2018 10:29 PM, Michael Neuling wrote:
> On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
>> Now the transaction reclaims happens very earlier in the trap handler, and
>> it is impossible to know precisely, at that early time, what should be set
>> as the failure cause for some specific cases, as, if the task will be
>> rescheduled, thus, the transaction abort case should be updated from
>> TM_CAUSE_MISC to TM_CAUSE_RESCHED, for example.
> 
> Please add comments to where this is used (in EXCEPTION_COMMON macro I think)
> that say this might happen.

Is it OK if I comment it in TM_KERNEL_ENTRY macro, since the failure cause
could be updated independently of the exception being execute, so, every call
to TM_KERNEL_ENTRY can have the cause overwritten.

I.e. it does not matter if  the exception is a systemcall or a page fault,
the failure cause will be updated if there is a process reschedule after the
exception/syscall is handled.

Thank you


Re: [RFC PATCH 11/11] selftests/powerpc: Adapt the test

2018-09-27 Thread Breno Leitao
Hi Mikey,

On 09/18/2018 03:36 AM, Michael Neuling wrote:
> On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
>> The Documentation/powerpc/transactional_memory.txt says:
>>
>>  "Syscalls made from within a suspended transaction are performed as normal
>>   and the transaction is not explicitly doomed by the kernel.  However,
>>   what the kernel does to perform the syscall may result in the transaction
>>   being doomed by the hardware."
>>
>> With this new TM mechanism, the syscall will continue to be executed if the
>> syscall happens on a suspended syscall, but, the syscall will *fail* if the
>> transaction is still active during the syscall invocation.
> 
> Not sure I get this. This doesn't seem any different to before.
> 
> An active (not suspended) transaction *will* result in the syscall failing and
> the transaction being doomed.  
> 
> A syscall in a suspended transaction should succeed and the transaction.

I understand that a transaction will always be doomed if there is a
reclaim/checkpoint, thus, if the you make a system call inside a suspended
transaction, it will reclaim and recheckpoint, thus, dooming the transaction,
and failing it on the next RFID.

So, the syscall executed in suspended mode may succeed, because it will not
be skipped (as in active mode), but the transaction will *always* fail,
because there was a reclaim and recheckpoint.

> You might need to clean up the language. I try to use:
> 
>Active == transactional but not suspended (ie MSR[TS] = T)
>Suspended == suspended (ie MSR [TS] = S)
>Doomed == transaction to be rolled back at next opportinity (ie tcheck 
> returns doomed)
> 
> (note: the kernel MSR_TM_ACTIVE() macro is not consistent with this since it's
> MSR[TS] == S or T).

Ok, I agree with this language as well. I might want to improve the code to
follow the same language all across the code.

>> On the syscall path, if the transaction is active and not suspended, it
>> will call TM_KERNEL_ENTRY which will reclaim and recheckpoint the
>> transaction, thus, dooming the transaction on userspace return, with
>> failure code TM_CAUSE_SYSCALL.
> 
> But the test below is on a suspend transaction?

Sorry, I meant "suspended transaction" above instead of "transaction is
active and not suspended".

> 
>> This new model will break part of this test, but I understand that that the
>> documentation above didn't guarantee that the syscall would succeed, and it
>> will never succeed anymore now on.
> 
> The syscall should pass in suspend (modulo the normal syscall checks). The
> transaction may fail as a result.

Perfect, and if the transaction fail, the CPU will rollback the changes and
restore the checkpoint registers (replacing the r3 that contains the pid
value), thus, it will be like "getpid" system call didn't execute.

For this test specifically, it assumes the syscall didn't execute if the
transaction failed. Take a look:

FUNC_START(getppid_tm_suspended)
tbegin.
beq 1f
li  r0, __NR_getppid
tsuspend.
sc
tresume.
tend.
blr
1:
li  r3, -1
blr

So, the tests thinks the syscall failed because the transaction aborted.

Anyway, I can try to improve this test other than removing this test, but I
am not sure how.

Thank you


Re: [RFC PATCH 10/11] powerpc/tm: Set failure summary

2018-09-27 Thread Breno Leitao
Hi Mikey,

On 09/18/2018 02:50 AM, Michael Neuling wrote:
> On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
>> Since the transaction will be doomed with treckpt., the TEXASR[FS]
>> should be set, to reflect that the transaction is a failure. This patch
>> ensures it before recheckpointing, and remove changes from other places
>> that were calling recheckpoint.
> 
> TEXASR[FS] should be set by the reclaim. 

Do you mean that the CPU should set TEXASR[FS] when treclaim is called, or,
that the tm_reclaim?

Looking at the ISA, I didn't see TEXASR[FS] being set automatically when a
reclaim happens, although, I see it needs TEXASR[FS] to be set when executing
a trecheckpoint, otherwise it will cause a TM Bad Thing.

That is why I am forcing TEXASR[FS]=1 to doom the transaction so we can
recheckpoint it, but it seems I understood this wrongly.

> I don't know why you'd need to set this
> explicitly in process.c. The only case is when the user supplies a bad signal
> context, but we should check that in the signals code, not process.c
> 
> Hence I think this patch is wrong.
> 
> Also, according to the architecture, TEXASR[FS] HAS TO BE SET on trecheckpoint
> otherwise you'll get a TM Bad Thing. You should say that rather than 
> suggesting
> it's because the transaction is doomed. It's ilqlegal to not do it. That's 
> why we
> have this check in arch/powerpc/kernel/tm.S.

When you say "HAS TO BE SET", do you mean that the hardware will set it and
we shouldn't care about this flag? Thus, if I am hitting EMIT_BUG_ENTRY, it
means my TEXASR was messed somehow?

Thank you


Re: [RFC PATCH 09/11] powerpc/tm: Do not restore default DSCR

2018-09-27 Thread Breno Leitao
Hi Mikey,

On 09/18/2018 02:41 AM, Michael Neuling wrote:
> On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
>> In the previous TM code, trecheckpoint was being executed in the middle of
>> an exception, thus, DSCR was being restored to default kernel DSCR value
>> after trecheckpoint was done.
>>
>> With this current patchset, trecheckpoint is executed just before getting
>> to userspace, at ret_from_except_lite, for example. Thus, we do not need to
>> set default kernel DSCR value anymore, as we are leaving kernel space.  It
>> is OK to keep the checkpointed DSCR value into the live SPR, mainly because
>> the transaction is doomed and it will fail soon (after RFID), 
> 
> What if we are going back to a suspended transaction?  It will remain live 
> until
> userspace does a tresume

Hmm, I understand that once we get in kernel space, and call
treclaim/trecheckpoint, the transaction will be doomed and it will abort and
rollback when we leave kernel space. I.e., if we can treclaim/trecheckpointn
in kernel space, the transaction will *always* abort just after RFID, so, a
possible tresume will never be executed. Is my understanding wrong?

> 
>> so,
>> continuing with the pre-checkpointed DSCR value is what seems correct.
> 
> Reading this description suggests this patch isn't really needed. Right?

Maybe the description is not clear, but I understand this patch is required,
otherwise we will leave userspace with a default DSCR value.

By the way, do you know if there is a change in DSCR inside a transaction,
will it be reverted if the transaction is aborted?

Thank you


[PATCH 8/8] powerpc: convert to generic builtin command line

2018-09-27 Thread Maksym Kokhan
From: Daniel Walker 

This updates the powerpc code to use the CONFIG_GENERIC_CMDLINE
option.

[maksym.kok...@globallogic.com: add strlcat to prom_init_check.sh
whitelist]
Cc: Daniel Walker 
Cc: Daniel Walker 
Signed-off-by: Daniel Walker 
Signed-off-by: Maksym Kokhan 
---
 arch/powerpc/Kconfig   | 23 +--
 arch/powerpc/kernel/prom.c |  4 
 arch/powerpc/kernel/prom_init.c|  8 
 arch/powerpc/kernel/prom_init_check.sh |  2 +-
 4 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a806692..f87906a 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -172,6 +172,7 @@ config PPC
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select GENERIC_TIME_VSYSCALL
+   select GENERIC_CMDLINE
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KGDB
@@ -787,28 +788,6 @@ config PPC_DENORMALISATION
  Add support for handling denormalisation of single precision
  values.  Useful for bare metal only.  If unsure say Y here.
 
-config CMDLINE_BOOL
-   bool "Default bootloader kernel arguments"
-
-config CMDLINE
-   string "Initial kernel command string"
-   depends on CMDLINE_BOOL
-   default "console=ttyS0,9600 console=tty0 root=/dev/sda2"
-   help
- On some platforms, there is currently no way for the boot loader to
- pass arguments to the kernel. For these platforms, you can supply
- some command-line options at build time by entering them here.  In
- most cases you will need to specify the root device here.
-
-config CMDLINE_FORCE
-   bool "Always use the default kernel command string"
-   depends on CMDLINE_BOOL
-   help
- Always use the default kernel command string, even if the boot
- loader passes other arguments to the kernel.
- This is useful if you cannot or don't want to change the
- command-line options your boot loader passes to the kernel.
-
 config EXTRA_TARGETS
string "Additional default image types"
help
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index c4d7078..f8c6e63 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -34,6 +34,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -716,6 +717,9 @@ void __init early_init_devtree(void *params)
 */
of_scan_flat_dt(early_init_dt_scan_chosen_ppc, boot_command_line);
 
+   /* append and prepend any arguments built into the kernel. */
+   cmdline_add_builtin(boot_command_line, NULL, COMMAND_LINE_SIZE);
+
/* Scan memory nodes and rebuild MEMBLOCKs */
of_scan_flat_dt(early_init_dt_scan_root, NULL);
of_scan_flat_dt(early_init_dt_scan_memory_ppc, NULL);
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 9b38a2e..e9214c6 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -634,11 +635,10 @@ static void __init early_cmdline_parse(void)
p = prom_cmd_line;
if ((long)prom.chosen > 0)
l = prom_getprop(prom.chosen, "bootargs", p, 
COMMAND_LINE_SIZE-1);
-#ifdef CONFIG_CMDLINE
+
if (l <= 0 || p[0] == '\0') /* dbl check */
-   strlcpy(prom_cmd_line,
-   CONFIG_CMDLINE, sizeof(prom_cmd_line));
-#endif /* CONFIG_CMDLINE */
+   cmdline_add_builtin(prom_cmd_line, NULL, sizeof(prom_cmd_line));
+
prom_printf("command line: %s\n", prom_cmd_line);
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/prom_init_check.sh 
b/arch/powerpc/kernel/prom_init_check.sh
index acb6b92..b1d7ca1 100644
--- a/arch/powerpc/kernel/prom_init_check.sh
+++ b/arch/powerpc/kernel/prom_init_check.sh
@@ -18,7 +18,7 @@
 
 WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush
 _end enter_prom memcpy memset reloc_offset __secondary_hold
-__secondary_hold_acknowledge __secondary_hold_spinloop __start
+__secondary_hold_acknowledge __secondary_hold_spinloop __start strlcat
 strcmp strcpy strlcpy strlen strncmp strstr kstrtobool logo_linux_clut224
 reloc_got2 kernstart_addr memstart_addr linux_banner _stext
 __prom_init_toc_start __prom_init_toc_end btext_setup_display TOC."
-- 
2.7.4



[PATCH 1/8] add generic builtin command line

2018-09-27 Thread Maksym Kokhan
From: Daniel Walker 

This code allows architectures to use a generic builtin command line.
The state of the builtin command line options across architecture is
diverse. On x86 and mips they have pretty much the same code and the
code prepends the builtin command line onto the boot loader provided
one. On powerpc there is only a builtin override and nothing else.

The code in this commit unifies the mips and x86 code into a generic
header file under the CONFIG_GENERIC_CMDLINE option. When this
option is enabled the architecture can call the cmdline_add_builtin()
to add the builtin command line.

[maksym.kok...@globallogic.com: fix cmdline_add_builtin() macro]
Cc: Daniel Walker 
Cc: Daniel Walker 
Signed-off-by: Daniel Walker 
Signed-off-by: Maksym Kokhan 
---
 include/linux/cmdline.h | 70 +
 init/Kconfig| 68 +++
 2 files changed, 138 insertions(+)
 create mode 100644 include/linux/cmdline.h

diff --git a/include/linux/cmdline.h b/include/linux/cmdline.h
new file mode 100644
index 000..75ef278
--- /dev/null
+++ b/include/linux/cmdline.h
@@ -0,0 +1,70 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CMDLINE_H
+#define _LINUX_CMDLINE_H
+
+/*
+ *
+ * Copyright (C) 2015. Cisco Systems, Inc.
+ *
+ * Generic Append/Prepend cmdline support.
+ */
+
+#if defined(CONFIG_GENERIC_CMDLINE) && defined(CONFIG_CMDLINE_BOOL)
+
+#ifndef CONFIG_CMDLINE_OVERRIDE
+/*
+ * This function will append or prepend a builtin command line to the command
+ * line provided by the bootloader. Kconfig options can be used to alter
+ * the behavior of this builtin command line.
+ * @dest: The destination of the final appended/prepended string
+ * @src: The starting string or NULL if there isn't one.
+ * @tmp: temporary space used for prepending
+ * @length: the maximum length of the strings above.
+ */
+static inline void
+_cmdline_add_builtin(char *dest, char *src, char *tmp, unsigned long length)
+{
+   if (src != dest && src != NULL) {
+   strlcpy(dest, " ", length);
+   strlcat(dest, src, length);
+   }
+
+   strlcat(dest, " ", length);
+
+   if (sizeof(CONFIG_CMDLINE_APPEND) > 1)
+   strlcat(dest, CONFIG_CMDLINE_APPEND, length);
+
+   if (sizeof(CONFIG_CMDLINE_PREPEND) > 1) {
+   strlcpy(tmp, CONFIG_CMDLINE_PREPEND, length);
+   strlcat(tmp, " ", length);
+   strlcat(tmp, dest, length);
+   strlcpy(dest, tmp, length);
+   }
+}
+
+#define cmdline_add_builtin(dest, src, length) \
+{  \
+   if (sizeof(CONFIG_CMDLINE_PREPEND) > 1) {   \
+   static char cmdline_tmp_space[length] __initdata;   \
+   _cmdline_add_builtin(dest, src, cmdline_tmp_space, length); \
+   } else {\
+   _cmdline_add_builtin(dest, src, NULL, length);  \
+   }   \
+}
+#else
+#define cmdline_add_builtin(dest, src, length)\
+{ \
+   strlcpy(dest, CONFIG_CMDLINE_PREPEND " " CONFIG_CMDLINE_APPEND,\
+   length);   \
+}
+#endif /* !CONFIG_CMDLINE_OVERRIDE */
+
+#else
+#define cmdline_add_builtin(dest, src, length) { \
+   if (src != NULL)   \
+   strlcpy(dest, src, length);\
+}
+#endif /* CONFIG_GENERIC_CMDLINE */
+
+
+#endif /* _LINUX_CMDLINE_H */
diff --git a/init/Kconfig b/init/Kconfig
index 1e234e2..e5aa676 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1751,6 +1751,74 @@ config PROFILING
 config TRACEPOINTS
bool
 
+config GENERIC_CMDLINE
+   bool
+
+if GENERIC_CMDLINE
+
+config CMDLINE_BOOL
+   bool "Built-in kernel command line"
+   help
+ Allow for specifying boot arguments to the kernel at
+ build time.  On some systems (e.g. embedded ones), it is
+ necessary or convenient to provide some or all of the
+ kernel boot arguments with the kernel itself (that is,
+ to not rely on the boot loader to provide them.)
+
+ To compile command line arguments into the kernel,
+ set this option to 'Y', then fill in the
+ the boot arguments in CONFIG_CMDLINE.
+
+ Systems with fully functional boot loaders (i.e. non-embedded)
+ should leave this option set to 'N'.
+
+config CMDLINE_APPEND
+   string "Built-in kernel command string append"
+   depends on CMDLINE_BOOL
+   default ""
+   help
+ Enter arguments here that should be compiled into the kernel
+ image and 

[PATCH 0/8] add generic builtin command line

2018-09-27 Thread Maksym Kokhan
There were series of patches [1] for 4.3.0-rc3, that allowed
architectures to use a generic builtin command line. I have rebased
these patches on kernel 4.19.0-rc4.

Things, modified in comparison with original patches:   
 
* There was some bug for mips, in the case when CONFIG_CMDLINE_PREPEND
and CONFIG_CMDLINE_APPEND are empty and CMDLINE_OVERRIDE is not set,
command line from bootloader was ignored, so I fixed it, modifying
patch "add generic builtin command line".

* Implemented new patch to resolve conflict with new kernel, which
modify EFI stub code. Unfortunately, I don't have capability to test
this modification on real arm board with EFI.

* Removed new realisation of mips builtin command line, which was
created after 4.3.0-rc3.

* Kernel 4.3.0-rc3 with original patches could not be compiled for
powerpc due to prom_init.c checking by prom_init_check.sh. So I added
strlcat (which is used by cmdline_add_builtin macro) to
prom_init_check.sh whitelist.

Patches have been tested in QEMU for x86, arm (little-endian), arm64
(little-endian), mips (little-endian, 32-bit) and powerpc
(big-endian, 64-bit), everything works perfectly. Also it was tested
on linux-next (next-20180924 tag) for all listed above architectures.

[1] : https://lore.kernel.org/patchwork/patch/604992/

Daniel Walker (7):
  add generic builtin command line
  drivers: of: ifdef out cmdline section
  x86: convert to generic builtin command line
  arm: convert to generic builtin command line
  arm64: convert to generic builtin command line
  mips: convert to generic builtin command line
  powerpc: convert to generic builtin command line

Maksym Kokhan (1):
  efi: modify EFI stub code for arm/arm64

 arch/arm/Kconfig| 38 +-
 arch/arm/kernel/atags_parse.c   | 14 ++-
 arch/arm/kernel/devtree.c   |  2 +
 arch/arm64/Kconfig  | 17 +---
 arch/arm64/kernel/setup.c   |  3 ++
 arch/mips/Kconfig   | 24 +--
 arch/mips/Kconfig.debug | 47 --
 arch/mips/kernel/setup.c| 41 ++-
 arch/powerpc/Kconfig| 23 +--
 arch/powerpc/kernel/prom.c  |  4 ++
 arch/powerpc/kernel/prom_init.c |  8 ++--
 arch/powerpc/kernel/prom_init_check.sh  |  2 +-
 arch/x86/Kconfig| 44 +
 arch/x86/kernel/setup.c | 19 ++---
 drivers/firmware/efi/libstub/arm-stub.c | 10 ++---
 drivers/of/fdt.c|  2 +-
 include/linux/cmdline.h | 70 +
 init/Kconfig| 68 
 18 files changed, 173 insertions(+), 263 deletions(-)
 create mode 100644 include/linux/cmdline.h

-- 
2.7.4



Re: [RFC PATCH 06/11] powerpc/tm: Refactor the __switch_to_tm code

2018-09-27 Thread Breno Leitao
hi Mikey

On 09/18/2018 01:04 AM, Michael Neuling wrote:
>> On top of that, both tm_reclaim_task() and tm_recheckpoint_new_task()
>> functions are not used anymore, removing them.
> 
> What about tm_reclaim_current().  This is being used in places like signals
> which I would have thought we could avoid with this series

In fact tm_reclaim_current() is the default function to reclaim. It is the
function that is being called by TM_KERNEL_ENTRY, other than that, it should
never be called on the sane path.

>> + * If we got here with an active transaction, then, it was
>> + * aborted by TM_KERNEL_ENTRY and the fix the failure case
>> + * needs to be fixed, so, indepedently how we arrived here, the
>> + * new TM abort case will be TM_CAUSE_RESCHED now.
> 
> What does "fix the failure case needs to be fixed" mean?
> 
> also s/indepedently/independently/

In fact, I rewrote this paragraph. I try to say that the "TEXASR Failure
code" needs to be updated/fixed, but it became that messy and crazy
paragraph. :-(

> 
>> + */
>> +if (MSR_TM_ACTIVE(prev->thread.regs->msr)) {
>> +/*
>> + * If there was an IRQ during trecheckpoint, it will
>> + * cause an IRQ to be replayed. This replayed IRQ can
>> + * invoke SCHEDULE_USER, thus, we arrive here with a TM
>> + * active transaction.
> 
> I don't think this can happen. trecheckpoint (and treclaim) are called with 
> IRQs
> hard off (since they change r1).

There will be an IRQ check and replay later. An IRQ being replayed can cause
a process to be reschedule and arrive here after a trecheckpoint.

> I think something else is going on here. I think this code and comment needs 
> to
> go but I assume it's here because you are seeing something.

Right, and it was my major concern about this whole review.

The tm_recheckpoint() was being called with IRQ hard off, as you said, but
the IRQ is being re-enabled later, after "trecheckpoint", when it calls
local_irq_restore().

Since the IRQ was re-enabled, there is a mechanism to check for IRQs that
were pending and execute them (after recheckpoint - hence with MSR[TS]
active). Some of these pending IRQ might cause a task reschedule, bringing us
to __switch_to with MSR[TS] active.

I also checked that there were cases where a pending IRQ was waiting to be
replayed even before  tm_recheckpoint() is called. I suspect we were reaching
tm_recheckpoint soft-disabled.

On the v2 patchset, I am following your suggestion, which is calling
restore_tm_state() much later (at the beginning of fast_exception_return),
after the _TIF_USER_WORK_MASK section, and after the IRQ replay section also.
So, after this point, there is no way to rollback the exit to userspace after
trecheckpoint. Therefore, if there is a recheckpoint, a rfid will always come
directly.

In order to do so, I need to remove _TIF_RESTORE_TM as part of
_TIF_USER_WORK_MASK and handle _TIF_RESTORE_TM individually later.

This seems to solve this problem, and I can remove this reclaim in the middle
of __switch_to_tm().

Thank you


Re: [RFC PATCH 01/11] powerpc/tm: Reclaim transaction on kernel entry

2018-09-27 Thread Breno Leitao
Hi Mikey,

On 09/17/2018 10:31 PM, Michael Neuling wrote:
> On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
>> This patch creates a macro that will be invoked on all entrance to the
>> kernel, so, in kernel space the transaction will be completely reclaimed
>> and not suspended anymore.
> 
> There are still some calls to tm_reclaim_current() in process.c. Should these
> probably go now, right?

Yes, we shouldn't not call tm_reclaim_current() from anywhere else other than
TM_KERNEL_ENTRY anymore.

That said, I think we can still have some references for tm_reclaim_current()
in some specific checkpoints. Something as:

if (WARN_ON(MSR_TM_SUSPENDED(mfmsr(
tm_reclaim_current(0);

I initially wrote something as BUG_ON(MSR_TM_SUSPENDED(mfmsr()) but
scripts/checkpatch.pl suggested me writing a WARN_ON() and create a recovery
pass, which seems fair.

Anyway, if you think it is not a good strategy, I can get rid of them at v2.

Thank you!


Re: [RFC PATCH 00/11] New TM Model

2018-09-27 Thread Breno Leitao
Hi Mikey,

First of all, thanks for you detailed review. I really appreciate your
comments here.

On 09/17/2018 02:25 AM, Michael Neuling wrote:
> On Wed, 2018-09-12 at 16:40 -0300, Breno Leitao wrote:
>> This patchset for the hardware transactional memory (TM) subsystem aims to
>> avoid spending a lot of time on TM suspended mode in kernel space. It
>> basically
>> changes where the reclaim/recheckpoint will be executed.
>>
>> Once a CPU enters in transactional state it uses a footprint area to track
>> down the load/stores performed in transaction so it can be verified later
>> to decide if a conflict happened due to some change done in that state. If
>> a transaction is active in userspace and there is an exception that takes
>> the CPU to the kernel space the CPU moves the transaction to suspended
>> state but does not discard the footprint area.
> 
> In this description, you should differente between memory and register
> (GPR/VSX/SPR) footprints.

Right, reading the ISA, I understand that footprint is a term for memory only
and it represents the modified memory that was stored during a transactional
state (that after tbegin). For registers, the ISA talks about checkpointed
registers, which is the register state *before* a transaction starts.

I.e, for register it is the previous state, and for memory, it is the
current/live state.

That said, if the transactional is aborted, the memory footprint is discarded
and the checkpointed registers replaces the live registers.

> In suspend, the CPU can disregard the memory footprint at any point, but it 
> has> to keep the register footprint.
Yes!

Anyway, I was just trying to describe how the hardware works, it is not
related to the kernel at the paragraph above, but I will make sure I will
re-write it better.

>> This patchset aims to reclaim the checkpointed footprint as soon as the
>> kernel is invoked, in the beginning of the exception handlers, thus freeing
>> room to other CPUs enter in suspended mode, avoiding too many CPUs in
>> suspended
>> state that can cause the CPUs to stall. The same mechanism is done on kernel
>> exit, doing a recheckpoint as late as possible (which will reload the
>> checkpointed state into CPU's room) at the exception return.
> 
> OK, but we are still potentially in suspend in userspace, so that doesn't help
> us on the lockup issue.
> 
> We need fake suspend in userspace to prevent lockups.

Correct. I will make sure I document it. This patchset is the very first step
to start creating a work around for the hardware limitations.

>> The way to achieve this goal is creating a macro (TM_KERNEL_ENTRY) which
>> will check if userspace was in an active transaction just after getting
>> into kernel and reclaiming if that's the case. Thus all exception handlers
>> will call this macro as soon as possible.
>>
>> All exceptions should reclaim (if necessary) at this stage and only
>> recheckpoint if the task is tagged as TIF_RESTORE_TM (i.e. was in
>> transactional state before being interrupted), which will be done at
>> ret_from_except_lite().
>>
>> Ideally all reclaims will happen at the exception entrance, however during
>> the recheckpoint process another exception can hit the CPU which might
>> cause the current thread to be rescheduled, thus there is another reclaim
>> point to be considered at __switch_to().
> 
> Can we do the recheckpoint() later so that it's when we have interrupts off 
> and
> can't be rescheduled?

Yes!  After thinking on it for a long time, this is definitely what should be
done. I will send a v2 with this change (and others being discussed here)

Thank you,
Breno


Re: [PATCH v3 1/6] dt-bindings: arm64: add compatible for LX2160A

2018-09-27 Thread Rob Herring
On Mon, 24 Sep 2018 05:38:56 +0530, Vabhav Sharma wrote:
> Add compatible for LX2160A SoC,QDS and RDB board
> 
> Signed-off-by: Vabhav Sharma 
> ---
>  Documentation/devicetree/bindings/arm/fsl.txt | 12 
>  1 file changed, 12 insertions(+)
> 

Reviewed-by: Rob Herring 


Re: [PATCH 1/2] soc: fsl: qbman: qman_portal: defer probing when qman is not available

2018-09-27 Thread Li Yang
On Wed, Sep 26, 2018 at 1:15 PM Li Yang  wrote:
>
> On Wed, Sep 26, 2018 at 4:28 AM Alexandre Belloni
>  wrote:
> >
> > On 25/09/2018 21:45:56+0200, Olof Johansson wrote:
> > > Hi,
> > >
> > >
> > > On Thu, Aug 23, 2018 at 11:36 PM Alexandre Belloni
> > >  wrote:
> > > >
> > > > If the qman driver (qman_ccsr) doesn't probe or fail to probe before
> > > > qman_portal, qm_ccsr_start will be either NULL or a stale pointer to an
> > > > unmapped page.
> > > >
> > > > This leads to a crash when probing  qman_portal as the init_pcfg 
> > > > function
> > > > calls qman_liodn_fixup that tries to read qman registers.
> > > >
> > > > Assume that qman didn't probe when the pool mask is 0.
> > > >
> > > > Signed-off-by: Alexandre Belloni 
> > > > ---
> > > >  drivers/soc/fsl/qbman/qman_portal.c | 2 ++
> > > >  1 file changed, 2 insertions(+)
> > > >
> > > > diff --git a/drivers/soc/fsl/qbman/qman_portal.c 
> > > > b/drivers/soc/fsl/qbman/qman_portal.c
> > > > index a120002b630e..4fc80d2c8feb 100644
> > > > --- a/drivers/soc/fsl/qbman/qman_portal.c
> > > > +++ b/drivers/soc/fsl/qbman/qman_portal.c
> > > > @@ -277,6 +277,8 @@ static int qman_portal_probe(struct platform_device 
> > > > *pdev)
> > > > }
> > > >
> > > > pcfg->pools = qm_get_pools_sdqcr();
> > > > +   if (pcfg->pools == 0)
> > > > +   return -EPROBE_DEFER;
> > >
> > > This is quite late in the probe, after a bunch of resources have been 
> > > claimed.
> > >
> > > Note that the ioremaps above this are doing unwinds, and you'll end up
> > > doing duplicate ioremaps if you come in and probe again.
> > >
> > > You should probably unwind those allocations, or move them to devm_*
> > > or do this check earlier in the function.
> > >
> >
> > The actual chance of having that happen is quite small (this was coming
> > from a non working DT) and I mainly wanted to avoid a crash so the
> > platform could still boot. I would think moving to devm_ would be the
> > right thing to do.
>
> Even if it is not failing with the upstreamed device trees, it is
> still good to harden the driver for possible issues.  Moving to devm_
> is definitely a right thing to do.  But I also think checking if the
> qman is already probed should be the first thing to do before starting
> to allocate resources and etc and rolling back later.  Probably we can
> move the qm_get_pools_sdqcr() to the begining of the probe to
> determine if qman is probed as it doesn't seem to depend on any of the
> setups done right now.

I just find out Laurentiu also included the following patches in his
SMMU patch series (although not neccessarily related to SMMU) which
also fix the same problem.  I think they are more straightforward and
can deal with the case that qman failed to probe.  So we can take
these to fix this problem instead in 4.19.

https://patchwork.kernel.org/patch/10616021/
https://patchwork.kernel.org/patch/10616019/
https://patchwork.kernel.org/patch/10615971/

Regards,
Leo


Re: [PATCH 0/8] add generic builtin command line

2018-09-27 Thread Daniel Walker

On 09/27/2018 10:05 AM, Ard Biesheuvel wrote:

On 27 September 2018 at 18:55, Maksym Kokhan
 wrote:

There were series of patches [1] for 4.3.0-rc3, that allowed
architectures to use a generic builtin command line. I have rebased
these patches on kernel 4.19.0-rc4.



Could you please elaborate on the purpose of this series? Is it simply
to align between architectures? Does it solve an actual problem?


1) It removed a lot of code duplication between architecture

2) At Cisco we have issues where our bootloaders having default boot 
arguments. Some platforms we can't update the boot loader and it's 
helpful to be able to have boot arguments which are prepended to the 
bootloader arguments , and some parameters which are appended. These 
changes allow that.


Daniel


Re: [PATCH] powerpc/rtas: Fix a potential race between CPU-Offline & Migration

2018-09-27 Thread Nathan Fontenot
On 09/27/2018 11:51 AM, Gautham R. Shenoy wrote:
> From: "Gautham R. Shenoy" 
> 
> Live Partition Migrations require all the present CPUs to execute the
> H_JOIN call, and hence rtas_ibm_suspend_me() onlines any offline CPUs
> before initiating the migration for this purpose.
> 
> The commit 85a88cabad57
> ("powerpc/pseries: Disable CPU hotplug across migrations")
> disables any CPU-hotplug operations once all the offline CPUs are
> brought online to prevent any further state change. Once the
> CPU-Hotplug operation is disabled, the code assumes that all the CPUs
> are online.
> 
> However, there is a minor window in rtas_ibm_suspend_me() between
> onlining the offline CPUs and disabling CPU-Hotplug when a concurrent
> CPU-offline operations initiated by the userspace can succeed thereby
> nullifying the the aformentioned assumption. In this unlikely case
> these offlined CPUs will not call H_JOIN, resulting in a system hang.
> 
> Fix this by verifying that all the present CPUs are actually online
> after CPU-Hotplug has been disabled, failing which we return from
> rtas_ibm_suspend_me() with -EBUSY.

Would we also want to havr the ability to re-try onlining all of the cpus
before failing the migration?

This would involve a bigger code change as the current code to online all
CPUs would work in its current form.

-Nathan

> 
> Cc: Nathan Fontenot 
> Cc: Tyrel Datwyler 
> Suggested-by: Michael Ellerman 
> Signed-off-by: Gautham R. Shenoy 
> ---
>  arch/powerpc/kernel/rtas.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
> index 2c7ed31..27f6fd3 100644
> --- a/arch/powerpc/kernel/rtas.c
> +++ b/arch/powerpc/kernel/rtas.c
> @@ -982,6 +982,16 @@ int rtas_ibm_suspend_me(u64 handle)
>   }
> 
>   cpu_hotplug_disable();
> +
> + /* Check if we raced with a CPU-Offline Operation */
> + if (unlikely(!cpumask_equal(cpu_present_mask, cpu_online_mask))) {
> + pr_err("%s: Raced against a concurrent CPU-Offline\n",
> +__func__);
> + atomic_set(&data.error, -EBUSY);
> + cpu_hotplug_enable();
> + goto out;
> + }
> +
>   stop_topology_update();
> 
>   /* Call function on all CPUs.  One of us will make the
> 



[PATCH] kdb: print real address of pointers instead of hashed addresses

2018-09-27 Thread Christophe Leroy
Since commit ad67b74d2469 ("printk: hash addresses printed with %p"),
all pointers printed with %p are printed with hashed addresses
instead of real addresses in order to avoid leaking addresses in
dmesg and syslog. But this applies to kdb too, with is unfortunate:

Entering kdb (current=0x(ptrval), pid 329) due to Keyboard Entry
kdb> ps
15 sleeping system daemon (state M) processes suppressed,
use 'ps A' to see all.
Task Addr   Pid   Parent [*] cpu State Thread Command
0x(ptrval)  329  328  10   R  0x(ptrval) *sh

0x(ptrval)10  00   S  0x(ptrval)  init
0x(ptrval)32  00   D  0x(ptrval)  rcu_gp
0x(ptrval)42  00   D  0x(ptrval)  rcu_par_gp
0x(ptrval)52  00   D  0x(ptrval)  kworker/0:0
0x(ptrval)62  00   D  0x(ptrval)  kworker/0:0H
0x(ptrval)72  00   D  0x(ptrval)  kworker/u2:0
0x(ptrval)82  00   D  0x(ptrval)  mm_percpu_wq
0x(ptrval)   102  00   D  0x(ptrval)  rcu_preempt

The whole purpose of kdb is to debug, and for debugging real addresses
need to be known. In addition, data displayed by kdb doesn't go into
dmesg.

This patch replaces all %p by %px in kdb in order to display real
addresses.

Fixes: ad67b74d2469 ("printk: hash addresses printed with %p")
Cc: 
Signed-off-by: Christophe Leroy 
---
 kernel/debug/kdb/kdb_main.c| 14 +++---
 kernel/debug/kdb/kdb_support.c | 12 ++--
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 2ddfce8f1e8f..f338d23b112b 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1192,7 +1192,7 @@ static int kdb_local(kdb_reason_t reason, int error, 
struct pt_regs *regs,
if (reason == KDB_REASON_DEBUG) {
/* special case below */
} else {
-   kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
+   kdb_printf("\nEntering kdb (current=0x%px, pid %d) ",
   kdb_current, kdb_current ? kdb_current->pid : 0);
 #if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -1208,7 +1208,7 @@ static int kdb_local(kdb_reason_t reason, int error, 
struct pt_regs *regs,
 */
switch (db_result) {
case KDB_DB_BPT:
-   kdb_printf("\nEntering kdb (0x%p, pid %d) ",
+   kdb_printf("\nEntering kdb (0x%px, pid %d) ",
   kdb_current, kdb_current->pid);
 #if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
@@ -2048,7 +2048,7 @@ static int kdb_lsmod(int argc, const char **argv)
if (mod->state == MODULE_STATE_UNFORMED)
continue;
 
-   kdb_printf("%-20s%8u  0x%p ", mod->name,
+   kdb_printf("%-20s%8u  0x%px ", mod->name,
   mod->core_layout.size, (void *)mod);
 #ifdef CONFIG_MODULE_UNLOAD
kdb_printf("%4d ", module_refcount(mod));
@@ -2059,7 +2059,7 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf(" (Loading)");
else
kdb_printf(" (Live)");
-   kdb_printf(" 0x%p", mod->core_layout.base);
+   kdb_printf(" 0x%px", mod->core_layout.base);
 
 #ifdef CONFIG_MODULE_UNLOAD
{
@@ -2341,7 +2341,7 @@ void kdb_ps1(const struct task_struct *p)
return;
 
cpu = kdb_process_cpu(p);
-   kdb_printf("0x%p %8d %8d  %d %4d   %c  0x%p %c%s\n",
+   kdb_printf("0x%px %8d %8d  %d %4d   %c  0x%px %c%s\n",
   (void *)p, p->pid, p->parent->pid,
   kdb_task_has_cpu(p), kdb_process_cpu(p),
   kdb_task_state_char(p),
@@ -2354,7 +2354,7 @@ void kdb_ps1(const struct task_struct *p)
} else {
if (KDB_TSK(cpu) != p)
kdb_printf("  Error: does not match running "
-  "process table (0x%p)\n", KDB_TSK(cpu));
+  "process table (0x%px)\n", KDB_TSK(cpu));
}
}
 }
@@ -2692,7 +2692,7 @@ int kdb_register_flags(char *cmd,
for_each_kdbcmd(kp, i) {
if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
kdb_printf("Duplicate kdb command registered: "
-   "%s, func %p help %s\n", cmd, func, help);
+   "%s, func %px help %s\n", cmd, func, help);
return 1;
}
}
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 990b3cc526c8..987eb73284d2 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/d

[PATCH v2] kdb: use correct pointer when 'btc' calls 'btt'

2018-09-27 Thread Christophe Leroy
On a powerpc 8xx, 'btc' fails as follows:

Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry
kdb> btc
btc: cpu status: Currently on cpu 0
Available cpus: 0
kdb_getarea: Bad address 0x0

when booting the kernel with 'debug_boot_weak_hash', it fails as well

Entering kdb (current=0xba99ad80, pid 284) due to Keyboard Entry
kdb> btc
btc: cpu status: Currently on cpu 0
Available cpus: 0
kdb_getarea: Bad address 0xba99ad80

On other platforms, Oopses have been observed too, see
https://github.com/linuxppc/linux/issues/139

This is due to btc calling 'btt' with %p pointer as an argument.

This patch replaces %p by %px to get the real pointer value as
expected by 'btt'

Fixes: ad67b74d2469 ("printk: hash addresses printed with %p")
Cc: 
Reviewed-by: Daniel Thompson 
Signed-off-by: Christophe Leroy 
---
 v2: Added fixes: and Daniel's reviewed-by: in commit's log. No code change.

 kernel/debug/kdb/kdb_bt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 6ad4a9fcbd6f..7921ae4fca8d 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -179,14 +179,14 @@ kdb_bt(int argc, const char **argv)
kdb_printf("no process for cpu %ld\n", cpu);
return 0;
}
-   sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+   sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
kdb_parse(buf);
return 0;
}
kdb_printf("btc: cpu status: ");
kdb_parse("cpu\n");
for_each_online_cpu(cpu) {
-   sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
+   sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
kdb_parse(buf);
touch_nmi_watchdog();
}
-- 
2.13.3



Re: [PATCH 0/8] add generic builtin command line

2018-09-27 Thread Ard Biesheuvel
On 27 September 2018 at 18:55, Maksym Kokhan
 wrote:
> There were series of patches [1] for 4.3.0-rc3, that allowed
> architectures to use a generic builtin command line. I have rebased
> these patches on kernel 4.19.0-rc4.
>

Could you please elaborate on the purpose of this series? Is it simply
to align between architectures? Does it solve an actual problem?

> Things, modified in comparison with original patches:
> * There was some bug for mips, in the case when CONFIG_CMDLINE_PREPEND
> and CONFIG_CMDLINE_APPEND are empty and CMDLINE_OVERRIDE is not set,
> command line from bootloader was ignored, so I fixed it, modifying
> patch "add generic builtin command line".
>
> * Implemented new patch to resolve conflict with new kernel, which
> modify EFI stub code. Unfortunately, I don't have capability to test
> this modification on real arm board with EFI.
>
> * Removed new realisation of mips builtin command line, which was
> created after 4.3.0-rc3.
>
> * Kernel 4.3.0-rc3 with original patches could not be compiled for
> powerpc due to prom_init.c checking by prom_init_check.sh. So I added
> strlcat (which is used by cmdline_add_builtin macro) to
> prom_init_check.sh whitelist.
>
> Patches have been tested in QEMU for x86, arm (little-endian), arm64
> (little-endian), mips (little-endian, 32-bit) and powerpc
> (big-endian, 64-bit), everything works perfectly. Also it was tested
> on linux-next (next-20180924 tag) for all listed above architectures.
>
> [1] : https://lore.kernel.org/patchwork/patch/604992/
>
> Daniel Walker (7):
>   add generic builtin command line
>   drivers: of: ifdef out cmdline section
>   x86: convert to generic builtin command line
>   arm: convert to generic builtin command line
>   arm64: convert to generic builtin command line
>   mips: convert to generic builtin command line
>   powerpc: convert to generic builtin command line
>
> Maksym Kokhan (1):
>   efi: modify EFI stub code for arm/arm64
>
>  arch/arm/Kconfig| 38 +-
>  arch/arm/kernel/atags_parse.c   | 14 ++-
>  arch/arm/kernel/devtree.c   |  2 +
>  arch/arm64/Kconfig  | 17 +---
>  arch/arm64/kernel/setup.c   |  3 ++
>  arch/mips/Kconfig   | 24 +--
>  arch/mips/Kconfig.debug | 47 --
>  arch/mips/kernel/setup.c| 41 ++-
>  arch/powerpc/Kconfig| 23 +--
>  arch/powerpc/kernel/prom.c  |  4 ++
>  arch/powerpc/kernel/prom_init.c |  8 ++--
>  arch/powerpc/kernel/prom_init_check.sh  |  2 +-
>  arch/x86/Kconfig| 44 +
>  arch/x86/kernel/setup.c | 19 ++---
>  drivers/firmware/efi/libstub/arm-stub.c | 10 ++---
>  drivers/of/fdt.c|  2 +-
>  include/linux/cmdline.h | 70 
> +
>  init/Kconfig| 68 
>  18 files changed, 173 insertions(+), 263 deletions(-)
>  create mode 100644 include/linux/cmdline.h
>
> --
> 2.7.4
>


[PATCH] powerpc/rtas: Fix a potential race between CPU-Offline & Migration

2018-09-27 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Live Partition Migrations require all the present CPUs to execute the
H_JOIN call, and hence rtas_ibm_suspend_me() onlines any offline CPUs
before initiating the migration for this purpose.

The commit 85a88cabad57
("powerpc/pseries: Disable CPU hotplug across migrations")
disables any CPU-hotplug operations once all the offline CPUs are
brought online to prevent any further state change. Once the
CPU-Hotplug operation is disabled, the code assumes that all the CPUs
are online.

However, there is a minor window in rtas_ibm_suspend_me() between
onlining the offline CPUs and disabling CPU-Hotplug when a concurrent
CPU-offline operations initiated by the userspace can succeed thereby
nullifying the the aformentioned assumption. In this unlikely case
these offlined CPUs will not call H_JOIN, resulting in a system hang.

Fix this by verifying that all the present CPUs are actually online
after CPU-Hotplug has been disabled, failing which we return from
rtas_ibm_suspend_me() with -EBUSY.

Cc: Nathan Fontenot 
Cc: Tyrel Datwyler 
Suggested-by: Michael Ellerman 
Signed-off-by: Gautham R. Shenoy 
---
 arch/powerpc/kernel/rtas.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 2c7ed31..27f6fd3 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -982,6 +982,16 @@ int rtas_ibm_suspend_me(u64 handle)
}
 
cpu_hotplug_disable();
+
+   /* Check if we raced with a CPU-Offline Operation */
+   if (unlikely(!cpumask_equal(cpu_present_mask, cpu_online_mask))) {
+   pr_err("%s: Raced against a concurrent CPU-Offline\n",
+  __func__);
+   atomic_set(&data.error, -EBUSY);
+   cpu_hotplug_enable();
+   goto out;
+   }
+
stop_topology_update();
 
/* Call function on all CPUs.  One of us will make the
-- 
1.9.4



Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling

2018-09-27 Thread Robin Murphy

On 27/09/18 17:27, Christoph Hellwig wrote:

On Thu, Sep 27, 2018 at 05:14:56PM +0100, Robin Murphy wrote:

This just seemed more readable to me than min_not_zero, but if others
prefer min_not_zero I can switch.


Nah, just checking whether there were any intentionally different
assumptions compared to the couple of other places in the patch where
min_not_zero() *is* used. If it's purely a style thing then no worries
(personally I'd have written it yet another way anyway).


I'm curious: how would you have written it?


Come to think of it, I actually already have, in iommu-dma:

if (dev->bus_dma_mask)
mask &= dev->bus_dma_mask;

but of course it's not so pretty for those cases where you don't already 
have the local variable ready to go.


Robin.


Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling

2018-09-27 Thread Christoph Hellwig
On Thu, Sep 27, 2018 at 05:14:56PM +0100, Robin Murphy wrote:
>> This just seemed more readable to me than min_not_zero, but if others
>> prefer min_not_zero I can switch.
>
> Nah, just checking whether there were any intentionally different 
> assumptions compared to the couple of other places in the patch where 
> min_not_zero() *is* used. If it's purely a style thing then no worries 
> (personally I'd have written it yet another way anyway).

I'm curious: how would you have written it?


Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling

2018-09-27 Thread Robin Murphy

On 27/09/18 16:32, Christoph Hellwig wrote:

On Thu, Sep 27, 2018 at 03:58:04PM +0100, Robin Murphy wrote:

   }
   #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
   diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 3c404e33d946..64466b7ef67b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -43,10 +43,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t 
size,
return false;
}
   -if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
+   if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {


Hmm... say *dev->dma_mask is 31 bits and dev->bus_dma_mask is 40 bits due
to a global DT property, we'll now scream where we didn't before even
though the bus mask is almost certainly irrelevant - is that desirable?


This is just the reporting in the error case - we'll only hit this
IFF dma_capable already returned false.  But if you don't want a message
here we can probably drop this hunk.


It was only a question of consistency - whatever the reason was to avoid 
warning for small device masks before, it's not obvious why it wouldn't 
still apply in the presence of nonzero but larger bus mask. The fact 
that the current check is there at all implied to me that we're 
expecting less-capable device to hit this often and thus wanted to avoid 
being noisy. If that's not the case then it's fine as-is AFAIC.



@@ -65,12 +66,18 @@ u64 dma_direct_get_required_mask(struct device *dev)
   {
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
   +if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)
+   max_dma = dev->bus_dma_mask;


Again, I think we could just do another min_not_zero() here. A device wired
to address only one single page of RAM isn't a realistic prospect (and we
could just flip the -1 and the shift in the max_dma calculation if we
*really* wanted to support such things).


This just seemed more readable to me than min_not_zero, but if others
prefer min_not_zero I can switch.


Nah, just checking whether there were any intentionally different 
assumptions compared to the couple of other places in the patch where 
min_not_zero() *is* used. If it's purely a style thing then no worries 
(personally I'd have written it yet another way anyway).


Robin.


Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection

2018-09-27 Thread Christoph Hellwig
On Thu, Sep 27, 2018 at 04:38:31PM +0100, Robin Murphy wrote:
> On 27/09/18 16:30, Christoph Hellwig wrote:
>> On Thu, Sep 27, 2018 at 03:30:20PM +0100, Robin Murphy wrote:
+static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64
 dma_mask,
 +  u64 *phys_mask)
 +{
 +  if (force_dma_unencrypted())
 +  *phys_mask = __dma_to_phys(dev, dma_mask);
 +  else
 +  *phys_mask = dma_to_phys(dev, dma_mask);
>>>
>>> Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we can
>>> reuse it here?
>>
>> This is a dma_to_phys and not a phys_to_dma.
>
> Ugh, clearly it's time to stop reviewing patches for today... sorry :(

I actually made the same mistake when writing it..

ALthough I'd really like to see some feedback from you on the arm64
swiotlb series once you had more cofee ;-)


Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection

2018-09-27 Thread Robin Murphy

On 27/09/18 16:30, Christoph Hellwig wrote:

On Thu, Sep 27, 2018 at 03:30:20PM +0100, Robin Murphy wrote:

   +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64
dma_mask,
+   u64 *phys_mask)
+{
+   if (force_dma_unencrypted())
+   *phys_mask = __dma_to_phys(dev, dma_mask);
+   else
+   *phys_mask = dma_to_phys(dev, dma_mask);


Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we can
reuse it here?


This is a dma_to_phys and not a phys_to_dma.


Ugh, clearly it's time to stop reviewing patches for today... sorry :(

Robin.


Re: [PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask

2018-09-27 Thread Robin Murphy

On 27/09/18 16:28, Christoph Hellwig wrote:

On Thu, Sep 27, 2018 at 03:12:25PM +0100, Robin Murphy wrote:

+u64 dma_direct_get_required_mask(struct device *dev)
+{
+   u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
+
+   return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;


I think that may as well just use __fls64() - it seems reasonable to assume
max_dma > 0. Otherwise,


Is there any good reason to micro-optimize given that this isn't
a fast path?


Not at all, I wasn't even thinking in terms of optimisation other than 
in terms of number of source characters and levels of parentheses.


But more importantly I was also being a big idiot because no matter how 
much I have the fls()/__fls() thing in mind, __fls64() doesn't actually 
exist. Nitpick rescinded!


Robin.


Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling

2018-09-27 Thread Christoph Hellwig
On Thu, Sep 27, 2018 at 03:58:04PM +0100, Robin Murphy wrote:
>>   }
>>   #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
>>   diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>> index 3c404e33d946..64466b7ef67b 100644
>> --- a/kernel/dma/direct.c
>> +++ b/kernel/dma/direct.c
>> @@ -43,10 +43,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, 
>> size_t size,
>>  return false;
>>  }
>>   -  if (*dev->dma_mask >= DMA_BIT_MASK(32)) {
>> +if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {
>
> Hmm... say *dev->dma_mask is 31 bits and dev->bus_dma_mask is 40 bits due 
> to a global DT property, we'll now scream where we didn't before even 
> though the bus mask is almost certainly irrelevant - is that desirable?

This is just the reporting in the error case - we'll only hit this
IFF dma_capable already returned false.  But if you don't want a message
here we can probably drop this hunk.

>> @@ -65,12 +66,18 @@ u64 dma_direct_get_required_mask(struct device *dev)
>>   {
>>  u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
>>   +  if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)
>> +max_dma = dev->bus_dma_mask;
>
> Again, I think we could just do another min_not_zero() here. A device wired 
> to address only one single page of RAM isn't a realistic prospect (and we 
> could just flip the -1 and the shift in the max_dma calculation if we 
> *really* wanted to support such things).

This just seemed more readable to me than min_not_zero, but if others
prefer min_not_zero I can switch.


Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection

2018-09-27 Thread Christoph Hellwig
On Thu, Sep 27, 2018 at 03:30:20PM +0100, Robin Murphy wrote:
>>   +static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 
>> dma_mask,
>> +u64 *phys_mask)
>> +{
>> +if (force_dma_unencrypted())
>> +*phys_mask = __dma_to_phys(dev, dma_mask);
>> +else
>> +*phys_mask = dma_to_phys(dev, dma_mask);
>
> Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we can 
> reuse it here?

This is a dma_to_phys and not a phys_to_dma.


Re: [PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask

2018-09-27 Thread Christoph Hellwig
On Thu, Sep 27, 2018 at 03:12:25PM +0100, Robin Murphy wrote:
>> +u64 dma_direct_get_required_mask(struct device *dev)
>> +{
>> +u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
>> +
>> +return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
>
> I think that may as well just use __fls64() - it seems reasonable to assume 
> max_dma > 0. Otherwise,

Is there any good reason to micro-optimize given that this isn't
a fast path?


Re: [PATCH 5/5] dma-direct: always allow dma mask <= physiscal memory size

2018-09-27 Thread Robin Murphy
[ oops, I should have looked at the replies first, now I see Ben already 
had the same thing to say about #3... ]


On 27/09/18 14:49, Christoph Hellwig wrote:

On Thu, Sep 27, 2018 at 11:50:14AM +1000, Benjamin Herrenschmidt wrote:

-* to be able to satisfy them - either by not supporting more physical
-* memory, or by providing a ZONE_DMA32.  If neither is the case, the
-* architecture needs to use an IOMMU instead of the direct mapping.
-*/
-   if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
+   u64 min_mask;
+
+   if (IS_ENABLED(CONFIG_ZONE_DMA))
+   min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
+   else
+   min_mask = min_t(u64, DMA_BIT_MASK(32),
+(max_pfn - 1) << PAGE_SHIFT);
+
+   if (mask >= phys_to_dma(dev, min_mask))
return 0;


nitpick ... to be completely "correct", I would have written

if (IS_ENABLED(CONFIG_ZONE_DMA))
min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
else
min_mask = DMA_BIT_MASK(32);

min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);

In "theory" it's also ok to have a mask < ZONE_DMA_BITS as long as it's
big enough to fit all memory :-)


Yeah, we could do that.


FWIW I like it even if just for looking slightly more readable. With 
that fixup,


Reviewed-by: Robin Murphy 


Re: [PATCH 4/5] dma-direct: implement complete bus_dma_mask handling

2018-09-27 Thread Robin Murphy

On 20/09/18 19:52, Christoph Hellwig wrote:

Instead of rejecting devices with a too small bus_dma_mask we can handle
by taking the bus dma_mask into account for allocations and bounce
buffering decisions.

Signed-off-by: Christoph Hellwig 
---
  include/linux/dma-direct.h |  3 ++-
  kernel/dma/direct.c| 21 +++--
  2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index b79496d8c75b..fbca184ff5a0 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -27,7 +27,8 @@ static inline bool dma_capable(struct device *dev, dma_addr_t 
addr, size_t size)
if (!dev->dma_mask)
return false;
  
-	return addr + size - 1 <= *dev->dma_mask;

+   return addr + size - 1 <=
+   min_not_zero(*dev->dma_mask, dev->bus_dma_mask);
  }
  #endif /* !CONFIG_ARCH_HAS_PHYS_TO_DMA */
  
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c

index 3c404e33d946..64466b7ef67b 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -43,10 +43,11 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t 
size,
return false;
}
  
-		if (*dev->dma_mask >= DMA_BIT_MASK(32)) {

+   if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_mask) {


Hmm... say *dev->dma_mask is 31 bits and dev->bus_dma_mask is 40 bits 
due to a global DT property, we'll now scream where we didn't before 
even though the bus mask is almost certainly irrelevant - is that desirable?



dev_err(dev,
-   "%s: overflow %pad+%zu of device mask %llx\n",
-   caller, &dma_addr, size, *dev->dma_mask);
+   "%s: overflow %pad+%zu of device mask %llx bus mask 
%llx\n",
+   caller, &dma_addr, size,
+   *dev->dma_mask, dev->bus_dma_mask);
}
return false;
}
@@ -65,12 +66,18 @@ u64 dma_direct_get_required_mask(struct device *dev)
  {
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
  
+	if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)

+   max_dma = dev->bus_dma_mask;


Again, I think we could just do another min_not_zero() here. A device 
wired to address only one single page of RAM isn't a realistic prospect 
(and we could just flip the -1 and the shift in the max_dma calculation 
if we *really* wanted to support such things).



+
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
  }
  
  static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,

u64 *phys_mask)
  {
+   if (dev->bus_dma_mask && dev->bus_dma_mask < dma_mask)
+   dma_mask = dev->bus_dma_mask;
+


Similarly, can't we assume dma_mask to be nonzero here too? It feels 
like we really shouldn't have managed to get this far without one.


Robin.


if (force_dma_unencrypted())
*phys_mask = __dma_to_phys(dev, dma_mask);
else
@@ -87,7 +94,7 @@ static gfp_t __dma_direct_optimal_gfp_mask(struct device 
*dev, u64 dma_mask,
  static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
  {
return phys_to_dma_direct(dev, phys) + size - 1 <=
-   dev->coherent_dma_mask;
+   min_not_zero(dev->coherent_dma_mask, dev->bus_dma_mask);
  }
  
  void *dma_direct_alloc_pages(struct device *dev, size_t size,

@@ -291,12 +298,6 @@ int dma_direct_supported(struct device *dev, u64 mask)
if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
return 0;
  #endif
-   /*
-* Upstream PCI/PCIe bridges or SoC interconnects may not carry
-* as many DMA address bits as the device itself supports.
-*/
-   if (dev->bus_dma_mask && mask > dev->bus_dma_mask)
-   return 0;
return 1;
  }
  



Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection

2018-09-27 Thread Robin Murphy

On 20/09/18 19:52, Christoph Hellwig wrote:

We need to take the DMA offset and encryption bit into account when
selecting a zone.  User the opportunity to factor out the zone
selection into a helper for reuse.

Signed-off-by: Christoph Hellwig 
---
  kernel/dma/direct.c | 31 +--
  1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 81b73a5bba54..3c404e33d946 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -68,6 +68,22 @@ u64 dma_direct_get_required_mask(struct device *dev)
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
  }
  
+static gfp_t __dma_direct_optimal_gfp_mask(struct device *dev, u64 dma_mask,

+   u64 *phys_mask)
+{
+   if (force_dma_unencrypted())
+   *phys_mask = __dma_to_phys(dev, dma_mask);
+   else
+   *phys_mask = dma_to_phys(dev, dma_mask);


Maybe make phys_to_dma_direct() take u64 instead of phys_addr_t so we 
can reuse it here?



+   /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
+   if (*phys_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
+   return GFP_DMA;


If we don't have ZONE_DMA it would in theory be a tiny bit better to 
fall back to GFP_DMA32 instead of returning 0 here, but I'm not sure 
it's worth the bother.



+   if (*phys_mask <= DMA_BIT_MASK(32))
+   return GFP_DMA32;
+   return 0;
+}
+
  static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
  {
return phys_to_dma_direct(dev, phys) + size - 1 <=
@@ -80,17 +96,13 @@ void *dma_direct_alloc_pages(struct device *dev, size_t 
size,
unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
int page_order = get_order(size);
struct page *page = NULL;
+   u64 phys_mask;
void *ret;
  
  	/* we always manually zero the memory once we are done: */

gfp &= ~__GFP_ZERO;
-
-   /* GFP_DMA32 and GFP_DMA are no ops without the corresponding zones: */
-   if (dev->coherent_dma_mask <= DMA_BIT_MASK(ARCH_ZONE_DMA_BITS))
-   gfp |= GFP_DMA;
-   if (dev->coherent_dma_mask <= DMA_BIT_MASK(32) && !(gfp & GFP_DMA))
-   gfp |= GFP_DMA32;
-
+   gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
+   &phys_mask);
  again:
/* CMA can be used only in the context which permits sleeping */
if (gfpflags_allow_blocking(gfp)) {
@@ -109,15 +121,14 @@ void *dma_direct_alloc_pages(struct device *dev, size_t 
size,
page = NULL;
  
  		if (IS_ENABLED(CONFIG_ZONE_DMA32) &&

-   dev->coherent_dma_mask < DMA_BIT_MASK(64) &&
+   phys_mask < DMA_BIT_MASK(64) &&
!(gfp & (GFP_DMA32 | GFP_DMA))) {
gfp |= GFP_DMA32;
goto again;


Ah right, in that situation we should probably end up here anyway, so 
that's good enough - definitely not worth any more #ifdeffery above. 
Nits aside,


Reviewed-by: Robin Murphy 


}
  
  		if (IS_ENABLED(CONFIG_ZONE_DMA) &&

-   dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
-   !(gfp & GFP_DMA)) {
+   phys_mask < DMA_BIT_MASK(32) && !(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}



Re: [PATCH 2/5] dma-direct: add an explicit dma_direct_get_required_mask

2018-09-27 Thread Robin Murphy

On 20/09/18 19:52, Christoph Hellwig wrote:

This is somewhat modelled after the powerpc version, and differs from
the legacy fallback in use fls64 instead of pointlessly splitting up the
address into low and high dwords and in that it takes (__)phys_to_dma
into account.

Signed-off-by: Christoph Hellwig 
---
  include/linux/dma-direct.h |  1 +
  kernel/dma/direct.c| 21 ++---
  2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/include/linux/dma-direct.h b/include/linux/dma-direct.h
index 86a59ba5a7f3..b79496d8c75b 100644
--- a/include/linux/dma-direct.h
+++ b/include/linux/dma-direct.h
@@ -55,6 +55,7 @@ static inline void dma_mark_clean(void *addr, size_t size)
  }
  #endif /* CONFIG_ARCH_HAS_DMA_MARK_CLEAN */
  
+u64 dma_direct_get_required_mask(struct device *dev);

  void *dma_direct_alloc(struct device *dev, size_t size, dma_addr_t 
*dma_handle,
gfp_t gfp, unsigned long attrs);
  void dma_direct_free(struct device *dev, size_t size, void *cpu_addr,
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index c954f0a6dc62..81b73a5bba54 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -53,11 +53,25 @@ check_addr(struct device *dev, dma_addr_t dma_addr, size_t 
size,
return true;
  }
  
+static inline dma_addr_t phys_to_dma_direct(struct device *dev,

+   phys_addr_t phys)
+{
+   if (force_dma_unencrypted())
+   return __phys_to_dma(dev, phys);
+   return phys_to_dma(dev, phys);
+}
+
+u64 dma_direct_get_required_mask(struct device *dev)
+{
+   u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
+
+   return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;


I think that may as well just use __fls64() - it seems reasonable to 
assume max_dma > 0. Otherwise,


Reviewed-by: Robin Murphy 


+}
+
  static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
  {
-   dma_addr_t addr = force_dma_unencrypted() ?
-   __phys_to_dma(dev, phys) : phys_to_dma(dev, phys);
-   return addr + size - 1 <= dev->coherent_dma_mask;
+   return phys_to_dma_direct(dev, phys) + size - 1 <=
+   dev->coherent_dma_mask;
  }
  
  void *dma_direct_alloc_pages(struct device *dev, size_t size,

@@ -296,6 +310,7 @@ const struct dma_map_ops dma_direct_ops = {
.unmap_page = dma_direct_unmap_page,
.unmap_sg   = dma_direct_unmap_sg,
  #endif
+   .get_required_mask  = dma_direct_get_required_mask,
.dma_supported  = dma_direct_supported,
.mapping_error  = dma_direct_mapping_error,
.cache_sync = arch_dma_cache_sync,



Re: [PATCH 5/5] dma-direct: always allow dma mask <= physiscal memory size

2018-09-27 Thread Christoph Hellwig
On Thu, Sep 27, 2018 at 11:50:14AM +1000, Benjamin Herrenschmidt wrote:
> > -* to be able to satisfy them - either by not supporting more physical
> > -* memory, or by providing a ZONE_DMA32.  If neither is the case, the
> > -* architecture needs to use an IOMMU instead of the direct mapping.
> > -*/
> > -   if (mask < phys_to_dma(dev, DMA_BIT_MASK(32)))
> > +   u64 min_mask;
> > +
> > +   if (IS_ENABLED(CONFIG_ZONE_DMA))
> > +   min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
> > +   else
> > +   min_mask = min_t(u64, DMA_BIT_MASK(32),
> > +(max_pfn - 1) << PAGE_SHIFT);
> > +
> > +   if (mask >= phys_to_dma(dev, min_mask))
> > return 0;
> 
> nitpick ... to be completely "correct", I would have written
> 
>   if (IS_ENABLED(CONFIG_ZONE_DMA))
>   min_mask = DMA_BIT_MASK(ARCH_ZONE_DMA_BITS);
>   else
>   min_mask = DMA_BIT_MASK(32);
> 
>   min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
> 
> In "theory" it's also ok to have a mask < ZONE_DMA_BITS as long as it's
> big enough to fit all memory :-)

Yeah, we could do that.


Re: [PATCH 3/5] dma-direct: refine dma_direct_alloc zone selection

2018-09-27 Thread Christoph Hellwig
On Thu, Sep 27, 2018 at 11:45:15AM +1000, Benjamin Herrenschmidt wrote:
> I'm not sure this is entirely right.
> 
> Let's say the mask is 30 bits. You will return GFP_DMA32, which will
> fail if you allocate something above 1G (which is legit for
> ZONE_DMA32).

And then we will try GFP_DMA further down in the function:

if (IS_ENABLED(CONFIG_ZONE_DMA) &&
dev->coherent_dma_mask < DMA_BIT_MASK(32) &&
!(gfp & GFP_DMA)) {
gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
goto again;
}

This is and old optimization from x86, because chances are high that
GFP_DMA32 will give you suitable memory for the infamous 31-bit
dma mask devices (at least at boot time) and thus we don't have
to deplete the tiny ZONE_DMA pool.


Re: [PATCH v3 1/9] powerpc: build .dtb files in dts directory

2018-09-27 Thread Rob Herring
On Mon, Sep 10, 2018 at 10:04 AM Rob Herring  wrote:
>
> Align powerpc with other architectures which build the dtb files in the
> same directory as the dts files. This is also in line with most other
> build targets which are located in the same directory as the source.
> This move will help enable the 'dtbs' target which builds all the dtbs
> regardless of kernel config.
>
> This transition could break some scripts if they expect dtb files in the
> old location.
>
> Cc: Benjamin Herrenschmidt 
> Cc: Paul Mackerras 
> Cc: Michael Ellerman 
> Cc: linuxppc-dev@lists.ozlabs.org
> Signed-off-by: Rob Herring 
> ---
> v3:
>  - Remove duplicate mpc5200 dtbs from image-y targets. The dtb target already
>comes from the cuImage. target.

Do the PPC folks have any comments on this?

Rob

>
>  arch/powerpc/Makefile  |  2 +-
>  arch/powerpc/boot/Makefile | 55 --
>  arch/powerpc/boot/dts/Makefile |  1 +
>  3 files changed, 28 insertions(+), 30 deletions(-)
>  create mode 100644 arch/powerpc/boot/dts/Makefile
>
> diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
> index 11a1acba164a..53ea887eb34e 100644
> --- a/arch/powerpc/Makefile
> +++ b/arch/powerpc/Makefile
> @@ -294,7 +294,7 @@ bootwrapper_install:
> $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
>
>  %.dtb: scripts
> -   $(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
> +   $(Q)$(MAKE) $(build)=$(boot)/dts $(patsubst %,$(boot)/dts/%,$@)
>
>  # Used to create 'merged defconfigs'
>  # To use it $(call) it with the first argument as the base defconfig
> diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
> index 0fb96c26136f..bca5c23767df 100644
> --- a/arch/powerpc/boot/Makefile
> +++ b/arch/powerpc/boot/Makefile
> @@ -304,9 +304,9 @@ image-$(CONFIG_PPC_ADDER875)+= 
> cuImage.adder875-uboot \
>dtbImage.adder875-redboot
>
>  # Board ports in arch/powerpc/platform/52xx/Kconfig
> -image-$(CONFIG_PPC_LITE5200)   += cuImage.lite5200 lite5200.dtb
> -image-$(CONFIG_PPC_LITE5200)   += cuImage.lite5200b lite5200b.dtb
> -image-$(CONFIG_PPC_MEDIA5200)  += cuImage.media5200 media5200.dtb
> +image-$(CONFIG_PPC_LITE5200)   += cuImage.lite5200
> +image-$(CONFIG_PPC_LITE5200)   += cuImage.lite5200b
> +image-$(CONFIG_PPC_MEDIA5200)  += cuImage.media5200
>
>  # Board ports in arch/powerpc/platform/82xx/Kconfig
>  image-$(CONFIG_MPC8272_ADS)+= cuImage.mpc8272ads
> @@ -381,11 +381,11 @@ $(addprefix $(obj)/, $(sort $(filter zImage.%, 
> $(image-y: vmlinux $(wrapperb
> $(call if_changed,wrap,$(subst $(obj)/zImage.,,$@))
>
>  # dtbImage% - a dtbImage is a zImage with an embedded device tree blob
> -$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE
> -   $(call if_changed,wrap,$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
> +$(obj)/dtbImage.initrd.%: vmlinux $(wrapperbits) $(obj)/dts/%.dtb FORCE
> +   $(call if_changed,wrap,$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz)
>
> -$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/%.dtb FORCE
> -   $(call if_changed,wrap,$*,,$(obj)/$*.dtb)
> +$(obj)/dtbImage.%: vmlinux $(wrapperbits) $(obj)/dts/%.dtb FORCE
> +   $(call if_changed,wrap,$*,,$(obj)/dts/$*.dtb)
>
>  # This cannot be in the root of $(src) as the zImage rule always adds a 
> $(obj)
>  # prefix
> @@ -395,36 +395,33 @@ $(obj)/vmlinux.strip: vmlinux
>  $(obj)/uImage: vmlinux $(wrapperbits) FORCE
> $(call if_changed,wrap,uboot)
>
> -$(obj)/uImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
> -   $(call 
> if_changed,wrap,uboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
> +$(obj)/uImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE
> +   $(call 
> if_changed,wrap,uboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz)
>
> -$(obj)/uImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
> -   $(call if_changed,wrap,uboot-$*,,$(obj)/$*.dtb)
> +$(obj)/uImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE
> +   $(call if_changed,wrap,uboot-$*,,$(obj)/dts/$*.dtb)
>
> -$(obj)/cuImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
> -   $(call 
> if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
> +$(obj)/cuImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE
> +   $(call 
> if_changed,wrap,cuboot-$*,,$(obj)/dts/$*.dtb,$(obj)/ramdisk.image.gz)
>
> -$(obj)/cuImage.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
> -   $(call if_changed,wrap,cuboot-$*,,$(obj)/$*.dtb)
> +$(obj)/cuImage.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE
> +   $(call if_changed,wrap,cuboot-$*,,$(obj)/dts/$*.dtb)
>
> -$(obj)/simpleImage.initrd.%: vmlinux $(obj)/%.dtb $(wrapperbits) FORCE
> -   $(call 
> if_changed,wrap,simpleboot-$*,,$(obj)/$*.dtb,$(obj)/ramdisk.image.gz)
> +$(obj)/simpleImage.initrd.%: vmlinux $(obj)/dts/%.dtb $(wrapperbits) FORCE
> +   $(call 
> if_chang

Re: [PATCH -next] PCI: hotplug: Use kmemdup rather than duplicating its implementation in pnv_php_add_devtree()

2018-09-27 Thread Michael Ellerman
YueHaibing  writes:

> Use kmemdup rather than duplicating its implementation
>
> Signed-off-by: YueHaibing 
> ---
>  drivers/pci/hotplug/pnv_php.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
>
> diff --git a/drivers/pci/hotplug/pnv_php.c b/drivers/pci/hotplug/pnv_php.c
> index 5070620..ee54f5b 100644
> --- a/drivers/pci/hotplug/pnv_php.c
> +++ b/drivers/pci/hotplug/pnv_php.c
> @@ -275,14 +275,13 @@ static int pnv_php_add_devtree(struct pnv_php_slot 
> *php_slot)
>   goto free_fdt1;
>   }
>  
> - fdt = kzalloc(fdt_totalsize(fdt1), GFP_KERNEL);
> + fdt = kmemdup(fdt1, fdt_totalsize(fdt1), GFP_KERNEL);
>   if (!fdt) {
>   ret = -ENOMEM;
>   goto free_fdt1;
>   }
>  
>   /* Unflatten device tree blob */
> - memcpy(fdt, fdt1, fdt_totalsize(fdt1));
>   dt = of_fdt_unflatten_tree(fdt, php_slot->dn, NULL);
>   if (!dt) {
>   ret = -EINVAL;

Looks correct to me.

Acked-by: Michael Ellerman 


cheers


Re: [PATCH v4 1/2] powerpc/32: add stack protector support

2018-09-27 Thread Christophe LEROY




Le 27/09/2018 à 09:45, Segher Boessenkool a écrit :

On Thu, Sep 27, 2018 at 08:20:00AM +0200, Christophe LEROY wrote:

Le 26/09/2018 à 21:16, Segher Boessenkool a écrit :

On Wed, Sep 26, 2018 at 11:40:38AM +, Christophe Leroy wrote:

+static __always_inline void boot_init_stack_canary(void)
+{
+   unsigned long canary;
+
+   /* Try to get a semi random initial value. */
+   get_random_bytes(&canary, sizeof(canary));
+   canary ^= mftb();
+   canary ^= LINUX_VERSION_CODE;
+
+   current->stack_canary = canary;
+}


I still think you should wait until there is entropy available.  You
haven't answered my questions about that (or I didn't see them): what
does the kernel do in other similar cases?

Looks great otherwise!


What do you mean by 'other similar cases' ? All arches have similar
boot_init_stack_canary().


Yes, those, and other things that want entropy early.


Functions add_latent_entropy() and add_device_randomness() are
there for that it seems, allthough it is not real entropy.




x86 uses rdtsc() which is equivalent to our
mftb(). Most arches xor it with LINUX_VERSION_CODE.

The issue is that it is called very early in start_kernel(), however
they try to set some entropy anyway:

boot_cpu_init();
page_address_init();
pr_notice("%s", linux_banner);
setup_arch(&command_line);
/*
 * Set up the the initial canary and entropy after arch
 * and after adding latent and command line entropy.
 */
add_latent_entropy();
add_device_randomness(command_line, strlen(command_line));
boot_init_stack_canary();

Apparently, it is too early for calling wait_for_random_bytes(), see below.


Hrm.  Too early to call wait_event_interruptible?  From there it went
into schedule(), which blew up.  Well you say we have only one context
at this point, so that is not too surprising then :-)


However this is the canary for initial startup only. Only idle() still
uses this canary once the system is running. A new canary is set for any
new forked task.


Ah, that makes things a lot better!  Do those new tasks get a canary
from something with sufficient entropy though?


For the kernel threads that are started early, probably not. For the 
ones started a bit later, and for user processes, I believe they have 
better entropy. Anyway, all this is handled by the kernel core and is 
out of control of individual arches, as its done in kernel/fork.c in 
function dup_task_struct(). However this function is declared as
static __latent_entropy struct task_struct *copy_process(). This 
__latent_entropy attibute must help in a way.





Maybe should the idle canary be updated later once there is more entropy


That is tricky to do, but sure, if you can, that should help.


? Today there is a new call to boot_init_stack_canary() in
cpu_startup_entry(), but it is enclosed inside #ifdef CONFIG_X86.


It needs to know the details of how ssp works on each platform.


Well, that could be for another patch in the future. That's probably 
feasible on x86 and PPC because they both use TLS guard, but not for 
other arches where the guard is global at the moment. So we'll have to 
do it carefully.


I agree with you that we may not for the time being get all the expected 
security against hackers out of it due to that little entropy, but my 
main concern at the time being is to deter developper's bugs clobbering 
the stack, and for that any non trivial canary should make it, shouldn't 
it ?


Christophe


Re: [PATCH] kdb: use correct pointer when 'btc' calls 'btt'

2018-09-27 Thread Christophe LEROY




Le 27/09/2018 à 13:09, Michael Ellerman a écrit :

Christophe LEROY  writes:

Le 26/09/2018 à 13:11, Daniel Thompson a écrit :

On 16/09/2018 20:06, Daniel Thompson wrote:

On Fri, Sep 14, 2018 at 12:35:44PM +, Christophe Leroy wrote:

On a powerpc 8xx, 'btc' fails as follows:
Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry

...


Signed-off-by: Christophe Leroy 
Cc:  # 4.15+


Would a Fixes: be better here?
Fixes: ad67b74d2469d9b82 ("printk: hash addresses printed with %p")


Christophe, When you add the Fixes: could you also add my

Reviewed-by: Daniel Thompson 


Ok, thanks for the review, but do I have to do anything really ?

The Fixes: and now your Reviewed-by: appear automatically in patchwork
(https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=65715),
so I believe they'll be automatically included when Jason or someone
else takes the patch, no ?


patchwork won't add the Fixes tag from the reply, it needs to be in the
original mail.

See:
   https://github.com/getpatchwork/patchwork/issues/151



Ok, so it accounts it and adds a '1' in the F column in the patches 
list, but won't take it into account.


Then I'll send a v2 with revised commit text.

Christophe


Re: [PATCH] kdb: use correct pointer when 'btc' calls 'btt'

2018-09-27 Thread Michael Ellerman
Christophe LEROY  writes:
> Le 26/09/2018 à 13:11, Daniel Thompson a écrit :
>> On 16/09/2018 20:06, Daniel Thompson wrote:
>>> On Fri, Sep 14, 2018 at 12:35:44PM +, Christophe Leroy wrote:
 On a powerpc 8xx, 'btc' fails as follows:
 Entering kdb (current=0x(ptrval), pid 282) due to Keyboard Entry
...

 Signed-off-by: Christophe Leroy 
 Cc:  # 4.15+
>>>
>>> Would a Fixes: be better here?
>>> Fixes: ad67b74d2469d9b82 ("printk: hash addresses printed with %p")
>> 
>> Christophe, When you add the Fixes: could you also add my
>> 
>> Reviewed-by: Daniel Thompson 
>
> Ok, thanks for the review, but do I have to do anything really ?
>
> The Fixes: and now your Reviewed-by: appear automatically in patchwork 
> (https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=65715), 
> so I believe they'll be automatically included when Jason or someone 
> else takes the patch, no ?

patchwork won't add the Fixes tag from the reply, it needs to be in the
original mail.

See:
  https://github.com/getpatchwork/patchwork/issues/151


cheers


[PATCH v3 6/6] memory-hotplug.txt: Add some details about locking internals

2018-09-27 Thread David Hildenbrand
Let's document the magic a bit, especially why device_hotplug_lock is
required when adding/removing memory and how it all play together with
requests to online/offline memory from user space.

Cc: Jonathan Corbet 
Cc: Michal Hocko 
Cc: Andrew Morton 
Reviewed-by: Pavel Tatashin 
Reviewed-by: Rashmica Gupta 
Signed-off-by: David Hildenbrand 
---
 Documentation/memory-hotplug.txt | 42 +++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 7f49ebf3ddb2..ce4faa5530fa 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -3,7 +3,7 @@ Memory Hotplug
 ==
 
 :Created:  Jul 28 2007
-:Updated: Add description of notifier of memory hotplug:   Oct 11 2007
+:Updated: Add some details about locking internals:Aug 20 2018
 
 This document is about memory hotplug including how-to-use and current status.
 Because Memory Hotplug is still under development, contents of this text will
@@ -495,6 +495,46 @@ further processing of the notification queue.
 
 NOTIFY_STOP stops further processing of the notification queue.
 
+
+Locking Internals
+=
+
+When adding/removing memory that uses memory block devices (i.e. ordinary RAM),
+the device_hotplug_lock should be held to:
+
+- synchronize against online/offline requests (e.g. via sysfs). This way, 
memory
+  block devices can only be accessed (.online/.state attributes) by user
+  space once memory has been fully added. And when removing memory, we
+  know nobody is in critical sections.
+- synchronize against CPU hotplug and similar (e.g. relevant for ACPI and PPC)
+
+Especially, there is a possible lock inversion that is avoided using
+device_hotplug_lock when adding memory and user space tries to online that
+memory faster than expected:
+
+- device_online() will first take the device_lock(), followed by
+  mem_hotplug_lock
+- add_memory_resource() will first take the mem_hotplug_lock, followed by
+  the device_lock() (while creating the devices, during bus_add_device()).
+
+As the device is visible to user space before taking the device_lock(), this
+can result in a lock inversion.
+
+onlining/offlining of memory should be done via device_online()/
+device_offline() - to make sure it is properly synchronized to actions
+via sysfs. Holding device_hotplug_lock is advised (to e.g. protect online_type)
+
+When adding/removing/onlining/offlining memory or adding/removing
+heterogeneous/device memory, we should always hold the mem_hotplug_lock in
+write mode to serialise memory hotplug (e.g. access to global/zone
+variables).
+
+In addition, mem_hotplug_lock (in contrast to device_hotplug_lock) in read
+mode allows for a quite efficient get_online_mems/put_online_mems
+implementation, so code accessing memory can protect from that memory
+vanishing.
+
+
 Future Work
 ===
 
-- 
2.17.1



[PATCH v3 5/6] powerpc/powernv: hold device_hotplug_lock when calling memtrace_offline_pages()

2018-09-27 Thread David Hildenbrand
Let's perform all checking + offlining + removing under
device_hotplug_lock, so nobody can mess with these devices via
sysfs concurrently.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Rashmica Gupta 
Cc: Balbir Singh 
Cc: Michael Neuling 
Reviewed-by: Pavel Tatashin 
Reviewed-by: Rashmica Gupta 
Acked-by: Balbir Singh 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index fdd48f1a39f7..84d038ed3882 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -70,6 +70,7 @@ static int change_memblock_state(struct memory_block *mem, 
void *arg)
return 0;
 }
 
+/* called with device_hotplug_lock held */
 static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
 {
u64 end_pfn = start_pfn + nr_pages - 1;
@@ -110,6 +111,7 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
/* Trace memory needs to be aligned to the size */
end_pfn = round_down(end_pfn - nr_pages, nr_pages);
 
+   lock_device_hotplug();
for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
/*
@@ -118,7 +120,6 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
 * we never try to remove memory that spans two iomem
 * resources.
 */
-   lock_device_hotplug();
end_pfn = base_pfn + nr_pages;
for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> 
PAGE_SHIFT) {
__remove_memory(nid, pfn << PAGE_SHIFT, bytes);
@@ -127,6 +128,7 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
return base_pfn << PAGE_SHIFT;
}
}
+   unlock_device_hotplug();
 
return 0;
 }
-- 
2.17.1



[PATCH v3 4/6] powerpc/powernv: hold device_hotplug_lock when calling device_online()

2018-09-27 Thread David Hildenbrand
device_online() should be called with device_hotplug_lock() held.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: Rashmica Gupta 
Cc: Balbir Singh 
Cc: Michael Neuling 
Reviewed-by: Pavel Tatashin 
Reviewed-by: Rashmica Gupta 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 773623f6bfb1..fdd48f1a39f7 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -242,9 +242,11 @@ static int memtrace_online(void)
 * we need to online the memory ourselves.
 */
if (!memhp_auto_online) {
+   lock_device_hotplug();
walk_memory_range(PFN_DOWN(ent->start),
  PFN_UP(ent->start + ent->size - 1),
  NULL, online_mem_block);
+   unlock_device_hotplug();
}
 
/*
-- 
2.17.1



[PATCH v3 3/6] mm/memory_hotplug: fix online/offline_pages called w.o. mem_hotplug_lock

2018-09-27 Thread David Hildenbrand
There seem to be some problems as result of 30467e0b3be ("mm, hotplug:
fix concurrent memory hot-add deadlock"), which tried to fix a possible
lock inversion reported and discussed in [1] due to the two locks
a) device_lock()
b) mem_hotplug_lock

While add_memory() first takes b), followed by a) during
bus_probe_device(), onlining of memory from user space first took a),
followed by b), exposing a possible deadlock.

In [1], and it was decided to not make use of device_hotplug_lock, but
rather to enforce a locking order.

The problems I spotted related to this:

1. Memory block device attributes: While .state first calls
   mem_hotplug_begin() and the calls device_online() - which takes
   device_lock() - .online does no longer call mem_hotplug_begin(), so
   effectively calls online_pages() without mem_hotplug_lock.

2. device_online() should be called under device_hotplug_lock, however
   onlining memory during add_memory() does not take care of that.

In addition, I think there is also something wrong about the locking in

3. arch/powerpc/platforms/powernv/memtrace.c calls offline_pages()
   without locks. This was introduced after 30467e0b3be. And skimming over
   the code, I assume it could need some more care in regards to locking
   (e.g. device_online() called without device_hotplug_lock. This will
   be addressed in the following patches.

Now that we hold the device_hotplug_lock when
- adding memory (e.g. via add_memory()/add_memory_resource())
- removing memory (e.g. via remove_memory())
- device_online()/device_offline()

We can move mem_hotplug_lock usage back into
online_pages()/offline_pages().

Why is mem_hotplug_lock still needed? Essentially to make
get_online_mems()/put_online_mems() be very fast (relying on
device_hotplug_lock would be very slow), and to serialize against
addition of memory that does not create memory block devices (hmm).

[1] http://driverdev.linuxdriverproject.org/pipermail/ driverdev-devel/
2015-February/065324.html

This patch is partly based on a patch by Vitaly Kuznetsov.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Martin Schwidefsky 
Cc: Heiko Carstens 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Rashmica Gupta 
Cc: Michael Neuling 
Cc: Balbir Singh 
Cc: Kate Stewart 
Cc: Thomas Gleixner 
Cc: Philippe Ombredanne 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Pavel Tatashin 
Cc: Vlastimil Babka 
Cc: Dan Williams 
Cc: Oscar Salvador 
Cc: YASUAKI ISHIMATSU 
Cc: Mathieu Malaterre 
Reviewed-by: Pavel Tatashin 
Reviewed-by: Rashmica Gupta 
Signed-off-by: David Hildenbrand 
---
 drivers/base/memory.c | 13 +
 mm/memory_hotplug.c   | 28 
 2 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 40cac122ec73..0e5985682642 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -228,7 +228,6 @@ static bool pages_correctly_probed(unsigned long start_pfn)
 /*
  * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is
  * OK to have direct references to sparsemem variables in here.
- * Must already be protected by mem_hotplug_begin().
  */
 static int
 memory_block_action(unsigned long phys_index, unsigned long action, int 
online_type)
@@ -294,7 +293,6 @@ static int memory_subsys_online(struct device *dev)
if (mem->online_type < 0)
mem->online_type = MMOP_ONLINE_KEEP;
 
-   /* Already under protection of mem_hotplug_begin() */
ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE);
 
/* clear online_type */
@@ -341,19 +339,11 @@ store_mem_state(struct device *dev,
goto err;
}
 
-   /*
-* Memory hotplug needs to hold mem_hotplug_begin() for probe to find
-* the correct memory block to online before doing device_online(dev),
-* which will take dev->mutex.  Take the lock early to prevent an
-* inversion, memory_subsys_online() callbacks will be implemented by
-* assuming it's already protected.
-*/
-   mem_hotplug_begin();
-
switch (online_type) {
case MMOP_ONLINE_KERNEL:
case MMOP_ONLINE_MOVABLE:
case MMOP_ONLINE_KEEP:
+   /* mem->online_type is protected by device_hotplug_lock */
mem->online_type = online_type;
ret = device_online(&mem->dev);
break;
@@ -364,7 +354,6 @@ store_mem_state(struct device *dev,
ret = -EINVAL; /* should never happen */
}
 
-   mem_hotplug_done();
 err:
unlock_device_hotplug();
 
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index affb03e0dfef..d4c7e42e46f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -860,7 +860,6 @@ static struct zone * __meminit move_pfn_range(int 
online_type, int ni

[PATCH v3 2/6] mm/memory_hotplug: make add_memory() take the device_hotplug_lock

2018-09-27 Thread David Hildenbrand
add_memory() currently does not take the device_hotplug_lock, however
is aleady called under the lock from
arch/powerpc/platforms/pseries/hotplug-memory.c
drivers/acpi/acpi_memhotplug.c
to synchronize against CPU hot-remove and similar.

In general, we should hold the device_hotplug_lock when adding memory
to synchronize against online/offline request (e.g. from user space) -
which already resulted in lock inversions due to device_lock() and
mem_hotplug_lock - see 30467e0b3be ("mm, hotplug: fix concurrent memory
hot-add deadlock"). add_memory()/add_memory_resource() will create memory
block devices, so this really feels like the right thing to do.

Holding the device_hotplug_lock makes sure that a memory block device
can really only be accessed (e.g. via .online/.state) from user space,
once the memory has been fully added to the system.

The lock is not held yet in
drivers/xen/balloon.c
arch/powerpc/platforms/powernv/memtrace.c
drivers/s390/char/sclp_cmd.c
drivers/hv/hv_balloon.c
So, let's either use the locked variants or take the lock.

Don't export add_memory_resource(), as it once was exported to be used
by XEN, which is never built as a module. If somebody requires it, we
also have to export a locked variant (as device_hotplug_lock is never
exported).

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Nathan Fontenot 
Cc: John Allen 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Dan Williams 
Cc: Joonsoo Kim 
Cc: Vlastimil Babka 
Cc: Oscar Salvador 
Cc: Mathieu Malaterre 
Cc: Pavel Tatashin 
Cc: YASUAKI ISHIMATSU 
Reviewed-by: Pavel Tatashin 
Reviewed-by: Rafael J. Wysocki 
Reviewed-by: Rashmica Gupta 
Signed-off-by: David Hildenbrand 
---
 .../platforms/pseries/hotplug-memory.c|  2 +-
 drivers/acpi/acpi_memhotplug.c|  2 +-
 drivers/base/memory.c |  9 ++--
 drivers/xen/balloon.c |  3 +++
 include/linux/memory_hotplug.h|  1 +
 mm/memory_hotplug.c   | 22 ---
 6 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index dd0264c43f3e..d26a771d985e 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -673,7 +673,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
nid = memory_add_physaddr_to_nid(lmb->base_addr);
 
/* Add the memory */
-   rc = add_memory(nid, lmb->base_addr, block_sz);
+   rc = __add_memory(nid, lmb->base_addr, block_sz);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 811148415993..8fe0960ea572 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -228,7 +228,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (node < 0)
node = memory_add_physaddr_to_nid(info->start_addr);
 
-   result = add_memory(node, info->start_addr, info->length);
+   result = __add_memory(node, info->start_addr, info->length);
 
/*
 * If the memory block has been used by the kernel, add_memory()
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 817320c7c4c1..40cac122ec73 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -519,15 +519,20 @@ memory_probe_store(struct device *dev, struct 
device_attribute *attr,
if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
return -EINVAL;
 
+   ret = lock_device_hotplug_sysfs();
+   if (ret)
+   goto out;
+
nid = memory_add_physaddr_to_nid(phys_addr);
-   ret = add_memory(nid, phys_addr,
-MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+   ret = __add_memory(nid, phys_addr,
+  MIN_MEMORY_BLOCK_SIZE * sections_per_block);
 
if (ret)
goto out;
 
ret = count;
 out:
+   unlock_device_hotplug();
return ret;
 }
 
diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c
index a3f5cbfcd4a1..fdfc64f5acea 100644
--- a/drivers/xen/balloon.c
+++ b/drivers/xen/balloon.c
@@ -395,7 +395,10 @@ static enum bp_state reserve_additional_memory(void)
 * callers drop the mutex before trying again.
 */
mutex_unlock(&balloon_mutex);
+   /* add_memory_resource() requires the device_hotplug lock */
+   lock_device_hotplug();
rc = add_memory_resource(nid, resource, memhp_auto_online);
+   unlock_device_hotplug();
mutex_lock(&balloon_mutex);
 
if (rc) {
diff --git a/include/linux/mem

[PATCH v3 1/6] mm/memory_hotplug: make remove_memory() take the device_hotplug_lock

2018-09-27 Thread David Hildenbrand
remove_memory() is exported right now but requires the
device_hotplug_lock, which is not exported. So let's provide a variant
that takes the lock and only export that one.

The lock is already held in
arch/powerpc/platforms/pseries/hotplug-memory.c
drivers/acpi/acpi_memhotplug.c
arch/powerpc/platforms/powernv/memtrace.c

Apart from that, there are not other users in the tree.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Rashmica Gupta 
Cc: Michael Neuling 
Cc: Balbir Singh 
Cc: Nathan Fontenot 
Cc: John Allen 
Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Dan Williams 
Cc: Joonsoo Kim 
Cc: Vlastimil Babka 
Cc: Pavel Tatashin 
Cc: Greg Kroah-Hartman 
Cc: Oscar Salvador 
Cc: YASUAKI ISHIMATSU 
Cc: Mathieu Malaterre 
Reviewed-by: Pavel Tatashin 
Reviewed-by: Rafael J. Wysocki 
Reviewed-by: Rashmica Gupta 
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c   | 2 +-
 arch/powerpc/platforms/pseries/hotplug-memory.c | 6 +++---
 drivers/acpi/acpi_memhotplug.c  | 2 +-
 include/linux/memory_hotplug.h  | 3 ++-
 mm/memory_hotplug.c | 9 -
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index a29fdf8a2e56..773623f6bfb1 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -121,7 +121,7 @@ static u64 memtrace_alloc_node(u32 nid, u64 size)
lock_device_hotplug();
end_pfn = base_pfn + nr_pages;
for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> 
PAGE_SHIFT) {
-   remove_memory(nid, pfn << PAGE_SHIFT, bytes);
+   __remove_memory(nid, pfn << PAGE_SHIFT, bytes);
}
unlock_device_hotplug();
return base_pfn << PAGE_SHIFT;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 9a15d39995e5..dd0264c43f3e 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -305,7 +305,7 @@ static int pseries_remove_memblock(unsigned long base, 
unsigned int memblock_siz
nid = memory_add_physaddr_to_nid(base);
 
for (i = 0; i < sections_per_block; i++) {
-   remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
+   __remove_memory(nid, base, MIN_MEMORY_BLOCK_SIZE);
base += MIN_MEMORY_BLOCK_SIZE;
}
 
@@ -394,7 +394,7 @@ static int dlpar_remove_lmb(struct drmem_lmb *lmb)
block_sz = pseries_memory_block_size();
nid = memory_add_physaddr_to_nid(lmb->base_addr);
 
-   remove_memory(nid, lmb->base_addr, block_sz);
+   __remove_memory(nid, lmb->base_addr, block_sz);
 
/* Update memory regions for memory remove */
memblock_remove(lmb->base_addr, block_sz);
@@ -681,7 +681,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
 
rc = dlpar_online_lmb(lmb);
if (rc) {
-   remove_memory(nid, lmb->base_addr, block_sz);
+   __remove_memory(nid, lmb->base_addr, block_sz);
invalidate_lmb_associativity_index(lmb);
} else {
lmb->flags |= DRCONF_MEM_ASSIGNED;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 6b0d3ef7309c..811148415993 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -282,7 +282,7 @@ static void acpi_memory_remove_memory(struct 
acpi_memory_device *mem_device)
nid = memory_add_physaddr_to_nid(info->start_addr);
 
acpi_unbind_memory_blocks(info);
-   remove_memory(nid, info->start_addr, info->length);
+   __remove_memory(nid, info->start_addr, info->length);
list_del(&info->list);
kfree(info);
}
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 34a28227068d..1f096852f479 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -301,6 +301,7 @@ extern bool is_mem_section_removable(unsigned long pfn, 
unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern void remove_memory(int nid, u64 start, u64 size);
+extern void __remove_memory(int nid, u64 start, u64 size);
 
 #else
 static inline bool is_mem_section_removable(unsigned long pfn,
@@ -317,6 +318,7 @@ static inline int offline_pages(unsigned long start_pfn, 
unsigned long nr_pages)
 }
 
 static inline void remove_memory(int nid, u64 start, u64 size) {}
+static inline void __remove_memory(int nid, u64 start, u64 size) {}
 #endif /* CONFIG_MEMORY_HOT

[PATCH v3 0/6] mm: online/offline_pages called w.o. mem_hotplug_lock

2018-09-27 Thread David Hildenbrand
@Andrew, Only patch #5 changed (see change notes below). Thanks!


Reading through the code and studying how mem_hotplug_lock is to be used,
I noticed that there are two places where we can end up calling
device_online()/device_offline() - online_pages()/offline_pages() without
the mem_hotplug_lock. And there are other places where we call
device_online()/device_offline() without the device_hotplug_lock.

While e.g.
echo "online" > /sys/devices/system/memory/memory9/state
is fine, e.g.
echo 1 > /sys/devices/system/memory/memory9/online
Will not take the mem_hotplug_lock. However the device_lock() and
device_hotplug_lock.

E.g. via memory_probe_store(), we can end up calling
add_memory()->online_pages() without the device_hotplug_lock. So we can
have concurrent callers in online_pages(). We e.g. touch in online_pages()
basically unprotected zone->present_pages then.

Looks like there is a longer history to that (see Patch #2 for details),
and fixing it to work the way it was intended is not really possible. We
would e.g. have to take the mem_hotplug_lock in device/base/core.c, which
sounds wrong.

Summary: We had a lock inversion on mem_hotplug_lock and device_lock().
More details can be found in patch 3 and patch 6.

I propose the general rules (documentation added in patch 6):

1. add_memory/add_memory_resource() must only be called with
   device_hotplug_lock.
2. remove_memory() must only be called with device_hotplug_lock. This is
   already documented and holds for all callers.
3. device_online()/device_offline() must only be called with
   device_hotplug_lock. This is already documented and true for now in core
   code. Other callers (related to memory hotplug) have to be fixed up.
4. mem_hotplug_lock is taken inside of add_memory/remove_memory/
   online_pages/offline_pages.

To me, this looks way cleaner than what we have right now (and easier to
verify). And looking at the documentation of remove_memory, using
lock_device_hotplug also for add_memory() feels natural.


v2 -> v3:
- Take device_hotplug_lock outside of loop in patch #5
- Added Ack to patch #5

v1 -> v2:
- Upstream changes in powerpc/powernv code required modifications to
  patch #1, #4 and #5.
- Minor patch description changes.
- Added more locking details in patch #6.
- Added rb's

RFCv2 -> v1:
- Dropped an unnecessary _ref from remove_memory() in patch #1
- Minor patch description fixes.
- Added rb's

RFC -> RFCv2:
- Don't export device_hotplug_lock, provide proper remove_memory/add_memory
  wrappers.
- Split up the patches a bit.
- Try to improve powernv memtrace locking
- Add some documentation for locking that matches my knowledg

David Hildenbrand (6):
  mm/memory_hotplug: make remove_memory() take the device_hotplug_lock
  mm/memory_hotplug: make add_memory() take the device_hotplug_lock
  mm/memory_hotplug: fix online/offline_pages called w.o.
mem_hotplug_lock
  powerpc/powernv: hold device_hotplug_lock when calling device_online()
  powerpc/powernv: hold device_hotplug_lock when calling
memtrace_offline_pages()
  memory-hotplug.txt: Add some details about locking internals

 Documentation/memory-hotplug.txt  | 42 -
 arch/powerpc/platforms/powernv/memtrace.c |  8 ++-
 .../platforms/pseries/hotplug-memory.c|  8 +--
 drivers/acpi/acpi_memhotplug.c|  4 +-
 drivers/base/memory.c | 22 +++
 drivers/xen/balloon.c |  3 +
 include/linux/memory_hotplug.h|  4 +-
 mm/memory_hotplug.c   | 59 +++
 8 files changed, 114 insertions(+), 36 deletions(-)

-- 
2.17.1



[PATCH 2/2] powerpc/pseries: Fix how we iterate over the DTL entries

2018-09-27 Thread Naveen N. Rao
When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set, we look up dtl_idx in
the lppaca to determine the number of entries in the buffer. Since
lppaca is in big endian, we need to do an endian conversion before using
this in our calculation to determine the number of entries in the
buffer. Without this, we do not iterate over the existing entries in the
DTL buffer properly.

Fixes: 7c105b63bd98 ("powerpc: Add CONFIG_CPU_LITTLE_ENDIAN kernel config 
option.")
Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/platforms/pseries/dtl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/dtl.c 
b/arch/powerpc/platforms/pseries/dtl.c
index c762689e0eb3..ef6595153642 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -184,7 +184,7 @@ static void dtl_stop(struct dtl *dtl)
 
 static u64 dtl_current_index(struct dtl *dtl)
 {
-   return lppaca_of(dtl->cpu).dtl_idx;
+   return be64_to_cpu(lppaca_of(dtl->cpu).dtl_idx);
 }
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
-- 
2.19.0



[PATCH 1/2] powerpc/pseries: Fix DTL buffer registration

2018-09-27 Thread Naveen N. Rao
When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set, we register the DTL
buffer for a cpu when the associated file under powerpc/dtl in debugfs
is opened. When doing so, we need to set the size of the buffer being
registered in the second u32 word of the buffer. This needs to be in big
endian, but we are not doing the conversion resulting in the below error
showing up in dmesg:

dtl_start: DTL registration for cpu 0 (hw 0) failed with -4

Fix this in the obvious manner.

Fixes: 7c105b63bd98 ("powerpc: Add CONFIG_CPU_LITTLE_ENDIAN kernel config 
option.")
Signed-off-by: Naveen N. Rao 
---
 arch/powerpc/platforms/pseries/dtl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/dtl.c 
b/arch/powerpc/platforms/pseries/dtl.c
index 18014cdeb590..c762689e0eb3 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -149,7 +149,7 @@ static int dtl_start(struct dtl *dtl)
 
/* Register our dtl buffer with the hypervisor. The HV expects the
 * buffer size to be passed in the second word of the buffer */
-   ((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES;
+   ((u32 *)dtl->buf)[1] = cpu_to_be32(DISPATCH_LOG_BYTES);
 
hwcpu = get_hard_smp_processor_id(dtl->cpu);
addr = __pa(dtl->buf);
-- 
2.19.0



[PATCH 0/2] A few fixes to DTL buffer access over debugfs

2018-09-27 Thread Naveen N. Rao
Two trivial fixes to DTL buffer access over debugfs when 
CONFIG_VIRT_CPU_ACCOUNTING_NATIVE is not set.

- Naveen

Naveen N. Rao (2):
  powerpc/pseries: Fix DTL buffer registration
  powerpc/pseries: Fix how we iterate over the DTL entries

 arch/powerpc/platforms/pseries/dtl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

-- 
2.19.0



Re: [PATCH v4 1/2] powerpc/32: add stack protector support

2018-09-27 Thread Segher Boessenkool
On Thu, Sep 27, 2018 at 08:20:00AM +0200, Christophe LEROY wrote:
> Le 26/09/2018 à 21:16, Segher Boessenkool a écrit :
> >On Wed, Sep 26, 2018 at 11:40:38AM +, Christophe Leroy wrote:
> >>+static __always_inline void boot_init_stack_canary(void)
> >>+{
> >>+   unsigned long canary;
> >>+
> >>+   /* Try to get a semi random initial value. */
> >>+   get_random_bytes(&canary, sizeof(canary));
> >>+   canary ^= mftb();
> >>+   canary ^= LINUX_VERSION_CODE;
> >>+
> >>+   current->stack_canary = canary;
> >>+}
> >
> >I still think you should wait until there is entropy available.  You
> >haven't answered my questions about that (or I didn't see them): what
> >does the kernel do in other similar cases?
> >
> >Looks great otherwise!
> 
> What do you mean by 'other similar cases' ? All arches have similar 
> boot_init_stack_canary().

Yes, those, and other things that want entropy early.

> x86 uses rdtsc() which is equivalent to our 
> mftb(). Most arches xor it with LINUX_VERSION_CODE.
> 
> The issue is that it is called very early in start_kernel(), however 
> they try to set some entropy anyway:
> 
>   boot_cpu_init();
>   page_address_init();
>   pr_notice("%s", linux_banner);
>   setup_arch(&command_line);
>   /*
>* Set up the the initial canary and entropy after arch
>* and after adding latent and command line entropy.
>*/
>   add_latent_entropy();
>   add_device_randomness(command_line, strlen(command_line));
>   boot_init_stack_canary();
> 
> Apparently, it is too early for calling wait_for_random_bytes(), see below.

Hrm.  Too early to call wait_event_interruptible?  From there it went
into schedule(), which blew up.  Well you say we have only one context
at this point, so that is not too surprising then :-)

> However this is the canary for initial startup only. Only idle() still 
> uses this canary once the system is running. A new canary is set for any 
> new forked task.

Ah, that makes things a lot better!  Do those new tasks get a canary
from something with sufficient entropy though?

> Maybe should the idle canary be updated later once there is more entropy 

That is tricky to do, but sure, if you can, that should help.

> ? Today there is a new call to boot_init_stack_canary() in 
> cpu_startup_entry(), but it is enclosed inside #ifdef CONFIG_X86.

It needs to know the details of how ssp works on each platform.


Segher


[PATCH v5 2/2] powerpc/64: add stack protector support

2018-09-27 Thread Christophe Leroy
On PPC64, as register r13 points to the paca_struct at all time,
this patch adds a copy of the canary there, which is copied at
task_switch.
That new canary is then used by using the following GCC options:
-mstack-protector-guard=tls
-mstack-protector-guard-reg=r13
-mstack-protector-guard-offset=offsetof(struct paca_struct, canary))

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/Kconfig  | 2 +-
 arch/powerpc/Makefile | 8 
 arch/powerpc/include/asm/paca.h   | 3 +++
 arch/powerpc/include/asm/stackprotector.h | 4 
 arch/powerpc/kernel/asm-offsets.c | 3 +++
 arch/powerpc/kernel/entry_64.S| 4 
 6 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 3bcb05929931..602eea723624 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -180,7 +180,7 @@ config PPC
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_CBPF_JITif !PPC64
-   select HAVE_STACKPROTECTOR  if 
$(cc-option,-mstack-protector-guard=tls) && PPC32
+   select HAVE_STACKPROTECTOR  if 
$(cc-option,-mstack-protector-guard=tls)
select HAVE_CONTEXT_TRACKINGif PPC64
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 45b8eb4d8fe7..81552c7b46eb 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -113,7 +113,11 @@ KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET)
 endif
 
 cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard=tls
+ifdef CONFIG_PPC64
+cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard-reg=r13
+else
 cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard-reg=r2
+endif
 
 LDFLAGS_vmlinux-y := -Bstatic
 LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie
@@ -411,8 +415,12 @@ ifdef CONFIG_STACKPROTECTOR
 prepare: stack_protector_prepare
 
 stack_protector_prepare: prepare0
+ifdef CONFIG_PPC64
+   $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if 
($$2 == "PACA_CANARY") print $$3;}' include/generated/asm-offsets.h))
+else
$(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if 
($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h))
 endif
+endif
 
 # Use the file '.tmp_gas_check' for binutils tests, as gas won't output
 # to stdout and these checks are run even on install targets.
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 6d6b3706232c..98d883e58945 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -246,6 +246,9 @@ struct paca_struct {
struct slb_entry *mce_faulty_slbs;
u16 slb_save_cache_ptr;
 #endif /* CONFIG_PPC_BOOK3S_64 */
+#ifdef CONFIG_STACKPROTECTOR
+   unsigned long canary;
+#endif
 } cacheline_aligned;
 
 extern struct paca_struct **paca_ptrs;
diff --git a/arch/powerpc/include/asm/stackprotector.h 
b/arch/powerpc/include/asm/stackprotector.h
index d05d969c98c2..1c8460e23583 100644
--- a/arch/powerpc/include/asm/stackprotector.h
+++ b/arch/powerpc/include/asm/stackprotector.h
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Initialize the stackprotector canary value.
@@ -29,6 +30,9 @@ static __always_inline void boot_init_stack_canary(void)
canary &= CANARY_MASK;
 
current->stack_canary = canary;
+#ifdef CONFIG_PPC64
+   get_paca()->canary = canary;
+#endif
 }
 
 #endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c 
b/arch/powerpc/kernel/asm-offsets.c
index a992f7a53cf3..773dee55b3f6 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -81,6 +81,9 @@ int main(void)
OFFSET(MM, task_struct, mm);
 #ifdef CONFIG_STACKPROTECTOR
OFFSET(TASK_CANARY, task_struct, stack_canary);
+#ifdef CONFIG_PPC64
+   OFFSET(PACA_CANARY, paca_struct, canary);
+#endif
 #endif
OFFSET(MMCONTEXTID, mm_struct, context.id);
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 77a888bfcb53..573fa879d785 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -624,6 +624,10 @@ _GLOBAL(_switch)
 
addir6,r4,-THREAD   /* Convert THREAD to 'current' */
std r6,PACACURRENT(r13) /* Set new 'current' */
+#if defined(CONFIG_STACKPROTECTOR)
+   ld  r6, TASK_CANARY(r6)
+   std r6, PACA_CANARY(r13)
+#endif
 
ld  r8,KSP(r4)  /* new stack pointer */
 #ifdef CONFIG_PPC_BOOK3S_64
-- 
2.13.3



[PATCH v5 1/2] powerpc/32: add stack protector support

2018-09-27 Thread Christophe Leroy
This functionality was tentatively added in the past
(commit 6533b7c16ee5 ("powerpc: Initial stack protector
(-fstack-protector) support")) but had to be reverted
(commit f2574030b0e3 ("powerpc: Revert the initial stack
protector support") because of GCC implementing it differently
whether it had been built with libc support or not.

Now, GCC offers the possibility to manually set the
stack-protector mode (global or tls) regardless of libc support.

This time, the patch selects HAVE_STACKPROTECTOR only if
-mstack-protector-guard=tls is supported by GCC.

On PPC32, as register r2 points to current task_struct at
all time, the stack_canary located inside task_struct can be
used directly by using the following GCC options:
-mstack-protector-guard=tls
-mstack-protector-guard-reg=r2
-mstack-protector-guard-offset=offsetof(struct task_struct, stack_canary))

The protector is disabled for prom_init and bootx_init as
it is too early to handle it properly.

 $ echo CORRUPT_STACK > /sys/kernel/debug/provoke-crash/DIRECT
[  134.943666] Kernel panic - not syncing: stack-protector: Kernel stack is 
corrupted in: lkdtm_CORRUPT_STACK+0x64/0x64
[  134.943666]
[  134.955414] CPU: 0 PID: 283 Comm: sh Not tainted 
4.18.0-s3k-dev-12143-ga3272be41209 #835
[  134.963380] Call Trace:
[  134.965860] [c6615d60] [c001f76c] panic+0x118/0x260 (unreliable)
[  134.971775] [c6615dc0] [c001f654] panic+0x0/0x260
[  134.976435] [c6615dd0] [c032c368] lkdtm_CORRUPT_STACK_STRONG+0x0/0x64
[  134.982769] [c6615e00] [] 0x

Signed-off-by: Christophe Leroy 
---
 v5: Using get_random_canary() and masking canary with CANARY_MASK

 v4: disable stack protector in bootx_init

 v3: the offset is now defined by a rule in the Makefile. No need anymore
 to take stack_canary out of the randomised area of task_struct

 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/Makefile | 10 +
 arch/powerpc/include/asm/stackprotector.h | 34 +++
 arch/powerpc/kernel/Makefile  |  2 ++
 arch/powerpc/kernel/asm-offsets.c |  3 +++
 arch/powerpc/platforms/powermac/Makefile  |  1 +
 6 files changed, 51 insertions(+)
 create mode 100644 arch/powerpc/include/asm/stackprotector.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a80669209155..3bcb05929931 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -180,6 +180,7 @@ config PPC
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_CBPF_JITif !PPC64
+   select HAVE_STACKPROTECTOR  if 
$(cc-option,-mstack-protector-guard=tls) && PPC32
select HAVE_CONTEXT_TRACKINGif PPC64
select HAVE_DEBUG_KMEMLEAK
select HAVE_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 07d9dce7eda6..45b8eb4d8fe7 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -112,6 +112,9 @@ KBUILD_LDFLAGS  += -m elf$(BITS)$(LDEMULATION)
 KBUILD_ARFLAGS += --target=elf$(BITS)-$(GNUTARGET)
 endif
 
+cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard=tls
+cflags-$(CONFIG_STACKPROTECTOR)+= -mstack-protector-guard-reg=r2
+
 LDFLAGS_vmlinux-y := -Bstatic
 LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie
 LDFLAGS_vmlinux:= $(LDFLAGS_vmlinux-y)
@@ -404,6 +407,13 @@ archclean:
 
 archprepare: checkbin
 
+ifdef CONFIG_STACKPROTECTOR
+prepare: stack_protector_prepare
+
+stack_protector_prepare: prepare0
+   $(eval KBUILD_CFLAGS += -mstack-protector-guard-offset=$(shell awk '{if 
($$2 == "TASK_CANARY") print $$3;}' include/generated/asm-offsets.h))
+endif
+
 # Use the file '.tmp_gas_check' for binutils tests, as gas won't output
 # to stdout and these checks are run even on install targets.
 TOUT   := .tmp_gas_check
diff --git a/arch/powerpc/include/asm/stackprotector.h 
b/arch/powerpc/include/asm/stackprotector.h
new file mode 100644
index ..d05d969c98c2
--- /dev/null
+++ b/arch/powerpc/include/asm/stackprotector.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * GCC stack protector support.
+ *
+ */
+
+#ifndef _ASM_STACKPROTECTOR_H
+#define _ASM_STACKPROTECTOR_H
+
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return,
+ * and it must always be inlined.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+   unsigned long canary;
+
+   /* Try to get a semi random initial value. */
+   canary = get_random_canary();
+   canary ^= mftb();
+   canary ^= LINUX_VERSION_CODE;
+   canary &= CANARY_MASK;
+
+   current->stack_canary = canary;
+}
+
+#endif /* _ASM_STACKPROTECTOR_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1e64cfe22a83..85ffa488dfb5 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -20,6 +2