date:20200429

[PATCH 2/6] powerpc/64s/kuap: kuap_restore missing isync

2020-04-29 Thread Nicholas Piggin

Writing the AMR register is documented to require context synchronizing
operations before and after, for it to take effect as expected. The kuap
restore at interrupt exit time deliberately avoids the isync after the
AMR update because it only needs to take effect after the context
synchronizing rfid that soon follows. Add a comment for this.

The missing isync before the update doesn't have an obvious
justification, and seems it could theorietically allow a rogue user
access to leak past the AMR update. Add isyncs for these.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/kup-radix.h | 11 ++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h 
b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 3bcef989a35d..8dc5f292b806 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -16,7 +16,9 @@
 #ifdef CONFIG_PPC_KUAP
BEGIN_MMU_FTR_SECTION_NESTED(67)
ld  \gpr, STACK_REGS_KUAP(r1)
+   isync
mtspr   SPRN_AMR, \gpr
+   /* No isync required, see kuap_restore_amr() */
END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
 #endif
 .endm
@@ -62,8 +64,15 @@
 
 static inline void kuap_restore_amr(struct pt_regs *regs)
 {
-   if (mmu_has_feature(MMU_FTR_RADIX_KUAP))
+   if (mmu_has_feature(MMU_FTR_RADIX_KUAP)) {
+   isync();
mtspr(SPRN_AMR, regs->kuap);
+   /*
+* No isync required here because we are about to rfi
+* back to previous context before any user accesses
+* would be made, which is a CSI.
+*/
+   }
 }
 
 static inline void kuap_check_amr(void)
-- 
2.23.0

[PATCH 3/6] powerpc/64/kuap: interrupt exit conditionally restore AMR

2020-04-29 Thread Nicholas Piggin

The AMR update is made conditional on AMR actually changing, which
should be the less common case on most workloads (though kernel page
faults on uaccess could be frequent, this doesn't significantly slow
down that case).

Signed-off-by: Nicholas Piggin 
---
 .../powerpc/include/asm/book3s/64/kup-radix.h | 36 ++-
 arch/powerpc/kernel/syscall_64.c  | 14 +---
 2 files changed, 37 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h 
b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index 8dc5f292b806..ec8970958a26 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -62,19 +62,32 @@
 #include 
 #include 
 
-static inline void kuap_restore_amr(struct pt_regs *regs)
+static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)
 {
if (mmu_has_feature(MMU_FTR_RADIX_KUAP)) {
-   isync();
-   mtspr(SPRN_AMR, regs->kuap);
-   /*
-* No isync required here because we are about to rfi
-* back to previous context before any user accesses
-* would be made, which is a CSI.
-*/
+   if (unlikely(regs->kuap != amr)) {
+   isync();
+   mtspr(SPRN_AMR, regs->kuap);
+   /*
+* No isync required here because we are about to rfi
+* back to previous context before any user accesses
+* would be made, which is a CSI.
+*/
+   }
}
 }
 
+static inline unsigned long kuap_get_and_check_amr(void)
+{
+   if (mmu_has_feature(MMU_FTR_RADIX_KUAP)) {
+   unsigned long amr = mfspr(SPRN_AMR);
+   if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG)) /* kuap_check_amr() */
+   WARN_ON_ONCE(amr != AMR_KUAP_BLOCKED);
+   return amr;
+   }
+   return 0;
+}
+
 static inline void kuap_check_amr(void)
 {
if (IS_ENABLED(CONFIG_PPC_KUAP_DEBUG) && 
mmu_has_feature(MMU_FTR_RADIX_KUAP))
@@ -151,13 +164,18 @@ bad_kuap_fault(struct pt_regs *regs, unsigned long 
address, bool is_write)
"Bug: %s fault blocked by AMR!", is_write ? "Write" : 
"Read");
 }
 #else /* CONFIG_PPC_KUAP */
-static inline void kuap_restore_amr(struct pt_regs *regs)
+static inline void kuap_restore_amr(struct pt_regs *regs, unsigned long amr)
 {
 }
 
 static inline void kuap_check_amr(void)
 {
 }
+
+static inline unsigned long kuap_get_and_check_amr(void)
+{
+   return 0;
+}
 #endif /* CONFIG_PPC_KUAP */
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c
index a37c7717424f..edeab10c6888 100644
--- a/arch/powerpc/kernel/syscall_64.c
+++ b/arch/powerpc/kernel/syscall_64.c
@@ -242,6 +242,10 @@ notrace unsigned long interrupt_exit_user_prepare(struct 
pt_regs *regs, unsigned
BUG_ON(!FULL_REGS(regs));
BUG_ON(regs->softe != IRQS_ENABLED);
 
+   /*
+* We don't need to restore AMR on the way back to userspace for KUAP.
+* AMR can only have been unlocked if we interrupted the kernel.
+*/
kuap_check_amr();
 
local_irq_save(flags);
@@ -313,13 +317,14 @@ notrace unsigned long 
interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
unsigned long *ti_flagsp = ¤t_thread_info()->flags;
unsigned long flags;
unsigned long ret = 0;
+   unsigned long amr;
 
if (IS_ENABLED(CONFIG_PPC_BOOK3S) && unlikely(!(regs->msr & MSR_RI)))
unrecoverable_exception(regs);
BUG_ON(regs->msr & MSR_PR);
BUG_ON(!FULL_REGS(regs));
 
-   kuap_check_amr();
+   amr = kuap_get_and_check_amr();
 
if (unlikely(*ti_flagsp & _TIF_EMULATE_STACK_STORE)) {
clear_bits(_TIF_EMULATE_STACK_STORE, ti_flagsp);
@@ -367,10 +372,11 @@ notrace unsigned long 
interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign
 #endif
 
/*
-* We don't need to restore AMR on the way back to userspace for KUAP.
-* The value of AMR only matters while we're in the kernel.
+* Don't want to mfspr(SPRN_AMR) here, because this comes after
+* mtmsr, which would cause RAW stalls. Hence, we take the AMR value
+* from the check above.
 */
-   kuap_restore_amr(regs);
+   kuap_restore_amr(regs, amr);
 
return ret;
 }
-- 
2.23.0

[PATCH 4/6] powerpc/64s/kuap: restore AMR in system reset exception

2020-04-29 Thread Nicholas Piggin

system reset interrupt handler locks AMR and exits with
PTION_RESTORE_REGS without restoring AMR. Similarly to the soft-NMI
ler, it needs to restore.

ed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 728ccb0f560c..b0ad930cbae5 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -971,6 +971,7 @@ EXC_COMMON_BEGIN(system_reset_common)
ld  r10,SOFTE(r1)
stb r10,PACAIRQSOFTMASK(r13)
 
+   kuap_restore_amr r10
EXCEPTION_RESTORE_REGS
RFI_TO_USER_OR_KERNEL
 
-- 
2.23.0

[PATCH 5/6] powerpc/64s/kuap: restore AMR in fast_interrupt_return

2020-04-29 Thread Nicholas Piggin

Interrupts that use fast_interrupt_return actually do lock AMR, but they
have been ones which tend to come from userspace (or kernel bugs) in
radix mode. With kuap on hash, segment interrupts are taken in kernel
often, which quickly breaks due to the missing restore.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/entry_64.S | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 9a1e5d636dea..b3c9f15089b6 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -472,15 +472,17 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 #ifdef CONFIG_PPC_BOOK3S
/*
 * If MSR EE/RI was never enabled, IRQs not reconciled, NVGPRs not
-* touched, AMR not set, no exit work created, then this can be used.
+* touched, no exit work created, then this can be used.
 */
.balign IFETCH_ALIGN_BYTES
.globl fast_interrupt_return
 fast_interrupt_return:
 _ASM_NOKPROBE_SYMBOL(fast_interrupt_return)
+   kuap_check_amr r3, r4
ld  r4,_MSR(r1)
andi.   r0,r4,MSR_PR
bne .Lfast_user_interrupt_return
+   kuap_restore_amr r3
andi.   r0,r4,MSR_RI
li  r3,0 /* 0 return value, no EMULATE_STACK_STORE */
bne+.Lfast_kernel_interrupt_return
-- 
2.23.0

[PATCH 6/6] powerpc/64s/kuap: conditionally restore AMR in kuap_restore_amr asm

2020-04-29 Thread Nicholas Piggin

Similar to the C code change, make the AMR restore conditional on
whether the register has changed.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/kup-radix.h | 10 +++---
 arch/powerpc/kernel/entry_64.S |  8 
 arch/powerpc/kernel/exceptions-64s.S   |  4 ++--
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/kup-radix.h 
b/arch/powerpc/include/asm/book3s/64/kup-radix.h
index ec8970958a26..e82df54f5681 100644
--- a/arch/powerpc/include/asm/book3s/64/kup-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/kup-radix.h
@@ -12,13 +12,17 @@
 
 #ifdef __ASSEMBLY__
 
-.macro kuap_restore_amrgpr
+.macro kuap_restore_amrgpr1, gpr2
 #ifdef CONFIG_PPC_KUAP
BEGIN_MMU_FTR_SECTION_NESTED(67)
-   ld  \gpr, STACK_REGS_KUAP(r1)
+   mfspr   \gpr1, SPRN_AMR
+   ld  \gpr2, STACK_REGS_KUAP(r1)
+   cmpd\gpr1, \gpr2
+   beq 998f
isync
-   mtspr   SPRN_AMR, \gpr
+   mtspr   SPRN_AMR, \gpr2
/* No isync required, see kuap_restore_amr() */
+998:
END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_RADIX_KUAP, 67)
 #endif
 .endm
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index b3c9f15089b6..9d49338e0c85 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -479,11 +479,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 fast_interrupt_return:
 _ASM_NOKPROBE_SYMBOL(fast_interrupt_return)
kuap_check_amr r3, r4
-   ld  r4,_MSR(r1)
-   andi.   r0,r4,MSR_PR
+   ld  r5,_MSR(r1)
+   andi.   r0,r5,MSR_PR
bne .Lfast_user_interrupt_return
-   kuap_restore_amr r3
-   andi.   r0,r4,MSR_RI
+   kuap_restore_amr r3, r4
+   andi.   r0,r5,MSR_RI
li  r3,0 /* 0 return value, no EMULATE_STACK_STORE */
bne+.Lfast_kernel_interrupt_return
addir3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index b0ad930cbae5..ef4a90212664 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -971,7 +971,7 @@ EXC_COMMON_BEGIN(system_reset_common)
ld  r10,SOFTE(r1)
stb r10,PACAIRQSOFTMASK(r13)
 
-   kuap_restore_amr r10
+   kuap_restore_amr r9, r10
EXCEPTION_RESTORE_REGS
RFI_TO_USER_OR_KERNEL
 
@@ -2757,7 +2757,7 @@ EXC_COMMON_BEGIN(soft_nmi_common)
ld  r10,SOFTE(r1)
stb r10,PACAIRQSOFTMASK(r13)
 
-   kuap_restore_amr r10
+   kuap_restore_amr r9, r10
EXCEPTION_RESTORE_REGS hsrr=0
RFI_TO_KERNEL
 
-- 
2.23.0

[PATCH] powerpc/64s: Fix early_init_mmu section mismatch

2020-04-29 Thread Nicholas Piggin

Christian reports:

  MODPOST vmlinux.o
  WARNING: modpost: vmlinux.o(.text.unlikely+0x1a0): Section mismatch in
  reference from the function .early_init_mmu() to the function
  .init.text:.radix__early_init_mmu()
  The function .early_init_mmu() references
  the function __init .radix__early_init_mmu().
  This is often because .early_init_mmu lacks a __init
  annotation or the annotation of .radix__early_init_mmu is wrong.

  WARNING: modpost: vmlinux.o(.text.unlikely+0x1ac): Section mismatch in
  reference from the function .early_init_mmu() to the function
  .init.text:.hash__early_init_mmu()
  The function .early_init_mmu() references
  the function __init .hash__early_init_mmu().
  This is often because .early_init_mmu lacks a __init
  annotation or the annotation of .hash__early_init_mmu is wrong.

The compiler is uninlining early_init_mmu and not putting it in an init
section because there is no annotation. Add it.

Reported-by: Christian Zigotzky 
Tested-by: Christian Zigotzky 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/book3s/64/mmu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
b/arch/powerpc/include/asm/book3s/64/mmu.h
index bb3deb76c951..3ffe5f967483 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -208,7 +208,7 @@ void hash__early_init_devtree(void);
 void radix__early_init_devtree(void);
 extern void hash__early_init_mmu(void);
 extern void radix__early_init_mmu(void);
-static inline void early_init_mmu(void)
+static inline void __init early_init_mmu(void)
 {
if (radix_enabled())
return radix__early_init_mmu();
-- 
2.23.0

[PATCH 2/2] powerpc/spufs: stop using access_ok

2020-04-29 Thread Jeremy Kerr

From: Christoph Hellwig 

Just use the proper non __-prefixed get/put_user variants where that is
not done yet.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Jeremy Kerr 
---
 arch/powerpc/platforms/cell/spufs/file.c | 42 +---
 1 file changed, 8 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/file.c 
b/arch/powerpc/platforms/cell/spufs/file.c
index b4e1ef650b40..cd7d10f27fad 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -590,17 +590,12 @@ static ssize_t spufs_mbox_read(struct file *file, char 
__user *buf,
size_t len, loff_t *pos)
 {
struct spu_context *ctx = file->private_data;
-   u32 mbox_data, __user *udata;
+   u32 mbox_data, __user *udata = (void __user *)buf;
ssize_t count;
 
if (len < 4)
return -EINVAL;
 
-   if (!access_ok(buf, len))
-   return -EFAULT;
-
-   udata = (void __user *)buf;
-
count = spu_acquire(ctx);
if (count)
return count;
@@ -616,7 +611,7 @@ static ssize_t spufs_mbox_read(struct file *file, char 
__user *buf,
 * but still need to return the data we have
 * read successfully so far.
 */
-   ret = __put_user(mbox_data, udata);
+   ret = put_user(mbox_data, udata);
if (ret) {
if (!count)
count = -EFAULT;
@@ -698,17 +693,12 @@ static ssize_t spufs_ibox_read(struct file *file, char 
__user *buf,
size_t len, loff_t *pos)
 {
struct spu_context *ctx = file->private_data;
-   u32 ibox_data, __user *udata;
+   u32 ibox_data, __user *udata = (void __user *)buf;
ssize_t count;
 
if (len < 4)
return -EINVAL;
 
-   if (!access_ok(buf, len))
-   return -EFAULT;
-
-   udata = (void __user *)buf;
-
count = spu_acquire(ctx);
if (count)
goto out;
@@ -727,7 +717,7 @@ static ssize_t spufs_ibox_read(struct file *file, char 
__user *buf,
}
 
/* if we can't write at all, return -EFAULT */
-   count = __put_user(ibox_data, udata);
+   count = put_user(ibox_data, udata);
if (count)
goto out_unlock;
 
@@ -741,7 +731,7 @@ static ssize_t spufs_ibox_read(struct file *file, char 
__user *buf,
 * but still need to return the data we have
 * read successfully so far.
 */
-   ret = __put_user(ibox_data, udata);
+   ret = put_user(ibox_data, udata);
if (ret)
break;
}
@@ -836,17 +826,13 @@ static ssize_t spufs_wbox_write(struct file *file, const 
char __user *buf,
size_t len, loff_t *pos)
 {
struct spu_context *ctx = file->private_data;
-   u32 wbox_data, __user *udata;
+   u32 wbox_data, __user *udata = (void __user *)buf;
ssize_t count;
 
if (len < 4)
return -EINVAL;
 
-   udata = (void __user *)buf;
-   if (!access_ok(buf, len))
-   return -EFAULT;
-
-   if (__get_user(wbox_data, udata))
+   if (get_user(wbox_data, udata))
return -EFAULT;
 
count = spu_acquire(ctx);
@@ -873,7 +859,7 @@ static ssize_t spufs_wbox_write(struct file *file, const 
char __user *buf,
/* write as much as possible */
for (count = 4, udata++; (count + 4) <= len; count += 4, udata++) {
int ret;
-   ret = __get_user(wbox_data, udata);
+   ret = get_user(wbox_data, udata);
if (ret)
break;
 
@@ -1982,9 +1968,6 @@ static ssize_t spufs_mbox_info_read(struct file *file, 
char __user *buf,
u32 stat, data;
int ret;
 
-   if (!access_ok(buf, len))
-   return -EFAULT;
-
ret = spu_acquire_saved(ctx);
if (ret)
return ret;
@@ -2028,9 +2011,6 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
char __user *buf,
u32 stat, data;
int ret;
 
-   if (!access_ok(buf, len))
-   return -EFAULT;
-
ret = spu_acquire_saved(ctx);
if (ret)
return ret;
@@ -2082,9 +2062,6 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
char __user *buf,
u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)];
int ret, count;
 
-   if (!access_ok(buf, len))
-   return -EFAULT;
-
ret = spu_acquire_saved(ctx);
if (ret)
return ret;
@@ -2143,9 +2120,6 @@ static ssize_t spufs_dma_info_read(struct file *file, 
char __user *buf,
struct spu_dma_info info;
int ret;
 
-   if (!access_ok(buf, len))
-   return -EFAULT;
-
ret = spu_acquire_saved(ctx);
if (ret)
re

[PATCH 1/2] powerpc/spufs: fix copy_to_user while atomic

2020-04-29 Thread Jeremy Kerr

Currently, we may perform a copy_to_user (through
simple_read_from_buffer()) while holding a context's register_lock,
while accessing the context save area.

This change uses a temporary buffer for the context save area data,
which we then pass to simple_read_from_buffer.

Includes changes from Christoph Hellwig .

Fixes: bf1ab978be23 ("[POWERPC] coredump: Add SPU elf notes to coredump.")
Signed-off-by: Jeremy Kerr 
Reviewed-by: Arnd Bergmann 
Reviewed-by: Christoph Hellwig 
---
 arch/powerpc/platforms/cell/spufs/file.c | 113 +++
 1 file changed, 75 insertions(+), 38 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/file.c 
b/arch/powerpc/platforms/cell/spufs/file.c
index c0f950a3f4e1..b4e1ef650b40 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -1978,8 +1978,9 @@ static ssize_t __spufs_mbox_info_read(struct spu_context 
*ctx,
 static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,
   size_t len, loff_t *pos)
 {
-   int ret;
struct spu_context *ctx = file->private_data;
+   u32 stat, data;
+   int ret;
 
if (!access_ok(buf, len))
return -EFAULT;
@@ -1988,11 +1989,16 @@ static ssize_t spufs_mbox_info_read(struct file *file, 
char __user *buf,
if (ret)
return ret;
spin_lock(&ctx->csa.register_lock);
-   ret = __spufs_mbox_info_read(ctx, buf, len, pos);
+   stat = ctx->csa.prob.mb_stat_R;
+   data = ctx->csa.prob.pu_mb_R;
spin_unlock(&ctx->csa.register_lock);
spu_release_saved(ctx);
 
-   return ret;
+   /* EOF if there's no entry in the mbox */
+   if (!(stat & 0xff))
+   return 0;
+
+   return simple_read_from_buffer(buf, len, pos, &data, sizeof(data));
 }
 
 static const struct file_operations spufs_mbox_info_fops = {
@@ -2019,6 +2025,7 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
char __user *buf,
   size_t len, loff_t *pos)
 {
struct spu_context *ctx = file->private_data;
+   u32 stat, data;
int ret;
 
if (!access_ok(buf, len))
@@ -2028,11 +2035,16 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
char __user *buf,
if (ret)
return ret;
spin_lock(&ctx->csa.register_lock);
-   ret = __spufs_ibox_info_read(ctx, buf, len, pos);
+   stat = ctx->csa.prob.mb_stat_R;
+   data = ctx->csa.priv2.puint_mb_R;
spin_unlock(&ctx->csa.register_lock);
spu_release_saved(ctx);
 
-   return ret;
+   /* EOF if there's no entry in the ibox */
+   if (!(stat & 0xff))
+   return 0;
+
+   return simple_read_from_buffer(buf, len, pos, &data, sizeof(data));
 }
 
 static const struct file_operations spufs_ibox_info_fops = {
@@ -2041,6 +2053,11 @@ static const struct file_operations spufs_ibox_info_fops 
= {
.llseek  = generic_file_llseek,
 };
 
+static size_t spufs_wbox_info_cnt(struct spu_context *ctx)
+{
+   return (4 - ((ctx->csa.prob.mb_stat_R & 0x00ff00) >> 8)) * sizeof(u32);
+}
+
 static ssize_t __spufs_wbox_info_read(struct spu_context *ctx,
char __user *buf, size_t len, loff_t *pos)
 {
@@ -2049,7 +2066,7 @@ static ssize_t __spufs_wbox_info_read(struct spu_context 
*ctx,
u32 wbox_stat;
 
wbox_stat = ctx->csa.prob.mb_stat_R;
-   cnt = 4 - ((wbox_stat & 0x00ff00) >> 8);
+   cnt = spufs_wbox_info_cnt(ctx);
for (i = 0; i < cnt; i++) {
data[i] = ctx->csa.spu_mailbox_data[i];
}
@@ -2062,7 +2079,8 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
char __user *buf,
   size_t len, loff_t *pos)
 {
struct spu_context *ctx = file->private_data;
-   int ret;
+   u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)];
+   int ret, count;
 
if (!access_ok(buf, len))
return -EFAULT;
@@ -2071,11 +2089,13 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
char __user *buf,
if (ret)
return ret;
spin_lock(&ctx->csa.register_lock);
-   ret = __spufs_wbox_info_read(ctx, buf, len, pos);
+   count = spufs_wbox_info_cnt(ctx);
+   memcpy(&data, &ctx->csa.spu_mailbox_data, sizeof(data));
spin_unlock(&ctx->csa.register_lock);
spu_release_saved(ctx);
 
-   return ret;
+   return simple_read_from_buffer(buf, len, pos, &data,
+   count * sizeof(u32));
 }
 
 static const struct file_operations spufs_wbox_info_fops = {
@@ -2084,27 +2104,33 @@ static const struct file_operations 
spufs_wbox_info_fops = {
.llseek  = generic_file_llseek,
 };
 
-static ssize_t __spufs_dma_info_read(struct spu_context *ctx,
-   char __user *buf, size_t len, loff_t *pos)
+static void ___spufs_dma_info_read(struct spu_context *ctx,
+

Re: [RFC PATCH] powerpc/spufs: fix copy_to_user while atomic

2020-04-29 Thread Arnd Bergmann

On Wed, Apr 29, 2020 at 8:33 AM Jeremy Kerr  wrote:
>
> Hi Christoph,
>
> > And another one that should go on top of this one to address Al's other
> > compaint:
>
> Yeah, I was pondering that one. The access_ok() is kinda redundant, but
> it does avoid forcing a SPU context save on those errors.
>
> However, it's not like we really need to optimise for the case of
> invalid addresses from userspace. So, I'll include this change in the
> submission to Michael's tree. Arnd - let me know if you have any
> objections.

Sounds good. A lot of the access_ok() checks in the kernel are redundant
or wrong, I think it makes a lot of sense to remove these.

   Arnd

[PATCH 2/3] powerpc/pci: unmap legacy INTx interrupts of passthrough IO adapters

2020-04-29 Thread Cédric Le Goater

When a passthrough IO adapter is removed from a pseries machine using
hash MMU and the XIVE interrupt mode, the POWER hypervisor, pHyp,
expects the guest OS to have cleared all page table entries related to
the adapter. If some are still present, the RTAS call which isolates
the PCI slot returns error 9001 "valid outstanding translations" and
the removal of the IO adapter fails.

INTx interrupt numbers need special care because Linux maps the
interrupts automatically in the Linux interrupt number space if they
are presented in the device tree node describing the IO adapter. These
interrupts are not un-mapped automatically and in case of an hot-plug
adapter, the PCI hot-plug layer needs to handle the cleanup to make
sure that all the page table entries of the XIVE ESB pages are
cleared.

Cc: "Oliver O'Halloran" 
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/kernel/pci-hotplug.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kernel/pci-hotplug.c 
b/arch/powerpc/kernel/pci-hotplug.c
index bf83f76563a3..9e9c6befd7ea 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -57,6 +57,8 @@ void pcibios_release_device(struct pci_dev *dev)
struct pci_controller *phb = pci_bus_to_host(dev->bus);
struct pci_dn *pdn = pci_get_pdn(dev);
 
+   irq_dispose_mapping(dev->irq);
+
eeh_remove_device(dev);
 
if (phb->controller_ops.release_device)
-- 
2.25.4

Re: [PATCH v7 2/5] powerpc/hv-24x7: Add rtas call in hv-24x7 driver to get processor details

2020-04-29 Thread Madhavan Srinivasan





On 3/27/20 12:06 PM, Kajol Jain wrote:

For hv_24x7 socket/chip level events, specific chip-id to which
the data requested should be added as part of pmu events.
But number of chips/socket in the system details are not exposed.

Patch implements read_sys_info_pseries() to get system
parameter values like number of sockets and chips per socket.
Rtas_call with token "PROCESSOR_MODULE_INFO"
is used to get these values.


Patch looks good to me.

Reviewed-by: Madhavan Srinivasan 


Sub-sequent patch exports these values via sysfs.

Patch also make these parameters default to 1.

Signed-off-by: Kajol Jain 
---
  arch/powerpc/perf/hv-24x7.c  | 72 
  arch/powerpc/platforms/pseries/pseries.h |  3 +
  2 files changed, 75 insertions(+)

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 48e8f4b17b91..9ae00f29bd21 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -20,6 +20,11 @@
  #include 
  #include 

+#ifdef CONFIG_PPC_RTAS
+#include 
+#include <../../platforms/pseries/pseries.h>
+#endif
+
  #include "hv-24x7.h"
  #include "hv-24x7-catalog.h"
  #include "hv-common.h"
@@ -57,6 +62,69 @@ static bool is_physical_domain(unsigned domain)
}
  }

+#ifdef CONFIG_PPC_RTAS
+#define PROCESSOR_MODULE_INFO   43
+#define PROCESSOR_MAX_LENGTH   (8 * 1024)
+
+static int strbe16toh(const char *buf, int offset)
+{
+   return (buf[offset] << 8) + buf[offset + 1];
+}
+
+static u32 physsockets;/* Physical sockets */
+static u32 physchips;  /* Physical chips */
+
+/*
+ * Function read_sys_info_pseries() make a rtas_call which require
+ * data buffer of size 8K. As standard 'rtas_data_buf' is of size
+ * 4K, we are adding new local buffer 'rtas_local_data_buf'.
+ */
+char rtas_local_data_buf[PROCESSOR_MAX_LENGTH] __cacheline_aligned;
+
+/*
+ * read_sys_info_pseries()
+ * Retrieve the number of sockets and chips per socket details
+ * through the get-system-parameter rtas call.
+ */
+void read_sys_info_pseries(void)
+{
+   int call_status, len, ntypes;
+
+   /*
+* Making system parameter: chips and sockets default to 1.
+*/
+   physsockets = 1;
+   physchips = 1;
+   memset(rtas_local_data_buf, 0, PROCESSOR_MAX_LENGTH);
+   spin_lock(&rtas_data_buf_lock);
+
+   call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
+   NULL,
+   PROCESSOR_MODULE_INFO,
+   __pa(rtas_local_data_buf),
+   PROCESSOR_MAX_LENGTH);
+
+   spin_unlock(&rtas_data_buf_lock);
+
+   if (call_status != 0) {
+   pr_info("%s %s Error calling get-system-parameter (0x%x)\n",
+   __FILE__, __func__, call_status);
+   } else {
+   rtas_local_data_buf[PROCESSOR_MAX_LENGTH - 1] = '\0';
+   len = strbe16toh(rtas_local_data_buf, 0);
+   if (len < 6)
+   return;
+
+   ntypes = strbe16toh(rtas_local_data_buf, 2);
+
+   if (!ntypes)
+   return;
+   physsockets = strbe16toh(rtas_local_data_buf, 4);
+   physchips = strbe16toh(rtas_local_data_buf, 6);
+   }
+}
+#endif /* CONFIG_PPC_RTAS */
+
  /* Domains for which more than one result element are returned for each 
event. */
  static bool domain_needs_aggregation(unsigned int domain)
  {
@@ -1605,6 +1673,10 @@ static int hv_24x7_init(void)
if (r)
return r;

+#ifdef CONFIG_PPC_RTAS
+   read_sys_info_pseries();
+#endif
+
return 0;
  }

diff --git a/arch/powerpc/platforms/pseries/pseries.h 
b/arch/powerpc/platforms/pseries/pseries.h
index 13fa370a87e4..1727559ce304 100644
--- a/arch/powerpc/platforms/pseries/pseries.h
+++ b/arch/powerpc/platforms/pseries/pseries.h
@@ -19,6 +19,9 @@ extern void request_event_sources_irqs(struct device_node *np,
  struct pt_regs;

  extern int pSeries_system_reset_exception(struct pt_regs *regs);
+#ifdef CONFIG_PPC_RTAS
+extern void read_sys_info_pseries(void);
+#endif
  extern int pSeries_machine_check_exception(struct pt_regs *regs);
  extern long pseries_machine_check_realmode(struct pt_regs *regs);

[PATCH 1/3] powerpc/xive: Clear the page tables for the ESB IO mapping

2020-04-29 Thread Cédric Le Goater

Commit 1ca3dec2b2df ("powerpc/xive: Prevent page fault issues in the
machine crash handler") fixed an issue in the FW assisted dump of
machines using hash MMU and the XIVE interrupt mode under the POWER
hypervisor. It forced the mapping of the ESB page of interrupts being
mapped in the Linux IRQ number space to make sure the 'crash kexec'
sequence worked during such an event. But it didn't handle the
un-mapping.

This mapping is now blocking the removal of a passthrough IO adapter
under the POWER hypervisor because it expects the guest OS to have
cleared all page table entries related to the adapter. If some are
still present, the RTAS call which isolates the PCI slot returns error
9001 "valid outstanding translations".

Remove these mapping in the IRQ data cleanup routine.

Under KVM, this cleanup is not required because the ESB pages for the
adapter interrupts are un-mapped from the guest by the hypervisor in
the KVM XIVE native device. This is now redundant but it's harmless.

Fixes: 1ca3dec2b2df ("powerpc/xive: Prevent page fault issues in the machine 
crash handler")
Cc: sta...@vger.kernel.org # v5.5+
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 9603b2830d03..3dbc94cb4380 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -1020,12 +1021,16 @@ EXPORT_SYMBOL_GPL(is_xive_irq);
 void xive_cleanup_irq_data(struct xive_irq_data *xd)
 {
if (xd->eoi_mmio) {
+   unmap_kernel_range((unsigned long)xd->eoi_mmio,
+  1u << xd->esb_shift);
iounmap(xd->eoi_mmio);
if (xd->eoi_mmio == xd->trig_mmio)
xd->trig_mmio = NULL;
xd->eoi_mmio = NULL;
}
if (xd->trig_mmio) {
+   unmap_kernel_range((unsigned long)xd->trig_mmio,
+  1u << xd->esb_shift);
iounmap(xd->trig_mmio);
xd->trig_mmio = NULL;
}
-- 
2.25.4

Re: [PATCH] fixup! signal: factor copy_siginfo_to_external32 from copy_siginfo_to_user32

2020-04-29 Thread Arnd Bergmann

On Wed, Apr 29, 2020 at 8:45 AM Christoph Hellwig  wrote:
>
> On Tue, Apr 28, 2020 at 09:56:26PM +0200, Arnd Bergmann wrote:
> > I think I found a way to improve the x32 handling:
> >
> > This is a simplification over Christoph's "[PATCH 2/7] signal: factor
> > copy_siginfo_to_external32 from copy_siginfo_to_user32", reducing the
> > x32 specifics in the common code to a single #ifdef/#endif check, in
> > order to keep it more readable for everyone else.
> >
> > Christoph, if you like it, please fold into your patch.
>
> What do you think of this version?  This one always overrides
> copy_siginfo_to_user32 for the x86 compat case to keep the churn down,
> and improves the copy_siginfo_to_external32 documentation a bit.

Looks good to me. I preferred checking for X32 explicitly (so we can
find and kill off the #ifdef if we ever remove X32 for good), but there is
little difference in the end.

 Arnd

[PATCH 3/3] powerpc/xive: Do not expose a debugfs file when XIVE is disabled

2020-04-29 Thread Cédric Le Goater

The XIVE interrupt mode can be disabled with the "xive=off" kernel
parameter, in which case there is nothing to present to the user in the
associated /sys/kernel/debug/powerpc/xive file.

Fixes: 930914b7d528 ("powerpc/xive: Add a debugfs file to dump internal XIVE 
state")
Signed-off-by: Cédric Le Goater 
---
 arch/powerpc/sysdev/xive/common.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/sysdev/xive/common.c 
b/arch/powerpc/sysdev/xive/common.c
index 3dbc94cb4380..f591be9f01f4 100644
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -1664,7 +1664,8 @@ DEFINE_SHOW_ATTRIBUTE(xive_core_debug);
 
 int xive_core_debug_init(void)
 {
-   debugfs_create_file("xive", 0400, powerpc_debugfs_root,
-   NULL, &xive_core_debug_fops);
+   if (xive_enabled())
+   debugfs_create_file("xive", 0400, powerpc_debugfs_root,
+   NULL, &xive_core_debug_fops);
return 0;
 }
-- 
2.25.4

Re: [PATCH v6 0/4] powerpc/papr_scm: Add support for reporting nvdimm health

2020-04-29 Thread Aneesh Kumar K.V

Vaibhav Jain  writes:

> The PAPR standard[1][3] provides mechanisms to query the health and
> performance stats of an NVDIMM via various hcalls as described in
> Ref[2].  Until now these stats were never available nor exposed to the
> user-space tools like 'ndctl'. This is partly due to PAPR platform not
> having support for ACPI and NFIT. Hence 'ndctl' is unable to query and
> report the dimm health status and a user had no way to determine the
> current health status of a NDVIMM.
>
> To overcome this limitation, this patch-set updates papr_scm kernel
> module to query and fetch nvdimm health stats using hcalls described
> in Ref[2].  This health and performance stats are then exposed to
> userspace via syfs and PAPR-nvDimm-Specific-Methods(PDSM) issued by
> libndctl.
>
> These changes coupled with proposed ndtcl changes located at Ref[4]
> should provide a way for the user to retrieve NVDIMM health status
> using ndtcl.
>
> Below is a sample output using proposed kernel + ndctl for PAPR NVDIMM
> in a emulation environment:
>
>  # ndctl list -DH
> [
>   {
> "dev":"nmem0",
> "health":{
>   "health_state":"fatal",
>   "shutdown_state":"dirty"
> }
>   }
> ]
>
> Dimm health report output on a pseries guest lpar with vPMEM or HMS
> based nvdimms that are in perfectly healthy conditions:
>
>  # ndctl list -d nmem0 -H
> [
>   {
> "dev":"nmem0",
> "health":{
>   "health_state":"ok",
>   "shutdown_state":"clean"
> }
>   }
> ]
>
> PAPR nvDimm-Specific-Methods(PDSM)
> ==
>
> PDSM requests are issued by vendor specific code in libndctl to
> execute certain operations or fetch information from NVDIMMS. PDSMs
> requests can be sent to papr_scm module via libndctl(userspace) and
> libnvdimm (kernel) using the ND_CMD_CALL ioctl command which can be
> handled in the dimm control function papr_scm_ndctl(). Current
> patchset proposes a single PDSM to retrieve NVDIMM health, defined in
> the newly introduced uapi header named 'papr_scm_pdsm.h'. Support for
> more PDSMs will be added in future.
>
> Structure of the patch-set
> ==
>
> The patchset starts with implementing support for fetching nvdimm health
> information from PHYP and partially exposing it to user-space via a nvdimm
> sysfs flag.
>
> Second & Third patches deal with implementing support for servicing PDSM
> commands in papr_scm module.
>
> Finally the Fourth patch implements support for servicing PDSM
> 'PAPR_SCM_PDSM_HEALTH' that returns the nvdimm health information to
> libndctl.

You can add to the series.

Reviewed-by: Aneesh Kumar K.V 

-aneesh

Re: [RFC PATCH dpss_eth] Don't initialise ports with no PHY

2020-04-29 Thread Christian Zigotzky


Hi Darren,

Thanks a lot for your patch!

I tested it with the RC3 today.

Unfortunately it doesn't compile because a bracket is missing in the 
following line:


+    if (prop && !strncmp(prop, "disabled", 8) {

And a semicolon is missing in the following line:

+        goto _return

I added the bracket and the semicolon and after that it compiled without 
any problems. (New patch attached)


Unfortunately I see more than 2 ethernet ports with the RC3 and your 
patch on my Cyrus P5040. Maybe Skateman has an other result on his Cyrus 
P5020.


Maybe we have to modify the dtb file.

Thanks,
Christian


On 25 April 2020 at 00:29 am, Darren Stevens wrote:

Since cbb961ca271e ("Use random MAC address when none is given")
Varisys Cyrus P5020 boards have been listing 5 ethernet ports instead of
the 2 the board has.This is because we were preventing the adding of the
unused ports by not suppling them a MAC address, which this patch now
supplies.

Prevent them from appearing in the net devices list by checking for a
'status="disabled"' entry during probe and skipping the port if we find
it.

Signed-off-by: Darren Stevens 

---

  drivers/net/ethernet/freescale/fman/mac.c | 11 +++
  1 file changed, 11 insertions(+)

diff --git a/drivers/net/ethernet/freescale/fman/mac.c 
b/drivers/net/ethernet/freescale/fman/mac.c
index 43427c5..c9ed411 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -606,6 +606,7 @@ static int mac_probe(struct platform_device *_of_dev)
struct resource  res;
struct mac_priv_s   *priv;
const u8*mac_addr;
+   const char  *prop;
u32  val;
u8  fman_id;
phy_interface_t  phy_if;
@@ -628,6 +629,16 @@ static int mac_probe(struct platform_device *_of_dev)
mac_dev->priv = priv;
priv->dev = dev;
  
+	/* check for disabled devices and skip them, as now a missing

+* MAC address will be replaced with a Random one rather than
+* disabling the port
+*/
+   prop = of_get_property(mac_node, "status", NULL);
+   if (prop && !strncmp(prop, "disabled", 8) {
+   err = -ENODEV;
+   goto _return
+   }
+
if (of_device_is_compatible(mac_node, "fsl,fman-dtsec")) {
setup_dtsec(mac_dev);
priv->internal_phy_node = of_parse_phandle(mac_node,


diff --git a/drivers/net/ethernet/freescale/fman/mac.c b/drivers/net/ethernet/freescale/fman/mac.c
index 43427c5..c9ed411 100644
--- a/drivers/net/ethernet/freescale/fman/mac.c
+++ b/drivers/net/ethernet/freescale/fman/mac.c
@@ -606,6 +606,7 @@ static int mac_probe(struct platform_device *_of_dev)
 	struct resource		 res;
 	struct mac_priv_s	*priv;
 	const u8		*mac_addr;
+	const char 		*prop;
 	u32			 val;
 	u8			fman_id;
 	phy_interface_t  phy_if;
@@ -628,6 +629,16 @@ static int mac_probe(struct platform_device *_of_dev)
 	mac_dev->priv = priv;
 	priv->dev = dev;
 
+	/* check for disabled devices and skip them, as now a missing
+	 * MAC address will be replaced with a Random one rather than
+	 * disabling the port
+	 */
+	prop = of_get_property(mac_node, "status", NULL);
+	if (prop && !strncmp(prop, "disabled", 8)) {
+		err = -ENODEV;
+		goto _return;
+	}
+
 	if (of_device_is_compatible(mac_node, "fsl,fman-dtsec")) {
 		setup_dtsec(mac_dev);
 		priv->internal_phy_node = of_parse_phandle(mac_node,

Re: [PATCH v7 1/5] powerpc/perf/hv-24x7: Fix inconsistent output values incase multiple hv-24x7 events run

2020-04-29 Thread Madhavan Srinivasan





On 3/27/20 12:06 PM, Kajol Jain wrote:

Commit 2b206ee6b0df ("powerpc/perf/hv-24x7: Display change in counter
values")' added to print _change_ in the counter value rather then raw
value for 24x7 counters. Incase of transactions, the event count
is set to 0 at the beginning of the transaction. It also sets
the event's prev_count to the raw value at the time of initialization.
Because of setting event count to 0, we are seeing some weird behaviour,
whenever we run multiple 24x7 events at a time.

For example:

command#: ./perf stat -e "{hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/,
   hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/}"
   -C 0 -I 1000 sleep 100

  1.000121704120 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  1.000121704  5 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  2.000357733  8 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  2.000357733 10 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  3.000495215 18,446,744,073,709,551,616 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  3.000495215 18,446,744,073,709,551,616 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  4.000641884 56 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  4.000641884 18,446,744,073,709,551,616 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  5.000791887 18,446,744,073,709,551,616 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/

Getting these large values in case we do -I.

As we are setting event_count to 0, for interval case, overall event_count is 
not
coming in incremental order. As we may can get new delta lesser then previous 
count.
Because of which when we print intervals, we are getting negative value which 
create
these large values.

This patch removes part where we set event_count to 0 in function
'h_24x7_event_read'. There won't be much impact as we do set 
event->hw.prev_count
to the raw value at the time of initialization to print change value.

With this patch
In power9 platform

command#: ./perf stat -e "{hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/,
   hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/}"
   -C 0 -I 1000 sleep 100

  1.000117685 93 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  1.000117685  1 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  2.000349331 98 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  2.000349331  2 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  3.000495900131 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  3.000495900  4 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  4.000645920204 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/
  4.000645920 61 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=1/
  4.284169997 22 
hv_24x7/PM_MCS01_128B_RD_DISP_PORT01,chip=0/

Signed-off-by: Kajol Jain 
Suggested-by: Sukadev Bhattiprolu 


Tested-by: Madhavan Srinivasan 


---
  arch/powerpc/perf/hv-24x7.c | 10 --
  1 file changed, 10 deletions(-)

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 573e0b309c0c..48e8f4b17b91 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -1400,16 +1400,6 @@ static void h_24x7_event_read(struct perf_event *event)
h24x7hw = &get_cpu_var(hv_24x7_hw);
h24x7hw->events[i] = event;
put_cpu_var(h24x7hw);
-   /*
-* Clear the event count so we can compute the _change_
-* in the 24x7 raw counter value at the end of the txn.
-*
-* Note that we could alternatively read the 24x7 value
-* now and save its value in event->hw.prev_count. But
-* that would require issuing a hcall, which would then
-* defeat the purpose of using the txn interface.
-*/
-   local64_set(&event->count, 0);
}

put_cpu_var(hv_24x7_reqb);

Re: [PATCH v7 3/5] powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show processor details

2020-04-29 Thread Madhavan Srinivasan





On 3/27/20 12:06 PM, Kajol Jain wrote:

To expose the system dependent parameter like total number of
sockets and numbers of chips per socket, patch adds two sysfs files.
"sockets" and "chips" are added to /sys/devices/hv_24x7/interface/
of the "hv_24x7" pmu.

Signed-off-by: Kajol Jain 
---
  arch/powerpc/perf/hv-24x7.c | 22 ++
  1 file changed, 22 insertions(+)

diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9ae00f29bd21..a31bd5b88f7a 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -454,6 +454,20 @@ static ssize_t device_show_string(struct device *dev,
return sprintf(buf, "%s\n", (char *)d->var);
  }

+#ifdef CONFIG_PPC_RTAS
+static ssize_t sockets_show(struct device *dev,
+   struct device_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%d\n", physsockets);
+}
+
+static ssize_t chips_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+   return sprintf(buf, "%d\n", physchips);
+}
+#endif
+


rtas call gives you the cores per chip too. We can expose that
also with this patch?  I understand tool side patchset is using
only metrics added to socket/chip information, its better to
include that cores also here..


  static struct attribute *device_str_attr_create_(char *name, char *str)
  {
struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
@@ -1100,6 +1114,10 @@ PAGE_0_ATTR(catalog_len, "%lld\n",
(unsigned long long)be32_to_cpu(page_0->length) * 4096);
  static BIN_ATTR_RO(catalog, 0/* real length varies */);
  static DEVICE_ATTR_RO(domains);
+#ifdef CONFIG_PPC_RTAS
+static DEVICE_ATTR_RO(sockets);
+static DEVICE_ATTR_RO(chips);
+#endif

  static struct bin_attribute *if_bin_attrs[] = {
&bin_attr_catalog,
@@ -1110,6 +1128,10 @@ static struct attribute *if_attrs[] = {
&dev_attr_catalog_len.attr,
&dev_attr_catalog_version.attr,
&dev_attr_domains.attr,
+#ifdef CONFIG_PPC_RTAS
+   &dev_attr_sockets.attr,
+   &dev_attr_chips.attr,
+#endif
NULL,
  };

Re: [PATCH v7 5/5] powerpc/hv-24x7: Update post_mobility_fixup() to handle migration

2020-04-29 Thread Madhavan Srinivasan





On 3/27/20 12:06 PM, Kajol Jain wrote:

Function 'read_sys_info_pseries()' is added to get system parameter
values like number of sockets and chips per socket.
and it gets these details via rtas_call with token
"PROCESSOR_MODULE_INFO".

Incase lpar migrate from one system to another, system
parameter details like chips per sockets or number of sockets might
change. So, it needs to be re-initialized otherwise, these values
corresponds to previous system values.
This patch adds a call to 'read_sys_info_pseries()' from
'post-mobility_fixup()' to re-init the physsockets and physchips values.


Changes looks fine to me.
Reviewed-by: Madhavan Srinivasan 


Signed-off-by: Kajol Jain 
---
  arch/powerpc/platforms/pseries/mobility.c | 12 
  1 file changed, 12 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/mobility.c 
b/arch/powerpc/platforms/pseries/mobility.c
index b571285f6c14..226accd6218b 100644
--- a/arch/powerpc/platforms/pseries/mobility.c
+++ b/arch/powerpc/platforms/pseries/mobility.c
@@ -371,6 +371,18 @@ void post_mobility_fixup(void)
/* Possibly switch to a new RFI flush type */
pseries_setup_rfi_flush();

+   /*
+* Incase lpar migrate from one system to another, system
+* parameter details like chips per sockets and number of sockets
+* might change. So, it needs to be re-initialized otherwise these
+* values corresponds to previous system.
+* Here, adding a call to read_sys_info_pseries() declared in
+* platforms/pseries/pseries.h to re-init the physsockets and
+* physchips value.
+*/
+   if (IS_ENABLED(CONFIG_HV_PERF_CTRS) && IS_ENABLED(CONFIG_PPC_RTAS))
+   read_sys_info_pseries();
+
return;
  }

Re: [PATCH] fixup! signal: factor copy_siginfo_to_external32 from copy_siginfo_to_user32

2020-04-29 Thread Christoph Hellwig

On Wed, Apr 29, 2020 at 10:07:11AM +0200, Arnd Bergmann wrote:
> > What do you think of this version?  This one always overrides
> > copy_siginfo_to_user32 for the x86 compat case to keep the churn down,
> > and improves the copy_siginfo_to_external32 documentation a bit.
> 
> Looks good to me. I preferred checking for X32 explicitly (so we can
> find and kill off the #ifdef if we ever remove X32 for good), but there is
> little difference in the end.

Is there any realistic chance we'll get rid of x32?

Re: [RFC PATCH v2 7/7] powerpc/selftest: reuse ppc-opcode macros to avoid redundancy

2020-04-29 Thread Michael Ellerman

Balamuruhan S  writes:
> Avoid redefining macros to encode ppc instructions instead reuse it from
> ppc-opcode.h, Makefile changes are necessary to compile memcmp_64.S with
> __ASSEMBLY__ defined from selftests.
>
> Signed-off-by: Balamuruhan S 
> ---
>  .../selftests/powerpc/stringloops/Makefile| 34 ++
>  .../powerpc/stringloops/asm/asm-const.h   |  1 +
>  .../powerpc/stringloops/asm/ppc-opcode.h  | 36 +--
>  3 files changed, 29 insertions(+), 42 deletions(-)
>  create mode 12 
> tools/testing/selftests/powerpc/stringloops/asm/asm-const.h
>  mode change 100644 => 12 
> tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
>
> diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile 
> b/tools/testing/selftests/powerpc/stringloops/Makefile
> index 7fc0623d85c3..efe76c5a5b94 100644
> --- a/tools/testing/selftests/powerpc/stringloops/Makefile
> +++ b/tools/testing/selftests/powerpc/stringloops/Makefile
> @@ -1,26 +1,44 @@
>  # SPDX-License-Identifier: GPL-2.0
>  # The loops are all 64-bit code
> -CFLAGS += -I$(CURDIR)
> +GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown")
> +CFLAGS += -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) -I$(CURDIR)/../include
>  
>  EXTRA_SOURCES := ../harness.c
>  
>  build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c 
> >/dev/null 2>&1) then echo "1"; fi)
>  
> +ifneq ($(build_32bit),1)
>  TEST_GEN_PROGS := memcmp_64 strlen
> +TEST_GEN_FILES := memcmp.o memcmp_64.o memcmp_64
> +MEMCMP := $(OUTPUT)/memcmp.o
> +MEMCMP_64 := $(OUTPUT)/memcmp_64.o
> +HARNESS :=  $(OUTPUT)/../harness.o
> +CFLAGS += -m64 -maltivec
>  
> -$(OUTPUT)/memcmp_64: memcmp.c
> -$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
> +OVERRIDE_TARGETS := 1
> +include ../../lib.mk
>  
> -ifeq ($(build_32bit),1)
> +$(OUTPUT)/memcmp_64: $(MEMCMP_64) $(MEMCMP) $(HARNESS)
> + $(CC) $(CFLAGS) memcmp.o memcmp_64.o ../harness.o -o memcmp_64
> +
> +$(MEMCMP_64): memcmp_64.S
> + $(CC) $(CFLAGS) -D__ASSEMBLY__ -o memcmp_64.o -c memcmp_64.S
> +
> +$(MEMCMP): memcmp.c
> + $(CC) $(CFLAGS) -o memcmp.o -c memcmp.c
> +
> +$(HARNESS): $(EXTRA_SOURCES)
> + $(CC) $(CFLAGS) -DGIT_VERSION='"$(GIT_VERSION)"' -o ../harness.o -c 
> $(EXTRA_SOURCES)

What are you actually trying to do here? Is it just that you need to
define __ASSEMBLY__ for memcmp_64.S?

What you have breaks the build, it's not respecting $(OUTPUT).

  make[2]: Entering directory 
'/linux/tools/testing/selftests/powerpc/stringloops'
  powerpc64le-linux-gnu-gcc -std=gnu99 -O2 -Wall -Werror 
-DGIT_VERSION='"v5.7-rc2-38-g2d7b142b5a96"' 
-I/linux/tools/testing/selftests/powerpc/include  
-DGIT_VERSION='"v5.7-rc2-38-g2d7b142b5a96"' 
-I/linux/tools/testing/selftests/powerpc/stringloops 
-I/linux/tools/testing/selftests/powerpc/stringloops/../include -m64 -maltivec 
-D__ASSEMBLY__ -o memcmp_64.o -c memcmp_64.S
  Assembler messages:
  Fatal error: can't create memcmp_64.o: Read-only file system
  make[2]: *** [Makefile:25: /output/kselftest/powerpc/stringloops/memcmp_64.o] 
Error 1
  powerpc64le-linux-gnu-gcc -std=gnu99 -O2 -Wall -Werror 
-DGIT_VERSION='"v5.7-rc2-38-g2d7b142b5a96"' 
-I/linux/tools/testing/selftests/powerpc/include  
-DGIT_VERSION='"v5.7-rc2-38-g2d7b142b5a96"' 
-I/linux/tools/testing/selftests/powerpc/stringloops 
-I/linux/tools/testing/selftests/powerpc/stringloops/../include -m64 -maltivec 
-o memcmp.o -c memcmp.c
  Assembler messages:
  Fatal error: can't create memcmp.o: Read-only file system
  make[2]: *** [Makefile:28: /output/kselftest/powerpc/stringloops/memcmp.o] 
Error 1
  powerpc64le-linux-gnu-gcc -std=gnu99 -O2 -Wall -Werror 
-DGIT_VERSION='"v5.7-rc2-38-g2d7b142b5a96"' 
-I/linux/tools/testing/selftests/powerpc/include  
-DGIT_VERSION='"v5.7-rc2-38-g2d7b142b5a96"' 
-I/linux/tools/testing/selftests/powerpc/stringloops 
-I/linux/tools/testing/selftests/powerpc/stringloops/../include -m64 -maltivec 
-DGIT_VERSION='"v5.7-rc2-38-g2d7b142b5a96"' -o ../harness.o -c ../harness.c
  Assembler messages:
  Fatal error: can't create ../harness.o: Read-only file system
  make[2]: *** [Makefile:31: 
/output/kselftest/powerpc/stringloops/../harness.o] Error 1


cheers

Re: [PATCH] powerpc/ps3: Move static keyword to the front of declaration

2020-04-29 Thread Geert Uytterhoeven

On Wed, Apr 29, 2020 at 12:07 PM Xiongfeng Wang
 wrote:
> Move the static keyword to the front of declaration of 'vuart_bus_priv',
> and resolve the following compiler warning that can be seen when
> building with warnings enabled (W=1):
>
> drivers/ps3/ps3-vuart.c:867:1: warning: ‘static’ is not at beginning of 
> declaration [-Wold-style-declaration]
>  } static vuart_bus_priv;
>  ^
>
> Reported-by: Hulk Robot 
> Signed-off-by: Xiongfeng Wang 

Reviewed-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

[PATCH 0/3] powerpc/xive: PCI hotplug fixes under PowerVM

2020-04-29 Thread Cédric Le Goater

Hello,

Here are a couple of fixes for PCI hotplug issues for machines running
under the POWER hypervisor using hash MMU and the XIVE interrupt mode.

Commit 1ca3dec2b2df ("powerpc/xive: Prevent page fault issues in the
machine crash handler") forced the mapping of the XIVE ESB page and
this is now blocking the removal of a passthrough IO adapter because
the PCI isolation fails with "valid outstanding translations". Under
KVM, the ESB pages for the adapter interrupts are un-mapped from the
guest by the hypervisor in the KVM XIVE native device. This is is now
redundant but it's harmless.

Last is a fix to disable the XIVE debugfs file when XIVE is disabled.

Thanks,

C.

Cédric Le Goater (3):
  powerpc/xive: Clear the page tables for the ESB IO mapping
  powerpc/pci: unmap legacy INTx interrupts of passthrough IO adapters
  powerpc/xive: Do not expose a debugfs file when XIVE is disabled

 arch/powerpc/kernel/pci-hotplug.c |  2 ++
 arch/powerpc/sysdev/xive/common.c | 10 --
 2 files changed, 10 insertions(+), 2 deletions(-)

-- 
2.25.4

[PATCH] powerpc/ps3: Move static keyword to the front of declaration

2020-04-29 Thread Xiongfeng Wang

Move the static keyword to the front of declaration of 'vuart_bus_priv',
and resolve the following compiler warning that can be seen when
building with warnings enabled (W=1):

drivers/ps3/ps3-vuart.c:867:1: warning: ‘static’ is not at beginning of 
declaration [-Wold-style-declaration]
 } static vuart_bus_priv;
 ^

Reported-by: Hulk Robot 
Signed-off-by: Xiongfeng Wang 
---
 drivers/ps3/ps3-vuart.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ps3/ps3-vuart.c b/drivers/ps3/ps3-vuart.c
index ddaa5ea..8e80e09 100644
--- a/drivers/ps3/ps3-vuart.c
+++ b/drivers/ps3/ps3-vuart.c
@@ -858,13 +858,13 @@ static int ps3_vuart_handle_port_interrupt(struct 
ps3_system_bus_device *dev)
return 0;
 }
 
-struct vuart_bus_priv {
+static struct vuart_bus_priv {
struct ports_bmp *bmp;
unsigned int virq;
struct mutex probe_mutex;
int use_count;
struct ps3_system_bus_device *devices[PORT_COUNT];
-} static vuart_bus_priv;
+} vuart_bus_priv;
 
 /**
  * ps3_vuart_irq_handler - first stage interrupt handler
-- 
1.7.12.4

Re: [PATCH] fixup! signal: factor copy_siginfo_to_external32 from copy_siginfo_to_user32

2020-04-29 Thread Arnd Bergmann

On Wed, Apr 29, 2020 at 11:42 AM Christoph Hellwig  wrote:
>
> On Wed, Apr 29, 2020 at 10:07:11AM +0200, Arnd Bergmann wrote:
> > > What do you think of this version?  This one always overrides
> > > copy_siginfo_to_user32 for the x86 compat case to keep the churn down,
> > > and improves the copy_siginfo_to_external32 documentation a bit.
> >
> > Looks good to me. I preferred checking for X32 explicitly (so we can
> > find and kill off the #ifdef if we ever remove X32 for good), but there is
> > little difference in the end.
>
> Is there any realistic chance we'll get rid of x32?

When we discussed it last year, there were a couple of users that replied
saying they actively use it for a full system, and some others said they run
specific programs built as x32 as it results in much faster (10% to 20%)
execution of the same binaries compared to either i686 or x86_64.

I expect both of these to get less common over time as stuff bitrots
and more of the workloads that benefit most from the higher
performance (cross-compilers, hpc) run out of virtual address space.
Debian popcon numbers are too small to be reliable but they do show
a trend at https://popcon.debian.org/stat/sub-x32.png

I would just ask again every few years, and eventually we'll decide
it's not worth keeping any more. I do expect most 32-bit machines
to stop getting kernel updates before 2030 and we can probably
remove a bunch of architectures including x32 before then, though
at least armv7 users will have to get kernel updates for substantially
longer.

  Arnd

Re: [PATCH v7 2/5] powerpc/hv-24x7: Add rtas call in hv-24x7 driver to get processor details

2020-04-29 Thread Michael Ellerman

Hi Kajol,

Some comments inline ...

Kajol Jain  writes:
> For hv_24x7 socket/chip level events, specific chip-id to which
> the data requested should be added as part of pmu events.
> But number of chips/socket in the system details are not exposed.
>
> Patch implements read_sys_info_pseries() to get system
> parameter values like number of sockets and chips per socket.
> Rtas_call with token "PROCESSOR_MODULE_INFO"
> is used to get these values.
>
> Sub-sequent patch exports these values via sysfs.
>
> Patch also make these parameters default to 1.
>
> Signed-off-by: Kajol Jain 
> ---
>  arch/powerpc/perf/hv-24x7.c  | 72 
>  arch/powerpc/platforms/pseries/pseries.h |  3 +
>  2 files changed, 75 insertions(+)
>
> diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
> index 48e8f4b17b91..9ae00f29bd21 100644
> --- a/arch/powerpc/perf/hv-24x7.c
> +++ b/arch/powerpc/perf/hv-24x7.c
> @@ -20,6 +20,11 @@
>  #include 
>  #include 
>  
> +#ifdef CONFIG_PPC_RTAS

This driver can only be build on pseries, and pseries always selects
RTAS. So the ifdef is unncessary.

> +#include 
> +#include <../../platforms/pseries/pseries.h>
> +#endif

That's not really what the platform header is intended for.

You should put the extern in arch/powerpc/include/asm somewhere.

Maybe rtas.h

> @@ -57,6 +62,69 @@ static bool is_physical_domain(unsigned domain)
>   }
>  }
>  
> +#ifdef CONFIG_PPC_RTAS

Not needed.

> +#define PROCESSOR_MODULE_INFO   43

Please document where these come from, presumably LoPAPR somewhere?

> +#define PROCESSOR_MAX_LENGTH (8 * 1024)
> +
> +static int strbe16toh(const char *buf, int offset)
> +{
> + return (buf[offset] << 8) + buf[offset + 1];
> +}

I'm confused by this. "str" implies string, a string is an array of
bytes and has no endian. But then be16 implies it's an array of __be16,
in which case buf should be a __be16 *.

> +
> +static u32   physsockets;/* Physical sockets */
> +static u32   physchips;  /* Physical chips */

No tabs there please.

> +
> +/*
> + * Function read_sys_info_pseries() make a rtas_call which require
> + * data buffer of size 8K. As standard 'rtas_data_buf' is of size
> + * 4K, we are adding new local buffer 'rtas_local_data_buf'.
> + */
> +char rtas_local_data_buf[PROCESSOR_MAX_LENGTH] __cacheline_aligned;

static?

> +/*
> + * read_sys_info_pseries()
> + * Retrieve the number of sockets and chips per socket details
> + * through the get-system-parameter rtas call.
> + */
> +void read_sys_info_pseries(void)
> +{
> + int call_status, len, ntypes;
> +
> + /*
> +  * Making system parameter: chips and sockets default to 1.
> +  */
> + physsockets = 1;
> + physchips = 1;
> + memset(rtas_local_data_buf, 0, PROCESSOR_MAX_LENGTH);
> + spin_lock(&rtas_data_buf_lock);

You're not using the rtas_data_buf, so why are you taking the
rtas_data_buf_lock?

> + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1,
> + NULL,
> + PROCESSOR_MODULE_INFO,
> + __pa(rtas_local_data_buf),
> + PROCESSOR_MAX_LENGTH);
> +
> + spin_unlock(&rtas_data_buf_lock);
> +
> + if (call_status != 0) {
> + pr_info("%s %s Error calling get-system-parameter (0x%x)\n",
> + __FILE__, __func__, call_status);

pr_err(), don't use __FILE__, this file already uses pr_fmt(). Not sure
__func__ is really necessary either.

return;

Then you can deindent the next block.

> + } else {
> + rtas_local_data_buf[PROCESSOR_MAX_LENGTH - 1] = '\0';
> + len = strbe16toh(rtas_local_data_buf, 0);

Why isn't the buffer a __be16 array, and then you just use be16_to_cpu() ?

> + if (len < 6)
> + return;
> +
> + ntypes = strbe16toh(rtas_local_data_buf, 2);
> +
> + if (!ntypes)
> + return;

What is ntypes?

> + physsockets = strbe16toh(rtas_local_data_buf, 4);
> + physchips = strbe16toh(rtas_local_data_buf, 6);
> + }
> +}
> +#endif /* CONFIG_PPC_RTAS */
> +
>  /* Domains for which more than one result element are returned for each 
> event. */
>  static bool domain_needs_aggregation(unsigned int domain)
>  {
> @@ -1605,6 +1673,10 @@ static int hv_24x7_init(void)
>   if (r)
>   return r;
>  
> +#ifdef CONFIG_PPC_RTAS
> + read_sys_info_pseries();
> +#endif

> +
>   return 0;
>  }
>  
> diff --git a/arch/powerpc/platforms/pseries/pseries.h 
> b/arch/powerpc/platforms/pseries/pseries.h
> index 13fa370a87e4..1727559ce304 100644
> --- a/arch/powerpc/platforms/pseries/pseries.h
> +++ b/arch/powerpc/platforms/pseries/pseries.h
> @@ -19,6 +19,9 @@ extern void request_event_sources_irqs(struct device_node 
> *np,
>  struct pt_regs;
>  
>  extern int pSeries_system_reset_exception(struct pt_regs *r

Re: [PATCH v7 3/5] powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show processor details

2020-04-29 Thread Michael Ellerman

Kajol Jain  writes:
> To expose the system dependent parameter like total number of
> sockets and numbers of chips per socket, patch adds two sysfs files.
> "sockets" and "chips" are added to /sys/devices/hv_24x7/interface/
> of the "hv_24x7" pmu.
>
> Signed-off-by: Kajol Jain 
> ---
>  arch/powerpc/perf/hv-24x7.c | 22 ++
>  1 file changed, 22 insertions(+)

This should also add documentation under Documentation/ABI.

cheers

> diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
> index 9ae00f29bd21..a31bd5b88f7a 100644
> --- a/arch/powerpc/perf/hv-24x7.c
> +++ b/arch/powerpc/perf/hv-24x7.c
> @@ -454,6 +454,20 @@ static ssize_t device_show_string(struct device *dev,
>   return sprintf(buf, "%s\n", (char *)d->var);
>  }
>  
> +#ifdef CONFIG_PPC_RTAS
> +static ssize_t sockets_show(struct device *dev,
> + struct device_attribute *attr, char *buf)
> +{
> + return sprintf(buf, "%d\n", physsockets);
> +}
> +
> +static ssize_t chips_show(struct device *dev, struct device_attribute *attr,
> +   char *buf)
> +{
> + return sprintf(buf, "%d\n", physchips);
> +}
> +#endif
> +
>  static struct attribute *device_str_attr_create_(char *name, char *str)
>  {
>   struct dev_ext_attribute *attr = kzalloc(sizeof(*attr), GFP_KERNEL);
> @@ -1100,6 +1114,10 @@ PAGE_0_ATTR(catalog_len, "%lld\n",
>   (unsigned long long)be32_to_cpu(page_0->length) * 4096);
>  static BIN_ATTR_RO(catalog, 0/* real length varies */);
>  static DEVICE_ATTR_RO(domains);
> +#ifdef CONFIG_PPC_RTAS
> +static DEVICE_ATTR_RO(sockets);
> +static DEVICE_ATTR_RO(chips);
> +#endif
>  
>  static struct bin_attribute *if_bin_attrs[] = {
>   &bin_attr_catalog,
> @@ -1110,6 +1128,10 @@ static struct attribute *if_attrs[] = {
>   &dev_attr_catalog_len.attr,
>   &dev_attr_catalog_version.attr,
>   &dev_attr_domains.attr,
> +#ifdef CONFIG_PPC_RTAS
> + &dev_attr_sockets.attr,
> + &dev_attr_chips.attr,
> +#endif
>   NULL,
>  };
>  
> -- 
> 2.18.1

Re: [PATCH v7 3/5] powerpc/hv-24x7: Add sysfs files inside hv-24x7 device to show processor details

2020-04-29 Thread Michael Ellerman

Michael Ellerman  writes:
> Kajol Jain  writes:
>> To expose the system dependent parameter like total number of
>> sockets and numbers of chips per socket, patch adds two sysfs files.
>> "sockets" and "chips" are added to /sys/devices/hv_24x7/interface/
>> of the "hv_24x7" pmu.
>>
>> Signed-off-by: Kajol Jain 
>> ---
>>  arch/powerpc/perf/hv-24x7.c | 22 ++
>>  1 file changed, 22 insertions(+)
>
> This should also add documentation under Documentation/ABI.

Ugh, sorry, you do that in the next patch :}

cheers

Re: [PATCH v7 5/5] powerpc/hv-24x7: Update post_mobility_fixup() to handle migration

2020-04-29 Thread Michael Ellerman

Kajol Jain  writes:
> Function 'read_sys_info_pseries()' is added to get system parameter
> values like number of sockets and chips per socket.
> and it gets these details via rtas_call with token
> "PROCESSOR_MODULE_INFO".
>
> Incase lpar migrate from one system to another, system
> parameter details like chips per sockets or number of sockets might
> change. So, it needs to be re-initialized otherwise, these values
> corresponds to previous system values.
> This patch adds a call to 'read_sys_info_pseries()' from
> 'post-mobility_fixup()' to re-init the physsockets and physchips values.
>
> Signed-off-by: Kajol Jain 
> ---
>  arch/powerpc/platforms/pseries/mobility.c | 12 
>  1 file changed, 12 insertions(+)
>
> diff --git a/arch/powerpc/platforms/pseries/mobility.c 
> b/arch/powerpc/platforms/pseries/mobility.c
> index b571285f6c14..226accd6218b 100644
> --- a/arch/powerpc/platforms/pseries/mobility.c
> +++ b/arch/powerpc/platforms/pseries/mobility.c
> @@ -371,6 +371,18 @@ void post_mobility_fixup(void)
>   /* Possibly switch to a new RFI flush type */
>   pseries_setup_rfi_flush();
>  
> + /*
> +  * Incase lpar migrate from one system to another, system

In case an LPAR migrates

> +  * parameter details like chips per sockets and number of sockets
> +  * might change. So, it needs to be re-initialized otherwise these
 ^   ^
 they need   the
> +  * values corresponds to previous system.
  ^
  will correspond to the

> +  * Here, adding a call to read_sys_info_pseries() declared in

Adding is the wrong tense in a comment. When someone reads the comment
the code has already been added. Past tense would be right, but really
the comment shouldn't say what you did, it should say why.

> +  * platforms/pseries/pseries.h to re-init the physsockets and
> +  * physchips value.

Call read_sys_info_pseries() to reinitialise the values.

> +  */
> + if (IS_ENABLED(CONFIG_HV_PERF_CTRS) && IS_ENABLED(CONFIG_PPC_RTAS))
> + read_sys_info_pseries();

The RTAS check is not needed. pseries always selects RTAS.

You shouldn't need the IS_ENABLED() check here though, do it with an
empty version in the header when CONFIG_HV_PERF_CTRS is not enabled.

cheers

Re: [PATCH] powerpc/spufs: Add rcu_read_lock() around fcheck()

2020-04-29 Thread Michael Ellerman

Christoph Hellwig  writes:
> On Tue, Apr 28, 2020 at 09:48:11PM +1000, Michael Ellerman wrote:
>> 
>> This comes from fcheck_files() via fcheck().
>> 
>> It's pretty clearly documented that fcheck() must be wrapped with
>> rcu_read_lock(), so fix it.
>
> But for this to actually be useful you'd need the rcu read lock until
> your are done with the file (or got a reference).

Hmm OK. My reasoning was that we were done with the struct file, because
we return the ctx that's hanging off the inode.

+   ctx = SPUFS_I(file_inode(file))->i_ctx;

But I guess the lifetime of the ctx is not guaranteed if the file goes
away.

It looks like the only long lived reference on the ctx is the one
taken in spufs_new_file() and dropped in spufs_evict_inode().

So if we take a reference to the ctx with the RCU lock held we should be
safe, I think. But I've definitely exhausted my spufs/vfs knowledge at
this point.

Something like below.

cheers


diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c 
b/arch/powerpc/platforms/cell/spufs/coredump.c
index 8b3296b62f65..37c155254cd5 100644
--- a/arch/powerpc/platforms/cell/spufs/coredump.c
+++ b/arch/powerpc/platforms/cell/spufs/coredump.c
@@ -82,13 +82,20 @@ static int match_context(const void *v, struct file *file, 
unsigned fd)
  */
 static struct spu_context *coredump_next_context(int *fd)
 {
+   struct spu_context *ctx;
struct file *file;
int n = iterate_fd(current->files, *fd, match_context, NULL);
if (!n)
return NULL;
*fd = n - 1;
+
+   rcu_read_lock();
file = fcheck(*fd);
-   return SPUFS_I(file_inode(file))->i_ctx;
+   ctx = SPUFS_I(file_inode(file))->i_ctx;
+   get_spu_context(ctx);
+   rcu_read_unlock();
+
+   return ctx;
 }

 int spufs_coredump_extra_notes_size(void)
@@ -99,17 +106,23 @@ int spufs_coredump_extra_notes_size(void)
fd = 0;
while ((ctx = coredump_next_context(&fd)) != NULL) {
rc = spu_acquire_saved(ctx);
-   if (rc)
+   if (rc) {
+   put_spu_context(ctx);
break;
+   }
+
rc = spufs_ctx_note_size(ctx, fd);
spu_release_saved(ctx);
-   if (rc < 0)
+   if (rc < 0) {
+   put_spu_context(ctx);
break;
+   }

size += rc;

/* start searching the next fd next time */
fd++;
+   put_spu_context(ctx);
}

return size;

Re: [PATCH] fixup! signal: factor copy_siginfo_to_external32 from copy_siginfo_to_user32

2020-04-29 Thread Christoph Hellwig

I did another pass at this, reducing the overhead of the x32 magic
in common code down to renaming copy_siginfo_to_user32 to
copy_siginfo_to_user32 and having a conditional #define to give it
the old name back:

---
>From 45e5263d7c24d854bb446b7e69dc53729ed842bc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig 
Date: Wed, 29 Apr 2020 11:57:10 +0200
Subject: signal: refactor copy_siginfo_to_user32

Factor out a copy_siginfo_to_external32 helper from
copy_siginfo_to_user32 that fills out the compat_siginfo, but does so
on a kernel space data structure.  With that we can let architectures
override copy_siginfo_to_user32 with their own implementations using
copy_siginfo_to_external32.  That allows moving the x32 SIGCHLD purely
to x86 architecture code.

As a nice side effect copy_siginfo_to_external32 also comes in handy
for avoiding a set_fs() call in the coredump code later on.

Contains improvements from Eric W. Biederman 
and Arnd Bergmann .

Signed-off-by: Christoph Hellwig 
---
 arch/x86/ia32/ia32_signal.c   |   2 +-
 arch/x86/include/asm/compat.h |   8 ++-
 arch/x86/kernel/signal.c  |  28 -
 include/linux/compat.h|  11 +++-
 kernel/signal.c   | 106 +-
 5 files changed, 96 insertions(+), 59 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index f9d8804144d09..81cf22398cd16 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -350,7 +350,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault);
user_access_end();
 
-   if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false))
+   if (__copy_siginfo_to_user32(&frame->info, &ksig->info))
return -EFAULT;
 
/* Set up registers for signal handler */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 52e9f3480f690..d4edf281fff49 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -214,7 +214,11 @@ static inline bool in_compat_syscall(void)
 #endif
 
 struct compat_siginfo;
-int __copy_siginfo_to_user32(struct compat_siginfo __user *to,
-   const kernel_siginfo_t *from, bool x32_ABI);
+
+#ifdef CONFIG_X86_X32_ABI
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+   const kernel_siginfo_t *from);
+#define copy_siginfo_to_user32 copy_siginfo_to_user32
+#endif /* CONFIG_X86_X32_ABI */
 
 #endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 83b74fb38c8fc..f3df262e370b3 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -37,6 +37,7 @@
 #include 
 
 #ifdef CONFIG_X86_64
+#include 
 #include 
 #include 
 #endif /* CONFIG_X86_64 */
@@ -511,6 +512,31 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 }
 #endif /* CONFIG_X86_32 */
 
+#ifdef CONFIG_X86_X32_ABI
+static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
+   const struct kernel_siginfo *from)
+{
+   struct compat_siginfo new;
+
+   copy_siginfo_to_external32(&new, from);
+   if (from->si_signo == SIGCHLD) {
+   new._sifields._sigchld_x32._utime = from->si_utime;
+   new._sifields._sigchld_x32._stime = from->si_stime;
+   }
+   if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
+   return -EFAULT;
+   return 0;
+}
+
+int copy_siginfo_to_user32(struct compat_siginfo __user *to,
+  const struct kernel_siginfo *from)
+{
+   if (in_x32_syscall())
+   return x32_copy_siginfo_to_user(to, from);
+   return __copy_siginfo_to_user32(to, from);
+}
+#endif /* CONFIG_X86_X32_ABI */
+
 static int x32_setup_rt_frame(struct ksignal *ksig,
  compat_sigset_t *set,
  struct pt_regs *regs)
@@ -543,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
user_access_end();
 
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-   if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
+   if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
return -EFAULT;
}
 
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 0480ba4db5929..e432df9be2e4b 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -402,8 +402,15 @@ long compat_get_bitmap(unsigned long *mask, const 
compat_ulong_t __user *umask,
   unsigned long bitmap_size);
 long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
   unsigned long bitmap_size);
-int copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo 
__user *from);
-int copy_siginfo_to_user32(struct compat_siginfo __user *to, const 
kernel_siginfo_t *from);
+void __copy_siginfo_to_external32(struct compat_siginfo *to,
+

Re: [PATCH v4 1/7] KVM: s390: clean up redundant 'kvm_run' parameters

2020-04-29 Thread Vitaly Kuznetsov

Tianjia Zhang  writes:

> In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
> structure. For historical reasons, many kvm-related function parameters
> retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
> patch does a unified cleanup of these remaining redundant parameters.
>
> Signed-off-by: Tianjia Zhang 
> ---
>  arch/s390/kvm/kvm-s390.c | 23 +++
>  1 file changed, 15 insertions(+), 8 deletions(-)
>
> diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
> index e335a7e5ead7..c0d94eaa00d7 100644
> --- a/arch/s390/kvm/kvm-s390.c
> +++ b/arch/s390/kvm/kvm-s390.c
> @@ -4176,8 +4176,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
>   return rc;
>  }
>  
> -static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +static void sync_regs_fmt2(struct kvm_vcpu *vcpu)
>  {
> + struct kvm_run *kvm_run = vcpu->run;
>   struct runtime_instr_cb *riccb;
>   struct gs_cb *gscb;
>  
> @@ -4243,8 +4244,10 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu, 
> struct kvm_run *kvm_run)
>   /* SIE will load etoken directly from SDNX and therefore kvm_run */
>  }
>  
> -static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +static void sync_regs(struct kvm_vcpu *vcpu)
>  {
> + struct kvm_run *kvm_run = vcpu->run;
> +
>   if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX)
>   kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix);
>   if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) {
> @@ -4273,7 +4276,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct 
> kvm_run *kvm_run)
>  
>   /* Sync fmt2 only data */
>   if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) {
> - sync_regs_fmt2(vcpu, kvm_run);
> + sync_regs_fmt2(vcpu);
>   } else {
>   /*
>* In several places we have to modify our internal view to
> @@ -4292,8 +4295,10 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct 
> kvm_run *kvm_run)
>   kvm_run->kvm_dirty_regs = 0;
>  }
>  
> -static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +static void store_regs_fmt2(struct kvm_vcpu *vcpu)
>  {
> + struct kvm_run *kvm_run = vcpu->run;
> +
>   kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
>   kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
>   kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea;
> @@ -4313,8 +4318,10 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu, 
> struct kvm_run *kvm_run)
>   /* SIE will save etoken directly into SDNX and therefore kvm_run */
>  }
>  
> -static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
> +static void store_regs(struct kvm_vcpu *vcpu)
>  {
> + struct kvm_run *kvm_run = vcpu->run;
> +
>   kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask;
>   kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
>   kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
> @@ -4333,7 +4340,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct 
> kvm_run *kvm_run)
>   current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc;
>   current->thread.fpu.regs = vcpu->arch.host_fpregs.regs;
>   if (likely(!kvm_s390_pv_cpu_is_protected(vcpu)))
> - store_regs_fmt2(vcpu, kvm_run);
> + store_regs_fmt2(vcpu);
>  }
>  
>  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
> @@ -4371,7 +4378,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
>   goto out;
>   }
>  
> - sync_regs(vcpu, kvm_run);
> + sync_regs(vcpu);
>   enable_cpu_timer_accounting(vcpu);
>  
>   might_fault();
> @@ -4393,7 +4400,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
>   }
>  
>   disable_cpu_timer_accounting(vcpu);
> - store_regs(vcpu, kvm_run);
> + store_regs(vcpu);
>  
>   kvm_sigset_deactivate(vcpu);

Haven't tried to compile this but the change itself looks obviously
correct, so

Reviewed-by: Vitaly Kuznetsov 

-- 
Vitaly

Re: [PATCH v4 2/7] KVM: arm64: clean up redundant 'kvm_run' parameters

2020-04-29 Thread Vitaly Kuznetsov

Tianjia Zhang  writes:

> In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
> structure. For historical reasons, many kvm-related function parameters
> retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
> patch does a unified cleanup of these remaining redundant parameters.
>
> Signed-off-by: Tianjia Zhang 
> ---
>  arch/arm64/include/asm/kvm_coproc.h | 12 +-
>  arch/arm64/include/asm/kvm_host.h   | 11 -
>  arch/arm64/include/asm/kvm_mmu.h|  2 +-
>  arch/arm64/kvm/handle_exit.c| 36 ++---
>  arch/arm64/kvm/sys_regs.c   | 13 +--
>  virt/kvm/arm/arm.c  |  6 ++---
>  virt/kvm/arm/mmio.c | 11 +
>  virt/kvm/arm/mmu.c  |  5 ++--
>  8 files changed, 46 insertions(+), 50 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_coproc.h 
> b/arch/arm64/include/asm/kvm_coproc.h
> index 0185ee8b8b5e..454373704b8a 100644
> --- a/arch/arm64/include/asm/kvm_coproc.h
> +++ b/arch/arm64/include/asm/kvm_coproc.h
> @@ -27,12 +27,12 @@ struct kvm_sys_reg_target_table {
>  void kvm_register_target_sys_reg_table(unsigned int target,
>  struct kvm_sys_reg_target_table *table);
>  
> -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run);
> -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run);
> -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run);
> -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run);
> -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run);
> -int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run);
> +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu);
> +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu);
> +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu);
> +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu);
> +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu);
> +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu);
>  
>  #define kvm_coproc_table_init kvm_sys_reg_table_init
>  void kvm_sys_reg_table_init(void);
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index 32c8a675e5a4..3fab32e4948c 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -481,18 +481,15 @@ u64 __kvm_call_hyp(void *hypfn, ...);
>  void force_vm_exit(const cpumask_t *mask);
>  void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
>  
> -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
> - int exception_index);
> -void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run,
> -int exception_index);
> +int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
> +void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index);
>  
>  /* MMIO helpers */
>  void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data);
>  unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len);
>  
> -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run);
> -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
> -  phys_addr_t fault_ipa);
> +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu);
> +int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa);
>  
>  int kvm_perf_init(void);
>  int kvm_perf_teardown(void);
> diff --git a/arch/arm64/include/asm/kvm_mmu.h 
> b/arch/arm64/include/asm/kvm_mmu.h
> index 30b0e8d6b895..2ec7b9bb25d3 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -159,7 +159,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm);
>  int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
> phys_addr_t pa, unsigned long size, bool writable);
>  
> -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
> +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu);
>  
>  void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
>  
> diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
> index aacfc55de44c..ec3a66642ea5 100644
> --- a/arch/arm64/kvm/handle_exit.c
> +++ b/arch/arm64/kvm/handle_exit.c
> @@ -25,7 +25,7 @@
>  #define CREATE_TRACE_POINTS
>  #include "trace.h"
>  
> -typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *);
> +typedef int (*exit_handle_fn)(struct kvm_vcpu *);
>  
>  static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr)
>  {
> @@ -33,7 +33,7 @@ static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, 
> u32 esr)
>   kvm_inject_vabt(vcpu);
>  }
>  
> -static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
> +static int handle_hvc(struct kvm_vcpu *vcpu)
>  {
>   int ret;
>  
> @@ -50,7 +50,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run 
> *run)
>   return ret;
>  }
>  
> -static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
> +static int handle_smc(struct kvm_v

[PATCH v2 00/20] mm: rework free_area_init*() funcitons

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

Hi,

After the discussion [1] about removal of CONFIG_NODES_SPAN_OTHER_NODES and
CONFIG_HAVE_MEMBLOCK_NODE_MAP options, I took it a bit further and updated
the node/zone initialization. 

Since all architectures have memblock, it is possible to use only the newer
version of free_area_init_node() that calculates the zone and node
boundaries based on memblock node mapping and architectural limits on
possible zone PFNs. 

The architectures that still determined zone and hole sizes can be switched
to the generic code and the old code that took those zone and hole sizes
can be simply removed.

And, since it all started from the removal of
CONFIG_NODES_SPAN_OTHER_NODES, the memmap_init() is now updated to iterate
over memblocks and so it does not need to perform early_pfn_to_nid() query
for every PFN.

v2 changes:
* move deletion of one of '#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP' from
  patch 2 to patch 3 where it should have been from the beginning
* drop patch that introduced a free_area_init_memoryless_node() wrapper
  for free_area_init_node()
* remove unused next_pfn(), thanks Qian
* drop stale comment in memmap_init_zone(), as per David

--
Sincerely yours,
Mike.

[1] 
https://lore.kernel.org/lkml/1585420282-25630-1-git-send-email-h...@os.amperecomputing.com

Baoquan He (1):
  mm: memmap_init: iterate over memblock regions rather that check each PFN

Mike Rapoport (19):
  mm: memblock: replace dereferences of memblock_region.nid with API calls
  mm: make early_pfn_to_nid() and related defintions close to each other
  mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option
  mm: free_area_init: use maximal zone PFNs rather than zone sizes
  mm: use free_area_init() instead of free_area_init_nodes()
  alpha: simplify detection of memory zone boundaries
  arm: simplify detection of memory zone boundaries
  arm64: simplify detection of memory zone boundaries for UMA configs
  csky: simplify detection of memory zone boundaries
  m68k: mm: simplify detection of memory zone boundaries
  parisc: simplify detection of memory zone boundaries
  sparc32: simplify detection of memory zone boundaries
  unicore32: simplify detection of memory zone boundaries
  xtensa: simplify detection of memory zone boundaries
  mm: remove early_pfn_in_nid() and CONFIG_NODES_SPAN_OTHER_NODES
  mm: free_area_init: allow defining max_zone_pfn in descending order
  mm: clean up free_area_init_node() and its helpers
  mm: simplify find_min_pfn_with_active_regions()
  docs/vm: update memory-models documentation

 .../vm/numa-memblock/arch-support.txt |  34 ---
 Documentation/vm/memory-model.rst |   9 +-
 arch/alpha/mm/init.c  |  16 +-
 arch/alpha/mm/numa.c  |  22 +-
 arch/arc/mm/init.c|  36 +--
 arch/arm/mm/init.c|  66 +
 arch/arm64/Kconfig|   1 -
 arch/arm64/mm/init.c  |  56 +---
 arch/arm64/mm/numa.c  |   9 +-
 arch/c6x/mm/init.c|   8 +-
 arch/csky/kernel/setup.c  |  26 +-
 arch/h8300/mm/init.c  |   6 +-
 arch/hexagon/mm/init.c|   6 +-
 arch/ia64/Kconfig |   1 -
 arch/ia64/mm/contig.c |   2 +-
 arch/ia64/mm/discontig.c  |   2 +-
 arch/m68k/mm/init.c   |   6 +-
 arch/m68k/mm/mcfmmu.c |   9 +-
 arch/m68k/mm/motorola.c   |  15 +-
 arch/m68k/mm/sun3mmu.c|  10 +-
 arch/microblaze/Kconfig   |   1 -
 arch/microblaze/mm/init.c |   2 +-
 arch/mips/Kconfig |   1 -
 arch/mips/loongson64/numa.c   |   2 +-
 arch/mips/mm/init.c   |   2 +-
 arch/mips/sgi-ip27/ip27-memory.c  |   2 +-
 arch/nds32/mm/init.c  |  11 +-
 arch/nios2/mm/init.c  |   8 +-
 arch/openrisc/mm/init.c   |   9 +-
 arch/parisc/mm/init.c |  22 +-
 arch/powerpc/Kconfig  |  10 -
 arch/powerpc/mm/mem.c |   2 +-
 arch/riscv/Kconfig|   1 -
 arch/riscv/mm/init.c  |   2 +-
 arch/s390/Kconfig |   1 -
 arch/s390/mm/init.c   |   2 +-
 arch/sh/Kconfig   |   1 -
 arch/sh/mm/init.c |   2 +-
 arch/sparc/Kconfig|  10 -
 arch/sparc/mm/init_64.c   |   2 +-
 arch/sparc/mm/srmmu.c |  21 +-
 arch/um/kernel/mem.c  |  12 +-
 arch/unicore32/include/asm/memory.h   |   2 +-
 arch/unicore32/include/mach/memory.h  |   6 +-
 a

[PATCH v2 01/20] mm: memblock: replace dereferences of memblock_region.nid with API calls

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

There are several places in the code that directly dereference
memblock_region.nid despite this field being defined only when
CONFIG_HAVE_MEMBLOCK_NODE_MAP=y.

Replace these with calls to memblock_get_region_nid() to improve code
robustness and to avoid possible breakage when
CONFIG_HAVE_MEMBLOCK_NODE_MAP will be removed.

Signed-off-by: Mike Rapoport 
---
 arch/arm64/mm/numa.c | 9 ++---
 arch/x86/mm/numa.c   | 6 --
 mm/memblock.c| 8 +---
 mm/page_alloc.c  | 4 ++--
 4 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c
index 4decf1659700..aafcee3e3f7e 100644
--- a/arch/arm64/mm/numa.c
+++ b/arch/arm64/mm/numa.c
@@ -350,13 +350,16 @@ static int __init numa_register_nodes(void)
struct memblock_region *mblk;
 
/* Check that valid nid is set to memblks */
-   for_each_memblock(memory, mblk)
-   if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
+   for_each_memblock(memory, mblk) {
+   int mblk_nid = memblock_get_region_node(mblk);
+
+   if (mblk_nid == NUMA_NO_NODE || mblk_nid >= MAX_NUMNODES) {
pr_warn("Warning: invalid memblk node %d [mem 
%#010Lx-%#010Lx]\n",
-   mblk->nid, mblk->base,
+   mblk_nid, mblk->base,
mblk->base + mblk->size - 1);
return -EINVAL;
}
+   }
 
/* Finally register nodes. */
for_each_node_mask(nid, numa_nodes_parsed) {
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 59ba008504dc..fe024b2ac796 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -517,8 +517,10 @@ static void __init numa_clear_kernel_node_hotplug(void)
 *   reserve specific pages for Sandy Bridge graphics. ]
 */
for_each_memblock(reserved, mb_region) {
-   if (mb_region->nid != MAX_NUMNODES)
-   node_set(mb_region->nid, reserved_nodemask);
+   int nid = memblock_get_region_node(mb_region);
+
+   if (nid != MAX_NUMNODES)
+   node_set(nid, reserved_nodemask);
}
 
/*
diff --git a/mm/memblock.c b/mm/memblock.c
index c79ba6f9920c..43e2fd3006c1 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1207,13 +1207,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int 
nid,
 {
struct memblock_type *type = &memblock.memory;
struct memblock_region *r;
+   int r_nid;
 
while (++*idx < type->cnt) {
r = &type->regions[*idx];
+   r_nid = memblock_get_region_node(r);
 
if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
continue;
-   if (nid == MAX_NUMNODES || nid == r->nid)
+   if (nid == MAX_NUMNODES || nid == r_nid)
break;
}
if (*idx >= type->cnt) {
@@ -1226,7 +1228,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int 
nid,
if (out_end_pfn)
*out_end_pfn = PFN_DOWN(r->base + r->size);
if (out_nid)
-   *out_nid = r->nid;
+   *out_nid = r_nid;
 }
 
 /**
@@ -1810,7 +1812,7 @@ int __init_memblock memblock_search_pfn_nid(unsigned long 
pfn,
*start_pfn = PFN_DOWN(type->regions[mid].base);
*end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
 
-   return type->regions[mid].nid;
+   return memblock_get_region_node(&type->regions[mid]);
 }
 #endif
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 69827d4fa052..0d012eda1694 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7208,7 +7208,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (!memblock_is_hotpluggable(r))
continue;
 
-   nid = r->nid;
+   nid = memblock_get_region_node(r);
 
usable_startpfn = PFN_DOWN(r->base);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
@@ -7229,7 +7229,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (memblock_is_mirror(r))
continue;
 
-   nid = r->nid;
+   nid = memblock_get_region_node(r);
 
usable_startpfn = memblock_region_memory_base_pfn(r);
 
-- 
2.26.1

[PATCH v2 02/20] mm: make early_pfn_to_nid() and related defintions close to each other

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The early_pfn_to_nid() and it's helper __early_pfn_to_nid() are spread
around include/linux/mm.h, include/linux/mmzone.h and mm/page_alloc.c.

Drop unused stub for __early_pfn_to_nid() and move its actual generic
implementation close to its users.

Signed-off-by: Mike Rapoport 
---
 include/linux/mm.h |  4 ++--
 include/linux/mmzone.h |  9 
 mm/page_alloc.c| 49 +-
 3 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5a323422d783..a404026d14d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2388,9 +2388,9 @@ extern void sparse_memory_present_with_active_regions(int 
nid);
 
 #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
 !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
-static inline int __early_pfn_to_nid(unsigned long pfn,
-   struct mminit_pfnnid_cache *state)
+static inline int early_pfn_to_nid(unsigned long pfn)
 {
+   BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA));
return 0;
 }
 #else
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1b9de7d220fb..7b5b6eba402f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1078,15 +1078,6 @@ static inline struct zoneref 
*first_zones_zonelist(struct zonelist *zonelist,
 #include 
 #endif
 
-#if !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) && \
-   !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-static inline unsigned long early_pfn_to_nid(unsigned long pfn)
-{
-   BUILD_BUG_ON(IS_ENABLED(CONFIG_NUMA));
-   return 0;
-}
-#endif
-
 #ifdef CONFIG_FLATMEM
 #define pfn_to_nid(pfn)(0)
 #endif
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0d012eda1694..a802ee47e715 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1504,6 +1504,31 @@ void __free_pages_core(struct page *page, unsigned int 
order)
 
 static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
 
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+   struct mminit_pfnnid_cache *state)
+{
+   unsigned long start_pfn, end_pfn;
+   int nid;
+
+   if (state->last_start <= pfn && pfn < state->last_end)
+   return state->last_nid;
+
+   nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+   if (nid != NUMA_NO_NODE) {
+   state->last_start = start_pfn;
+   state->last_end = end_pfn;
+   state->last_nid = nid;
+   }
+
+   return nid;
+}
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
+
 int __meminit early_pfn_to_nid(unsigned long pfn)
 {
static DEFINE_SPINLOCK(early_pfn_lock);
@@ -6299,30 +6324,6 @@ void __meminit init_currently_empty_zone(struct zone 
*zone,
 }
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
-   struct mminit_pfnnid_cache *state)
-{
-   unsigned long start_pfn, end_pfn;
-   int nid;
-
-   if (state->last_start <= pfn && pfn < state->last_end)
-   return state->last_nid;
-
-   nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
-   if (nid != NUMA_NO_NODE) {
-   state->last_start = start_pfn;
-   state->last_end = end_pfn;
-   state->last_nid = nid;
-   }
-
-   return nid;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 
 /**
  * free_bootmem_with_active_regions - Call memblock_free_early_nid for each 
active range
-- 
2.26.1

[PATCH v2 03/20] mm: remove CONFIG_HAVE_MEMBLOCK_NODE_MAP option

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The CONFIG_HAVE_MEMBLOCK_NODE_MAP is used to differentiate initialization
of nodes and zones structures between the systems that have region to node
mapping in memblock and those that don't.

Currently all the NUMA architectures enable this option and for the
non-NUMA systems we can presume that all the memory belongs to node 0 and
therefore the compile time configuration option is not required.

The remaining few architectures that use DISCONTIGMEM without NUMA are
easily updated to use memblock_add_node() instead of memblock_add() and
thus have proper correspondence of memblock regions to NUMA nodes.

Still, free_area_init_node() must have a backward compatible version
because its semantics with and without CONFIG_HAVE_MEMBLOCK_NODE_MAP is
different. Once all the architectures will use the new semantics, the
entire compatibility layer can be dropped.

To avoid addition of extra run time memory to store node id for
architectures that keep memblock but have only a single node, the node id
field of the memblock_region is guarded by CONFIG_NEED_MULTIPLE_NODES and
the corresponding accessors presume that in those cases it is always 0.

Signed-off-by: Mike Rapoport 
---
 .../vm/numa-memblock/arch-support.txt |  34 --
 arch/alpha/mm/numa.c  |   4 +-
 arch/arm64/Kconfig|   1 -
 arch/ia64/Kconfig |   1 -
 arch/m68k/mm/motorola.c   |   4 +-
 arch/microblaze/Kconfig   |   1 -
 arch/mips/Kconfig |   1 -
 arch/powerpc/Kconfig  |   1 -
 arch/riscv/Kconfig|   1 -
 arch/s390/Kconfig |   1 -
 arch/sh/Kconfig   |   1 -
 arch/sparc/Kconfig|   1 -
 arch/x86/Kconfig  |   1 -
 include/linux/memblock.h  |   8 +-
 include/linux/mm.h|  12 +-
 include/linux/mmzone.h|   2 +-
 mm/Kconfig|   3 -
 mm/memblock.c |  11 +-
 mm/memory_hotplug.c   |   4 -
 mm/page_alloc.c   | 103 ++
 20 files changed, 74 insertions(+), 121 deletions(-)
 delete mode 100644 Documentation/features/vm/numa-memblock/arch-support.txt

diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt 
b/Documentation/features/vm/numa-memblock/arch-support.txt
deleted file mode 100644
index 3004beb0fd71..
--- a/Documentation/features/vm/numa-memblock/arch-support.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-#
-# Feature name:  numa-memblock
-# Kconfig:   HAVE_MEMBLOCK_NODE_MAP
-# description:   arch supports NUMA aware memblocks
-#
----
-| arch |status|
----
-|   alpha: | TODO |
-| arc: |  ..  |
-| arm: |  ..  |
-|   arm64: |  ok  |
-| c6x: |  ..  |
-|csky: |  ..  |
-|   h8300: |  ..  |
-| hexagon: |  ..  |
-|ia64: |  ok  |
-|m68k: |  ..  |
-|  microblaze: |  ok  |
-|mips: |  ok  |
-|   nds32: | TODO |
-|   nios2: |  ..  |
-|openrisc: |  ..  |
-|  parisc: |  ..  |
-| powerpc: |  ok  |
-|   riscv: |  ok  |
-|s390: |  ok  |
-|  sh: |  ok  |
-|   sparc: |  ok  |
-|  um: |  ..  |
-|   unicore32: |  ..  |
-| x86: |  ok  |
-|  xtensa: |  ..  |
----
diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index d0b73371e985..a24cd13e71cb 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -144,8 +144,8 @@ setup_memory_node(int nid, void *kernel_end)
if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > 
start_kernel_pfn))
panic("kernel loaded out of ram");
 
-   memblock_add(PFN_PHYS(node_min_pfn),
-(node_max_pfn - node_min_pfn) << PAGE_SHIFT);
+   memblock_add_node(PFN_PHYS(node_min_pfn),
+ (node_max_pfn - node_min_pfn) << PAGE_SHIFT, nid);
 
/* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned.
   Note that we round this down, not up - node memory
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 40fb05d96c60..957151013d10 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -156,7 +156,6 @@ config ARM64
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_IRQ_TIME_ACCOUNTING
-   select HAVE_MEMBLOCK_NODE_MAP if NUMA
select HAVE_NMI
select HAVE_PATA_PLATFORM
select HAVE_PERF_EVENTS
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bab7cd878464..88b05b5256a9 100644
---

[PATCH v2 04/20] mm: free_area_init: use maximal zone PFNs rather than zone sizes

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

Currently, architectures that use free_area_init() to initialize memory map
and node and zone structures need to calculate zone and hole sizes. We can
use free_area_init_nodes() instead and let it detect the zone boundaries
while the architectures will only have to supply the possible limits for
the zones.

Signed-off-by: Mike Rapoport 
---
 arch/alpha/mm/init.c| 16 ++--
 arch/c6x/mm/init.c  |  8 +++-
 arch/h8300/mm/init.c|  6 +++---
 arch/hexagon/mm/init.c  |  6 +++---
 arch/m68k/mm/init.c |  6 +++---
 arch/m68k/mm/mcfmmu.c   |  9 +++--
 arch/nds32/mm/init.c| 11 ---
 arch/nios2/mm/init.c|  8 +++-
 arch/openrisc/mm/init.c |  9 +++--
 arch/um/kernel/mem.c| 12 
 include/linux/mm.h  |  2 +-
 mm/page_alloc.c |  5 ++---
 12 files changed, 38 insertions(+), 60 deletions(-)

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 12e218d3792a..667cd21393b5 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -243,21 +243,17 @@ callback_init(void * kernel_end)
  */
 void __init paging_init(void)
 {
-   unsigned long zones_size[MAX_NR_ZONES] = {0, };
-   unsigned long dma_pfn, high_pfn;
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
+   unsigned long dma_pfn;
 
dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-   high_pfn = max_pfn = max_low_pfn;
+   max_pfn = max_low_pfn;
 
-   if (dma_pfn >= high_pfn)
-   zones_size[ZONE_DMA] = high_pfn;
-   else {
-   zones_size[ZONE_DMA] = dma_pfn;
-   zones_size[ZONE_NORMAL] = high_pfn - dma_pfn;
-   }
+   max_zone_pfn[ZONE_DMA] = dma_pfn;
+   max_zone_pfn[ZONE_NORMAL] = max_pfn;
 
/* Initialize mem_map[].  */
-   free_area_init(zones_size);
+   free_area_init(max_zone_pfn);
 
/* Initialize the kernel's ZERO_PGE. */
memset((void *)ZERO_PGE, 0, PAGE_SIZE);
diff --git a/arch/c6x/mm/init.c b/arch/c6x/mm/init.c
index 9b374393a8f4..a97e51a3e26d 100644
--- a/arch/c6x/mm/init.c
+++ b/arch/c6x/mm/init.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(empty_zero_page);
 void __init paging_init(void)
 {
struct pglist_data *pgdat = NODE_DATA(0);
-   unsigned long zones_size[MAX_NR_ZONES] = {0, };
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
 
empty_zero_page  = (unsigned long) memblock_alloc(PAGE_SIZE,
  PAGE_SIZE);
@@ -49,11 +49,9 @@ void __init paging_init(void)
/*
 * Define zones
 */
-   zones_size[ZONE_NORMAL] = (memory_end - PAGE_OFFSET) >> PAGE_SHIFT;
-   pgdat->node_zones[ZONE_NORMAL].zone_start_pfn =
-   __pa(PAGE_OFFSET) >> PAGE_SHIFT;
+   max_zone_pfn[ZONE_NORMAL] = memory_end >> PAGE_SHIFT;
 
-   free_area_init(zones_size);
+   free_area_init(max_zone_pfn);
 }
 
 void __init mem_init(void)
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 1eab16b1a0bc..27a0020e3771 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -83,10 +83,10 @@ void __init paging_init(void)
 start_mem, end_mem);
 
{
-   unsigned long zones_size[MAX_NR_ZONES] = {0, };
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
 
-   zones_size[ZONE_NORMAL] = (end_mem - PAGE_OFFSET) >> PAGE_SHIFT;
-   free_area_init(zones_size);
+   max_zone_pfn[ZONE_NORMAL] = end_mem >> PAGE_SHIFT;
+   free_area_init(max_zone_pfn);
}
 }
 
diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c
index c961773a6fff..f2e6c868e477 100644
--- a/arch/hexagon/mm/init.c
+++ b/arch/hexagon/mm/init.c
@@ -91,7 +91,7 @@ void sync_icache_dcache(pte_t pte)
  */
 void __init paging_init(void)
 {
-   unsigned long zones_sizes[MAX_NR_ZONES] = {0, };
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = {0, };
 
/*
 *  This is not particularly well documented anywhere, but
@@ -101,9 +101,9 @@ void __init paging_init(void)
 *  adjust accordingly.
 */
 
-   zones_sizes[ZONE_NORMAL] = max_low_pfn;
+   max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
 
-   free_area_init(zones_sizes);  /*  sets up the zonelists and mem_map  */
+   free_area_init(max_zone_pfn);  /*  sets up the zonelists and mem_map  */
 
/*
 * Start of high memory area.  Will probably need something more
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index b88d510d4fe3..6d3147662ff2 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -84,7 +84,7 @@ void __init paging_init(void)
 * page_alloc get different views of the world.
 */
unsigned long end_mem = memory_end & PAGE_MASK;
-   unsigned long zones_size[MAX_NR_ZONES] = { 0, };
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
 
high_memory = (void *) end_mem;
 
@@ -98,8 +98,8 @@ void __

[PATCH v2 05/20] mm: use free_area_init() instead of free_area_init_nodes()

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() has effectively became a wrapper for
free_area_init_nodes() and there is no point of keeping it. Still
free_area_init() name is shorter and more general as it does not imply
necessity to initialize multiple nodes.

Rename free_area_init_nodes() to free_area_init(), update the callers and
drop old version of free_area_init().

Signed-off-by: Mike Rapoport 
---
 arch/arm64/mm/init.c |  2 +-
 arch/ia64/mm/contig.c|  2 +-
 arch/ia64/mm/discontig.c |  2 +-
 arch/microblaze/mm/init.c|  2 +-
 arch/mips/loongson64/numa.c  |  2 +-
 arch/mips/mm/init.c  |  2 +-
 arch/mips/sgi-ip27/ip27-memory.c |  2 +-
 arch/powerpc/mm/mem.c|  2 +-
 arch/riscv/mm/init.c |  2 +-
 arch/s390/mm/init.c  |  2 +-
 arch/sh/mm/init.c|  2 +-
 arch/sparc/mm/init_64.c  |  2 +-
 arch/x86/mm/init.c   |  2 +-
 include/linux/mm.h   |  7 +++
 mm/page_alloc.c  | 10 ++
 15 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index e42727e3568e..a650adb358ee 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -206,7 +206,7 @@ static void __init zone_sizes_init(unsigned long min, 
unsigned long max)
 #endif
max_zone_pfns[ZONE_NORMAL] = max;
 
-   free_area_init_nodes(max_zone_pfns);
+   free_area_init(max_zone_pfns);
 }
 
 #else
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 5b00dc3898e1..8786fa5c7612 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -210,6 +210,6 @@ paging_init (void)
printk("Virtual mem_map starts at 0x%p\n", mem_map);
}
 #endif /* !CONFIG_VIRTUAL_MEM_MAP */
-   free_area_init_nodes(max_zone_pfns);
+   free_area_init(max_zone_pfns);
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 4f33f6e7e206..dd8284bcbf16 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -627,7 +627,7 @@ void __init paging_init(void)
max_zone_pfns[ZONE_DMA32] = max_dma;
 #endif
max_zone_pfns[ZONE_NORMAL] = max_pfn;
-   free_area_init_nodes(max_zone_pfns);
+   free_area_init(max_zone_pfns);
 
zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page));
 }
diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c
index 1ffbfa96b9b8..dcaa53d11339 100644
--- a/arch/microblaze/mm/init.c
+++ b/arch/microblaze/mm/init.c
@@ -112,7 +112,7 @@ static void __init paging_init(void)
 #endif
 
/* We don't have holes in memory map */
-   free_area_init_nodes(zones_size);
+   free_area_init(zones_size);
 }
 
 void __init setup_memory(void)
diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c
index 1ae072df4831..901f5be5ee76 100644
--- a/arch/mips/loongson64/numa.c
+++ b/arch/mips/loongson64/numa.c
@@ -247,7 +247,7 @@ void __init paging_init(void)
zones_size[ZONE_DMA32] = MAX_DMA32_PFN;
 #endif
zones_size[ZONE_NORMAL] = max_low_pfn;
-   free_area_init_nodes(zones_size);
+   free_area_init(zones_size);
 }
 
 void __init mem_init(void)
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 79684000de0e..19719e8b41a5 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -418,7 +418,7 @@ void __init paging_init(void)
}
 #endif
 
-   free_area_init_nodes(max_zone_pfns);
+   free_area_init(max_zone_pfns);
 }
 
 #ifdef CONFIG_64BIT
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index a45691e6ab90..1213215ea965 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -419,7 +419,7 @@ void __init paging_init(void)
 
pagetable_init();
zones_size[ZONE_NORMAL] = max_low_pfn;
-   free_area_init_nodes(zones_size);
+   free_area_init(zones_size);
 }
 
 void __init mem_init(void)
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 041ed7cfd341..0fcea21f26b4 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -271,7 +271,7 @@ void __init paging_init(void)
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
 #endif
 
-   free_area_init_nodes(max_zone_pfns);
+   free_area_init(max_zone_pfns);
 
mark_nonram_nosave();
 }
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index b55be44ff9bd..f2ceab77b8e6 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -39,7 +39,7 @@ static void __init zone_sizes_init(void)
 #endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
 
-   free_area_init_nodes(max_zone_pfns);
+   free_area_init(max_zone_pfns);
 }
 
 static void setup_zero_page(void)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 87b2d024e75a..b11bcf4da531 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -122,7 +122,7 @@ void __

[PATCH v2 06/20] alpha: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/alpha/mm/numa.c | 18 --
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/arch/alpha/mm/numa.c b/arch/alpha/mm/numa.c
index a24cd13e71cb..5ad6087de1d6 100644
--- a/arch/alpha/mm/numa.c
+++ b/arch/alpha/mm/numa.c
@@ -202,8 +202,7 @@ setup_memory(void *kernel_end)
 
 void __init paging_init(void)
 {
-   unsigned intnid;
-   unsigned long   zones_size[MAX_NR_ZONES] = {0, };
+   unsigned long   max_zone_pfn[MAX_NR_ZONES] = {0, };
unsigned long   dma_local_pfn;
 
/*
@@ -215,19 +214,10 @@ void __init paging_init(void)
 */
dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
 
-   for_each_online_node(nid) {
-   unsigned long start_pfn = NODE_DATA(nid)->node_start_pfn;
-   unsigned long end_pfn = start_pfn + 
NODE_DATA(nid)->node_present_pages;
+   max_zone_pfn[ZONE_DMA] = dma_local_pfn;
+   max_zone_pfn[ZONE_NORMAL] = max_pfn;
 
-   if (dma_local_pfn >= end_pfn - start_pfn)
-   zones_size[ZONE_DMA] = end_pfn - start_pfn;
-   else {
-   zones_size[ZONE_DMA] = dma_local_pfn;
-   zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - 
dma_local_pfn;
-   }
-   node_set_state(nid, N_NORMAL_MEMORY);
-   free_area_init_node(nid, zones_size, start_pfn, NULL);
-   }
+   free_area_init(max_zone_pfn);
 
/* Initialize the kernel's ZERO_PGE. */
memset((void *)ZERO_PGE, 0, PAGE_SIZE);
-- 
2.26.1

[PATCH v2 07/20] arm: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/arm/mm/init.c | 66 +-
 1 file changed, 7 insertions(+), 59 deletions(-)

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 054be44d1cdb..4e43455fab84 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -92,18 +92,6 @@ EXPORT_SYMBOL(arm_dma_zone_size);
  */
 phys_addr_t arm_dma_limit;
 unsigned long arm_dma_pfn_limit;
-
-static void __init arm_adjust_dma_zone(unsigned long *size, unsigned long 
*hole,
-   unsigned long dma_size)
-{
-   if (size[0] <= dma_size)
-   return;
-
-   size[ZONE_NORMAL] = size[0] - dma_size;
-   size[ZONE_DMA] = dma_size;
-   hole[ZONE_NORMAL] = hole[0];
-   hole[ZONE_DMA] = 0;
-}
 #endif
 
 void __init setup_dma_zone(const struct machine_desc *mdesc)
@@ -121,56 +109,16 @@ void __init setup_dma_zone(const struct machine_desc 
*mdesc)
 static void __init zone_sizes_init(unsigned long min, unsigned long max_low,
unsigned long max_high)
 {
-   unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
-   struct memblock_region *reg;
-
-   /*
-* initialise the zones.
-*/
-   memset(zone_size, 0, sizeof(zone_size));
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
 
-   /*
-* The memory size has already been determined.  If we need
-* to do anything fancy with the allocation of this memory
-* to the zones, now is the time to do it.
-*/
-   zone_size[0] = max_low - min;
-#ifdef CONFIG_HIGHMEM
-   zone_size[ZONE_HIGHMEM] = max_high - max_low;
+#ifdef CONFIG_ZONE_DMA
+   max_zone_pfn[ZONE_DMA] = min(arm_dma_pfn_limit, max_low);
 #endif
-
-   /*
-* Calculate the size of the holes.
-*  holes = node_size - sum(bank_sizes)
-*/
-   memcpy(zhole_size, zone_size, sizeof(zhole_size));
-   for_each_memblock(memory, reg) {
-   unsigned long start = memblock_region_memory_base_pfn(reg);
-   unsigned long end = memblock_region_memory_end_pfn(reg);
-
-   if (start < max_low) {
-   unsigned long low_end = min(end, max_low);
-   zhole_size[0] -= low_end - start;
-   }
+   max_zone_pfn[ZONE_NORMAL] = max_low;
 #ifdef CONFIG_HIGHMEM
-   if (end > max_low) {
-   unsigned long high_start = max(start, max_low);
-   zhole_size[ZONE_HIGHMEM] -= end - high_start;
-   }
+   max_zone_pfn[ZONE_HIGHMEM] = max_high;
 #endif
-   }
-
-#ifdef CONFIG_ZONE_DMA
-   /*
-* Adjust the sizes according to any special requirements for
-* this machine type.
-*/
-   if (arm_dma_zone_size)
-   arm_adjust_dma_zone(zone_size, zhole_size,
-   arm_dma_zone_size >> PAGE_SHIFT);
-#endif
-
-   free_area_init_node(0, zone_size, min, zhole_size);
+   free_area_init(max_zone_pfn);
 }
 
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
@@ -306,7 +254,7 @@ void __init bootmem_init(void)
sparse_init();
 
/*
-* Now free the memory - free_area_init_node needs
+* Now free the memory - free_area_init needs
 * the sparse mem_map arrays initialized by sparse_init()
 * for memmap_init_zone(), otherwise all PFNs are invalid.
 */
-- 
2.26.1

[PATCH v2 08/20] arm64: simplify detection of memory zone boundaries for UMA configs

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/arm64/mm/init.c | 54 
 1 file changed, 54 deletions(-)

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index a650adb358ee..d54ad2250dce 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -192,8 +192,6 @@ static phys_addr_t __init max_zone_phys(unsigned int 
zone_bits)
return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM());
 }
 
-#ifdef CONFIG_NUMA
-
 static void __init zone_sizes_init(unsigned long min, unsigned long max)
 {
unsigned long max_zone_pfns[MAX_NR_ZONES]  = {0};
@@ -209,58 +207,6 @@ static void __init zone_sizes_init(unsigned long min, 
unsigned long max)
free_area_init(max_zone_pfns);
 }
 
-#else
-
-static void __init zone_sizes_init(unsigned long min, unsigned long max)
-{
-   struct memblock_region *reg;
-   unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
-   unsigned long __maybe_unused max_dma, max_dma32;
-
-   memset(zone_size, 0, sizeof(zone_size));
-
-   max_dma = max_dma32 = min;
-#ifdef CONFIG_ZONE_DMA
-   max_dma = max_dma32 = PFN_DOWN(arm64_dma_phys_limit);
-   zone_size[ZONE_DMA] = max_dma - min;
-#endif
-#ifdef CONFIG_ZONE_DMA32
-   max_dma32 = PFN_DOWN(arm64_dma32_phys_limit);
-   zone_size[ZONE_DMA32] = max_dma32 - max_dma;
-#endif
-   zone_size[ZONE_NORMAL] = max - max_dma32;
-
-   memcpy(zhole_size, zone_size, sizeof(zhole_size));
-
-   for_each_memblock(memory, reg) {
-   unsigned long start = memblock_region_memory_base_pfn(reg);
-   unsigned long end = memblock_region_memory_end_pfn(reg);
-
-#ifdef CONFIG_ZONE_DMA
-   if (start >= min && start < max_dma) {
-   unsigned long dma_end = min(end, max_dma);
-   zhole_size[ZONE_DMA] -= dma_end - start;
-   start = dma_end;
-   }
-#endif
-#ifdef CONFIG_ZONE_DMA32
-   if (start >= max_dma && start < max_dma32) {
-   unsigned long dma32_end = min(end, max_dma32);
-   zhole_size[ZONE_DMA32] -= dma32_end - start;
-   start = dma32_end;
-   }
-#endif
-   if (start >= max_dma32 && start < max) {
-   unsigned long normal_end = min(end, max);
-   zhole_size[ZONE_NORMAL] -= normal_end - start;
-   }
-   }
-
-   free_area_init_node(0, zone_size, min, zhole_size);
-}
-
-#endif /* CONFIG_NUMA */
-
 int pfn_valid(unsigned long pfn)
 {
phys_addr_t addr = pfn << PAGE_SHIFT;
-- 
2.26.1

[PATCH v2 09/20] csky: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/csky/kernel/setup.c | 26 +++---
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c
index 819a9a7bf786..0481f4e34538 100644
--- a/arch/csky/kernel/setup.c
+++ b/arch/csky/kernel/setup.c
@@ -26,7 +26,9 @@ struct screen_info screen_info = {
 
 static void __init csky_memblock_init(void)
 {
-   unsigned long zone_size[MAX_NR_ZONES];
+   unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET);
+   unsigned long sseg_size = PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET);
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
signed long size;
 
memblock_reserve(__pa(_stext), _end - _stext);
@@ -36,28 +38,22 @@ static void __init csky_memblock_init(void)
 
memblock_dump_all();
 
-   memset(zone_size, 0, sizeof(zone_size));
-
min_low_pfn = PFN_UP(memblock_start_of_DRAM());
max_low_pfn = max_pfn = PFN_DOWN(memblock_end_of_DRAM());
 
size = max_pfn - min_low_pfn;
 
-   if (size <= PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET))
-   zone_size[ZONE_NORMAL] = size;
-   else if (size < PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET)) {
-   zone_size[ZONE_NORMAL] =
-   PFN_DOWN(SSEG_SIZE - PHYS_OFFSET_OFFSET);
-   max_low_pfn = min_low_pfn + zone_size[ZONE_NORMAL];
-   } else {
-   zone_size[ZONE_NORMAL] =
-   PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET);
-   max_low_pfn = min_low_pfn + zone_size[ZONE_NORMAL];
+   if (size >= lowmem_size) {
+   max_low_pfn = min_low_pfn + lowmem_size;
write_mmu_msa1(read_mmu_msa0() + SSEG_SIZE);
+   } else if (size > sseg_size) {
+   max_low_pfn = min_low_pfn + sseg_size;
}
 
+   max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
+
 #ifdef CONFIG_HIGHMEM
-   zone_size[ZONE_HIGHMEM] = max_pfn - max_low_pfn;
+   max_zone_pfn[ZONE_HIGHMEM] = max_pfn;
 
highstart_pfn = max_low_pfn;
highend_pfn   = max_pfn;
@@ -66,7 +62,7 @@ static void __init csky_memblock_init(void)
 
dma_contiguous_reserve(0);
 
-   free_area_init_node(0, zone_size, min_low_pfn, NULL);
+   free_area_init(max_zone_pfn);
 }
 
 void __init setup_arch(char **cmdline_p)
-- 
2.26.1

[PATCH v2 10/20] m68k: mm: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/m68k/mm/motorola.c | 11 +--
 arch/m68k/mm/sun3mmu.c  | 10 +++---
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index 84ab5963cabb..904c2a663977 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -365,7 +365,7 @@ static void __init map_node(int node)
  */
 void __init paging_init(void)
 {
-   unsigned long zones_size[MAX_NR_ZONES] = { 0, };
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
unsigned long min_addr, max_addr;
unsigned long addr;
int i;
@@ -448,11 +448,10 @@ void __init paging_init(void)
 #ifdef DEBUG
printk ("before free_area_init\n");
 #endif
-   for (i = 0; i < m68k_num_memory; i++) {
-   zones_size[ZONE_DMA] = m68k_memory[i].size >> PAGE_SHIFT;
-   free_area_init_node(i, zones_size,
-   m68k_memory[i].addr >> PAGE_SHIFT, NULL);
+   for (i = 0; i < m68k_num_memory; i++)
if (node_present_pages(i))
node_set_state(i, N_NORMAL_MEMORY);
-   }
+
+   max_zone_pfn[ZONE_DMA] = memblock_end_of_DRAM();
+   free_area_init(max_zone_pfn);
 }
diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c
index eca1c46bb90a..5d8d956d9329 100644
--- a/arch/m68k/mm/sun3mmu.c
+++ b/arch/m68k/mm/sun3mmu.c
@@ -42,7 +42,7 @@ void __init paging_init(void)
unsigned long address;
unsigned long next_pgtable;
unsigned long bootmem_end;
-   unsigned long zones_size[MAX_NR_ZONES] = { 0, };
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
unsigned long size;
 
empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
@@ -89,14 +89,10 @@ void __init paging_init(void)
current->mm = NULL;
 
/* memory sizing is a hack stolen from motorola.c..  hope it works for 
us */
-   zones_size[ZONE_DMA] = ((unsigned long)high_memory - PAGE_OFFSET) >> 
PAGE_SHIFT;
+   max_zone_pfn[ZONE_DMA] = ((unsigned long)high_memory) >> PAGE_SHIFT;
 
/* I really wish I knew why the following change made things better...  
-- Sam */
-/* free_area_init(zones_size); */
-   free_area_init_node(0, zones_size,
-   (__pa(PAGE_OFFSET) >> PAGE_SHIFT) + 1, NULL);
+   free_area_init(max_zone_pfn);
 
 
 }
-
-
-- 
2.26.1

[PATCH v2 11/20] parisc: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/parisc/mm/init.c | 22 +++---
 1 file changed, 3 insertions(+), 19 deletions(-)

diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 5224fb38d766..02d2fdb85dcc 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -675,27 +675,11 @@ static void __init gateway_init(void)
 
 static void __init parisc_bootmem_free(void)
 {
-   unsigned long zones_size[MAX_NR_ZONES] = { 0, };
-   unsigned long holes_size[MAX_NR_ZONES] = { 0, };
-   unsigned long mem_start_pfn = ~0UL, mem_end_pfn = 0, mem_size_pfn = 0;
-   int i;
-
-   for (i = 0; i < npmem_ranges; i++) {
-   unsigned long start = pmem_ranges[i].start_pfn;
-   unsigned long size = pmem_ranges[i].pages;
-   unsigned long end = start + size;
-
-   if (mem_start_pfn > start)
-   mem_start_pfn = start;
-   if (mem_end_pfn < end)
-   mem_end_pfn = end;
-   mem_size_pfn += size;
-   }
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, };
 
-   zones_size[0] = mem_end_pfn - mem_start_pfn;
-   holes_size[0] = zones_size[0] - mem_size_pfn;
+   max_zone_pfn[0] = memblock_end_of_DRAM();
 
-   free_area_init_node(0, zones_size, mem_start_pfn, holes_size);
+   free_area_init(max_zone_pfn);
 }
 
 void __init paging_init(void)
-- 
2.26.1

[PATCH v2 12/20] sparc32: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/sparc/mm/srmmu.c | 21 +
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index b7c94de70cca..cc071dd7d8da 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -1008,24 +1008,13 @@ void __init srmmu_paging_init(void)
kmap_init();
 
{
-   unsigned long zones_size[MAX_NR_ZONES];
-   unsigned long zholes_size[MAX_NR_ZONES];
-   unsigned long npages;
-   int znum;
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
 
-   for (znum = 0; znum < MAX_NR_ZONES; znum++)
-   zones_size[znum] = zholes_size[znum] = 0;
+   max_zone_pfn[ZONE_DMA] = max_low_pfn;
+   max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
+   max_zone_pfn[ZONE_HIGHMEM] = highend_pfn;
 
-   npages = max_low_pfn - pfn_base;
-
-   zones_size[ZONE_DMA] = npages;
-   zholes_size[ZONE_DMA] = npages - pages_avail;
-
-   npages = highend_pfn - max_low_pfn;
-   zones_size[ZONE_HIGHMEM] = npages;
-   zholes_size[ZONE_HIGHMEM] = npages - calc_highpages();
-
-   free_area_init_node(0, zones_size, pfn_base, zholes_size);
+   free_area_init(max_zone_pfn);
}
 }
 
-- 
2.26.1

[PATCH v2 13/20] unicore32: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/unicore32/include/asm/memory.h  |  2 +-
 arch/unicore32/include/mach/memory.h |  6 ++--
 arch/unicore32/kernel/pci.c  | 14 ++---
 arch/unicore32/mm/init.c | 43 ++--
 4 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/arch/unicore32/include/asm/memory.h 
b/arch/unicore32/include/asm/memory.h
index 23c93105f98f..66285178dd9b 100644
--- a/arch/unicore32/include/asm/memory.h
+++ b/arch/unicore32/include/asm/memory.h
@@ -60,7 +60,7 @@
 #ifndef __ASSEMBLY__
 
 #ifndef arch_adjust_zones
-#define arch_adjust_zones(size, holes) do { } while (0)
+#define arch_adjust_zones(max_zone_pfn) do { } while (0)
 #endif
 
 /*
diff --git a/arch/unicore32/include/mach/memory.h 
b/arch/unicore32/include/mach/memory.h
index 2b527cedd03d..b4e6035cb9a3 100644
--- a/arch/unicore32/include/mach/memory.h
+++ b/arch/unicore32/include/mach/memory.h
@@ -25,10 +25,10 @@
 
 #if !defined(__ASSEMBLY__) && defined(CONFIG_PCI)
 
-void puv3_pci_adjust_zones(unsigned long *size, unsigned long *holes);
+void puv3_pci_adjust_zones(unsigned long *max_zone_pfn);
 
-#define arch_adjust_zones(size, holes) \
-   puv3_pci_adjust_zones(size, holes)
+#define arch_adjust_zones(max_zone_pfn) \
+   puv3_pci_adjust_zones(max_zone_pfn)
 
 #endif
 
diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c
index efa04a94dcdb..0d098aa05b47 100644
--- a/arch/unicore32/kernel/pci.c
+++ b/arch/unicore32/kernel/pci.c
@@ -133,21 +133,11 @@ static int pci_puv3_map_irq(const struct pci_dev *dev, u8 
slot, u8 pin)
  * This is really ugly and we need a better way of specifying
  * DMA-capable regions of memory.
  */
-void __init puv3_pci_adjust_zones(unsigned long *zone_size,
-   unsigned long *zhole_size)
+void __init puv3_pci_adjust_zones(unsigned long max_zone_pfn)
 {
unsigned int sz = SZ_128M >> PAGE_SHIFT;
 
-   /*
-* Only adjust if > 128M on current system
-*/
-   if (zone_size[0] <= sz)
-   return;
-
-   zone_size[1] = zone_size[0] - sz;
-   zone_size[0] = sz;
-   zhole_size[1] = zhole_size[0];
-   zhole_size[0] = 0;
+   max_zone_pfn[ZONE_DMA] = sz;
 }
 
 /*
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index 6cf010fadc7a..52425d383cea 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -61,46 +61,21 @@ static void __init find_limits(unsigned long *min, unsigned 
long *max_low,
}
 }
 
-static void __init uc32_bootmem_free(unsigned long min, unsigned long max_low,
-   unsigned long max_high)
+static void __init uc32_bootmem_free(unsigned long max_low)
 {
-   unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
-   struct memblock_region *reg;
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
 
-   /*
-* initialise the zones.
-*/
-   memset(zone_size, 0, sizeof(zone_size));
-
-   /*
-* The memory size has already been determined.  If we need
-* to do anything fancy with the allocation of this memory
-* to the zones, now is the time to do it.
-*/
-   zone_size[0] = max_low - min;
-
-   /*
-* Calculate the size of the holes.
-*  holes = node_size - sum(bank_sizes)
-*/
-   memcpy(zhole_size, zone_size, sizeof(zhole_size));
-   for_each_memblock(memory, reg) {
-   unsigned long start = memblock_region_memory_base_pfn(reg);
-   unsigned long end = memblock_region_memory_end_pfn(reg);
-
-   if (start < max_low) {
-   unsigned long low_end = min(end, max_low);
-   zhole_size[0] -= low_end - start;
-   }
-   }
+   max_zone_pfn[ZONE_DMA] = max_low;
+   max_zone_pfn[ZONE_NORMAL] = max_low;
 
/*
 * Adjust the sizes according to any special requirements for
 * this machine type.
+* This might lower ZONE_DMA limit.
 */
-   arch_adjust_zones(zone_size, zhole_size);
+   arch_adjust_zones(max_zone_pfn);
 
-   free_area_init_node(0, zone_size, min, zhole_size);
+   free_area_init(max_zone_pfn);
 }
 
 int pfn_valid(unsigned long pfn)
@@ -176,11 +151,11 @@ void __init bootmem_init(void)
sparse_init();
 
/*
-* Now free the memory - free_area_init_node needs
+* Now free the memory - free_area_init needs
 * the sparse mem_map arrays initialized by sparse_init()
 * for memmap_init_zone(), otherwise all PFNs are invalid.

[PATCH v2 14/20] xtensa: simplify detection of memory zone boundaries

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init() function only requires the definition of maximal PFN
for each of the supported zone rater than calculation of actual zone sizes
and the sizes of the holes between the zones.

After removal of CONFIG_HAVE_MEMBLOCK_NODE_MAP the free_area_init() is
available to all architectures.

Using this function instead of free_area_init_node() simplifies the zone
detection.

Signed-off-by: Mike Rapoport 
---
 arch/xtensa/mm/init.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 19c625e6d81f..a05b306cf371 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -70,13 +70,13 @@ void __init bootmem_init(void)
 void __init zones_init(void)
 {
/* All pages are DMA-able, so we put them all in the DMA zone. */
-   unsigned long zones_size[MAX_NR_ZONES] = {
-   [ZONE_NORMAL] = max_low_pfn - ARCH_PFN_OFFSET,
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = {
+   [ZONE_NORMAL] = max_low_pfn,
 #ifdef CONFIG_HIGHMEM
-   [ZONE_HIGHMEM] = max_pfn - max_low_pfn,
+   [ZONE_HIGHMEM] = max_pfn,
 #endif
};
-   free_area_init_node(0, zones_size, ARCH_PFN_OFFSET, NULL);
+   free_area_init(max_zone_pfn);
 }
 
 #ifdef CONFIG_HIGHMEM
-- 
2.26.1

[PATCH v2 15/20] mm: memmap_init: iterate over memblock regions rather that check each PFN

2020-04-29 Thread Mike Rapoport

From: Baoquan He 

When called during boot the memmap_init_zone() function checks if each PFN
is valid and actually belongs to the node being initialized using
early_pfn_valid() and early_pfn_in_nid().

Each such check may cost up to O(log(n)) where n is the number of memory
banks, so for large amount of memory overall time spent in early_pfn*()
becomes substantial.

Since the information is anyway present in memblock, we can iterate over
memblock memory regions in memmap_init() and only call memmap_init_zone()
for PFN ranges that are know to be valid and in the appropriate node.

Signed-off-by: Baoquan He 
Signed-off-by: Mike Rapoport 
---
 mm/page_alloc.c | 47 ---
 1 file changed, 16 insertions(+), 31 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f6a3081edb8..8d112defaead 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5939,23 +5939,6 @@ overlap_memmap_init(unsigned long zone, unsigned long 
*pfn)
return false;
 }
 
-#ifdef CONFIG_SPARSEMEM
-/* Skip PFNs that belong to non-present sections */
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
-   const unsigned long section_nr = pfn_to_section_nr(++pfn);
-
-   if (present_section_nr(section_nr))
-   return pfn;
-   return section_nr_to_pfn(next_present_section_nr(section_nr));
-}
-#else
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
-   return pfn++;
-}
-#endif
-
 /*
  * Initially all pages are reserved - free ones are freed
  * up by memblock_free_all() once the early boot process is
@@ -5990,19 +5973,7 @@ void __meminit memmap_init_zone(unsigned long size, int 
nid, unsigned long zone,
 #endif
 
for (pfn = start_pfn; pfn < end_pfn; ) {
-   /*
-* There can be holes in boot-time mem_map[]s handed to this
-* function.  They do not exist on hotplugged memory.
-*/
if (context == MEMMAP_EARLY) {
-   if (!early_pfn_valid(pfn)) {
-   pfn = next_pfn(pfn);
-   continue;
-   }
-   if (!early_pfn_in_nid(pfn, nid)) {
-   pfn++;
-   continue;
-   }
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -6118,9 +6089,23 @@ static void __meminit zone_init_free_lists(struct zone 
*zone)
 }
 
 void __meminit __weak memmap_init(unsigned long size, int nid,
- unsigned long zone, unsigned long start_pfn)
+ unsigned long zone,
+ unsigned long range_start_pfn)
 {
-   memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+   unsigned long start_pfn, end_pfn;
+   unsigned long range_end_pfn = range_start_pfn + size;
+   int i;
+
+   for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+   start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+   end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+
+   if (end_pfn > start_pfn) {
+   size = end_pfn - start_pfn;
+   memmap_init_zone(size, nid, zone, start_pfn,
+MEMMAP_EARLY, NULL);
+   }
+   }
 }
 
 static int zone_batchsize(struct zone *zone)
-- 
2.26.1

[PATCH v2 16/20] mm: remove early_pfn_in_nid() and CONFIG_NODES_SPAN_OTHER_NODES

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The commit f47ac088c406 ("mm: memmap_init: iterate over memblock regions
rather that check each PFN") made early_pfn_in_nid() obsolete and since
CONFIG_NODES_SPAN_OTHER_NODES is only used to pick a stub or a real
implementation of early_pfn_in_nid() it is also not needed anymore.

Remove both early_pfn_in_nid() and the CONFIG_NODES_SPAN_OTHER_NODES.

Co-developed-by: Hoan Tran 
Signed-off-by: Hoan Tran 
Signed-off-by: Mike Rapoport 
---
 arch/powerpc/Kconfig |  9 -
 arch/sparc/Kconfig   |  9 -
 arch/x86/Kconfig |  9 -
 mm/page_alloc.c  | 20 
 4 files changed, 47 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5f86b22b7d2c..74f316deeae1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -685,15 +685,6 @@ config ARCH_MEMORY_PROBE
def_bool y
depends on MEMORY_HOTPLUG
 
-# Some NUMA nodes have memory ranges that span
-# other nodes.  Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node.  See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
-   def_bool y
-   depends on NEED_MULTIPLE_NODES
-
 config STDBINUTILS
bool "Using standard binutils settings"
depends on 44x
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 795206b7b552..0e4f3891b904 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -286,15 +286,6 @@ config NODES_SHIFT
  Specify the maximum number of NUMA Nodes available on the target
  system.  Increases memory reserved to accommodate various tables.
 
-# Some NUMA nodes have memory ranges that span
-# other nodes.  Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node.  See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
-   def_bool y
-   depends on NEED_MULTIPLE_NODES
-
 config ARCH_SPARSEMEM_ENABLE
def_bool y if SPARC64
select SPARSEMEM_VMEMMAP_ENABLE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f8bf218a169c..1ec2a5e2fef6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1581,15 +1581,6 @@ config X86_64_ACPI_NUMA
---help---
  Enable ACPI SRAT based node topology detection.
 
-# Some NUMA nodes have memory ranges that span
-# other nodes.  Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node.  See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
-   def_bool y
-   depends on X86_64_ACPI_NUMA
-
 config NUMA_EMU
bool "NUMA emulation"
depends on NUMA
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d112defaead..d35ca0996a09 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1541,26 +1541,6 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
 }
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
-   int nid;
-
-   nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-   if (nid >= 0 && nid != node)
-   return false;
-   return true;
-}
-
-#else
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
-   return true;
-}
-#endif
-
-
 void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
 {
-- 
2.26.1

Re: New powerpc vdso calling convention

2020-04-29 Thread Adhemerval Zanella




On 28/04/2020 23:39, Nicholas Piggin wrote:
> Excerpts from Adhemerval Zanella's message of April 27, 2020 11:09 pm:
>>
>>
>> On 26/04/2020 00:41, Nicholas Piggin wrote:
>>> Excerpts from Rich Felker's message of April 26, 2020 9:11 am:
 On Sun, Apr 26, 2020 at 08:58:19AM +1000, Nicholas Piggin wrote:
> Excerpts from Christophe Leroy's message of April 25, 2020 10:20 pm:
>>
>>
>> Le 25/04/2020 à 12:56, Nicholas Piggin a écrit :
>>> Excerpts from Christophe Leroy's message of April 25, 2020 5:47 pm:


 Le 25/04/2020 à 07:22, Nicholas Piggin a écrit :
> As noted in the 'scv' thread, powerpc's vdso calling convention does 
> not
> match the C ELF ABI calling convention (or the proposed scv 
> convention).
> I think we could implement a new ABI by basically duplicating function
> entry points with different names.

 I think doing this is a real good idea.

 I've been working at porting powerpc VDSO to the GENERIC C VDSO, and 
 the
 main pitfall has been that our vdso calling convention is not 
 compatible
 with C calling convention, so we have go through an ASM entry/exit.

 See 
 https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=171469

 We should kill this error flag return through CR[SO] and get it the
 "modern" way like other architectectures implementing the C VDSO: 
 return
 0 when successfull, return -err when failed.
>>>
>>> Agreed.
>>>
> The ELF v2 ABI convention would suit it well, because the caller 
> already
> requires the function address for ctr, so having it in r12 will
> eliminate the need for address calculation, which suits the vdso data
> page access.
>
> Is there a need for ELF v1 specific calls as well, or could those 
> just be
> deprecated and remain on existing functions or required to use the ELF
> v2 calls using asm wrappers?

 What's ELF v1 and ELF v2 ? Is ELF v1 what PPC32 uses ? If so, I'd say
 yes, it would be good to have it to avoid going through ASM in the 
 middle.
>>>
>>> I'm not sure about PPC32. On PPC64, ELFv2 functions must be called with
>>> their address in r12 if called at their global entry point. ELFv1 have a
>>> function descriptor with call address and TOC in it, caller has to load
>>> the TOC if it's global.
>>>
>>> The vdso doesn't have TOC, it has one global address (the vdso data
>>> page) which it loads by calculating its own address.
>>>
>>> The kernel doesn't change the vdso based on whether it's called by a v1
>>> or v2 userspace (it doesn't really know itself and would have to export
>>> different functions). glibc has a hack to create something:
>>>
>>> # define VDSO_IFUNC_RET(value)   \
>>>({ \
>>>  static Elf64_FuncDesc vdso_opd = { .fd_toc = ~0x0 }; \
>>>  vdso_opd.fd_func = (Elf64_Addr)value;\
>>>  &vdso_opd;   \
>>>})
>>>
>>> If we could make something which links more like any other dso with
>>> ELFv1, that would be good. Otherwise I think v2 is preferable so it
>>> doesn't have to calculate its own address.
>>
>> I see the following in glibc. So looks like PPC32 is like PPC64 elfv1. 
>> By the way, they are talking about something not completely finished in 
>> the kernel. Can we finish it ?
>
> Possibly can. It seems like a good idea to fix all loose ends if we are 
> going to add new versions. Will have to check with the toolchain people 
> to make sure we're doing the right thing.

 "ELFv1" and "ELFv2" are PPC64-specific names for the old and new
 version of the ELF psABI for PPC64. They have nothing at all to do
 with PPC32 which is a completely different ABI from either.
>>>
>>> Right, I'm just talking about those comments -- it seems like the kernel 
>>> vdso should contain an .opd section with function descriptors in it for
>>> elfv1 calls, rather than the hack it has now of creating one in the 
>>> caller's .data section.
>>>
>>> But all that function descriptor code is gated by
>>>
>>> #if (defined(__PPC64__) || defined(__powerpc64__)) && _CALL_ELF != 2
>>>
>>> So it seems PPC32 does not use function descriptors but a direct pointer 
>>> to the entry point like PPC64 with ELFv2.
>>
>> Yes, this hack is only for ELFv1.  The missing ODP has not been an issue 
>> or glibc because it has been using the inline assembly to emulate the 
>> functions call since initial vDSO support (INTERNAL_VSYSCALL_CALL_TYPE).
>> It just has become an issue when I added a ifunc optimization to 
>> gettimeofday so it c

[PATCH v2 17/20] mm: free_area_init: allow defining max_zone_pfn in descending order

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

Some architectures (e.g. ARC) have the ZONE_HIGHMEM zone below the
ZONE_NORMAL. Allowing free_area_init() parse max_zone_pfn array even it is
sorted in descending order allows using free_area_init() on such
architectures.

Add top -> down traversal of max_zone_pfn array in free_area_init() and use
the latter in ARC node/zone initialization.

Signed-off-by: Mike Rapoport 
---
 arch/arc/mm/init.c | 36 +++-
 mm/page_alloc.c| 24 +++-
 2 files changed, 26 insertions(+), 34 deletions(-)

diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c
index 0920c969c466..41eb9be1653c 100644
--- a/arch/arc/mm/init.c
+++ b/arch/arc/mm/init.c
@@ -63,11 +63,13 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 
size)
 
low_mem_sz = size;
in_use = 1;
+   memblock_add_node(base, size, 0);
} else {
 #ifdef CONFIG_HIGHMEM
high_mem_start = base;
high_mem_sz = size;
in_use = 1;
+   memblock_add_node(base, size, 1);
 #endif
}
 
@@ -83,8 +85,7 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
  */
 void __init setup_arch_memory(void)
 {
-   unsigned long zones_size[MAX_NR_ZONES];
-   unsigned long zones_holes[MAX_NR_ZONES];
+   unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
 
init_mm.start_code = (unsigned long)_text;
init_mm.end_code = (unsigned long)_etext;
@@ -115,7 +116,6 @@ void __init setup_arch_memory(void)
 * the crash
 */
 
-   memblock_add_node(low_mem_start, low_mem_sz, 0);
memblock_reserve(CONFIG_LINUX_LINK_BASE,
 __pa(_end) - CONFIG_LINUX_LINK_BASE);
 
@@ -133,22 +133,7 @@ void __init setup_arch_memory(void)
memblock_dump_all();
 
/*- node/zones setup --*/
-   memset(zones_size, 0, sizeof(zones_size));
-   memset(zones_holes, 0, sizeof(zones_holes));
-
-   zones_size[ZONE_NORMAL] = max_low_pfn - min_low_pfn;
-   zones_holes[ZONE_NORMAL] = 0;
-
-   /*
-* We can't use the helper free_area_init(zones[]) because it uses
-* PAGE_OFFSET to compute the @min_low_pfn which would be wrong
-* when our kernel doesn't start at PAGE_OFFSET, i.e.
-* PAGE_OFFSET != CONFIG_LINUX_RAM_BASE
-*/
-   free_area_init_node(0,  /* node-id */
-   zones_size, /* num pages per zone */
-   min_low_pfn,/* first pfn of node */
-   zones_holes);   /* holes */
+   max_zone_pfn[ZONE_NORMAL] = max_low_pfn;
 
 #ifdef CONFIG_HIGHMEM
/*
@@ -168,20 +153,13 @@ void __init setup_arch_memory(void)
min_high_pfn = PFN_DOWN(high_mem_start);
max_high_pfn = PFN_DOWN(high_mem_start + high_mem_sz);
 
-   zones_size[ZONE_NORMAL] = 0;
-   zones_holes[ZONE_NORMAL] = 0;
-
-   zones_size[ZONE_HIGHMEM] = max_high_pfn - min_high_pfn;
-   zones_holes[ZONE_HIGHMEM] = 0;
-
-   free_area_init_node(1,  /* node-id */
-   zones_size, /* num pages per zone */
-   min_high_pfn,   /* first pfn of node */
-   zones_holes);   /* holes */
+   max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn;
 
high_memory = (void *)(min_high_pfn << PAGE_SHIFT);
kmap_init();
 #endif
+
+   free_area_init(max_zone_pfn);
 }
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d35ca0996a09..98a47f90065a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7408,7 +7408,8 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
 void __init free_area_init(unsigned long *max_zone_pfn)
 {
unsigned long start_pfn, end_pfn;
-   int i, nid;
+   int i, nid, zone;
+   bool descending = false;
 
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -7418,13 +7419,26 @@ void __init free_area_init(unsigned long *max_zone_pfn)
 
start_pfn = find_min_pfn_with_active_regions();
 
+   /*
+* Some architecturs, e.g. ARC may have ZONE_HIGHMEM below
+* ZONE_NORMAL. For such cases we allow max_zone_pfn sorted in the
+* descending order
+*/
+   if (MAX_NR_ZONES > 1 && max_zone_pfn[0] > max_zone_pfn[1])
+   descending = true;
+
for (i = 0; i < MAX_NR_ZONES; i++) {
-   if (i == ZONE_MOVABLE)
+   if (descending)
+   zone = MAX_NR_ZONES - i - 1;
+   else
+   zone = i;
+
+   if (zone == ZONE_MOVABLE)
continue;
 
-   end_pfn = max(max_zone_pfn[i], start_pfn);
-   arch_zone_lowest_possible_pfn[i] = start_pfn;
-   arch_zone_highest_possible_pfn[i] = end_pfn;
+

[PATCH v2 18/20] mm: clean up free_area_init_node() and its helpers

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The free_area_init_node() now always uses memblock info and the zone PFN
limits so it does not need the backwards compatibility functions to
calculate the zone spanned and absent pages. The removal of the compat_
versions of zone_{abscent,spanned}_pages_in_node() in turn, makes zone_size
and zhole_size parameters unused.

The node_start_pfn is determined by get_pfn_range_for_nid(), so there is no
need to pass it to free_area_init_node().

As the result, the only required parameter to free_area_init_node() is the
node ID, all the rest are removed along with no longer used
compat_zone_{abscent,spanned}_pages_in_node() helpers.

Signed-off-by: Mike Rapoport 
---
 arch/x86/mm/numa.c |   2 +-
 include/linux/mm.h |   7 +--
 mm/page_alloc.c| 110 +
 3 files changed, 24 insertions(+), 95 deletions(-)

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fe024b2ac796..0e1b99f491e4 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -742,7 +742,7 @@ static void __init init_memory_less_node(int nid)
 
/* Allocate and initialize node data. Memory-less node is now online.*/
alloc_node_data(nid);
-   free_area_init_node(nid, zones_size, 0, zholes_size);
+   free_area_init_node(nid);
 
/*
 * All zonelists will be built later in start_kernel() after per cpu
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1c2ecb42e043..2c0d42b11f3c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2272,8 +2272,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, 
pud_t *pud)
 }
 
 extern void __init pagecache_init(void);
-extern void __init free_area_init_node(int nid, unsigned long * zones_size,
-   unsigned long zone_start_pfn, unsigned long *zholes_size);
+extern void __init free_area_init_node(int nid);
 extern void free_initmem(void);
 
 /*
@@ -2346,9 +2345,7 @@ static inline unsigned long get_num_physpages(void)
 /*
  * Using memblock node mappings, an architecture may initialise its
  * zones, allocate the backing mem_map and account for memory holes in a more
- * architecture independent manner. This is a substitute for creating the
- * zone_sizes[] and zholes_size[] arrays and passing them to
- * free_area_init_node()
+ * architecture independent manner.
  *
  * An architecture is expected to register range of page frames backed by
  * physical memory with memblock_add[_node]() before calling
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 98a47f90065a..30d171451d4c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6420,8 +6420,7 @@ static unsigned long __init 
zone_spanned_pages_in_node(int nid,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
-   unsigned long *zone_end_pfn,
-   unsigned long *ignored)
+   unsigned long *zone_end_pfn)
 {
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6485,8 +6484,7 @@ unsigned long __init absent_pages_in_range(unsigned long 
start_pfn,
 static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
-   unsigned long node_end_pfn,
-   unsigned long *ignored)
+   unsigned long node_end_pfn)
 {
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6533,43 +6531,9 @@ static unsigned long __init 
zone_absent_pages_in_node(int nid,
return nr_absent;
 }
 
-static inline unsigned long __init compat_zone_spanned_pages_in_node(int nid,
-   unsigned long zone_type,
-   unsigned long node_start_pfn,
-   unsigned long node_end_pfn,
-   unsigned long *zone_start_pfn,
-   unsigned long *zone_end_pfn,
-   unsigned long *zones_size)
-{
-   unsigned int zone;
-
-   *zone_start_pfn = node_start_pfn;
-   for (zone = 0; zone < zone_type; zone++)
-   *zone_start_pfn += zones_size[zone];
-
-   *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
-
-   return zones_size[zone_type];
-}
-
-static inline unsigned long __init compat_zone_absent_pages_in_node(int nid,
-   unsigned long zone_type,
-   u

[PATCH v2 19/20] mm: simplify find_min_pfn_with_active_regions()

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

The find_min_pfn_with_active_regions() calls find_min_pfn_for_node() with
nid parameter set to MAX_NUMNODES. This makes the find_min_pfn_for_node()
traverse all memblock memory regions although the first PFN in the system
can be easily found with memblock_start_of_DRAM().

Use memblock_start_of_DRAM() in find_min_pfn_with_active_regions() and drop
now unused find_min_pfn_for_node().

Signed-off-by: Mike Rapoport 
---
 mm/page_alloc.c | 20 +---
 1 file changed, 1 insertion(+), 19 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 30d171451d4c..b990e9734474 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7045,24 +7045,6 @@ unsigned long __init node_map_pfn_alignment(void)
return ~accl_mask + 1;
 }
 
-/* Find the lowest pfn for a node */
-static unsigned long __init find_min_pfn_for_node(int nid)
-{
-   unsigned long min_pfn = ULONG_MAX;
-   unsigned long start_pfn;
-   int i;
-
-   for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
-   min_pfn = min(min_pfn, start_pfn);
-
-   if (min_pfn == ULONG_MAX) {
-   pr_warn("Could not find start_pfn for node %d\n", nid);
-   return 0;
-   }
-
-   return min_pfn;
-}
-
 /**
  * find_min_pfn_with_active_regions - Find the minimum PFN registered
  *
@@ -7071,7 +7053,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
  */
 unsigned long __init find_min_pfn_with_active_regions(void)
 {
-   return find_min_pfn_for_node(MAX_NUMNODES);
+   return PHYS_PFN(memblock_start_of_DRAM());
 }
 
 /*
-- 
2.26.1

[PATCH v2 20/20] docs/vm: update memory-models documentation

2020-04-29 Thread Mike Rapoport

From: Mike Rapoport 

to reflect the updates to free_area_init() family of functions.

Signed-off-by: Mike Rapoport 
---
 Documentation/vm/memory-model.rst | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Documentation/vm/memory-model.rst 
b/Documentation/vm/memory-model.rst
index 58a12376b7df..91228044ed16 100644
--- a/Documentation/vm/memory-model.rst
+++ b/Documentation/vm/memory-model.rst
@@ -46,11 +46,10 @@ maps the entire physical memory. For most architectures, 
the holes
 have entries in the `mem_map` array. The `struct page` objects
 corresponding to the holes are never fully initialized.
 
-To allocate the `mem_map` array, architecture specific setup code
-should call :c:func:`free_area_init_node` function or its convenience
-wrapper :c:func:`free_area_init`. Yet, the mappings array is not
-usable until the call to :c:func:`memblock_free_all` that hands all
-the memory to the page allocator.
+To allocate the `mem_map` array, architecture specific setup code should
+call :c:func:`free_area_init` function. Yet, the mappings array is not
+usable until the call to :c:func:`memblock_free_all` that hands all the
+memory to the page allocator.
 
 If an architecture enables `CONFIG_ARCH_HAS_HOLES_MEMORYMODEL` option,
 it may free parts of the `mem_map` array that do not cover the
-- 
2.26.1

Re: [PATCH v2 3/3] mm/page_alloc: Keep memoryless cpuless node 0 offline

2020-04-29 Thread Michal Hocko

On Wed 29-04-20 07:11:45, Srikar Dronamraju wrote:
> > > 
> > > By marking, N_ONLINE as NODE_MASK_NONE, lets stop assuming that Node 0 is
> > > always online.
> > > 
> > > ...
> > >
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -116,8 +116,10 @@ EXPORT_SYMBOL(latent_entropy);
> > >   */
> > >  nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
> > >   [N_POSSIBLE] = NODE_MASK_ALL,
> > > +#ifdef CONFIG_NUMA
> > > + [N_ONLINE] = NODE_MASK_NONE,
> > > +#else
> > >   [N_ONLINE] = { { [0] = 1UL } },
> > > -#ifndef CONFIG_NUMA
> > >   [N_NORMAL_MEMORY] = { { [0] = 1UL } },
> > >  #ifdef CONFIG_HIGHMEM
> > >   [N_HIGH_MEMORY] = { { [0] = 1UL } },
> > 
> > So on all other NUMA machines, when does node 0 get marked online?
> > 
> > This change means that for some time during boot, such machines will
> > now be running with node 0 marked as offline.  What are the
> > implications of this?  Will something break?
> 
> Till the nodes are detected, marking Node 0 as online tends to be redundant.
> Because the system doesn't know if its a NUMA or a non-NUMA system.
> Once we detect the nodes, we online them immediately. Hence I don't see any
> side-effects or negative implications of this change.
> 
> However if I am missing anything, please do let me know.
> 
> >From my part, I have tested this on
> 1. Non-NUMA Single node but CPUs and memory coming from zero node.
> 2. Non-NUMA Single node but CPUs and memory coming from non-zero node.
> 3. NUMA Multi node but with CPUs and memory from node 0.
> 4. NUMA Multi node but with no CPUs and memory from node 0.

Have you tested on something else than ppc? Each arch does the NUMA
setup separately and this is a big mess. E.g. x86 marks even memory less
nodes (see init_memory_less_node) as online.

Honestly I have hard time to evaluate the effect of this patch. It makes
some sense to assume all nodes offline before they get online but this
is a land mine territory.

I am also not sure what kind of problem this is going to address. You
have mentioned numa balancing without many details.
-- 
Michal Hocko
SUSE Labs

Re: [PATCH v4 3/7] KVM: PPC: Remove redundant kvm_run from vcpu_arch

2020-04-29 Thread Vitaly Kuznetsov

Tianjia Zhang  writes:

> The 'kvm_run' field already exists in the 'vcpu' structure, which
> is the same structure as the 'kvm_run' in the 'vcpu_arch' and
> should be deleted.
>
> Signed-off-by: Tianjia Zhang 
> ---
>  arch/powerpc/include/asm/kvm_host.h | 1 -
>  arch/powerpc/kvm/book3s_hv.c| 6 ++
>  arch/powerpc/kvm/book3s_hv_nested.c | 3 +--
>  3 files changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index 1dc63101ffe1..2745ff8faa01 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -795,7 +795,6 @@ struct kvm_vcpu_arch {
>   struct mmio_hpte_cache_entry *pgfault_cache;
>  
>   struct task_struct *run_task;
> - struct kvm_run *kvm_run;
>  
>   spinlock_t vpa_update_lock;
>   struct kvmppc_vpa vpa;
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 93493f0cbfe8..413ea2dcb10c 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -2934,7 +2934,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, 
> bool is_master)
>  
>   ret = RESUME_GUEST;
>   if (vcpu->arch.trap)
> - ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
> + ret = kvmppc_handle_exit_hv(vcpu->run, vcpu,
>   vcpu->arch.run_task);
>  
>   vcpu->arch.ret = ret;
> @@ -3920,7 +3920,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, 
> struct kvm_vcpu *vcpu)
>   spin_lock(&vc->lock);
>   vcpu->arch.ceded = 0;
>   vcpu->arch.run_task = current;
> - vcpu->arch.kvm_run = kvm_run;
>   vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
>   vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
>   vcpu->arch.busy_preempt = TB_NIL;
> @@ -3973,7 +3972,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, 
> struct kvm_vcpu *vcpu)
>   if (signal_pending(v->arch.run_task)) {
>   kvmppc_remove_runnable(vc, v);
>   v->stat.signal_exits++;
> - v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
> + v->run->exit_reason = KVM_EXIT_INTR;
>   v->arch.ret = -EINTR;
>   wake_up(&v->arch.cpu_run);
>   }
> @@ -4049,7 +4048,6 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
>   vc = vcpu->arch.vcore;
>   vcpu->arch.ceded = 0;
>   vcpu->arch.run_task = current;
> - vcpu->arch.kvm_run = kvm_run;
>   vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
>   vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
>   vcpu->arch.busy_preempt = TB_NIL;
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
> b/arch/powerpc/kvm/book3s_hv_nested.c
> index dc97e5be76f6..5a3987f3ebf3 100644
> --- a/arch/powerpc/kvm/book3s_hv_nested.c
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -290,8 +290,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
>   r = RESUME_HOST;
>   break;
>   }
> - r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
> -   lpcr);
> + r = kvmhv_run_single_vcpu(vcpu->run, vcpu, hdec_exp, lpcr);
>   } while (is_kvmppc_resume_guest(r));
>  
>   /* save L2 state for return */

FWIW,

Reviewed-by: Vitaly Kuznetsov 

-- 
Vitaly

Re: [PATCH v4 4/7] KVM: PPC: clean up redundant 'kvm_run' parameters

2020-04-29 Thread Vitaly Kuznetsov

Tianjia Zhang  writes:

> In the current kvm version, 'kvm_run' has been included in the 'kvm_vcpu'
> structure. For historical reasons, many kvm-related function parameters
> retain the 'kvm_run' and 'kvm_vcpu' parameters at the same time. This
> patch does a unified cleanup of these remaining redundant parameters.
>
> Signed-off-by: Tianjia Zhang 
> ---
>  arch/powerpc/include/asm/kvm_book3s.h| 16 +++---
>  arch/powerpc/include/asm/kvm_ppc.h   | 27 +
>  arch/powerpc/kvm/book3s.c|  4 +-
>  arch/powerpc/kvm/book3s.h|  2 +-
>  arch/powerpc/kvm/book3s_64_mmu_hv.c  | 12 ++--
>  arch/powerpc/kvm/book3s_64_mmu_radix.c   |  4 +-
>  arch/powerpc/kvm/book3s_emulate.c| 10 ++--
>  arch/powerpc/kvm/book3s_hv.c | 60 ++--
>  arch/powerpc/kvm/book3s_hv_nested.c  | 11 ++--
>  arch/powerpc/kvm/book3s_paired_singles.c | 72 
>  arch/powerpc/kvm/book3s_pr.c | 30 +-
>  arch/powerpc/kvm/booke.c | 36 ++--
>  arch/powerpc/kvm/booke.h |  8 +--
>  arch/powerpc/kvm/booke_emulate.c |  2 +-
>  arch/powerpc/kvm/e500_emulate.c  | 15 +++--
>  arch/powerpc/kvm/emulate.c   | 10 ++--
>  arch/powerpc/kvm/emulate_loadstore.c | 32 +--
>  arch/powerpc/kvm/powerpc.c   | 72 
>  arch/powerpc/kvm/trace_hv.h  |  6 +-
>  19 files changed, 212 insertions(+), 217 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 506e4df2d730..66dbb1f85d59 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -155,12 +155,11 @@ extern void kvmppc_mmu_unmap_page(struct kvm_vcpu 
> *vcpu, struct kvmppc_pte *pte)
>  extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
>  extern void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong eaddr, 
> ulong seg_size);
>  extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
> -extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run,
> - struct kvm_vcpu *vcpu, unsigned long addr,
> - unsigned long status);
> +extern int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
> + unsigned long addr, unsigned long status);
>  extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr,
>   unsigned long slb_v, unsigned long valid);
> -extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
> +extern int kvmppc_hv_emulate_mmio(struct kvm_vcpu *vcpu,
>   unsigned long gpa, gva_t ea, int is_store);
>  
>  extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct 
> hpte_cache *pte);
> @@ -174,8 +173,7 @@ extern void kvmppc_mmu_hpte_sysexit(void);
>  extern int kvmppc_mmu_hv_init(void);
>  extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long 
> hc);
>  
> -extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
> - struct kvm_vcpu *vcpu,
> +extern int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
>   unsigned long ea, unsigned long dsisr);
>  extern unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
>   gva_t eaddr, void *to, void *from,
> @@ -234,7 +232,7 @@ extern void kvmppc_trigger_fac_interrupt(struct kvm_vcpu 
> *vcpu, ulong fac);
>  extern void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat,
>  bool upper, u32 val);
>  extern void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr);
> -extern int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu 
> *vcpu);
> +extern int kvmppc_emulate_paired_single(struct kvm_vcpu *vcpu);
>  extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
>   bool writing, bool *writable);
>  extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry 
> *rev,
> @@ -300,12 +298,12 @@ void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, 
> u64 dw1);
>  void kvmhv_release_all_nested(struct kvm *kvm);
>  long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
>  long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
> -int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
> +int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu,
> u64 time_limit, unsigned long lpcr);
>  void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
>  void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
>  struct hv_guest_state *hr);
> -long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu);
> +long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
>  
>  void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
>  
> diff --git a/arch/powerpc/include/asm/kvm_ppc

Re: [PATCH] fixup! signal: factor copy_siginfo_to_external32 from copy_siginfo_to_user32

2020-04-29 Thread Arnd Bergmann

On Wed, Apr 29, 2020 at 1:53 PM Christoph Hellwig  wrote:
>
> I did another pass at this, reducing the overhead of the x32 magic
> in common code down to renaming copy_siginfo_to_user32 to
> copy_siginfo_to_user32 and having a conditional #define to give it
> the old name back:

Nice! I guess this is about as good as it gets, so we can stop
spending more time on it now ;-)

   Arnd

Re: [RFC PATCH dpss_eth] Don't initialise ports with no PHY

2020-04-29 Thread Andrew Lunn

> Maybe we have to modify the dtb file.

Hi Christian

Could you point me at the DT file.

  Thanks
Andrew

Re: [Skiboot] [PATCH v8 1/3] Self Save: Introducing Support for SPR Self Save

2020-04-29 Thread Vaidyanathan Srinivasan

* Pratik Rajesh Sampat  [2020-04-23 16:24:36]:

> From: Prem Shanker Jha 
> 
> The commit is a merger of commits that makes the following changes:
> 1. Commit fixes some issues with code found during integration test
>   -  replacement of addi with xor instruction during self save API.
>   -  fixing instruction generation for MFMSR during self save
>   -  data struct updates in STOP API
>   -  error RC updates for hcode image build
>   -  HOMER parser updates.
>   -  removed self save support for URMOR and HRMOR
>   -  code changes for compilation with OPAL
>   -  populating CME Image header with unsecure HOMER address.
> 
> Key_Cronus_Test=PM_REGRESS
> 
> Change-Id: I7cedcc466267c4245255d8d75c01ed695e316720
> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/66580
> Tested-by: FSP CI Jenkins 
> Tested-by: HWSV CI 
> Tested-by: PPE CI 
> Tested-by: Jenkins Server 
> Tested-by: Cronus HW CI 
> Tested-by: Hostboot CI 
> Reviewed-by: Gregory S. Still 
> Reviewed-by: RAHUL BATRA 
> Reviewed-by: Jennifer A. Stofer 
> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/66587
> Reviewed-by: Christian R. Geddes 
> Signed-off-by: Prem Shanker Jha 
> Signed-off-by: Akshay Adiga 
> Signed-off-by: Pratik Rajesh Sampat 

Reviewed-by: Vaidyanathan Srinivasan 

> 2. The commit also incorporates changes that make STOP API project
> agnostic changes include defining wrapper functions which call legacy
> API. It also adds duplicate enum members which start with prefix PROC
> instead of P9.
> 
> Key_Cronus_Test=PM_REGRESS
> 
> Change-Id: If87970f3e8cf9b507f33eb1be249e03eb3836a5e
> RTC: 201128
> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/71307
> Tested-by: FSP CI Jenkins 
> Tested-by: Jenkins Server 
> Tested-by: Hostboot CI 
> Tested-by: Cronus HW CI 
> Reviewed-by: RANGANATHPRASAD G. BRAHMASAMUDRA 
> Reviewed-by: Gregory S. Still 
> Reviewed-by: Jennifer A Stofer 
> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/71314
> Tested-by: Jenkins OP Build CI 
> Tested-by: Jenkins OP HW 
> Reviewed-by: Daniel M. Crowell 
> Signed-off-by: Prem Shanker Jha 
> Signed-off-by: Pratik Rajesh Sampat 
> ---
>  include/p9_stop_api.H|  79 +-
>  libpore/p9_cpu_reg_restore_instruction.H |   4 +
>  libpore/p9_stop_api.C| 954 +--
>  libpore/p9_stop_api.H| 115 ++-
>  libpore/p9_stop_data_struct.H|   4 +-
>  libpore/p9_stop_util.H   |   7 +-
>  6 files changed, 721 insertions(+), 442 deletions(-)


These code changes are from hcode component copied as is into OPAL
project for integration and use of stop-api.

Hcode component in cooperation with hostboot would be loaded in memory
before OPAL is loaded. This code will allow runtime changes and usage
of various power management save restore functions.

This patch specifically enables self-save feature by the microcode.

--Vaidy

Re: [PATCH v8 2/3] API to verify the STOP API and image compatibility

2020-04-29 Thread Vaidyanathan Srinivasan

* Pratik Rajesh Sampat  [2020-04-23 16:24:37]:

> From: Prem Shanker Jha 
> 
> Commit defines a new API primarily intended for OPAL to determine
> cpu register save API's compatibility with HOMER layout and
> self save restore. It can help OPAL determine if version of
> API integrated with OPAL is different from hostboot.
> 
> Change-Id: Ic0de45a336cfb8b6b6096a10ac1cd3ffbaa44fc0
> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/77612
> Tested-by: FSP CI Jenkins 
> Tested-by: Jenkins Server 
> Tested-by: Hostboot CI 
> Reviewed-by: RANGANATHPRASAD G. BRAHMASAMUDRA 
> Reviewed-by: Gregory S Still 
> Reviewed-by: Jennifer A Stofer 
> Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/77614
> Tested-by: Jenkins OP Build CI 
> Tested-by: Jenkins OP HW 
> Reviewed-by: Daniel M Crowell 
> Signed-off-by: Pratik Rajesh Sampat 

Reviewed-by: Vaidyanathan Srinivasan 

> ---
>  include/p9_stop_api.H| 25 ++
>  libpore/p9_cpu_reg_restore_instruction.H |  7 ++-
>  libpore/p9_hcd_memmap_base.H |  7 +++
>  libpore/p9_stop_api.C| 58 +++-
>  libpore/p9_stop_api.H| 26 ++-
>  libpore/p9_stop_util.H   | 20 
>  6 files changed, 130 insertions(+), 13 deletions(-)

This stop-api code will help OPAL check and use self-save functions so
that different versions of OPAL can be loaded and run with different
versions of low level firmware stack.

--Vaidy

Re: [Skiboot] [PATCH v8 3/3] Self save API integration

2020-04-29 Thread Vaidyanathan Srinivasan

* Pratik Rajesh Sampat  [2020-04-23 16:24:38]:

> The commit makes the self save API available outside the firmware by defining
> an OPAL wrapper.
> This wrapper has a similar interface to that of self restore and expects the
> cpu pir, SPR number, minus the value of that SPR to be passed in its
> paramters and returns OPAL_SUCCESS on success. It adds a device-tree
> node signifying support for self-save after verifying the stop API
> version compatibility.
> 
> The commit also documents both the self-save and the self-restore API
> calls along with their working and usage.
> 
> Signed-off-by: Pratik Rajesh Sampat 

Reviewed-by: Vaidyanathan Srinivasan 

> ---
>  doc/opal-api/opal-slw-self-save-reg-181.rst |  51 ++
>  doc/opal-api/opal-slw-set-reg-100.rst   |   5 +
>  doc/power-management.rst|  48 +
>  hw/slw.c| 106 
>  include/opal-api.h  |   3 +-
>  include/p9_stop_api.H   |  18 
>  include/skiboot.h   |   3 +
>  7 files changed, 233 insertions(+), 1 deletion(-)
>  create mode 100644 doc/opal-api/opal-slw-self-save-reg-181.rst

This patch enables OPAL interface to call stop-api and set self-save.
Basically completes the infrastructure required to use the new
self-save function provided by the microcode.

--Vaidy

Re: [PATCH] ASoC: fsl_easrc: Check NULL pinter before dereference

2020-04-29 Thread Mark Brown

On Fri, Apr 24, 2020 at 08:30:04PM +0800, Shengjiu Wang wrote:
> The patch 955ac624058f: "ASoC: fsl_easrc: Add EASRC ASoC CPU DAI
> drivers" from Apr 16, 2020, leads to the following Smatch complaint:

This doesn't apply against current code, please check and resend.


signature.asc
Description: PGP signature

Re: [RFC PATCH v2 7/7] powerpc/selftest: reuse ppc-opcode macros to avoid redundancy

2020-04-29 Thread Naveen N. Rao


Michael Ellerman wrote:

Balamuruhan S  writes:

Avoid redefining macros to encode ppc instructions instead reuse it from
ppc-opcode.h, Makefile changes are necessary to compile memcmp_64.S with
__ASSEMBLY__ defined from selftests.

Signed-off-by: Balamuruhan S 
---
 .../selftests/powerpc/stringloops/Makefile| 34 ++
 .../powerpc/stringloops/asm/asm-const.h   |  1 +
 .../powerpc/stringloops/asm/ppc-opcode.h  | 36 +--
 3 files changed, 29 insertions(+), 42 deletions(-)
 create mode 12 tools/testing/selftests/powerpc/stringloops/asm/asm-const.h
 mode change 100644 => 12 
tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h

diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile 
b/tools/testing/selftests/powerpc/stringloops/Makefile
index 7fc0623d85c3..efe76c5a5b94 100644
--- a/tools/testing/selftests/powerpc/stringloops/Makefile
+++ b/tools/testing/selftests/powerpc/stringloops/Makefile
@@ -1,26 +1,44 @@
 # SPDX-License-Identifier: GPL-2.0
 # The loops are all 64-bit code
-CFLAGS += -I$(CURDIR)
+GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown")
+CFLAGS += -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) -I$(CURDIR)/../include
 
 EXTRA_SOURCES := ../harness.c
 
 build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c >/dev/null 2>&1) then echo "1"; fi)
 
+ifneq ($(build_32bit),1)

 TEST_GEN_PROGS := memcmp_64 strlen
+TEST_GEN_FILES := memcmp.o memcmp_64.o memcmp_64
+MEMCMP := $(OUTPUT)/memcmp.o
+MEMCMP_64 := $(OUTPUT)/memcmp_64.o
+HARNESS :=  $(OUTPUT)/../harness.o
+CFLAGS += -m64 -maltivec
 
-$(OUTPUT)/memcmp_64: memcmp.c

-$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
+OVERRIDE_TARGETS := 1
+include ../../lib.mk
 
-ifeq ($(build_32bit),1)

+$(OUTPUT)/memcmp_64: $(MEMCMP_64) $(MEMCMP) $(HARNESS)
+   $(CC) $(CFLAGS) memcmp.o memcmp_64.o ../harness.o -o memcmp_64
+
+$(MEMCMP_64): memcmp_64.S
+   $(CC) $(CFLAGS) -D__ASSEMBLY__ -o memcmp_64.o -c memcmp_64.S
+
+$(MEMCMP): memcmp.c
+   $(CC) $(CFLAGS) -o memcmp.o -c memcmp.c
+
+$(HARNESS): $(EXTRA_SOURCES)
+   $(CC) $(CFLAGS) -DGIT_VERSION='"$(GIT_VERSION)"' -o ../harness.o -c 
$(EXTRA_SOURCES)


What are you actually trying to do here? Is it just that you need to
define __ASSEMBLY__ for memcmp_64.S?


Adding __ASSEMBLY__ while building memcmp_64.S would be the goal, so as 
to reuse ppc-opcode.h. However, asm/ppc-opcode.h under stringloops test 
is tiny and doesn't seem to justify the change.




What you have breaks the build, it's not respecting $(OUTPUT).


I think we should just drop this patch from the series.

Bala,
Can you re-post this series without the RFC tag, with the last patch 
dropped?



- Naveen

Re: [PATCH v2 16/20] mm: remove early_pfn_in_nid() and CONFIG_NODES_SPAN_OTHER_NODES

2020-04-29 Thread Christoph Hellwig

On Wed, Apr 29, 2020 at 03:11:22PM +0300, Mike Rapoport wrote:
> From: Mike Rapoport 
> 
> The commit f47ac088c406 ("mm: memmap_init: iterate over memblock regions
> rather that check each PFN") made early_pfn_in_nid() obsolete and since
> CONFIG_NODES_SPAN_OTHER_NODES is only used to pick a stub or a real
> implementation of early_pfn_in_nid() it is also not needed anymore.

I don't think you can quote a commit id for something that hasn't been
commited to mainline yet.  Then again I would have just merged this
patch into the one that obsoleted early_pfn_in_nid anyway.

Re: [PATCH v2 16/20] mm: remove early_pfn_in_nid() and CONFIG_NODES_SPAN_OTHER_NODES

2020-04-29 Thread Mike Rapoport

On Wed, Apr 29, 2020 at 07:17:06AM -0700, Christoph Hellwig wrote:
> On Wed, Apr 29, 2020 at 03:11:22PM +0300, Mike Rapoport wrote:
> > From: Mike Rapoport 
> > 
> > The commit f47ac088c406 ("mm: memmap_init: iterate over memblock regions
> > rather that check each PFN") made early_pfn_in_nid() obsolete and since
> > CONFIG_NODES_SPAN_OTHER_NODES is only used to pick a stub or a real
> > implementation of early_pfn_in_nid() it is also not needed anymore.
> 
> I don't think you can quote a commit id for something that hasn't been
> commited to mainline yet.i

Ouch, that was one of the things I've indented to fix in v2...

> Then again I would have just merged this
> patch into the one that obsoleted early_pfn_in_nid anyway.

I've kept these commits separate to preserve the authorship.
I'll update the changelog so that it won't mention commit id.

-- 
Sincerely yours,
Mike.

Re: [PATCH v8 0/7] powerpc/perf: Add json file metric support for the hv_24x7 socket/chip level events

2020-04-29 Thread Arnaldo Carvalho de Melo

Em Tue, Apr 28, 2020 at 12:02:42PM +0530, kajoljain escreveu:
> Hi Arnaldo,
>   Please let me know if there any changes required in this patchset,
> as some of its patches are still not part of your perf/core tree.

Thanks, I checked and all seems ok, added Jiri's Acked-by, appreciated,
thanks,

- Arnaldo
 
> Thanks,
> Kajol Jain
> 
> On 4/2/20 2:03 AM, Kajol Jain wrote:
> > Patchset adds json file metric support for the hv_24x7 socket/chip level
> > events. "hv_24x7" pmu interface events needs system dependent parameter
> > like socket/chip/core. For example, hv_24x7 chip level events needs
> > specific chip-id to which the data is requested should be added as part
> > of pmu events.
> > 
> > So to enable JSON file support to "hv_24x7" interface, patchset reads
> > total number of sockets details in sysfs under 
> > "/sys/devices/hv_24x7/interface/".
> > 
> > Second patch of the patchset adds expr_scanner_ctx object to hold user
> > data for the expr scanner, which can be used to hold runtime parameter.
> > 
> > Patch 4 & 6 of the patchset handles perf tool plumbing needed to replace
> > the "?" character in the metric expression to proper value and hv_24x7
> > json metric file for different Socket/chip resources.
> > 
> > Patch set also enable Hz/hz prinitg for --metric-only option to print
> > metric data for bus frequency.
> > 
> > Applied and tested all these patches cleanly on top of jiri's flex changes
> > with the changes done by Kan Liang for "Support metric group constraint"
> > patchset and made required changes.
> > 
> > Also apply this patch on top of the fix patch send earlier
> > for printing metric name incase overlapping events.
> > https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/commit/?h=perf/core&id=37cd7f65bf71a48f25eeb6d9be5dacb20d008ea6
> > 
> > Changelog:
> > v7 -> v8
> > - Add test case for testing parsing of "?" in metric expression
> > - Reaname variables name to runtime
> > 
> > v6 -> v7
> > - Spit patchset into two patch series one for kernel changes and other
> >   for tool side changes.
> > - Made changes Suggested by Jiri, including rather then reading runtime
> >   parameter from metric name, actually add it in structure egroup and
> >   metric_expr.
> > - As we don't need to read runtime parameter from metric name,
> >   now I am not appending it and rather just printing it in
> >   generic_metric function.
> > 
> > Kernel Side changes patch series: https://lkml.org/lkml/2020/3/27/58
> > 
> > v5 -> v6
> > - resolve compilation issue due to rearranging patch series.
> > - Rather then adding new function to take careof case for runtime param
> >   in metricgroup__add_metric, using metricgroup__add_metric_param itself
> >   for that work.
> > - Address some optimization suggested like using directly file path
> >   rather then adding new macro in header.c
> > - Change commit message on patch where we are adding "?" support
> >   by adding simple example.
> > 
> > v4 -> v5
> > - Using sysfs__read_int instead of sysfs__read_ull while reading
> >   parameter value in powerpc/util/header.c file.
> > 
> > - Using asprintf rather then malloc and sprintf 
> >   Suggested by Arnaldo Carvalho de Melo
> > 
> > - Break patch 6 from previous version to two patch,
> >   - One to add refactor current "metricgroup__add_metric" function
> > and another where actually "?" handling infra added.
> > 
> > - Add expr__runtimeparam as part of 'expr_scanner_ctx' struct
> >   rather then making it global variable. Thanks Jiri for
> >   adding this structure to hold user data for the expr scanner.
> > 
> > - Add runtime param as agrugement to function 'expr__find_other'
> >   and 'expr__parse' and made changes on references accordingly.
> > 
> > v3 -> v4
> > - Apply these patch on top of Kan liang changes.
> >   As suggested by Jiri.
> > 
> > v2 -> v3
> > - Remove setting  event_count to 0 part in function 'h_24x7_event_read'
> >   with comment rather then adding 0 to event_count value.
> >   Suggested by: Sukadev Bhattiprolu
> > 
> > - Apply tool side changes require to replace "?" on Jiri's flex patch
> >   series and made all require changes to make it compatible with added
> >   flex change.
> > 
> > v1 -> v2
> > - Rename hv-24x7 metric json file as nest_metrics.json
> > 
> > Jiri Olsa (2):
> >   perf expr: Add expr_ prefix for parse_ctx and parse_id
> >   perf expr: Add expr_scanner_ctx object
> > 
> > Kajol Jain (5):
> >   perf/tools: Refactoring metricgroup__add_metric function
> >   perf/tools: Enhance JSON/metric infrastructure to handle "?"
> >   perf/tests/expr: Added test for runtime param in metric expression
> >   tools/perf: Enable Hz/hz prinitg for --metric-only option
> >   perf/tools/pmu-events/powerpc: Add hv_24x7 socket/chip level metric
> > events
> > 
> >  tools/perf/arch/powerpc/util/header.c |  8 ++
> >  .../arch/powerpc/power9/nest_metrics.json | 19 +
> >  tools/perf/tests/expr.c   | 20 +++--
> >  tools/perf/util/

Re: [PATCH 2/2] powerpc/spufs: stop using access_ok

2020-04-29 Thread Christophe Leroy





Le 29/04/2020 à 09:03, Jeremy Kerr a écrit :

From: Christoph Hellwig 

Just use the proper non __-prefixed get/put_user variants where that is
not done yet.


But it means you are doing the access_ok() check everytime, which is 
what is to be avoided by doing the access_ok() once then using the 
__-prefixed variant.


Christophe



Signed-off-by: Christoph Hellwig 
Signed-off-by: Jeremy Kerr 
---
  arch/powerpc/platforms/cell/spufs/file.c | 42 +---
  1 file changed, 8 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/file.c 
b/arch/powerpc/platforms/cell/spufs/file.c
index b4e1ef650b40..cd7d10f27fad 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -590,17 +590,12 @@ static ssize_t spufs_mbox_read(struct file *file, char 
__user *buf,
size_t len, loff_t *pos)
  {
struct spu_context *ctx = file->private_data;
-   u32 mbox_data, __user *udata;
+   u32 mbox_data, __user *udata = (void __user *)buf;
ssize_t count;
  
  	if (len < 4)

return -EINVAL;
  
-	if (!access_ok(buf, len))

-   return -EFAULT;
-
-   udata = (void __user *)buf;
-
count = spu_acquire(ctx);
if (count)
return count;
@@ -616,7 +611,7 @@ static ssize_t spufs_mbox_read(struct file *file, char 
__user *buf,
 * but still need to return the data we have
 * read successfully so far.
 */
-   ret = __put_user(mbox_data, udata);
+   ret = put_user(mbox_data, udata);
if (ret) {
if (!count)
count = -EFAULT;
@@ -698,17 +693,12 @@ static ssize_t spufs_ibox_read(struct file *file, char 
__user *buf,
size_t len, loff_t *pos)
  {
struct spu_context *ctx = file->private_data;
-   u32 ibox_data, __user *udata;
+   u32 ibox_data, __user *udata = (void __user *)buf;
ssize_t count;
  
  	if (len < 4)

return -EINVAL;
  
-	if (!access_ok(buf, len))

-   return -EFAULT;
-
-   udata = (void __user *)buf;
-
count = spu_acquire(ctx);
if (count)
goto out;
@@ -727,7 +717,7 @@ static ssize_t spufs_ibox_read(struct file *file, char 
__user *buf,
}
  
  	/* if we can't write at all, return -EFAULT */

-   count = __put_user(ibox_data, udata);
+   count = put_user(ibox_data, udata);
if (count)
goto out_unlock;
  
@@ -741,7 +731,7 @@ static ssize_t spufs_ibox_read(struct file *file, char __user *buf,

 * but still need to return the data we have
 * read successfully so far.
 */
-   ret = __put_user(ibox_data, udata);
+   ret = put_user(ibox_data, udata);
if (ret)
break;
}
@@ -836,17 +826,13 @@ static ssize_t spufs_wbox_write(struct file *file, const 
char __user *buf,
size_t len, loff_t *pos)
  {
struct spu_context *ctx = file->private_data;
-   u32 wbox_data, __user *udata;
+   u32 wbox_data, __user *udata = (void __user *)buf;
ssize_t count;
  
  	if (len < 4)

return -EINVAL;
  
-	udata = (void __user *)buf;

-   if (!access_ok(buf, len))
-   return -EFAULT;
-
-   if (__get_user(wbox_data, udata))
+   if (get_user(wbox_data, udata))
return -EFAULT;
  
  	count = spu_acquire(ctx);

@@ -873,7 +859,7 @@ static ssize_t spufs_wbox_write(struct file *file, const 
char __user *buf,
/* write as much as possible */
for (count = 4, udata++; (count + 4) <= len; count += 4, udata++) {
int ret;
-   ret = __get_user(wbox_data, udata);
+   ret = get_user(wbox_data, udata);
if (ret)
break;
  
@@ -1982,9 +1968,6 @@ static ssize_t spufs_mbox_info_read(struct file *file, char __user *buf,

u32 stat, data;
int ret;
  
-	if (!access_ok(buf, len))

-   return -EFAULT;
-
ret = spu_acquire_saved(ctx);
if (ret)
return ret;
@@ -2028,9 +2011,6 @@ static ssize_t spufs_ibox_info_read(struct file *file, 
char __user *buf,
u32 stat, data;
int ret;
  
-	if (!access_ok(buf, len))

-   return -EFAULT;
-
ret = spu_acquire_saved(ctx);
if (ret)
return ret;
@@ -2082,9 +2062,6 @@ static ssize_t spufs_wbox_info_read(struct file *file, 
char __user *buf,
u32 data[ARRAY_SIZE(ctx->csa.spu_mailbox_data)];
int ret, count;
  
-	if (!access_ok(buf, len))

-   return -EFAULT;
-
ret = spu_acquire_saved(ctx);
if (ret)
return ret;
@@ -2143,9 +2120,6 @@ static ssize_t spufs_dma_info_read(struct file *file, 
char __user *buf,

Re: [RFC PATCH dpss_eth] Don't initialise ports with no PHY

2020-04-29 Thread Andrew Lunn

On Wed, Apr 29, 2020 at 03:55:28PM +0200, Christian Zigotzky wrote:
> Hi Andrew,
> 
> You can find some dtb and source files in our kernel package.
> 
> Download: http://www.xenosoft.de/linux-image-5.7-rc3-X1000_X5000.tar.gz

I have the tarball. Are we talking about
linux-image-5.7-rc3-X1000_X5000/X5000_and_QEMU_e5500/dtbs/X5000_20/cyrus.eth.dtb

I don't see any status = "disabled"; in the blob. So i would expect
the driver to probe.

Andrew

Re: [RFC PATCH dpss_eth] Don't initialise ports with no PHY

2020-04-29 Thread Christian Zigotzky




> On 29. Apr 2020, at 17:22, Andrew Lunn  wrote:
> 
> On Wed, Apr 29, 2020 at 03:55:28PM +0200, Christian Zigotzky wrote:
>> Hi Andrew,
>> 
>> You can find some dtb and source files in our kernel package.
>> 
>> Download: http://www.xenosoft.de/linux-image-5.7-rc3-X1000_X5000.tar.gz
> 
> I have the tarball. Are we talking about
> linux-image-5.7-rc3-X1000_X5000/X5000_and_QEMU_e5500/dtbs/X5000_20/cyrus.eth.dtb
> 
> I don't see any status = "disabled"; in the blob. So i would expect
> the driver to probe.
> 
>Andrew
> 
> 

Yes, that’s correct but maybe Darren uses another dtb file.

@Darren
Which dtb file do you use?

[PATCH v1 0/3] mm/memory_hotplug: Make virtio-mem play nicely with kexec-tools

2020-04-29 Thread David Hildenbrand

This series is based on [1]:
[PATCH v2 00/10] virtio-mem: paravirtualized memory
That will hopefull get picked up soon, rebased to -next.

The following patches were reverted from -next [2]:
[PATCH 0/3] kexec/memory_hotplug: Prevent removal and accidental use
As discussed in that thread, they should be reverted from -next already.

In theory, if people agree, we could take the first two patches via the
-mm tree now and the last (virtio-mem) patch via MST's tree once picking up
virtio-mem. No strong feelings.


Memory added by virtio-mem is special and might contain logical holes,
especially after memory unplug, but also when adding memory in
sub-section size. While memory in these holes can usually be read, that
memory should not be touched. virtio-mem managed device memory is never
exposed via any firmware memmap (esp., e820). The device driver will
request to plug memory from the hypervisor and add it to Linux.

On a cold start, all memory is unplugged, and the guest driver will first
request to plug memory from the hypervisor, to then add it to Linux. After
a reboot, all memory will get unplugged (except in rare, special cases). In
case the device driver comes up and detects that some memory is still
plugged after a reboot, it will manually request to unplug all memory from
the hypervisor first - to then request to plug memory from the hypervisor
and add to Linux. This is essentially a defragmentation step, where all
logical holes are removed.

As the device driver is responsible for detecting, adding and managing that
memory, also kexec should treat it like that. It is special. We need a way
to teach kexec-tools to not add that memory to the fixed-up firmware
memmap, to not place kexec images onto this memory, but still allow kdump
to dump it. Add a flag to tell memory hotplug code to
not create /sys/firmware/memmap entries and to indicate it via
"System RAM (driver managed)" in /proc/iomem.

Before this series, kexec_file_load() already did the right thing (for
virtio-mem) by not adding that memory to the fixed-up firmware memmap and
letting the device driver handle it. With this series, also kexec_load() -
which relies on user space to provide a fixed up firmware memmap - does
the right thing with virtio-mem memory.

When the virtio-mem device driver(s) come up, they will request to unplug
all memory from the hypervisor first (esp. defragment), to then request to
plug consecutive memory ranges from the hypervisor, and add them to Linux
- just like on a reboot where we still have memory plugged.

[1] https://lore.kernel.org/r/20200311171422.10484-1-da...@redhat.com/
[2] https://lore.kernel.org/r/20200326180730.4754-1-james.mo...@arm.com

David Hildenbrand (3):
  mm/memory_hotplug: Prepare passing flags to add_memory() and friends
  mm/memory_hotplug: Introduce MHP_DRIVER_MANAGED
  virtio-mem: Add memory with MHP_DRIVER_MANAGED

 arch/powerpc/platforms/powernv/memtrace.c |  2 +-
 .../platforms/pseries/hotplug-memory.c|  2 +-
 drivers/acpi/acpi_memhotplug.c|  2 +-
 drivers/base/memory.c |  2 +-
 drivers/dax/kmem.c|  2 +-
 drivers/hv/hv_balloon.c   |  2 +-
 drivers/s390/char/sclp_cmd.c  |  2 +-
 drivers/virtio/virtio_mem.c   |  3 +-
 drivers/xen/balloon.c |  2 +-
 include/linux/memory_hotplug.h| 15 +++--
 mm/memory_hotplug.c   | 31 +--
 11 files changed, 44 insertions(+), 21 deletions(-)

-- 
2.25.3

[PATCH v1 1/3] mm/memory_hotplug: Prepare passing flags to add_memory() and friends

2020-04-29 Thread David Hildenbrand

We soon want to pass flags - prepare for that.

This patch is based on a similar patch by Oscar Salvador:

https://lkml.kernel.org/r/20190625075227.15193-3-osalva...@suse.de

Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: Greg Kroah-Hartman 
Cc: Dan Williams 
Cc: Vishal Verma 
Cc: Dave Jiang 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Wei Liu 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: Andrew Morton 
Cc: Thomas Gleixner 
Cc: Pingfan Liu 
Cc: Leonardo Bras 
Cc: Nathan Lynch 
Cc: Oscar Salvador 
Cc: Michal Hocko 
Cc: Baoquan He 
Cc: Wei Yang 
Cc: Pankaj Gupta 
Cc: Eric Biederman 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-ker...@vger.kernel.org
Cc: linux-a...@vger.kernel.org
Cc: linux-nvd...@lists.01.org
Cc: linux-hyp...@vger.kernel.org
Cc: linux-s...@vger.kernel.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: xen-de...@lists.xenproject.org
Signed-off-by: David Hildenbrand 
---
 arch/powerpc/platforms/powernv/memtrace.c   |  2 +-
 arch/powerpc/platforms/pseries/hotplug-memory.c |  2 +-
 drivers/acpi/acpi_memhotplug.c  |  2 +-
 drivers/base/memory.c   |  2 +-
 drivers/dax/kmem.c  |  2 +-
 drivers/hv/hv_balloon.c |  2 +-
 drivers/s390/char/sclp_cmd.c|  2 +-
 drivers/virtio/virtio_mem.c |  2 +-
 drivers/xen/balloon.c   |  2 +-
 include/linux/memory_hotplug.h  |  7 ---
 mm/memory_hotplug.c | 11 ++-
 11 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 13b369d2cc45..a7475d18c671 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -224,7 +224,7 @@ static int memtrace_online(void)
ent->mem = 0;
}
 
-   if (add_memory(ent->nid, ent->start, ent->size)) {
+   if (add_memory(ent->nid, ent->start, ent->size, 0)) {
pr_err("Failed to add trace memory to node %d\n",
ent->nid);
ret += 1;
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 5ace2f9a277e..ae44eba46ca0 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -646,7 +646,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb)
block_sz = memory_block_size_bytes();
 
/* Add the memory */
-   rc = __add_memory(lmb->nid, lmb->base_addr, block_sz);
+   rc = __add_memory(lmb->nid, lmb->base_addr, block_sz, 0);
if (rc) {
invalidate_lmb_associativity_index(lmb);
return rc;
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index e294f44a7850..d91b3584d4b2 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -207,7 +207,7 @@ static int acpi_memory_enable_device(struct 
acpi_memory_device *mem_device)
if (node < 0)
node = memory_add_physaddr_to_nid(info->start_addr);
 
-   result = __add_memory(node, info->start_addr, info->length);
+   result = __add_memory(node, info->start_addr, info->length, 0);
 
/*
 * If the memory block has been used by the kernel, add_memory()
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 2b09b68b9f78..c0ef7d9e310a 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -432,7 +432,7 @@ static ssize_t probe_store(struct device *dev, struct 
device_attribute *attr,
 
nid = memory_add_physaddr_to_nid(phys_addr);
ret = __add_memory(nid, phys_addr,
-  MIN_MEMORY_BLOCK_SIZE * sections_per_block);
+  MIN_MEMORY_BLOCK_SIZE * sections_per_block, 0);
 
if (ret)
goto out;
diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
index 3d0a7e702c94..e159184e0ba0 100644
--- a/drivers/dax/kmem.c
+++ b/drivers/dax/kmem.c
@@ -65,7 +65,7 @@ int dev_dax_kmem_probe(struct device *dev)
new_res->flags = IORESOURCE_SYSTEM_RAM;
new_res->name = dev_name(dev);
 
-   rc = add_memory(numa_node, new_res->start, resource_size(new_res));
+   rc = add_memory(numa_node, new_res->start, resource_size(new_res), 0);
if (rc) {
release_resource(new_res);
kfree(new_res);
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
index 32e3bc0aa665..0194bed1a573 100644
--- a/drivers/hv/hv_balloon.c
+++ b/drivers/hv/hv_balloon.c
@@ -72

[PATCH v1 2/3] mm/memory_hotplug: Introduce MHP_DRIVER_MANAGED

2020-04-29 Thread David Hildenbrand

Some paravirtualized devices that add memory via add_memory() and
friends (esp. virtio-mem) don't want to create entries in
/sys/firmware/memmap/ - primarily to hinder kexec from adding this
memory to the boot memmap of the kexec kernel.

In fact, such memory is never exposed via the firmware (e.g., e820), but
only via the device, so exposing this memory via /sys/firmware/memmap/ is
wrong:
 "kexec needs the raw firmware-provided memory map to setup the
  parameter segment of the kernel that should be booted with
  kexec. Also, the raw memory map is useful for debugging. For
  that reason, /sys/firmware/memmap is an interface that provides
  the raw memory map to userspace." [1]

We want to let user space know that memory which is always detected,
added, and managed via a (device) driver - like memory managed by
virtio-mem - is special. It cannot be used for placing kexec segments
and the (device) driver is responsible for re-adding memory that
(eventually shrunk/grown/defragmented) memory after a reboot/kexec. It
should e.g., not be added to a fixed up firmware memmap. However, it should
be dumped by kdump.

Also, such memory could behave differently than an ordinary DIMM - e.g.,
memory managed by virtio-mem can have holes inside added memory resource,
which should not be touched, especially for writing.

Let's expose that memory as "System RAM (driver managed)" e.g., via
/pro/iomem.

We don't have to worry about firmware_map_remove() on the removal path.
If there is no entry, it will simply return with -EINVAL.

[1] https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-firmware-memmap

Cc: Andrew Morton 
Cc: Michal Hocko 
Cc: Pankaj Gupta 
Cc: Wei Yang 
Cc: Baoquan He 
Cc: Eric Biederman 
Signed-off-by: David Hildenbrand 
---
 include/linux/memory_hotplug.h |  8 
 mm/memory_hotplug.c| 20 
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index bf0e3edb8688..cc538584b39e 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -68,6 +68,14 @@ struct mhp_params {
pgprot_t pgprot;
 };
 
+/* Flags used for add_memory() and friends. */
+
+/*
+ * Don't create entries in /sys/firmware/memmap/ and expose memory as
+ * "System RAM (driver managed)" in e.g., /proc/iomem
+ */
+#define MHP_DRIVER_MANAGED 1
+
 /*
  * Zone resizing functions
  *
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ebdf6541d074..cfa0721280aa 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -98,11 +98,11 @@ void mem_hotplug_done(void)
 u64 max_mem_size = U64_MAX;
 
 /* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size,
+const char *resource_name)
 {
struct resource *res;
unsigned long flags =  IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-   char *resource_name = "System RAM";
 
/*
 * Make sure value parsed from 'mem=' only restricts memory adding
@@ -1058,7 +1058,8 @@ int __ref add_memory_resource(int nid, struct resource 
*res,
BUG_ON(ret);
 
/* create new memmap entry */
-   firmware_map_add_hotplug(start, start + size, "System RAM");
+   if (!(flags & MHP_DRIVER_MANAGED))
+   firmware_map_add_hotplug(start, start + size, "System RAM");
 
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
@@ -1081,10 +1082,21 @@ int __ref add_memory_resource(int nid, struct resource 
*res,
 /* requires device_hotplug_lock, see add_memory_resource() */
 int __ref __add_memory(int nid, u64 start, u64 size, unsigned long flags)
 {
+   const char *resource_name = "System RAM";
struct resource *res;
int ret;
 
-   res = register_memory_resource(start, size);
+   /*
+* Indicate that memory managed by a driver is special. It's always
+* detected and added via a driver, should not be given to the kexec
+* kernel for booting when manually crafting the firmware memmap, and
+* no kexec segments should be placed on it. However, kdump should
+* dump this memory.
+*/
+   if (flags & MHP_DRIVER_MANAGED)
+   resource_name = "System RAM (driver managed)";
+
+   res = register_memory_resource(start, size, resource_name);
if (IS_ERR(res))
return PTR_ERR(res);
 
-- 
2.25.3

[PATCH v1 3/3] virtio-mem: Add memory with MHP_DRIVER_MANAGED

2020-04-29 Thread David Hildenbrand

We don't want /sys/firmware/memmap entries and we want to indicate
our memory as "System RAM (driver managed)" in /proc/iomem. This is
especially relevant for kexec-tools, which have to be updated to
support dumping virtio-mem memory after this patch. Expected behavior in
kexec-tools:
- Don't use this memory when creating a fixed-up firmware memmap. Works
  now out of the box on x86-64.
- Don't use this memory for placing kexec segments. Works now out of the
  box on x86-64.
- Consider "System RAM (driver managed)" when creating the elfcorehdr
  for kdump. This memory has to be dumped. Needs update of kexec-tools.

With this patch on x86-64:

/proc/iomem:
-0fff : Reserved
1000-0009fbff : System RAM
[...]
fffc- : Reserved
1-13fff : System RAM
14000-147ff : System RAM (driver managed)
34000-347ff : System RAM (driver managed)
34800-34fff : System RAM (driver managed)
[..]
328000-32 : PCI Bus :00

/sys/firmware/memmap:
-0009fc00 (System RAM)
0009fc00-000a (Reserved)
000f-0010 (Reserved)
0010-bffe (System RAM)
bffe-c000 (Reserved)
feffc000-ff00 (Reserved)
fffc-0001 (Reserved)
0001-00014000 (System RAM)

Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: Michal Hocko 
Cc: Eric Biederman 
Signed-off-by: David Hildenbrand 
---
 drivers/virtio/virtio_mem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/virtio/virtio_mem.c b/drivers/virtio/virtio_mem.c
index 3101cbf9e59d..6f658d1aeac4 100644
--- a/drivers/virtio/virtio_mem.c
+++ b/drivers/virtio/virtio_mem.c
@@ -421,7 +421,8 @@ static int virtio_mem_mb_add(struct virtio_mem *vm, 
unsigned long mb_id)
nid = memory_add_physaddr_to_nid(addr);
 
dev_dbg(&vm->vdev->dev, "adding memory block: %lu\n", mb_id);
-   return add_memory(nid, addr, memory_block_size_bytes(), 0);
+   return add_memory(nid, addr, memory_block_size_bytes(),
+ MHP_DRIVER_MANAGED);
 }
 
 /*
-- 
2.25.3

Re: [PATCH] ASoC: fsl_easrc: Check NULL pinter before dereference

2020-04-29 Thread Mark Brown

On Fri, 24 Apr 2020 20:30:04 +0800, Shengjiu Wang wrote:
> The patch 955ac624058f: "ASoC: fsl_easrc: Add EASRC ASoC CPU DAI
> drivers" from Apr 16, 2020, leads to the following Smatch complaint:
> 
> sound/soc/fsl/fsl_easrc.c:1529 fsl_easrc_hw_free()
> warn: variable dereferenced before check 'ctx' (see line 1527)
> 
> sound/soc/fsl/fsl_easrc.c
>   1526  struct fsl_asrc_pair *ctx = runtime->private_data;
>   1527  struct fsl_easrc_ctx_priv *ctx_priv = ctx->private;
>   ^
> Dereference
> 
> [...]

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-5.8

Thanks!

[1/1] ASoC: fsl_easrc: Check for null pointer before dereferencing "ctx" in 
fsl_easrc_hw_free()
  commit: f3fc1ea011f09156886e8f4beb240ea814f2197a

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

[PATCH v2.5 16/20] mm: remove early_pfn_in_nid() and CONFIG_NODES_SPAN_OTHER_NODES

2020-04-29 Thread Mike Rapoport

On Wed, Apr 29, 2020 at 03:11:22PM +0300, Mike Rapoport wrote:
> From: Mike Rapoport 
> 
> The commit f47ac088c406 ("mm: memmap_init: iterate over memblock regions
> rather that check each PFN") made early_pfn_in_nid() obsolete and since
> CONFIG_NODES_SPAN_OTHER_NODES is only used to pick a stub or a real
> implementation of early_pfn_in_nid() it is also not needed anymore.
> 
> Remove both early_pfn_in_nid() and the CONFIG_NODES_SPAN_OTHER_NODES.
> 
> Co-developed-by: Hoan Tran 
> Signed-off-by: Hoan Tran 
> Signed-off-by: Mike Rapoport 
> ---

Here's the version with the updated changelog:

>From 7415d1a9b7000c6eecd9f63770592e4d4a8d2463 Mon Sep 17 00:00:00 2001
From: Mike Rapoport 
Date: Sat, 11 Apr 2020 11:26:49 +0300
Subject: [PATCH v2.5] mm: remove early_pfn_in_nid() and 
CONFIG_NODES_SPAN_OTHER_NODES

The memmap_init() function was made to iterate over memblock regions and as
the result the early_pfn_in_nid() function became obsolete.
Since CONFIG_NODES_SPAN_OTHER_NODES is only used to pick a stub or a real
implementation of early_pfn_in_nid(), it is also not needed anymore.

Remove both early_pfn_in_nid() and the CONFIG_NODES_SPAN_OTHER_NODES.

Co-developed-by: Hoan Tran 
Signed-off-by: Hoan Tran 
Signed-off-by: Mike Rapoport 
---
 arch/powerpc/Kconfig |  9 -
 arch/sparc/Kconfig   |  9 -
 arch/x86/Kconfig |  9 -
 mm/page_alloc.c  | 20 
 4 files changed, 47 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 5f86b22b7d2c..74f316deeae1 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -685,15 +685,6 @@ config ARCH_MEMORY_PROBE
def_bool y
depends on MEMORY_HOTPLUG
 
-# Some NUMA nodes have memory ranges that span
-# other nodes.  Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node.  See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
-   def_bool y
-   depends on NEED_MULTIPLE_NODES
-
 config STDBINUTILS
bool "Using standard binutils settings"
depends on 44x
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 795206b7b552..0e4f3891b904 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -286,15 +286,6 @@ config NODES_SHIFT
  Specify the maximum number of NUMA Nodes available on the target
  system.  Increases memory reserved to accommodate various tables.
 
-# Some NUMA nodes have memory ranges that span
-# other nodes.  Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node.  See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
-   def_bool y
-   depends on NEED_MULTIPLE_NODES
-
 config ARCH_SPARSEMEM_ENABLE
def_bool y if SPARC64
select SPARSEMEM_VMEMMAP_ENABLE
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f8bf218a169c..1ec2a5e2fef6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1581,15 +1581,6 @@ config X86_64_ACPI_NUMA
---help---
  Enable ACPI SRAT based node topology detection.
 
-# Some NUMA nodes have memory ranges that span
-# other nodes.  Even though a pfn is valid and
-# between a node's start and end pfns, it may not
-# reside on that node.  See memmap_init_zone()
-# for details.
-config NODES_SPAN_OTHER_NODES
-   def_bool y
-   depends on X86_64_ACPI_NUMA
-
 config NUMA_EMU
bool "NUMA emulation"
depends on NUMA
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d112defaead..d35ca0996a09 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1541,26 +1541,6 @@ int __meminit early_pfn_to_nid(unsigned long pfn)
 }
 #endif /* CONFIG_NEED_MULTIPLE_NODES */
 
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
-   int nid;
-
-   nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
-   if (nid >= 0 && nid != node)
-   return false;
-   return true;
-}
-
-#else
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
-   return true;
-}
-#endif
-
-
 void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
 {
-- 
2.26.1

Re: [PATCH v1 1/3] mm/memory_hotplug: Prepare passing flags to add_memory() and friends

2020-04-29 Thread Wei Liu

On Wed, Apr 29, 2020 at 06:08:01PM +0200, David Hildenbrand wrote:
> We soon want to pass flags - prepare for that.
> 
> This patch is based on a similar patch by Oscar Salvador:
> 
> https://lkml.kernel.org/r/20190625075227.15193-3-osalva...@suse.de
> 
[...]
> ---
>  drivers/hv/hv_balloon.c |  2 +-

> diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c
> index 32e3bc0aa665..0194bed1a573 100644
> --- a/drivers/hv/hv_balloon.c
> +++ b/drivers/hv/hv_balloon.c
> @@ -726,7 +726,7 @@ static void hv_mem_hot_add(unsigned long start, unsigned 
> long size,
>  
>   nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
>   ret = add_memory(nid, PFN_PHYS((start_pfn)),
> - (HA_CHUNK << PAGE_SHIFT));
> + (HA_CHUNK << PAGE_SHIFT), 0);
>  
>   if (ret) {
>   pr_err("hot_add memory failed error is %d\n", ret);

Acked-by: Wei Liu

[PATCH] net/bonding: Do not transition down slave after speed/duplex check

2020-04-29 Thread Thomas Falcon

The following behavior has been observed when testing logical partition
migration of LACP-bonded VNIC devices in a PowerVM pseries environment.

1. When performing the migration, the bond master detects that a slave has
   lost its link, deactivates the LACP port, and sets the port's
   is_enabled flag to false.
2. The slave device then updates it's carrier state to off while it resets
   itself. This update triggers a NETDEV_CHANGE notification, which performs
   a speed and duplex update. The device does not return a valid speed
   and duplex, so the master sets the slave link state to BOND_LINK_FAIL.
3. When the slave VNIC device(s) are active again, some operations, such
   as setting the port's is_enabled flag, are not performed when transitioning
   the link state back to BOND_LINK_UP from BOND_LINK_FAIL, though the state
   prior to the speed check was BOND_LINK_DOWN.

Affected devices are therefore not utilized in the aggregation though they
are operational. The simplest way to fix this seems to be to restrict the
link state change to devices that are currently up and running.

CC: Jay Vosburgh 
CC: Veaceslav Falico 
CC: Andy Gospodarek 
Signed-off-by: Thomas Falcon 
---
 drivers/net/bonding/bond_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2e70e43c5df5..d840da7cd379 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3175,7 +3175,8 @@ static int bond_slave_netdev_event(unsigned long event,
 * speeds/duplex are available.
 */
if (bond_update_speed_duplex(slave) &&
-   BOND_MODE(bond) == BOND_MODE_8023AD) {
+   BOND_MODE(bond) == BOND_MODE_8023AD &&
+   slave->link == BOND_LINK_UP) {
if (slave->last_link_up)
slave->link = BOND_LINK_FAIL;
else
-- 
2.18.2

Re: [PATCH] net/bonding: Do not transition down slave after speed/duplex check

2020-04-29 Thread Thomas Falcon




On 4/29/20 1:38 PM, Jay Vosburgh wrote:

Thomas Falcon  wrote:


The following behavior has been observed when testing logical partition
migration of LACP-bonded VNIC devices in a PowerVM pseries environment.

1. When performing the migration, the bond master detects that a slave has
   lost its link, deactivates the LACP port, and sets the port's
   is_enabled flag to false.
2. The slave device then updates it's carrier state to off while it resets
   itself. This update triggers a NETDEV_CHANGE notification, which performs
   a speed and duplex update. The device does not return a valid speed
   and duplex, so the master sets the slave link state to BOND_LINK_FAIL.
3. When the slave VNIC device(s) are active again, some operations, such
   as setting the port's is_enabled flag, are not performed when transitioning
   the link state back to BOND_LINK_UP from BOND_LINK_FAIL, though the state
   prior to the speed check was BOND_LINK_DOWN.

Just to make sure I'm understanding correctly, in regards to
"the state prior to the speed check was BOND_LINK_DOWN," do you mean
that during step 1, the slave link is set to BOND_LINK_DOWN, and then in
step 2 changed from _DOWN to _FAIL?


Yes, that's what I meant, thanks.




Affected devices are therefore not utilized in the aggregation though they
are operational. The simplest way to fix this seems to be to restrict the
link state change to devices that are currently up and running.

This sounds similar to an issue from last fall; can you confirm
that you're running with a kernel that includes:

1899bb325149 bonding: fix state transition issue in link monitoring

-J



I think so, but I will confirm ASAP.

Tom



CC: Jay Vosburgh 
CC: Veaceslav Falico 
CC: Andy Gospodarek 
Signed-off-by: Thomas Falcon 
---
drivers/net/bonding/bond_main.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2e70e43c5df5..d840da7cd379 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3175,7 +3175,8 @@ static int bond_slave_netdev_event(unsigned long event,
 * speeds/duplex are available.
 */
if (bond_update_speed_duplex(slave) &&
-   BOND_MODE(bond) == BOND_MODE_8023AD) {
+   BOND_MODE(bond) == BOND_MODE_8023AD &&
+   slave->link == BOND_LINK_UP) {
if (slave->last_link_up)
slave->link = BOND_LINK_FAIL;
else
--
2.18.2


---
-Jay Vosburgh, jay.vosbu...@canonical.com

Re: [RFC PATCH dpss_eth] Don't initialise ports with no PHY

2020-04-29 Thread Christian Zigotzky

Hi Andrew,

You can find some dtb and source files in our kernel package.

Download: http://www.xenosoft.de/linux-image-5.7-rc3-X1000_X5000.tar.gz

Thanks,
Christian

> On 29. Apr 2020, at 15:13, Andrew Lunn  wrote:
> 
> 
>> 
>> Maybe we have to modify the dtb file.
> 
> Hi Christian
> 
> Could you point me at the DT file.
> 
>  Thanks
>Andrew

Re: [PATCH] net/bonding: Do not transition down slave after speed/duplex check

2020-04-29 Thread Jay Vosburgh

Thomas Falcon  wrote:

>The following behavior has been observed when testing logical partition
>migration of LACP-bonded VNIC devices in a PowerVM pseries environment.
>
>1. When performing the migration, the bond master detects that a slave has
>   lost its link, deactivates the LACP port, and sets the port's
>   is_enabled flag to false.
>2. The slave device then updates it's carrier state to off while it resets
>   itself. This update triggers a NETDEV_CHANGE notification, which performs
>   a speed and duplex update. The device does not return a valid speed
>   and duplex, so the master sets the slave link state to BOND_LINK_FAIL.
>3. When the slave VNIC device(s) are active again, some operations, such
>   as setting the port's is_enabled flag, are not performed when transitioning
>   the link state back to BOND_LINK_UP from BOND_LINK_FAIL, though the state
>   prior to the speed check was BOND_LINK_DOWN.

Just to make sure I'm understanding correctly, in regards to
"the state prior to the speed check was BOND_LINK_DOWN," do you mean
that during step 1, the slave link is set to BOND_LINK_DOWN, and then in
step 2 changed from _DOWN to _FAIL?

>Affected devices are therefore not utilized in the aggregation though they
>are operational. The simplest way to fix this seems to be to restrict the
>link state change to devices that are currently up and running.

This sounds similar to an issue from last fall; can you confirm
that you're running with a kernel that includes:

1899bb325149 bonding: fix state transition issue in link monitoring

-J


>CC: Jay Vosburgh 
>CC: Veaceslav Falico 
>CC: Andy Gospodarek 
>Signed-off-by: Thomas Falcon 
>---
> drivers/net/bonding/bond_main.c | 3 ++-
> 1 file changed, 2 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 2e70e43c5df5..d840da7cd379 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -3175,7 +3175,8 @@ static int bond_slave_netdev_event(unsigned long event,
>* speeds/duplex are available.
>*/
>   if (bond_update_speed_duplex(slave) &&
>-  BOND_MODE(bond) == BOND_MODE_8023AD) {
>+  BOND_MODE(bond) == BOND_MODE_8023AD &&
>+  slave->link == BOND_LINK_UP) {
>   if (slave->last_link_up)
>   slave->link = BOND_LINK_FAIL;
>   else
>-- 
>2.18.2
>

---
-Jay Vosburgh, jay.vosbu...@canonical.com

Re: Re: [RESEND PATCH v5 2/5] arm64/crash_core: Export TCR_EL1.T1SZ in vmcoreinfo

2020-04-29 Thread Scott Branden


Hi Bhupesh,

On 2020-02-23 10:25 p.m., Bhupesh Sharma wrote:

Hi Amit,

On Fri, Feb 21, 2020 at 2:36 PM Amit Kachhap  wrote:

Hi Bhupesh,

On 1/13/20 5:44 PM, Bhupesh Sharma wrote:

Hi James,

On 01/11/2020 12:30 AM, Dave Anderson wrote:

- Original Message -

Hi Bhupesh,

On 25/12/2019 19:01, Bhupesh Sharma wrote:

On 12/12/2019 04:02 PM, James Morse wrote:

On 29/11/2019 19:59, Bhupesh Sharma wrote:

vabits_actual variable on arm64 indicates the actual VA space size,
and allows a single binary to support both 48-bit and 52-bit VA
spaces.

If the ARMv8.2-LVA optional feature is present, and we are running
with a 64KB page size; then it is possible to use 52-bits of address
space for both userspace and kernel addresses. However, any kernel
binary that supports 52-bit must also be able to fall back to 48-bit
at early boot time if the hardware feature is not present.

Since TCR_EL1.T1SZ indicates the size offset of the memory region
addressed by TTBR1_EL1 (and hence can be used for determining the
vabits_actual value) it makes more sense to export the same in
vmcoreinfo rather than vabits_actual variable, as the name of the
variable can change in future kernel versions, but the architectural
constructs like TCR_EL1.T1SZ can be used better to indicate intended
specific fields to user-space.

User-space utilities like makedumpfile and crash-utility, need to
read/write this value from/to vmcoreinfo

(write?)

Yes, also write so that the vmcoreinfo from an (crashing) arm64
system can
be used for
analysis of the root-cause of panic/crash on say an x86_64 host using
utilities like
crash-utility/gdb.

I read this as as "User-space [...] needs to write to vmcoreinfo".

That's correct. But for writing to vmcore dump in the kdump kernel, we
need to read the symbols from the vmcoreinfo in the primary kernel.


for determining if a virtual address lies in the linear map range.

I think this is a fragile example. The debugger shouldn't need to know
this.

Well that the current user-space utility design, so I am not sure we
can
tweak that too much.


The user-space computation for determining whether an address lies in
the linear map range is the same as we have in kernel-space:

 #define __is_lm_address(addr)(!(((u64)addr) &
BIT(vabits_actual -
 1)))

This was changed with 14c127c957c1 ("arm64: mm: Flip kernel VA
space"). If
user-space
tools rely on 'knowing' the kernel memory layout, they must have to
constantly be fixed
and updated. This is a poor argument for adding this to something that
ends up as ABI.

See above. The user-space has to rely on some ABI/guaranteed
hardware-symbols which can be
used for 'determining' the kernel memory layout.

I disagree. Everything and anything in the kernel will change. The
ABI rules apply to
stuff exposed via syscalls and kernel filesystems. It does not apply
to kernel internals,
like the memory layout we used yesterday. 14c127c957c1 is a case in
point.

A debugger trying to rely on this sort of thing would have to play
catchup whenever it
changes.

Exactly.  That's the whole point.

The crash utility and makedumpfile are not in the same league as other
user-space tools.
They have always had to "play catchup" precisely because they depend
upon kernel internals,
which constantly change.

I agree with you and DaveA here. Software user-space debuggers are
dependent on kernel internals (which can change from time-to-time) and
will have to play catch-up (which has been the case since the very start).

Unfortunately we don't have any clear ABI for software debugging tools -
may be something to look for in future.

A case in point is gdb/kgdb, which still needs to run with KASLR
turned-off (nokaslr) for debugging, as it confuses gdb which resolve
kernel symbol address from symbol table of vmlinux. But we can
work-around the same in makedumpfile/crash by reading the 'kaslr_offset'
value. And I have several users telling me now they cannot use gdb on
KASLR enabled kernel to debug panics, but can makedumpfile + crash
combination to achieve the same.

So, we should be looking to fix these utilities which are broken since
the 52-bit changes for arm64. Accordingly, I will try to send the v6
soon while incorporating the comments posted on the v5.

Any update on the next v6 version. Since this patch series is fixing the
current broken kdump so need this series to add some more fields in
vmcoreinfo for Pointer Authentication work.

Sorry for the delay. I was caught up in some other urgent arm64
user-space issues.
I am preparing the v6 now and hopefully will be able to post it out
for review later today.


Did v6 get sent out?



Thanks,
Bhupesh



Regards,
Scott

Re: [PATCH 2/2] powerpc/spufs: stop using access_ok

2020-04-29 Thread Jeremy Kerr

Hi Christophe,

> > Just use the proper non __-prefixed get/put_user variants where
> > that is not done yet.
> 
> But it means you are doing the access_ok() check everytime, which is 
> what is to be avoided by doing the access_ok() once then using the 
> __-prefixed variant.

5 out of 8 of these are just a access_ok(); simple_read_from_buffer().

For the cases where it's multiple __put/get_user()s, the max will be 5.
(for the mbox access). Is that worth optimising the access_ok() checks?

Cheers,


Jeremy

Re: [PATCH v5 0/5] Track and expose idle PURR and SPURR ticks

2020-04-29 Thread Michael Ellerman

Gautham R Shenoy  writes:
> On Mon, Apr 20, 2020 at 03:46:35PM -0700, Tyrel Datwyler wrote:
>> On 4/7/20 1:47 AM, Gautham R. Shenoy wrote:
>> > From: "Gautham R. Shenoy" 
>> > 
>> > Hi,
>> > 
>> > This is the fifth version of the patches to track and expose idle PURR
>> > and SPURR ticks. These patches are required by tools such as lparstat
>> > to compute system utilization for capacity planning purposes.
...
>> > 
>> > Gautham R. Shenoy (5):
>> >   powerpc: Move idle_loop_prolog()/epilog() functions to header file
>> >   powerpc/idle: Store PURR snapshot in a per-cpu global variable
>> >   powerpc/pseries: Account for SPURR ticks on idle CPUs
>> >   powerpc/sysfs: Show idle_purr and idle_spurr for every CPU
>> >   Documentation: Document sysfs interfaces purr, spurr, idle_purr,
>> > idle_spurr
>> > 
>> >  Documentation/ABI/testing/sysfs-devices-system-cpu | 39 +
>> >  arch/powerpc/include/asm/idle.h| 93 
>> > ++
>> >  arch/powerpc/kernel/sysfs.c| 82 
>> > ++-
>> >  arch/powerpc/platforms/pseries/setup.c |  8 +-
>> >  drivers/cpuidle/cpuidle-pseries.c  | 39 ++---
>> >  5 files changed, 224 insertions(+), 37 deletions(-)
>> >  create mode 100644 arch/powerpc/include/asm/idle.h
>> > 
>> 
>> Reviewed-by: Tyrel Datwyler 
>
> Thanks for reviewing the patches.
>
>> 
>> Any chance this is going to be merged in the near future? There is a 
>> patchset to
>> update lparstat in the powerpc-utils package to calculate PURR/SPURR cpu
>> utilization that I would like to merge, but have been holding off to make 
>> sure
>> we are synced with this proposed patchset.
>
> Michael, could you please consider this for 5.8 ?

Yes. Has it been tested on KVM at all?

cheers

Re: [RFC PATCH v2 7/7] powerpc/selftest: reuse ppc-opcode macros to avoid redundancy

2020-04-29 Thread Michael Ellerman

"Naveen N. Rao"  writes:
> Michael Ellerman wrote:
>> Balamuruhan S  writes:
>>> Avoid redefining macros to encode ppc instructions instead reuse it from
>>> ppc-opcode.h, Makefile changes are necessary to compile memcmp_64.S with
>>> __ASSEMBLY__ defined from selftests.
>>>
>>> Signed-off-by: Balamuruhan S 
>>> ---
>>>  .../selftests/powerpc/stringloops/Makefile| 34 ++
>>>  .../powerpc/stringloops/asm/asm-const.h   |  1 +
>>>  .../powerpc/stringloops/asm/ppc-opcode.h  | 36 +--
>>>  3 files changed, 29 insertions(+), 42 deletions(-)
>>>  create mode 12 
>>> tools/testing/selftests/powerpc/stringloops/asm/asm-const.h
>>>  mode change 100644 => 12 
>>> tools/testing/selftests/powerpc/stringloops/asm/ppc-opcode.h
>>>
>>> diff --git a/tools/testing/selftests/powerpc/stringloops/Makefile 
>>> b/tools/testing/selftests/powerpc/stringloops/Makefile
>>> index 7fc0623d85c3..efe76c5a5b94 100644
>>> --- a/tools/testing/selftests/powerpc/stringloops/Makefile
>>> +++ b/tools/testing/selftests/powerpc/stringloops/Makefile
>>> @@ -1,26 +1,44 @@
>>>  # SPDX-License-Identifier: GPL-2.0
>>>  # The loops are all 64-bit code
>>> -CFLAGS += -I$(CURDIR)
>>> +GIT_VERSION = $(shell git describe --always --long --dirty || echo 
>>> "unknown")
>>> +CFLAGS += -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) 
>>> -I$(CURDIR)/../include
>>>  
>>>  EXTRA_SOURCES := ../harness.c
>>>  
>>>  build_32bit = $(shell if ($(CC) $(CFLAGS) -m32 -o /dev/null memcmp.c 
>>> >/dev/null 2>&1) then echo "1"; fi)
>>>  
>>> +ifneq ($(build_32bit),1)
>>>  TEST_GEN_PROGS := memcmp_64 strlen
>>> +TEST_GEN_FILES := memcmp.o memcmp_64.o memcmp_64
>>> +MEMCMP := $(OUTPUT)/memcmp.o
>>> +MEMCMP_64 := $(OUTPUT)/memcmp_64.o
>>> +HARNESS :=  $(OUTPUT)/../harness.o
>>> +CFLAGS += -m64 -maltivec
>>>  
>>> -$(OUTPUT)/memcmp_64: memcmp.c
>>> -$(OUTPUT)/memcmp_64: CFLAGS += -m64 -maltivec
>>> +OVERRIDE_TARGETS := 1
>>> +include ../../lib.mk
>>>  
>>> -ifeq ($(build_32bit),1)
>>> +$(OUTPUT)/memcmp_64: $(MEMCMP_64) $(MEMCMP) $(HARNESS)
>>> +   $(CC) $(CFLAGS) memcmp.o memcmp_64.o ../harness.o -o memcmp_64
>>> +
>>> +$(MEMCMP_64): memcmp_64.S
>>> +   $(CC) $(CFLAGS) -D__ASSEMBLY__ -o memcmp_64.o -c memcmp_64.S
>>> +
>>> +$(MEMCMP): memcmp.c
>>> +   $(CC) $(CFLAGS) -o memcmp.o -c memcmp.c
>>> +
>>> +$(HARNESS): $(EXTRA_SOURCES)
>>> +   $(CC) $(CFLAGS) -DGIT_VERSION='"$(GIT_VERSION)"' -o ../harness.o -c 
>>> $(EXTRA_SOURCES)
>> 
>> What are you actually trying to do here? Is it just that you need to
>> define __ASSEMBLY__ for memcmp_64.S?
>
> Adding __ASSEMBLY__ while building memcmp_64.S would be the goal, so as 
> to reuse ppc-opcode.h. However, asm/ppc-opcode.h under stringloops test 
> is tiny and doesn't seem to justify the change.

I don't see ppc-opcode.h testing __ASSEMBLY__ though, so I don't think
we even need to define it?

cheers

Re: [musl] Re: New powerpc vdso calling convention

2020-04-29 Thread Michael Ellerman

Rich Felker  writes:
> On Sat, Apr 25, 2020 at 08:56:54PM +1000, Nicholas Piggin wrote:
>> >> The ELF v2 ABI convention would suit it well, because the caller already
>> >> requires the function address for ctr, so having it in r12 will
>> >> eliminate the need for address calculation, which suits the vdso data
>> >> page access.
>> >> 
>> >> Is there a need for ELF v1 specific calls as well, or could those just be
>> >> deprecated and remain on existing functions or required to use the ELF
>> >> v2 calls using asm wrappers?
>> > 
>> > What's ELF v1 and ELF v2 ? Is ELF v1 what PPC32 uses ? If so, I'd say 
>> > yes, it would be good to have it to avoid going through ASM in the middle..
>> 
>> I'm not sure about PPC32. On PPC64, ELFv2 functions must be called with 
>> their address in r12 if called at their global entry point. ELFv1 have a 
>> function descriptor with call address and TOC in it, caller has to load 
>> the TOC if it's global.
>> 
>> The vdso doesn't have TOC, it has one global address (the vdso data 
>> page) which it loads by calculating its own address.
>
> A function descriptor could be put in the VDSO data page, or as it's
> done now by glibc the vdso linkage code could create it. My leaning is
> to at least have a version of the code that's callable (with the right
> descriptor around it) by v1 binaries, but since musl does not use
> ELFv1 at all we really have no stake in this and I'm fine with
> whatever outcome users of v1 decide on.
>
>> The kernel doesn't change the vdso based on whether it's called by a v1 
>> or v2 userspace (it doesn't really know itself and would have to export 
>> different functions). glibc has a hack to create something:
>
> I'm pretty sure it does know because signal invocation has to know
> whether the function pointer points to a descriptor or code. At least
> for FDPIC archs (similar to PPC64 ELFv1 function descriptors) it knows
> and has to know.

It does know, see TIF_ELF2ABI which is tested by is_elf2_task(), and as
you say is used by the signal delivery code.

Currently the VDSO entry points are not functions, so they don't need to
change based on the ABI.

cheers

[RFC PATCH 0/2] powerpc/64s: scv support

2020-04-29 Thread Nicholas Piggin

Another round of scv, which is getting closer to done. ABI and
compatibility / feature testing still not set in stone, but some
good discussion among the various libcs etc. and it's close enough
that changes should just be small tweaks to clobbers etc. Posting
now because there is some interest to prototype userspace support
which we should do before fixing the ABI.

This relies on some of the signal handling and kuap patches I
already posted, so tree is here:

https://github.com/npiggin/linux/commits/next-test

I have qemu scv support apatches I need to resend, but they're not
merged yet. POWER9 system simulator should support it, but I have
not tested the public version:

https://www14.software.ibm.com/support/customercare/sas/f/pwrfs/pwr9/home.html

Thanks,
Nick

Nicholas Piggin (2):
  powerpc/64s/exception: treat NIA below __end_interrupts as soft-masked
  powerpc/64s: system call support for scv/rfscv instructions

 Documentation/powerpc/syscall64-abi.rst   |  42 --
 arch/powerpc/include/asm/asm-prototypes.h |   2 +-
 arch/powerpc/include/asm/exception-64s.h  |   6 +
 arch/powerpc/include/asm/head-64.h|   2 +-
 arch/powerpc/include/asm/ppc-opcode.h |   2 +
 arch/powerpc/include/asm/ppc_asm.h|   2 +
 arch/powerpc/include/asm/processor.h  |   2 +-
 arch/powerpc/include/asm/ptrace.h |   8 +-
 arch/powerpc/include/asm/setup.h  |   4 +-
 arch/powerpc/include/asm/sstep.h  |   1 +
 arch/powerpc/include/asm/vdso.h   |   1 +
 arch/powerpc/kernel/cpu_setup_power.S |   2 +-
 arch/powerpc/kernel/cputable.c|   3 +-
 arch/powerpc/kernel/dt_cpu_ftrs.c |   1 +
 arch/powerpc/kernel/entry_64.S| 158 +-
 arch/powerpc/kernel/exceptions-64s.S  | 150 +++-
 arch/powerpc/kernel/process.c |  10 +-
 arch/powerpc/kernel/setup_64.c|   5 +-
 arch/powerpc/kernel/signal.c  |  19 ++-
 arch/powerpc/kernel/signal_64.c   |  28 +++-
 arch/powerpc/kernel/syscall_64.c  |  32 +++--
 arch/powerpc/kernel/vdso.c|   2 +
 arch/powerpc/kernel/vdso64/sigtramp.S |  34 -
 arch/powerpc/kernel/vdso64/vdso64.lds.S   |   1 +
 arch/powerpc/lib/sstep.c  |  14 ++
 arch/powerpc/perf/callchain_64.c  |   9 +-
 arch/powerpc/platforms/pseries/setup.c|   8 +-
 arch/powerpc/xmon/xmon.c  |   1 +
 28 files changed, 492 insertions(+), 57 deletions(-)

-- 
2.23.0

[RFC PATCH 1/2] powerpc/64s/exception: treat NIA below __end_interrupts as soft-masked

2020-04-29 Thread Nicholas Piggin

The scv instruction causes an interrupt which can enter the kernel with
MSR[EE]=1, thus allowing interrupts to hit at any time. These must not
be taken as normal interrupts, because they come from MSR[PR]=0 context,
and yet the kernel stack is not yet set up and r13 is not set to the
PACA).

Treat this as a soft-masked interrupt regardless of the soft masked
state. This does not affect behaviour yet, because currently all
interrupts are taken with MSR[EE]=0.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/exceptions-64s.S | 27 ---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index ef4a90212664..8dfebf6c6a1e 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -508,8 +508,24 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
 
 .macro __GEN_COMMON_BODY name
.if IMASK
+   .if ! ISTACK
+   .error "No support for masked interrupt to use custom stack"
+   .endif
+
+   /* If coming from user, skip soft-mask tests. */
+   andi.   r10,r12,MSR_PR
+   bne 2f
+
+   /* Kernel code running below __end_interrupts is implicitly
+* soft-masked */
+   LOAD_HANDLER(r10, __end_interrupts)
+   cmpdr11,r10
+   li  r10,IMASK
+   blt-1f
+
+   /* Test the soft mask state against our interrupt's bit */
lbz r10,PACAIRQSOFTMASK(r13)
-   andi.   r10,r10,IMASK
+1: andi.   r10,r10,IMASK
/* Associate vector numbers with bits in paca->irq_happened */
.if IVEC == 0x500 || IVEC == 0xea0
li  r10,PACA_IRQ_EE
@@ -540,7 +556,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real)
 
.if ISTACK
andi.   r10,r12,MSR_PR  /* See if coming from user  */
-   mr  r10,r1  /* Save r1  */
+2: mr  r10,r1  /* Save r1  */
subir1,r1,INT_FRAME_SIZE/* alloc frame on kernel stack  */
beq-100f
ld  r1,PACAKSAVE(r13)   /* kernel stack to use  */
@@ -2811,7 +2827,8 @@ masked_interrupt:
ld  r10,PACA_EXGEN+EX_R10(r13)
ld  r11,PACA_EXGEN+EX_R11(r13)
ld  r12,PACA_EXGEN+EX_R12(r13)
-   /* returns to kernel where r13 must be set up, so don't restore it */
+   ld  r13,PACA_EXGEN+EX_R13(r13)
+   /* May return to masked low address where r13 is not set up */
.if \hsrr
HRFI_TO_KERNEL
.else
@@ -2970,6 +2987,10 @@ EXC_COMMON_BEGIN(ppc64_runlatch_on_trampoline)
 
 USE_FIXED_SECTION(virt_trampolines)
/*
+* All code below __end_interrupts is treated as soft-masked. If
+* any code runs here with MSR[EE]=1, it must then cope with pending
+* soft interrupt being raised (i.e., by ensuring it is replayed).
+*
 * The __end_interrupts marker must be past the out-of-line (OOL)
 * handlers, so that they are copied to real address 0x100 when running
 * a relocatable kernel. This ensures they can be reached from the short
-- 
2.23.0

[RFC PATCH 2/2] powerpc/64s: system call support for scv/rfscv instructions

2020-04-29 Thread Nicholas Piggin

Add support for the scv instruction on POWER9 and later CPUs.

For now this implements the zeroth scv vector 'scv 0', as identical
to 'sc' system calls, with the exception that lr is not preserved, and
it is 64-bit only. There may yet be changes made to this ABI, so it's
for testing only.

rfscv is implemented to return from scv type system calls. It can not
be used to return from sc system calls because those are defined to
preserve lr.

In a comparison of getpid syscall, the test program had scv taking
about 3 more cycles in user mode (92 vs 89 for sc), due to lr handling.
getpid syscall throughput on POWER9 is improved by 33%, mostly due to
reducing mtmsr and mtspr.

Signed-off-by: Nicholas Piggin 
---
 Documentation/powerpc/syscall64-abi.rst   |  42 --
 arch/powerpc/include/asm/asm-prototypes.h |   2 +-
 arch/powerpc/include/asm/exception-64s.h  |   6 +
 arch/powerpc/include/asm/head-64.h|   2 +-
 arch/powerpc/include/asm/ppc-opcode.h |   2 +
 arch/powerpc/include/asm/ppc_asm.h|   2 +
 arch/powerpc/include/asm/processor.h  |   2 +-
 arch/powerpc/include/asm/ptrace.h |   8 +-
 arch/powerpc/include/asm/setup.h  |   4 +-
 arch/powerpc/include/asm/sstep.h  |   1 +
 arch/powerpc/include/asm/vdso.h   |   1 +
 arch/powerpc/kernel/cpu_setup_power.S |   2 +-
 arch/powerpc/kernel/cputable.c|   3 +-
 arch/powerpc/kernel/dt_cpu_ftrs.c |   1 +
 arch/powerpc/kernel/entry_64.S| 158 +-
 arch/powerpc/kernel/exceptions-64s.S  | 123 -
 arch/powerpc/kernel/process.c |  10 +-
 arch/powerpc/kernel/setup_64.c|   5 +-
 arch/powerpc/kernel/signal.c  |  19 ++-
 arch/powerpc/kernel/signal_64.c   |  28 +++-
 arch/powerpc/kernel/syscall_64.c  |  32 +++--
 arch/powerpc/kernel/vdso.c|   2 +
 arch/powerpc/kernel/vdso64/sigtramp.S |  34 -
 arch/powerpc/kernel/vdso64/vdso64.lds.S   |   1 +
 arch/powerpc/lib/sstep.c  |  14 ++
 arch/powerpc/perf/callchain_64.c  |   9 +-
 arch/powerpc/platforms/pseries/setup.c|   8 +-
 arch/powerpc/xmon/xmon.c  |   1 +
 28 files changed, 468 insertions(+), 54 deletions(-)

diff --git a/Documentation/powerpc/syscall64-abi.rst 
b/Documentation/powerpc/syscall64-abi.rst
index e49f69f941b9..6f311ad37211 100644
--- a/Documentation/powerpc/syscall64-abi.rst
+++ b/Documentation/powerpc/syscall64-abi.rst
@@ -5,6 +5,15 @@ Power Architecture 64-bit Linux system call ABI
 syscall
 ===
 
+Invocation
+--
+The syscall is made with the sc instruction, and returns with execution
+continuing at the instruction following the sc instruction.
+
+If PPC_FEATURE2_SCV appears in the AT_HWCAP2 ELF auxiliary vector, the
+scv 0 instruction is an alternative that may provide better performance,
+with some differences to calling sequence.
+
 syscall calling sequence\ [1]_ matches the Power Architecture 64-bit ELF ABI
 specification C function calling sequence, including register preservation
 rules, with the following differences.
@@ -12,16 +21,23 @@ rules, with the following differences.
 .. [1] Some syscalls (typically low-level management functions) may have
different calling sequences (e.g., rt_sigreturn).
 
-Parameters and return value

+Parameters
+--
 The system call number is specified in r0.
 
 There is a maximum of 6 integer parameters to a syscall, passed in r3-r8.
 
-Both a return value and a return error code are returned. cr0.SO is the return
-error code, and r3 is the return value or error code. When cr0.SO is clear,
-the syscall succeeded and r3 is the return value. When cr0.SO is set, the
-syscall failed and r3 is the error code that generally corresponds to errno.
+Return value
+
+- For the sc instruction, both a return value and a return error code are
+  returned. cr0.SO is the return error code, and r3 is the return value or
+  error code. When cr0.SO is clear, the syscall succeeded and r3 is the return
+  value. When cr0.SO is set, the syscall failed and r3 is the error code that
+  generally corresponds to errno.
+
+- For the scv 0 instruction, there is a return value indicates failure if it
+  is >= -MAX_ERRNO (-4095) as an unsigned comparison, in which case it is the
+  negated return error code. Otherwise it is the successful return value.
 
 Stack
 -
@@ -34,22 +50,23 @@ Register preservation rules match the ELF ABI calling 
sequence with the
 following differences:
 
 === = 
+--- For the sc instruction ---
 r0  Volatile  (System call number.)
 r3  Volatile  (Parameter 1, and return value.)
 r4-r8   Volatile  (Parameters 2-6.)
-cr0 Volatile  (cr0.SO is the return error condition)
+cr0 Volatile  (cr0.SO is the return error condition.)
 cr1, cr5-7  Nonvolatile
 lr

Re: [PATCH v5 0/5] Track and expose idle PURR and SPURR ticks

2020-04-29 Thread Gautham R Shenoy

Hello Michael,

On Thu, Apr 30, 2020 at 12:34:52PM +1000, Michael Ellerman wrote:
> Gautham R Shenoy  writes:
> > On Mon, Apr 20, 2020 at 03:46:35PM -0700, Tyrel Datwyler wrote:
> >> On 4/7/20 1:47 AM, Gautham R. Shenoy wrote:
> >> > From: "Gautham R. Shenoy" 
> >> > 
> >> > Hi,
> >> > 
> >> > This is the fifth version of the patches to track and expose idle PURR
> >> > and SPURR ticks. These patches are required by tools such as lparstat
> >> > to compute system utilization for capacity planning purposes.
> ...
> >> > 
> >> > Gautham R. Shenoy (5):
> >> >   powerpc: Move idle_loop_prolog()/epilog() functions to header file
> >> >   powerpc/idle: Store PURR snapshot in a per-cpu global variable
> >> >   powerpc/pseries: Account for SPURR ticks on idle CPUs
> >> >   powerpc/sysfs: Show idle_purr and idle_spurr for every CPU
> >> >   Documentation: Document sysfs interfaces purr, spurr, idle_purr,
> >> > idle_spurr
> >> > 
> >> >  Documentation/ABI/testing/sysfs-devices-system-cpu | 39 +
> >> >  arch/powerpc/include/asm/idle.h| 93 
> >> > ++
> >> >  arch/powerpc/kernel/sysfs.c| 82 
> >> > ++-
> >> >  arch/powerpc/platforms/pseries/setup.c |  8 +-
> >> >  drivers/cpuidle/cpuidle-pseries.c  | 39 ++---
> >> >  5 files changed, 224 insertions(+), 37 deletions(-)
> >> >  create mode 100644 arch/powerpc/include/asm/idle.h
> >> > 
> >> 
> >> Reviewed-by: Tyrel Datwyler 
> >
> > Thanks for reviewing the patches.
> >
> >> 
> >> Any chance this is going to be merged in the near future? There is a 
> >> patchset to
> >> update lparstat in the powerpc-utils package to calculate PURR/SPURR cpu
> >> utilization that I would like to merge, but have been holding off to make 
> >> sure
> >> we are synced with this proposed patchset.
> >
> > Michael, could you please consider this for 5.8 ?
> 
> Yes. Has it been tested on KVM at all?

No. I haven't tested this on KVM. Will do that today.


> 
> cheers

--
Thanks and Regards
gautham.

[PATCH v4 01/16] powerpc/watchpoint: Rename current DAWR macros

2020-04-29 Thread Ravi Bangoria

Future Power architecture is introducing second DAWR. Use real
register names from ISA for current macros:
  s/SPRN_DAWR/SPRN_DAWR0/
  s/SPRN_DAWRX/SPRN_DAWRX0/

Signed-off-by: Ravi Bangoria 
Reviewed-by: Michael Neuling 
---
 arch/powerpc/include/asm/reg.h  |  4 ++--
 arch/powerpc/kernel/dawr.c  |  4 ++--
 arch/powerpc/kvm/book3s_hv.c| 12 ++--
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 18 +-
 arch/powerpc/xmon/xmon.c|  2 +-
 5 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index da5cab038e25..156ee89fa9be 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -283,14 +283,14 @@
 #define   CTRL_CT1 0x4000  /* thread 1 */
 #define   CTRL_TE  0x00c0  /* thread enable */
 #define   CTRL_RUNLATCH0x1
-#define SPRN_DAWR  0xB4
+#define SPRN_DAWR0 0xB4
 #define SPRN_RPR   0xBA/* Relative Priority Register */
 #define SPRN_CIABR 0xBB
 #define   CIABR_PRIV   0x3
 #define   CIABR_PRIV_USER  1
 #define   CIABR_PRIV_SUPER 2
 #define   CIABR_PRIV_HYPER 3
-#define SPRN_DAWRX 0xBC
+#define SPRN_DAWRX00xBC
 #define   DAWRX_USER   __MASK(0)
 #define   DAWRX_KERNEL __MASK(1)
 #define   DAWRX_HYP__MASK(2)
diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
index cc14aa6c4a1b..e91b613bf137 100644
--- a/arch/powerpc/kernel/dawr.c
+++ b/arch/powerpc/kernel/dawr.c
@@ -39,8 +39,8 @@ int set_dawr(struct arch_hw_breakpoint *brk)
if (ppc_md.set_dawr)
return ppc_md.set_dawr(dawr, dawrx);
 
-   mtspr(SPRN_DAWR, dawr);
-   mtspr(SPRN_DAWRX, dawrx);
+   mtspr(SPRN_DAWR0, dawr);
+   mtspr(SPRN_DAWRX0, dawrx);
 
return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 93493f0cbfe8..db07199f0977 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3392,8 +3392,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
int trap;
unsigned long host_hfscr = mfspr(SPRN_HFSCR);
unsigned long host_ciabr = mfspr(SPRN_CIABR);
-   unsigned long host_dawr = mfspr(SPRN_DAWR);
-   unsigned long host_dawrx = mfspr(SPRN_DAWRX);
+   unsigned long host_dawr = mfspr(SPRN_DAWR0);
+   unsigned long host_dawrx = mfspr(SPRN_DAWRX0);
unsigned long host_psscr = mfspr(SPRN_PSSCR);
unsigned long host_pidr = mfspr(SPRN_PID);
 
@@ -3422,8 +3422,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
mtspr(SPRN_SPURR, vcpu->arch.spurr);
 
if (dawr_enabled()) {
-   mtspr(SPRN_DAWR, vcpu->arch.dawr);
-   mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
+   mtspr(SPRN_DAWR0, vcpu->arch.dawr);
+   mtspr(SPRN_DAWRX0, vcpu->arch.dawrx);
}
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
mtspr(SPRN_IC, vcpu->arch.ic);
@@ -3475,8 +3475,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu 
*vcpu, u64 time_limit,
  (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
-   mtspr(SPRN_DAWR, host_dawr);
-   mtspr(SPRN_DAWRX, host_dawrx);
+   mtspr(SPRN_DAWR0, host_dawr);
+   mtspr(SPRN_DAWRX0, host_dawrx);
mtspr(SPRN_PID, host_pidr);
 
/*
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 780a499c7114..70de3325d0e9 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -707,8 +707,8 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 BEGIN_FTR_SECTION
mfspr   r5, SPRN_CIABR
-   mfspr   r6, SPRN_DAWR
-   mfspr   r7, SPRN_DAWRX
+   mfspr   r6, SPRN_DAWR0
+   mfspr   r7, SPRN_DAWRX0
mfspr   r8, SPRN_IAMR
std r5, STACK_SLOT_CIABR(r1)
std r6, STACK_SLOT_DAWR(r1)
@@ -803,8 +803,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
beq 1f
ld  r5, VCPU_DAWR(r4)
ld  r6, VCPU_DAWRX(r4)
-   mtspr   SPRN_DAWR, r5
-   mtspr   SPRN_DAWRX, r6
+   mtspr   SPRN_DAWR0, r5
+   mtspr   SPRN_DAWRX0, r6
 1:
ld  r7, VCPU_CIABR(r4)
ld  r8, VCPU_TAR(r4)
@@ -1766,8 +1766,8 @@ BEGIN_FTR_SECTION
 * If the DAWR doesn't work, it's ok to write these here as
 * this value should always be zero
*/
-   mtspr   SPRN_DAWR, r6
-   mtspr   SPRN_DAWRX, r7
+   mtspr   SPRN_DAWR0, r6
+   mtspr   SPRN_DAWRX0, r7
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 BEGIN_FTR_SECTION
ld  r5, STACK_SLOT_TID(r1)
@@ -2577,8 +2577,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mfmsr   r6
andi.   r6, r6, MSR_DR  /* in real mode? */
bne 4f

[PATCH v4 00/16] powerpc/watchpoint: Preparation for more than one watchpoint

2020-04-29 Thread Ravi Bangoria

So far, powerpc Book3S code has been written with an assumption of only
one watchpoint. But future power architecture is introducing second
watchpoint register (DAWR). Even though this patchset does not enable
2nd DAWR, it make the infrastructure ready so that enabling 2nd DAWR
should just be a matter of changing count.

Existing functionality works fine with the patchset. I've tested it with
perf, ptrace(gdb), xmon. All hw-breakpoint selftests are passing as well.
And I've build tested for 8xx and 'AMCC 44x, 46x or 47x'.

Note: kvm or PowerVM guest is not enabled yet.

v3: 
https://lore.kernel.org/linuxppc-dev/20200414031659.58875-1-ravi.bango...@linux.ibm.com

v3->v4:
 - Reduce the scope of local variables to increase readability at some
   places, suggested by Christophe.
 - Added Michael Neuling's Reviewed by for the series.
 - Rebased to powerpc/next

Ravi Bangoria (16):
  powerpc/watchpoint: Rename current DAWR macros
  powerpc/watchpoint: Add SPRN macros for second DAWR
  powerpc/watchpoint: Introduce function to get nr watchpoints
dynamically
  powerpc/watchpoint/ptrace: Return actual num of available watchpoints
  powerpc/watchpoint: Provide DAWR number to set_dawr
  powerpc/watchpoint: Provide DAWR number to __set_breakpoint
  powerpc/watchpoint: Get watchpoint count dynamically while disabling
them
  powerpc/watchpoint: Disable all available watchpoints when
!dawr_force_enable
  powerpc/watchpoint: Convert thread_struct->hw_brk to an array
  powerpc/watchpoint: Use loop for thread_struct->ptrace_bps
  powerpc/watchpoint: Introduce is_ptrace_bp() function
  powerpc/watchpoint: Use builtin ALIGN*() macros
  powerpc/watchpoint: Prepare handler to handle more than one
watcnhpoint
  powerpc/watchpoint: Don't allow concurrent perf and ptrace events
  powerpc/watchpoint/xmon: Don't allow breakpoint overwriting
  powerpc/watchpoint/xmon: Support 2nd dawr

 arch/powerpc/include/asm/cputable.h   |   6 +-
 arch/powerpc/include/asm/debug.h  |   2 +-
 arch/powerpc/include/asm/hw_breakpoint.h  |  32 +-
 arch/powerpc/include/asm/processor.h  |   6 +-
 arch/powerpc/include/asm/reg.h|   6 +-
 arch/powerpc/include/asm/sstep.h  |   2 +
 arch/powerpc/kernel/dawr.c|  23 +-
 arch/powerpc/kernel/hw_breakpoint.c   | 645 ++
 arch/powerpc/kernel/process.c |  85 +--
 arch/powerpc/kernel/ptrace/ptrace-noadv.c |  72 ++-
 arch/powerpc/kernel/ptrace/ptrace32.c |   4 +-
 arch/powerpc/kernel/signal.c  |  13 +-
 arch/powerpc/kvm/book3s_hv.c  |  12 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  18 +-
 arch/powerpc/xmon/xmon.c  |  99 +++-
 kernel/events/hw_breakpoint.c |  16 +
 16 files changed, 814 insertions(+), 227 deletions(-)

-- 
2.21.1

[PATCH v4 03/16] powerpc/watchpoint: Introduce function to get nr watchpoints dynamically

2020-04-29 Thread Ravi Bangoria

So far we had only one watchpoint, so we have hardcoded HBP_NUM to 1.
But future Power architecture is introducing 2nd DAWR and thus kernel
should be able to dynamically find actual number of watchpoints
supported by hw it's running on. Introduce function for the same.
Also convert HBP_NUM macro to HBP_NUM_MAX, which will now represent
maximum number of watchpoints supported by Powerpc.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Michael Neuling 
---
 arch/powerpc/include/asm/cputable.h  | 6 +-
 arch/powerpc/include/asm/hw_breakpoint.h | 5 +
 arch/powerpc/include/asm/processor.h | 2 +-
 arch/powerpc/kernel/hw_breakpoint.c  | 2 +-
 4 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/cputable.h 
b/arch/powerpc/include/asm/cputable.h
index 40a4d3c6fd99..c67b94f3334c 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -614,7 +614,11 @@ enum {
 };
 #endif /* __powerpc64__ */
 
-#define HBP_NUM 1
+/*
+ * Maximum number of hw breakpoint supported on powerpc. Number of
+ * breakpoints supported by actual hw might be less than this.
+ */
+#define HBP_NUM_MAX1
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h 
b/arch/powerpc/include/asm/hw_breakpoint.h
index f2f8d8aa8e3b..518b41eef924 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -43,6 +43,11 @@ struct arch_hw_breakpoint {
 #define DABR_MAX_LEN   8
 #define DAWR_MAX_LEN   512
 
+static inline int nr_wp_slots(void)
+{
+   return HBP_NUM_MAX;
+}
+
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 #include 
 #include 
diff --git a/arch/powerpc/include/asm/processor.h 
b/arch/powerpc/include/asm/processor.h
index bfa336fbcfeb..a71bdd6bc284 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -180,7 +180,7 @@ struct thread_struct {
int fpexc_mode; /* floating-point exception mode */
unsigned intalign_ctl;  /* alignment handling control */
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
-   struct perf_event *ptrace_bps[HBP_NUM];
+   struct perf_event *ptrace_bps[HBP_NUM_MAX];
/*
 * Helps identify source of single-step exception and subsequent
 * hw-breakpoint enablement
diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index 72f461bd70fb..4120349e2abe 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -38,7 +38,7 @@ static DEFINE_PER_CPU(struct perf_event *, bp_per_reg);
 int hw_breakpoint_slots(int type)
 {
if (type == TYPE_DATA)
-   return HBP_NUM;
+   return nr_wp_slots();
return 0;   /* no instruction breakpoints available */
 }
 
-- 
2.21.1

[PATCH v4 02/16] powerpc/watchpoint: Add SPRN macros for second DAWR

2020-04-29 Thread Ravi Bangoria

Future Power architecture is introducing second DAWR. Add SPRN_ macros
for the same.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Michael Neuling 
---
 arch/powerpc/include/asm/reg.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 156ee89fa9be..062e74cf41fd 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -284,6 +284,7 @@
 #define   CTRL_TE  0x00c0  /* thread enable */
 #define   CTRL_RUNLATCH0x1
 #define SPRN_DAWR0 0xB4
+#define SPRN_DAWR1 0xB5
 #define SPRN_RPR   0xBA/* Relative Priority Register */
 #define SPRN_CIABR 0xBB
 #define   CIABR_PRIV   0x3
@@ -291,6 +292,7 @@
 #define   CIABR_PRIV_SUPER 2
 #define   CIABR_PRIV_HYPER 3
 #define SPRN_DAWRX00xBC
+#define SPRN_DAWRX10xBD
 #define   DAWRX_USER   __MASK(0)
 #define   DAWRX_KERNEL __MASK(1)
 #define   DAWRX_HYP__MASK(2)
-- 
2.21.1

[PATCH v4 04/16] powerpc/watchpoint/ptrace: Return actual num of available watchpoints

2020-04-29 Thread Ravi Bangoria

User can ask for num of available watchpoints(dbginfo.num_data_bps)
using ptrace(PPC_PTRACE_GETHWDBGINFO). Return actual number of
available watchpoints on the machine rather than hardcoded 1.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Michael Neuling 
---
 arch/powerpc/kernel/ptrace/ptrace-noadv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/ptrace/ptrace-noadv.c 
b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
index f87e7c5c3bf3..12962302d6a4 100644
--- a/arch/powerpc/kernel/ptrace/ptrace-noadv.c
+++ b/arch/powerpc/kernel/ptrace/ptrace-noadv.c
@@ -44,7 +44,7 @@ void ppc_gethwdinfo(struct ppc_debug_info *dbginfo)
dbginfo->version = 1;
dbginfo->num_instruction_bps = 0;
if (ppc_breakpoint_available())
-   dbginfo->num_data_bps = 1;
+   dbginfo->num_data_bps = nr_wp_slots();
else
dbginfo->num_data_bps = 0;
dbginfo->num_condition_regs = 0;
-- 
2.21.1

Re: [PATCH v2 2/3] powerpc/numa: Prefer node id queried from vphn

2020-04-29 Thread Srikar Dronamraju

* Gautham R Shenoy  [2020-04-29 12:22:29]:

> Hello Srikar,
> 
> 
> > +   if (nid == NUMA_NO_NODE) {
> > +   cpu = of_get_cpu_node(i, NULL);
> > +   if (cpu) {
> 
> Why are we not retaining the BUG_ON(!cpu) assert here ?
> 
> > +   nid = of_node_to_nid_single(cpu);
> > +   of_node_put(cpu);
> > +   }
> > +   }
> 
> Is it possible at this point that both vphn_get_nid(i) and
> of_node_to_nid_single(cpu) returns NUMA_NO_NODE ? If so,
> should we still call node_set_online() below ?

Yeah, I think It makes sense to retain the BUG_ON and if check.

Will incorporate both of them in the next version.

> 
> 
> > node_set_online(nid);
> > }
> > 
> > -- 
> > 2.20.1
> > 
> --
> Thanks and Regards
> gautham.

-- 
Thanks and Regards
Srikar Dronamraju

[PATCH v4 05/16] powerpc/watchpoint: Provide DAWR number to set_dawr

2020-04-29 Thread Ravi Bangoria

Introduce new parameter 'nr' to set_dawr() which indicates which DAWR
should be programed.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Michael Neuling 
---
 arch/powerpc/include/asm/hw_breakpoint.h |  4 ++--
 arch/powerpc/kernel/dawr.c   | 15 ++-
 arch/powerpc/kernel/process.c|  2 +-
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_breakpoint.h 
b/arch/powerpc/include/asm/hw_breakpoint.h
index 518b41eef924..5b3b02834e0b 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -104,10 +104,10 @@ static inline bool dawr_enabled(void)
 {
return dawr_force_enable;
 }
-int set_dawr(struct arch_hw_breakpoint *brk);
+int set_dawr(int nr, struct arch_hw_breakpoint *brk);
 #else
 static inline bool dawr_enabled(void) { return false; }
-static inline int set_dawr(struct arch_hw_breakpoint *brk) { return -1; }
+static inline int set_dawr(int nr, struct arch_hw_breakpoint *brk) { return 
-1; }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c
index e91b613bf137..8114ad3a8574 100644
--- a/arch/powerpc/kernel/dawr.c
+++ b/arch/powerpc/kernel/dawr.c
@@ -16,7 +16,7 @@
 bool dawr_force_enable;
 EXPORT_SYMBOL_GPL(dawr_force_enable);
 
-int set_dawr(struct arch_hw_breakpoint *brk)
+int set_dawr(int nr, struct arch_hw_breakpoint *brk)
 {
unsigned long dawr, dawrx, mrd;
 
@@ -39,15 +39,20 @@ int set_dawr(struct arch_hw_breakpoint *brk)
if (ppc_md.set_dawr)
return ppc_md.set_dawr(dawr, dawrx);
 
-   mtspr(SPRN_DAWR0, dawr);
-   mtspr(SPRN_DAWRX0, dawrx);
+   if (nr == 0) {
+   mtspr(SPRN_DAWR0, dawr);
+   mtspr(SPRN_DAWRX0, dawrx);
+   } else {
+   mtspr(SPRN_DAWR1, dawr);
+   mtspr(SPRN_DAWRX1, dawrx);
+   }
 
return 0;
 }
 
 static void set_dawr_cb(void *info)
 {
-   set_dawr(info);
+   set_dawr(0, info);
 }
 
 static ssize_t dawr_write_file_bool(struct file *file,
@@ -60,7 +65,7 @@ static ssize_t dawr_write_file_bool(struct file *file,
/* Send error to user if they hypervisor won't allow us to write DAWR */
if (!dawr_force_enable &&
firmware_has_feature(FW_FEATURE_LPAR) &&
-   set_dawr(&null_brk) != H_SUCCESS)
+   set_dawr(0, &null_brk) != H_SUCCESS)
return -ENODEV;
 
rc = debugfs_write_file_bool(file, user_buf, count, ppos);
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 8479c762aef2..7488adf4d61c 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -806,7 +806,7 @@ void __set_breakpoint(struct arch_hw_breakpoint *brk)
 
if (dawr_enabled())
// Power8 or later
-   set_dawr(brk);
+   set_dawr(0, brk);
else if (IS_ENABLED(CONFIG_PPC_8xx))
set_breakpoint_8xx(brk);
else if (!cpu_has_feature(CPU_FTR_ARCH_207S))
-- 
2.21.1

[PATCH v4 06/16] powerpc/watchpoint: Provide DAWR number to __set_breakpoint

2020-04-29 Thread Ravi Bangoria

Introduce new parameter 'nr' to __set_breakpoint() which indicates
which DAWR should be programed. Also convert current_brk variable
to an array.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Michael Neuling 
---
 arch/powerpc/include/asm/debug.h |  2 +-
 arch/powerpc/include/asm/hw_breakpoint.h |  2 +-
 arch/powerpc/kernel/hw_breakpoint.c  |  8 
 arch/powerpc/kernel/process.c| 14 +++---
 arch/powerpc/kernel/signal.c |  2 +-
 arch/powerpc/xmon/xmon.c |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 7756026b95ca..ec57daf87f40 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -45,7 +45,7 @@ static inline int debugger_break_match(struct pt_regs *regs) 
{ return 0; }
 static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
-void __set_breakpoint(struct arch_hw_breakpoint *brk);
+void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk);
 bool ppc_breakpoint_available(void);
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 extern void do_send_trap(struct pt_regs *regs, unsigned long address,
diff --git a/arch/powerpc/include/asm/hw_breakpoint.h 
b/arch/powerpc/include/asm/hw_breakpoint.h
index 5b3b02834e0b..1120c7d9db58 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -85,7 +85,7 @@ static inline void hw_breakpoint_disable(void)
brk.len = 0;
brk.hw_len = 0;
if (ppc_breakpoint_available())
-   __set_breakpoint(&brk);
+   __set_breakpoint(0, &brk);
 }
 extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs);
 int hw_breakpoint_handler(struct die_args *args);
diff --git a/arch/powerpc/kernel/hw_breakpoint.c 
b/arch/powerpc/kernel/hw_breakpoint.c
index 4120349e2abe..5826f1f2cab9 100644
--- a/arch/powerpc/kernel/hw_breakpoint.c
+++ b/arch/powerpc/kernel/hw_breakpoint.c
@@ -63,7 +63,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
 * If so, DABR will be populated in single_step_dabr_instruction().
 */
if (current->thread.last_hit_ubp != bp)
-   __set_breakpoint(info);
+   __set_breakpoint(0, info);
 
return 0;
 }
@@ -221,7 +221,7 @@ void thread_change_pc(struct task_struct *tsk, struct 
pt_regs *regs)
 
info = counter_arch_bp(tsk->thread.last_hit_ubp);
regs->msr &= ~MSR_SE;
-   __set_breakpoint(info);
+   __set_breakpoint(0, info);
tsk->thread.last_hit_ubp = NULL;
 }
 
@@ -346,7 +346,7 @@ int hw_breakpoint_handler(struct die_args *args)
if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
perf_bp_event(bp, regs);
 
-   __set_breakpoint(info);
+   __set_breakpoint(0, info);
 out:
rcu_read_unlock();
return rc;
@@ -379,7 +379,7 @@ static int single_step_dabr_instruction(struct die_args 
*args)
if (!(info->type & HW_BRK_TYPE_EXTRANEOUS_IRQ))
perf_bp_event(bp, regs);
 
-   __set_breakpoint(info);
+   __set_breakpoint(0, info);
current->thread.last_hit_ubp = NULL;
 
/*
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 7488adf4d61c..351fbd8d2c5b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -637,7 +637,7 @@ void do_break (struct pt_regs *regs, unsigned long address,
 }
 #endif /* CONFIG_PPC_ADV_DEBUG_REGS */
 
-static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk);
+static DEFINE_PER_CPU(struct arch_hw_breakpoint, current_brk[HBP_NUM_MAX]);
 
 #ifdef CONFIG_PPC_ADV_DEBUG_REGS
 /*
@@ -714,7 +714,7 @@ EXPORT_SYMBOL_GPL(switch_booke_debug_regs);
 static void set_breakpoint(struct arch_hw_breakpoint *brk)
 {
preempt_disable();
-   __set_breakpoint(brk);
+   __set_breakpoint(0, brk);
preempt_enable();
 }
 
@@ -800,13 +800,13 @@ static inline int set_breakpoint_8xx(struct 
arch_hw_breakpoint *brk)
return 0;
 }
 
-void __set_breakpoint(struct arch_hw_breakpoint *brk)
+void __set_breakpoint(int nr, struct arch_hw_breakpoint *brk)
 {
-   memcpy(this_cpu_ptr(¤t_brk), brk, sizeof(*brk));
+   memcpy(this_cpu_ptr(¤t_brk[nr]), brk, sizeof(*brk));
 
if (dawr_enabled())
// Power8 or later
-   set_dawr(0, brk);
+   set_dawr(nr, brk);
else if (IS_ENABLED(CONFIG_PPC_8xx))
set_breakpoint_8xx(brk);
else if (!cpu_has_feature(CPU_FTR_ARCH_207S))
@@ -1174,8 +1174,8 @@ struct task_struct *__switch_to(struct task_struct *prev,
  * schedule DABR
  */
 #ifndef CONFIG_HAVE_HW_BREAKPOINT
-   if (unlikely(!hw_brk_match(this_cpu_ptr(¤t_brk), 
&new->thread.hw_brk)))
-   __set_breakpoint(&new->thread.hw_brk);
+   if (unlikely(!hw_brk_match(this_cpu_ptr(¤t_brk[0]), 
&new->thread.hw_brk)))
+   __set_breakpoint(

[PATCH v4 07/16] powerpc/watchpoint: Get watchpoint count dynamically while disabling them

2020-04-29 Thread Ravi Bangoria

Instead of disabling only one watchpoint, get num of available
watchpoints dynamically and disable all of them.

Signed-off-by: Ravi Bangoria 
Reviewed-by: Michael Neuling 
---
 arch/powerpc/include/asm/hw_breakpoint.h | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_breakpoint.h 
b/arch/powerpc/include/asm/hw_breakpoint.h
index 1120c7d9db58..d472b2eb757e 100644
--- a/arch/powerpc/include/asm/hw_breakpoint.h
+++ b/arch/powerpc/include/asm/hw_breakpoint.h
@@ -78,14 +78,14 @@ extern void ptrace_triggered(struct perf_event *bp,
struct perf_sample_data *data, struct pt_regs *regs);
 static inline void hw_breakpoint_disable(void)
 {
-   struct arch_hw_breakpoint brk;
-
-   brk.address = 0;
-   brk.type = 0;
-   brk.len = 0;
-   brk.hw_len = 0;
-   if (ppc_breakpoint_available())
-   __set_breakpoint(0, &brk);
+   int i;
+   struct arch_hw_breakpoint null_brk = {0};
+
+   if (!ppc_breakpoint_available())
+   return;
+
+   for (i = 0; i < nr_wp_slots(); i++)
+   __set_breakpoint(i, &null_brk);
 }
 extern void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs);
 int hw_breakpoint_handler(struct die_args *args);
-- 
2.21.1

1 2 >

1 - 100 of 113 matches

Mail list logo