[PATCH] powerpc/8xx: Load modules closer to kernel text

2021-03-29 Thread Christophe Leroy
On the 8xx, TASK_SIZE is 0x8000. The space between TASK_SIZE and
PAGE_OFFSET is not used.

Use it to load modules in order to minimise the distance between
kernel text and modules and avoid trampolines in modules to access
kernel functions or other module functions.

Define a 16Mbytes area for modules, that's more than enough.

DEBUG logs in module_32.c without the patch:

[ 1572.588822] module_32: Applying ADD relocate section 13 to 12
[ 1572.588891] module_32: Doing plt for call to 0xc00671a4 at 0xcae04024
[ 1572.588964] module_32: Initialized plt for 0xc00671a4 at cae04000
[ 1572.589037] module_32: REL24 value = CAE04000. location = CAE04024
[ 1572.589110] module_32: Location before: 4801.
[ 1572.589171] module_32: Location after: 4BDD.
[ 1572.589231] module_32: ie. jump to 03DC+CAE04024 = CEE04000
[ 1572.589317] module_32: Applying ADD relocate section 15 to 14
[ 1572.589386] module_32: Doing plt for call to 0xc00671a4 at 0xcadfc018
[ 1572.589457] module_32: Initialized plt for 0xc00671a4 at cadfc000
[ 1572.589529] module_32: REL24 value = CADFC000. location = CADFC018
[ 1572.589601] module_32: Location before: 4800.
[ 1572.589661] module_32: Location after: 4BE8.
[ 1572.589723] module_32: ie. jump to 03E8+CADFC018 = CEDFC000

With the patch:

[  279.404671] module_32: Applying ADD relocate section 13 to 12
[  279.404741] module_32: REL24 value = C00671B4. location = BF808024
[  279.404814] module_32: Location before: 4801.
[  279.404874] module_32: Location after: 4885F191.
[  279.404933] module_32: ie. jump to 0085F190+BF808024 = C00671B4
[  279.405016] module_32: Applying ADD relocate section 15 to 14
[  279.405085] module_32: REL24 value = C00671B4. location = BF800018
[  279.405156] module_32: Location before: 4800.
[  279.405215] module_32: Location after: 4886719C.
[  279.405275] module_32: ie. jump to 0086719C+BF800018 = C00671B4

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h 
b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
index 478249959baa..7902a42d6d3e 100644
--- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
+++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h
@@ -172,6 +172,9 @@
 
 #define mmu_linear_psize   MMU_PAGE_8M
 
+#define MODULES_VADDR  (PAGE_OFFSET - SZ_16M)
+#define MODULES_ENDPAGE_OFFSET
+
 #ifndef __ASSEMBLY__
 
 #include 
-- 
2.25.0



Re: [PATCH] powerpc/64s: power4 nap fixup in C

2021-03-29 Thread Benjamin Herrenschmidt
On Fri, 2021-03-12 at 11:20 +1000, Nicholas Piggin wrote:
> 
> +static inline void nap_adjust_return(struct pt_regs *regs)
> 
> +{
> 
> +#ifdef CONFIG_PPC_970_NAP
> 
> +   if (unlikely(test_thread_local_flags(_TLF_NAPPING))) {
> +   /* Can avoid a test-and-clear because NMIs do not call this */
> +   clear_thread_local_flags(_TLF_NAPPING);
> +   regs->nip = (unsigned long)power4_idle_nap_return;
> +   }

Is this a pointer to a function descriptor or the actual code ?

Cheers,
Ben.

> +#endif
> 
> +}
> 
> +
> 
>  struct interrupt_state {
> 
>  #ifdef CONFIG_PPC_BOOK3E_64
> 
> enum ctx_state ctx_state;
> 
> @@ -111,6 +122,9 @@ static inline void interrupt_async_exit_prepare(struct 
> pt_regs *regs, struct int
> 
>  {
> 
> irq_exit();
> 
> interrupt_exit_prepare(regs, state);
> 
> +
> 
> +   /* Adjust at exit so the main handler sees the true NIA */
> 
> +   nap_adjust_return(regs);
> 
>  }
> 
>  
> 
>  struct interrupt_nmi_state {
> 
> @@ -164,6 +178,11 @@ static inline void interrupt_nmi_exit_prepare(struct 
> pt_regs *regs, struct inter
> 
> radix_enabled() || (mfmsr() & MSR_DR))
> 
> nmi_exit();
> 
>  
> 
> +   /*
> 
> +* nmi does not call nap_adjust_return because nmi should not create
> 
> +* new work to do (must use irq_work for that).
> 
> +*/
> 
> +
> 
>  #ifdef CONFIG_PPC64
> 
> if (TRAP(regs) != 0x900 && TRAP(regs) != 0xf00 && TRAP(regs) != 0x260)
> 
> this_cpu_set_ftrace_enabled(state->ftrace_enabled);
> 
> diff --git a/arch/powerpc/include/asm/processor.h 
> b/arch/powerpc/include/asm/processor.h
> 
> index 8acc3590c971..eedc3c775141 100644
> 
> --- a/arch/powerpc/include/asm/processor.h
> 
> +++ b/arch/powerpc/include/asm/processor.h
> 
> @@ -393,6 +393,7 @@ extern unsigned long isa300_idle_stop_mayloss(unsigned 
> long psscr_val);
> 
>  extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
> 
>  #ifdef CONFIG_PPC_970_NAP
> 
>  extern void power4_idle_nap(void);
> 
> +void power4_idle_nap_return(void);
> 
>  #endif
> 
>  
> 
>  extern unsigned long cpuidle_disable;
> 
> diff --git a/arch/powerpc/include/asm/thread_info.h 
> b/arch/powerpc/include/asm/thread_info.h
> 
> index 386d576673a1..bf137151100b 100644
> 
> --- a/arch/powerpc/include/asm/thread_info.h
> 
> +++ b/arch/powerpc/include/asm/thread_info.h
> 
> @@ -152,6 +152,12 @@ void arch_setup_new_exec(void);
> 
>  
> 
>  #ifndef __ASSEMBLY__
> 
>  
> 
> +static inline void clear_thread_local_flags(unsigned int flags)
> 
> +{
> 
> +   struct thread_info *ti = current_thread_info();
> 
> +   ti->local_flags &= ~flags;
> 
> +}
> 
> +
> 
>  static inline bool test_thread_local_flags(unsigned int flags)
> 
>  {
> 
> struct thread_info *ti = current_thread_info();
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S 
> b/arch/powerpc/kernel/exceptions-64s.S
> 
> index 60d3051a8bc8..ea7a443488d2 100644
> 
> --- a/arch/powerpc/kernel/exceptions-64s.S
> 
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> 
> @@ -692,25 +692,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
> 
> ld  r1,GPR1(r1)
> 
>  .endm
> 
>  
> 
> -/*
> 
> - * When the idle code in power4_idle puts the CPU into NAP mode,
> 
> - * it has to do so in a loop, and relies on the external interrupt
> 
> - * and decrementer interrupt entry code to get it out of the loop.
> 
> - * It sets the _TLF_NAPPING bit in current_thread_info()->local_flags
> 
> - * to signal that it is in the loop and needs help to get out.
> 
> - */
> 
> -#ifdef CONFIG_PPC_970_NAP
> 
> -#define FINISH_NAP \
> 
> -BEGIN_FTR_SECTION  \
> 
> -   ld  r11, PACA_THREAD_INFO(r13); \
> 
> -   ld  r9,TI_LOCAL_FLAGS(r11); \
> 
> -   andi.   r10,r9,_TLF_NAPPING;\
> 
> -   bnelpower4_fixup_nap;   \
> 
> -END_FTR_SECTION_IFSET(CPU_FTR_CAN_NAP)
> 
> -#else
> 
> -#define FINISH_NAP
> 
> -#endif
> 
> -
> 
>  /*
> 
>   * There are a few constraints to be concerned with.
> 
>   * - Real mode exceptions code/data must be located at their physical 
> location.
> 
> @@ -1248,7 +1229,6 @@ EXC_COMMON_BEGIN(machine_check_common)
> 
>  */
> 
> GEN_COMMON machine_check
> 
>  
> 
> -   FINISH_NAP
> 
> /* Enable MSR_RI when finished with PACA_EXMC */
> 
> li  r10,MSR_RI
> 
> mtmsrd  r10,1
> 
> @@ -1571,7 +1551,6 @@ EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100)
> 
>  EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100)
> 
>  EXC_COMMON_BEGIN(hardware_interrupt_common)
> 
> GEN_COMMON hardware_interrupt
> 
> -   FINISH_NAP
> 
> addir3,r1,STACK_FRAME_OVERHEAD
> 
> bl  do_IRQ
> 
> b   interrupt_return
> 
> @@ -1801,7 +1780,6 @@ EXC_VIRT_BEGIN(decrementer, 0x4900, 0x80)
> 
>  EXC_VIRT_END(decrementer, 0x4900, 0x80)
> 
>  EXC_COMMON_BEGIN(dec

[PATCH] PCI: Try to find two continuous regions for child resource

2021-03-29 Thread Kai-Heng Feng
Built-in grahpics on HP EliteDesk 805 G6 doesn't work because graphics
can't get the BAR it needs:
[0.611504] pci_bus :00: root bus resource [mem 
0x1002020-0x100303f window]
[0.611505] pci_bus :00: root bus resource [mem 
0x1003040-0x100401f window]
...
[0.638083] pci :00:08.1:   bridge window [mem 0xd200-0xd23f]
[0.638086] pci :00:08.1:   bridge window [mem 
0x1003000-0x100401f 64bit pref]
[0.962086] pci :00:08.1: can't claim BAR 15 [mem 
0x1003000-0x100401f 64bit pref]: no compatible bridge window
[0.962086] pci :00:08.1: [mem 0x1003000-0x100401f 64bit pref] 
clipped to [mem 0x1003000-0x100303f 64bit pref]
[0.962086] pci :00:08.1:   bridge window [mem 
0x1003000-0x100303f 64bit pref]
[0.962086] pci :07:00.0: can't claim BAR 0 [mem 
0x1003000-0x1003fff 64bit pref]: no compatible bridge window
[0.962086] pci :07:00.0: can't claim BAR 2 [mem 
0x1004000-0x100401f 64bit pref]: no compatible bridge window

However, the root bus has two continuous regions that can contain the
child resource requested.

So try to find another parent region if two regions are continuous and
can contain child resource. This change makes the grahpics works on the
system in question.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212013
Signed-off-by: Kai-Heng Feng 
---
 arch/microblaze/pci/pci-common.c |  4 +--
 arch/powerpc/kernel/pci-common.c |  8 ++---
 arch/sparc/kernel/pci.c  |  4 +--
 drivers/pci/pci.c| 60 +++-
 drivers/pci/setup-res.c  | 21 +++
 drivers/pcmcia/rsrc_nonstatic.c  |  4 +--
 include/linux/pci.h  |  6 ++--
 7 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 557585f1be41..8e65832fb510 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -669,7 +669,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus 
*bus)
 {
struct pci_bus *b;
int i;
-   struct resource *res, *pr;
+   struct resource *res, *pr = NULL;
 
pr_debug("PCI: Allocating bus resources for %04x:%02x...\n",
 pci_domain_nr(bus), bus->number);
@@ -688,7 +688,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus 
*bus)
 * and as such ensure proper re-allocation
 * later.
 */
-   pr = pci_find_parent_resource(bus->self, res);
+   pci_find_parent_resource(bus->self, res, &pr, NULL);
if (pr == res) {
/* this happens when the generic PCI
 * code (wrongly) decides that this
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 001e90cd8948..f865354b746d 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -1196,7 +1196,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus 
*bus)
 {
struct pci_bus *b;
int i;
-   struct resource *res, *pr;
+   struct resource *res, *pr = NULL;
 
pr_debug("PCI: Allocating bus resources for %04x:%02x...\n",
 pci_domain_nr(bus), bus->number);
@@ -1213,7 +1213,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus 
*bus)
pr = (res->flags & IORESOURCE_IO) ?
&ioport_resource : &iomem_resource;
else {
-   pr = pci_find_parent_resource(bus->self, res);
+   pci_find_parent_resource(bus->self, res, &pr, NULL);
if (pr == res) {
/* this happens when the generic PCI
 * code (wrongly) decides that this
@@ -1265,12 +1265,12 @@ static void pcibios_allocate_bus_resources(struct 
pci_bus *bus)
 
 static inline void alloc_resource(struct pci_dev *dev, int idx)
 {
-   struct resource *pr, *r = &dev->resource[idx];
+   struct resource *pr = NULL, *r = &dev->resource[idx];
 
pr_debug("PCI: Allocating %s: Resource %d: %pR\n",
 pci_name(dev), idx, r);
 
-   pr = pci_find_parent_resource(dev, r);
+   pci_find_parent_resource(dev, r, &pr, NULL);
if (!pr || (pr->flags & IORESOURCE_UNSET) ||
request_resource(pr, r) < 0) {
printk(KERN_WARNING "PCI: Cannot allocate resource region %d"
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index 9c2b720bfd20..b4006798e4e1 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -621,7 +621,7 @@ static void pci_bus_register_of_sysfs(struct pci_bus *bus)
 static void pci_claim_legacy_resources(struct pci_dev *dev)
 {
struct pci_bus_region region;
-   struct resou

Re: [PATCH] powerpc/64s: power4 nap fixup in C

2021-03-29 Thread Christophe Leroy




Le 29/03/2021 à 10:33, Benjamin Herrenschmidt a écrit :

On Fri, 2021-03-12 at 11:20 +1000, Nicholas Piggin wrote:


+static inline void nap_adjust_return(struct pt_regs *regs)

+{

+#ifdef CONFIG_PPC_970_NAP

+   if (unlikely(test_thread_local_flags(_TLF_NAPPING))) {
+   /* Can avoid a test-and-clear because NMIs do not call this */
+   clear_thread_local_flags(_TLF_NAPPING);
+   regs->nip = (unsigned long)power4_idle_nap_return;
+   }


Is this a pointer to a function descriptor or the actual code ?



--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -209,4 +209,8 @@ _GLOBAL(power4_idle_nap)
mtmsrd  r7
isync
b   1b
+
+   .globl power4_idle_nap_return
+power4_idle_nap_return:
+   blr
 #endif



Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-29 Thread Laurent Dufour

Hi Christophe and Dimitry,

Le 27/03/2021 à 18:43, Dmitry Safonov a écrit :

Hi Christophe,

On 3/27/21 5:19 PM, Christophe Leroy wrote:
[..]

I opportunistically Cc stable on it: I understand that usually such
stuff isn't a stable material, but that will allow us in CRIU have
one workaround less that is needed just for one release (v5.11) on
one platform (ppc64), which we otherwise have to maintain.


Why is that a workaround, and why for one release only ? I think the
solution proposed by Laurentto use the aux vector AT_SYSINFO_EHDR should
work with any past and future release.


Yeah, I guess.
Previously, (before v5.11/power) all kernels had ELF start at "[vdso]"
VMA start, now we'll have to carry the offset in the VMA. Probably, not
the worst thing, but as it will be only for v5.11 release it can break,
so needs separate testing.
Kinda life was a bit easier without this additional code.
The assumption that ELF header is at the start of "[vdso]" is perhaps not a good 
one, but using a "[vvar]" section looks more conventional and allows to clearly 
identify the data part. I'd argue for this option.





I wouldn't go as far as to say that the commit 511157ab641e is ABI
regression as no other userspace got broken, but I'd really appreciate
if it gets backported to v5.11 after v5.12 is released, so as not
to complicate already non-simple CRIU-vdso code. Thanks!

Cc: Andrei Vagin 
Cc: Andy Lutomirski 
Cc: Benjamin Herrenschmidt 
Cc: Christophe Leroy 
Cc: Laurent Dufour 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: sta...@vger.kernel.org # v5.11
[1]: https://github.com/checkpoint-restore/criu/issues/1417
Signed-off-by: Dmitry Safonov 
Tested-by: Christophe Leroy 


I tested it with sifreturn_vdso selftest and it worked, because that
selftest doesn't involve VDSO data.


Thanks again on helping with testing it, I appreciate it!


But if I do a mremap() on the VDSO text vma without remapping VVAR to
keep the same distance between the two vmas, gettimeofday() crashes. The
reason is that the code obtains the address of the data by calculating a
fix difference from its own address with the below macro, the delta
being resolved at link time:

.macro get_datapage ptr
 bcl    20, 31, .+4
999:
 mflr    \ptr
#if CONFIG_PPC_PAGE_SHIFT > 14
 addis    \ptr, \ptr, (_vdso_datapage - 999b)@ha
#endif
 addi    \ptr, \ptr, (_vdso_datapage - 999b)@l
.endm

So the datapage needs to remain at the same distance from the code at
all time.

Wondering how the other architectures do to have two independent VMAs
and be able to move one independently of the other.


It's alright as far as I know. If userspace remaps vdso/vvar it should
be aware of this (CRIU keeps this in mind, also old vdso image is dumped
to compare on restore with the one that the host has).


I do agree, playing with the VDSO mapping needs the application to be aware of 
the mapping details, and prior to 83d3f0e90c6c "powerpc/mm: tracking vDSO 
remap", remapping the VDSO was not working on PowerPC and nobody complained...


Laurent.



Re: [PATCH v2 3/7] powerpc: convert config files to generic cmdline

2021-03-29 Thread Will Deacon
On Thu, Mar 25, 2021 at 12:59:56PM -0700, Daniel Walker wrote:
> On Thu, Mar 25, 2021 at 01:03:55PM +0100, Christophe Leroy wrote:
> > 
> > Ok, so you agree we don't need to provide two CMDLINE, one to be appended 
> > and one to be prepended.
> > 
> > Let's only provide once CMDLINE as of today, and ask the user to select
> > whether he wants it appended or prepended or replacee. Then no need to
> > change all existing config to rename CONFIG_CMDLINE into either of the new
> > ones.
> > 
> > That's the main difference between my series and Daniel's series. So I'll
> > finish taking Will's comment into account and we'll send out a v3 soon.
> 
> It doesn't solve the needs of Cisco, I've stated many times your changes have
> little value. Please stop submitting them.

FWIW, they're useful for arm64 and I will gladly review the updated series.

I don't think asking people to stop submitting patches is ever the right
answer. Please don't do that.

Will


[PATCH] powerpc/papr_scm: Mark nvdimm as unarmed if needed during probe

2021-03-29 Thread Vaibhav Jain
In case an nvdimm is found to be unarmed during probe then set its
NDD_UNARMED flag before nvdimm_create(). This would enforce a
read-only access to the ndimm region. Presently even if an nvdimm is
unarmed its not marked as read-only on ppc64 guests.

The patch updates papr_scm_nvdimm_init() to force query of nvdimm
health via __drc_pmem_query_health() and if nvdimm is found to be
unarmed then set the nvdimm flag ND_UNARMED for nvdimm_create().

Signed-off-by: Vaibhav Jain 
---
 arch/powerpc/platforms/pseries/papr_scm.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index 835163f54244..7e8168e19427 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -914,6 +914,15 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
dimm_flags = 0;
set_bit(NDD_LABELING, &dimm_flags);
 
+   /*
+* Check if the nvdimm is unarmed. No locking needed as we are still
+* initializing. Ignore error encountered if any.
+*/
+   __drc_pmem_query_health(p);
+
+   if (p->health_bitmap & PAPR_PMEM_UNARMED_MASK)
+   set_bit(NDD_UNARMED, &dimm_flags);
+
p->nvdimm = nvdimm_create(p->bus, p, papr_nd_attr_groups,
  dimm_flags, PAPR_SCM_DIMM_CMD_MASK, 0, NULL);
if (!p->nvdimm) {
-- 
2.30.2



Re: [PATCH 00/30] DMA: Mundane typo fixes

2021-03-29 Thread Greg KH
On Mon, Mar 29, 2021 at 11:25:11AM +0530, Bhaskar Chowdhury wrote:
> On 07:29 Mon 29 Mar 2021, Christoph Hellwig wrote:
> > I really don't think these typo patchbomb are that useful.  I'm all
> > for fixing typos when working with a subsystem, but I'm not sure these
> > patchbombs help anything.
> > 
> I am sure you are holding the wrong end of the wand and grossly failing to
> understand.

Please stop statements like this, it is not helpful and is doing nothing
but ensure that your patches will not be looked at in the future.

> Anyway, I hope I give a heads up ...find "your way" to fix those damn
> thing...it's glaring

There is no requirement that anyone accept patches that are sent to
them.  When you complain when receiving comments on them, that
shows you do not wish to work with others.

Sorry, but you are now on my local blacklist for a while, and I
encourage other maintainers to just ignore these patches as well.

thanks,

greg k-h


Re: [PATCH] powerpc/64s: power4 nap fixup in C

2021-03-29 Thread Andreas Schwab
On Mär 29 2021, Christophe Leroy wrote:

> Le 29/03/2021 à 10:33, Benjamin Herrenschmidt a écrit :
>> On Fri, 2021-03-12 at 11:20 +1000, Nicholas Piggin wrote:
>>>
>>> +static inline void nap_adjust_return(struct pt_regs *regs)
>>>
>>> +{
>>>
>>> +#ifdef CONFIG_PPC_970_NAP
>>>
>>> +   if (unlikely(test_thread_local_flags(_TLF_NAPPING))) {
>>> +   /* Can avoid a test-and-clear because NMIs do not call this 
>>> */
>>> +   clear_thread_local_flags(_TLF_NAPPING);
>>> +   regs->nip = (unsigned long)power4_idle_nap_return;
>>> +   }
>> Is this a pointer to a function descriptor or the actual code ?
>> 
>
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -209,4 +209,8 @@ _GLOBAL(power4_idle_nap)
>   mtmsrd  r7
>   isync
>   b   1b
> +
> + .globl power4_idle_nap_return
> +power4_idle_nap_return:
> + blr
>  #endif

The problem is not the definition, it is the reference.  In C, a
function symbol always resolves to the address of the descriptor.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."


Re: [PATCH 00/30] DMA: Mundane typo fixes

2021-03-29 Thread Bhaskar Chowdhury

On 13:48 Mon 29 Mar 2021, Greg KH wrote:

On Mon, Mar 29, 2021 at 11:25:11AM +0530, Bhaskar Chowdhury wrote:

On 07:29 Mon 29 Mar 2021, Christoph Hellwig wrote:
> I really don't think these typo patchbomb are that useful.  I'm all
> for fixing typos when working with a subsystem, but I'm not sure these
> patchbombs help anything.
>
I am sure you are holding the wrong end of the wand and grossly failing to
understand.


Please stop statements like this, it is not helpful and is doing nothing
but ensure that your patches will not be looked at in the future.


Greg, don't you think you are bit harsh and have an one sided view? People can
say in better way if they don't like some work. I Have always try to get
along.

Anyway, I hope I give a heads up ...find "your way" to fix those damn
thing...it's glaring


There is no requirement that anyone accept patches that are sent to
them.  When you complain when receiving comments on them, that
shows you do not wish to work with others.


Unfortunate you are only seeing my complains...I don't know why you are so
blindfolded.

Sorry, but you are now on my local blacklist for a while, and I
encourage other maintainers to just ignore these patches as well.


I can not overrule that ...I know my pathes are trivial ..but it seems some
other problems are looming large.

NOT good Gregnot good seriously.

thanks,

greg k-h


signature.asc
Description: PGP signature


Re: [PATCH] powerpc/mm/book3s64: Use the correct storage key value when calling H_PROTECT

2021-03-29 Thread Michael Ellerman
On Fri, 26 Mar 2021 12:37:55 +0530, Aneesh Kumar K.V wrote:
> H_PROTECT expect the flag value to include
> flags: AVPN, pp0, pp1, pp2, key0-key4, Noexec, CMO Option flags
> 
> This patch updates hpte_updatepp() to fetch the storage key value from the 
> linux page
> table and use the same in H_PROTECT hcall.
> 
> native_hpte_updatepp() is not updated because the kernel doesn't clear the 
> existing
> storage key value there. The kernel also doesn't use hpte_updatepp() callback 
> for
> updating storage keys.
> 
> [...]

Applied to powerpc/fixes.

[1/1] powerpc/mm/book3s64: Use the correct storage key value when calling 
H_PROTECT
  https://git.kernel.org/powerpc/c/53f1d31708f6240e4615b0927df31f182e389e2f

cheers


Re: [PATCH next v1 2/3] printk: remove safe buffers

2021-03-29 Thread Petr Mladek
On Fri 2021-03-26 12:12:37, John Ogness wrote:
> On 2021-03-23, Petr Mladek  wrote:
> >> --- a/kernel/printk/printk.c
> >> +++ b/kernel/printk/printk.c
> >> -
> >>if (seq != prb_next_seq(&printk_rb_static)) {
> >>pr_err("dropped %llu messages\n",
> >>   prb_next_seq(&printk_rb_static) - seq);
> >> @@ -2666,7 +2631,6 @@ void console_unlock(void)
> >>size_t ext_len = 0;
> >>size_t len;
> >>  
> >> -  printk_safe_enter_irqsave(flags);
> >>  skip:
> >>if (!prb_read_valid(prb, console_seq, &r))
> >>break;
> >> @@ -2711,6 +2675,8 @@ void console_unlock(void)
> >>printk_time);
> >>console_seq++;
> >>  
> >> +  printk_safe_enter_irqsave(flags);
> >
> > What is the purpose of the printk_safe context here, please?
> 
> console_lock_spinning_enable() needs to be called with interrupts
> disabled. I should have just used local_irq_save().
> 
> I could add local_irq_save() to console_lock_spinning_enable() and
> restore them at the end of console_lock_spinning_disable_and_check(),
> but then I would need to add a @flags argument to both functions. I
> think it is simpler to just do the disable/enable from the caller,
> console_unlock().

I see. I have missed it that all this code have to be called with
interrupts disabled.

OK, it is a must-to-have because of the spinning. But I wonder if some
console drivers rely on the fact that the write() callback is
called with interrupts disabled.

IMHO, it would be a bug when any write() callback expects that
callers disabled the interrupts.

Do you plan to remove the console-spinning stuff after offloading
consoles to the kthreads?

Will you call console write() callback with irq enabled from
the kthread?

Anyway, we should at least add a comment why the interrupts are
disabled.


> BTW, I could not find any sane way of disabling interrupts via a
> raw_spin_lock_irqsave() of @console_owner_lock because of the how it is
> used with lockdep. In particular for
> console_lock_spinning_disable_and_check().

I see. IMHO, we would need to explicitly call local_irq_save()/restore()
if we moved them to console_lock_spinning_enable()/disable_and_check().
I mean to do:


static void console_lock_spinning_enable(unsigned long *flags)
{
local_irq_save(*flags);

raw_spin_lock(&console_owner_lock);
console_owner = current;
raw_spin_unlock(&console_owner_lock);

/* The waiter may spin on us after setting console_owner */
spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_);
}

...

Best Regards,
Petr


Re: [PATCH next v1 2/3] printk: remove safe buffers

2021-03-29 Thread John Ogness
On 2021-03-29, Petr Mladek  wrote:
> I wonder if some console drivers rely on the fact that the write()
> callback is called with interrupts disabled.
>
> IMHO, it would be a bug when any write() callback expects that
> callers disabled the interrupts.

Agreed.

> Do you plan to remove the console-spinning stuff after offloading
> consoles to the kthreads?

Yes. Although a similar concept will be introduced to allow the threaded
printers and the atomic consoles to compete.

> Will you call console write() callback with irq enabled from the
> kthread?

No. That defeats the fundamental purpose of this entire rework
excercise. ;-)

> Anyway, we should at least add a comment why the interrupts are
> disabled.

I decided to move the local_irq_save/restore inside the console-spinning
functions and added a comment for v2.

John Ogness


Re: [PATCH next v1 2/3] printk: remove safe buffers

2021-03-29 Thread John Ogness
On 2021-03-29, John Ogness  wrote:
>> Will you call console write() callback with irq enabled from the
>> kthread?
>
> No. That defeats the fundamental purpose of this entire rework
> excercise. ;-)

Sorry, I misread your question. The answer is "yes". We want to avoid a
local_irq_save() when calling into console->write().

John Ogness


Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-29 Thread Laurent Dufour

Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :

Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front")
VVAR page is in front of the VDSO area. In result it breaks CRIU
(Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]"
from /proc/../maps points at ELF/vdso image, rather than at VVAR data page.
Laurent made a patch to keep CRIU working (by reading aux vector).
But I think it still makes sence to separate two mappings into different
VMAs. It will also make ppc64 less "special" for userspace and as
a side-bonus will make VVAR page un-writable by debugger (which previously
would COW page and can be unexpected).

I opportunistically Cc stable on it: I understand that usually such
stuff isn't a stable material, but that will allow us in CRIU have
one workaround less that is needed just for one release (v5.11) on
one platform (ppc64), which we otherwise have to maintain.
I wouldn't go as far as to say that the commit 511157ab641e is ABI
regression as no other userspace got broken, but I'd really appreciate
if it gets backported to v5.11 after v5.12 is released, so as not
to complicate already non-simple CRIU-vdso code. Thanks!

Cc: Andrei Vagin 
Cc: Andy Lutomirski 
Cc: Benjamin Herrenschmidt 
Cc: Christophe Leroy 
Cc: Laurent Dufour 
Cc: Michael Ellerman 
Cc: Paul Mackerras 
Cc: linuxppc-dev@lists.ozlabs.org
Cc: sta...@vger.kernel.org # v5.11
[1]: https://github.com/checkpoint-restore/criu/issues/1417
Signed-off-by: Dmitry Safonov 
Tested-by: Christophe Leroy 


I run the CRIU's test suite and except the usual suspects, all the tests passed.

Tested-by: Laurent Dufour 


---
  arch/powerpc/include/asm/mmu_context.h |  2 +-
  arch/powerpc/kernel/vdso.c | 54 +++---
  2 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 652ce85f9410..4bc45d3ed8b0 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -263,7 +263,7 @@ extern void arch_exit_mmap(struct mm_struct *mm);
  static inline void arch_unmap(struct mm_struct *mm,
  unsigned long start, unsigned long end)
  {
-   unsigned long vdso_base = (unsigned long)mm->context.vdso - PAGE_SIZE;
+   unsigned long vdso_base = (unsigned long)mm->context.vdso;
  
  	if (start <= vdso_base && vdso_base < end)

mm->context.vdso = NULL;
diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c
index e839a906fdf2..b14907209822 100644
--- a/arch/powerpc/kernel/vdso.c
+++ b/arch/powerpc/kernel/vdso.c
@@ -55,10 +55,10 @@ static int vdso_mremap(const struct vm_special_mapping *sm, 
struct vm_area_struc
  {
unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
  
-	if (new_size != text_size + PAGE_SIZE)

+   if (new_size != text_size)
return -EINVAL;
  
-	current->mm->context.vdso = (void __user *)new_vma->vm_start + PAGE_SIZE;

+   current->mm->context.vdso = (void __user *)new_vma->vm_start;
  
  	return 0;

  }
@@ -73,6 +73,10 @@ static int vdso64_mremap(const struct vm_special_mapping 
*sm, struct vm_area_str
return vdso_mremap(sm, new_vma, &vdso64_end - &vdso64_start);
  }
  
+static struct vm_special_mapping vvar_spec __ro_after_init = {

+   .name = "[vvar]",
+};
+
  static struct vm_special_mapping vdso32_spec __ro_after_init = {
.name = "[vdso]",
.mremap = vdso32_mremap,
@@ -89,11 +93,11 @@ static struct vm_special_mapping vdso64_spec 
__ro_after_init = {
   */
  static int __arch_setup_additional_pages(struct linux_binprm *bprm, int 
uses_interp)
  {
-   struct mm_struct *mm = current->mm;
+   unsigned long vdso_size, vdso_base, mappings_size;
struct vm_special_mapping *vdso_spec;
+   unsigned long vvar_size = PAGE_SIZE;
+   struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
-   unsigned long vdso_size;
-   unsigned long vdso_base;
  
  	if (is_32bit_task()) {

vdso_spec = &vdso32_spec;
@@ -110,8 +114,8 @@ static int __arch_setup_additional_pages(struct 
linux_binprm *bprm, int uses_int
vdso_base = 0;
}
  
-	/* Add a page to the vdso size for the data page */

-   vdso_size += PAGE_SIZE;
+   mappings_size = vdso_size + vvar_size;
+   mappings_size += (VDSO_ALIGNMENT - 1) & PAGE_MASK;
  
  	/*

 * pick a base address for the vDSO in process space. We try to put it
@@ -119,9 +123,7 @@ static int __arch_setup_additional_pages(struct 
linux_binprm *bprm, int uses_int
 * and end up putting it elsewhere.
 * Add enough to the size so that the result can be aligned.
 */
-   vdso_base = get_unmapped_area(NULL, vdso_base,
- vdso_size + ((VDSO_ALIGNMENT - 1) & 
PAGE_MASK),
- 0, 0);
+   vdso_base = get_unmapped_area(NULL, vdso_base, 

Re: [PATCH] powerpc/64s: power4 nap fixup in C

2021-03-29 Thread Andreas Schwab
On Mär 29 2021, Michael Ellerman wrote:

> Nicholas Piggin  writes:
>> There is no need for this to be in asm, use the new intrrupt entry wrapper.
>>
>> Signed-off-by: Nicholas Piggin 
>> ---
>> Hopefully this works on a real G5 now, but I couldn't reproduce the
>> problem with QEMU.
>
> It still prevents my G5 from booting.

I see differing failures.  What's common is that there is a pause of
about 60 seconds before the crash occurs.  It looks like the crash
occurs in power4_idle_nap+0x30/0x34.  Unfortuately, the BootX console is
too small to see enough.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."


Re: [PATCH] PCI: Try to find two continuous regions for child resource

2021-03-29 Thread Bjorn Helgaas
On Mon, Mar 29, 2021 at 04:47:59PM +0800, Kai-Heng Feng wrote:
> Built-in grahpics on HP EliteDesk 805 G6 doesn't work because graphics
> can't get the BAR it needs:
> [0.611504] pci_bus :00: root bus resource [mem 
> 0x1002020-0x100303f window]
> [0.611505] pci_bus :00: root bus resource [mem 
> 0x1003040-0x100401f window]
> ...
> [0.638083] pci :00:08.1:   bridge window [mem 0xd200-0xd23f]
> [0.638086] pci :00:08.1:   bridge window [mem 
> 0x1003000-0x100401f 64bit pref]
> [0.962086] pci :00:08.1: can't claim BAR 15 [mem 
> 0x1003000-0x100401f 64bit pref]: no compatible bridge window
> [0.962086] pci :00:08.1: [mem 0x1003000-0x100401f 64bit pref] 
> clipped to [mem 0x1003000-0x100303f 64bit pref]
> [0.962086] pci :00:08.1:   bridge window [mem 
> 0x1003000-0x100303f 64bit pref]
> [0.962086] pci :07:00.0: can't claim BAR 0 [mem 
> 0x1003000-0x1003fff 64bit pref]: no compatible bridge window
> [0.962086] pci :07:00.0: can't claim BAR 2 [mem 
> 0x1004000-0x100401f 64bit pref]: no compatible bridge window
>
> However, the root bus has two continuous regions that can contain the
> child resource requested.
>
> So try to find another parent region if two regions are continuous and
> can contain child resource. This change makes the grahpics works on the
> system in question.

The BIOS description of PCI0 is interesting:

  pci_bus :00: root bus resource [mem 0x100-0x100201f window]
  pci_bus :00: root bus resource [mem 0x1002020-0x100303f window]
  pci_bus :00: root bus resource [mem 0x1003040-0x100401f window]

So the PCI0 _CRS apparently gave us:

  [mem 0x100-0x100201f] size 0x2020 (512MB + 2MB)
  [mem 0x1002020-0x100303f] size 0x1020 (256MB + 2MB)
  [mem 0x1003040-0x100401f] size 0x0fe0 (254MB)

These are all contiguous, so we'd have no problem if we coalesced them
into a single window:

  [mem 0x100-0x100401f window] size 0x4020 (1GB + 2MB)

I think we currently keep these root bus resources separate because if
we ever support _SRS for host bridges, the argument we give to _SRS
must be exactly the same format as what we got from _CRS (see ACPI
v6.3, sec 6.2.16, and pnpacpi_set_resources()).

pnpacpi_encode_resources() is currently very simple-minded and copies
each device resource back into a single _SRS entry.  But (1) we don't
support _SRS for host bridges, and (2) if we ever do, we can make
pnpacpi_encode_resources() smarter so it breaks things back up.

So I think we should try to fix this by coalescing these adjacent
resources from _CRS so we end up with a single root bus resource that
covers all contiguous regions.

Typos, etc:
  - No need for the timestamps; they're not relevant to the problem.
  - s/grahpics/graphics/ (two occurrences above)
  - s/continuous/contiguous/ (three occurrences above)

> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=212013
> Signed-off-by: Kai-Heng Feng 
> ---
>  arch/microblaze/pci/pci-common.c |  4 +--
>  arch/powerpc/kernel/pci-common.c |  8 ++---
>  arch/sparc/kernel/pci.c  |  4 +--
>  drivers/pci/pci.c| 60 +++-
>  drivers/pci/setup-res.c  | 21 +++
>  drivers/pcmcia/rsrc_nonstatic.c  |  4 +--
>  include/linux/pci.h  |  6 ++--
>  7 files changed, 80 insertions(+), 27 deletions(-)
> 
> diff --git a/arch/microblaze/pci/pci-common.c 
> b/arch/microblaze/pci/pci-common.c
> index 557585f1be41..8e65832fb510 100644
> --- a/arch/microblaze/pci/pci-common.c
> +++ b/arch/microblaze/pci/pci-common.c
> @@ -669,7 +669,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus 
> *bus)
>  {
>   struct pci_bus *b;
>   int i;
> - struct resource *res, *pr;
> + struct resource *res, *pr = NULL;
>  
>   pr_debug("PCI: Allocating bus resources for %04x:%02x...\n",
>pci_domain_nr(bus), bus->number);
> @@ -688,7 +688,7 @@ static void pcibios_allocate_bus_resources(struct pci_bus 
> *bus)
>* and as such ensure proper re-allocation
>* later.
>*/
> - pr = pci_find_parent_resource(bus->self, res);
> + pci_find_parent_resource(bus->self, res, &pr, NULL);
>   if (pr == res) {
>   /* this happens when the generic PCI
>* code (wrongly) decides that this
> diff --git a/arch/powerpc/kernel/pci-common.c 
> b/arch/powerpc/kernel/pci-common.c
> index 001e90cd8948..f865354b746d 100644
> --- a/arch/powerpc/kernel/pci-common.c
> +++ b/arch/powerpc/kernel/pci-common.c
> @@ -1196,7 +1196,7 @@ static void pcibios_allocate_bus_resources(struct 
> pci_bus *bus)
>  {
>   struct pci_bus *b;
>   int i;
> - struct resource *res, *pr;
> + s

Re: [PATCH] powerpc/64s: power4 nap fixup in C

2021-03-29 Thread Andreas Schwab
On Mär 29 2021, Andreas Schwab wrote:

> On Mär 29 2021, Michael Ellerman wrote:
>
>> Nicholas Piggin  writes:
>>> There is no need for this to be in asm, use the new intrrupt entry wrapper.
>>>
>>> Signed-off-by: Nicholas Piggin 
>>> ---
>>> Hopefully this works on a real G5 now, but I couldn't reproduce the
>>> problem with QEMU.
>>
>> It still prevents my G5 from booting.
>
> I see differing failures.  What's common is that there is a pause of
> about 60 seconds before the crash occurs.  It looks like the crash
> occurs in power4_idle_nap+0x30/0x34.  Unfortuately, the BootX console is
> too small to see enough.

I was now able to see the messages on the VGA console, and the problem
is actually that the cpus are starting to stall.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."


Re: [PATCH] powerpc/64s: power4 nap fixup in C

2021-03-29 Thread Andreas Schwab
On Mär 29 2021, Andreas Schwab wrote:

> On Mär 29 2021, Christophe Leroy wrote:
>
>> Le 29/03/2021 à 10:33, Benjamin Herrenschmidt a écrit :
>>> On Fri, 2021-03-12 at 11:20 +1000, Nicholas Piggin wrote:

 +static inline void nap_adjust_return(struct pt_regs *regs)

 +{

 +#ifdef CONFIG_PPC_970_NAP

 +   if (unlikely(test_thread_local_flags(_TLF_NAPPING))) {
 +   /* Can avoid a test-and-clear because NMIs do not call 
 this */
 +   clear_thread_local_flags(_TLF_NAPPING);
 +   regs->nip = (unsigned long)power4_idle_nap_return;
 +   }
>>> Is this a pointer to a function descriptor or the actual code ?
>>> 
>>
>> --- a/arch/powerpc/kernel/idle_book3s.S
>> +++ b/arch/powerpc/kernel/idle_book3s.S
>> @@ -209,4 +209,8 @@ _GLOBAL(power4_idle_nap)
>>  mtmsrd  r7
>>  isync
>>  b   1b
>> +
>> +.globl power4_idle_nap_return
>> +power4_idle_nap_return:
>> +blr
>>  #endif
>
> The problem is not the definition, it is the reference.  In C, a
> function symbol always resolves to the address of the descriptor.

Sorry, this is wrong, I have misremembered how function descriptors work
on ppc64.  The address is really pointing to the actual code.

Andreas.

-- 
Andreas Schwab, sch...@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."


[PATCH v3] powerpc/papr_scm: Implement support for H_SCM_FLUSH hcall

2021-03-29 Thread Shivaprasad G Bhat
Add support for ND_REGION_ASYNC capability if the device tree
indicates 'ibm,hcall-flush-required' property in the NVDIMM node.
Flush is done by issuing H_SCM_FLUSH hcall to the hypervisor.

If the flush request failed, the hypervisor is expected to
to reflect the problem in the subsequent nvdimm H_SCM_HEALTH call.

This patch prevents mmap of namespaces with MAP_SYNC flag if the
nvdimm requires an explicit flush[1].

References:
[1] 
https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c

Signed-off-by: Shivaprasad G Bhat 
---
v2 - https://www.spinics.net/lists/kvm-ppc/msg18799.html
Changes from v2:
   - Fixed the commit message.
   - Add dev_dbg before the H_SCM_FLUSH hcall

v1 - https://www.spinics.net/lists/kvm-ppc/msg18272.html
Changes from v1:
   - Hcall semantics finalized, all changes are to accomodate them.

 Documentation/powerpc/papr_hcalls.rst |   14 ++
 arch/powerpc/include/asm/hvcall.h |3 +-
 arch/powerpc/platforms/pseries/papr_scm.c |   40 +
 3 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/Documentation/powerpc/papr_hcalls.rst 
b/Documentation/powerpc/papr_hcalls.rst
index 48fcf1255a33..648f278eea8f 100644
--- a/Documentation/powerpc/papr_hcalls.rst
+++ b/Documentation/powerpc/papr_hcalls.rst
@@ -275,6 +275,20 @@ Health Bitmap Flags:
 Given a DRC Index collect the performance statistics for NVDIMM and copy them
 to the resultBuffer.
 
+**H_SCM_FLUSH**
+
+| Input: *drcIndex, continue-token*
+| Out: *continue-token*
+| Return Value: *H_SUCCESS, H_Parameter, H_P2, H_BUSY*
+
+Given a DRC Index Flush the data to backend NVDIMM device.
+
+The hcall returns H_BUSY when the flush takes longer time and the hcall needs
+to be issued multiple times in order to be completely serviced. The
+*continue-token* from the output to be passed in the argument list of
+subsequent hcalls to the hypervisor until the hcall is completely serviced
+at which point H_SUCCESS or other error is returned by the hypervisor.
+
 References
 ==
 .. [1] "Power Architecture Platform Reference"
diff --git a/arch/powerpc/include/asm/hvcall.h 
b/arch/powerpc/include/asm/hvcall.h
index ed6086d57b22..9f7729a97ebd 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -315,7 +315,8 @@
 #define H_SCM_HEALTH0x400
 #define H_SCM_PERFORMANCE_STATS 0x418
 #define H_RPT_INVALIDATE   0x448
-#define MAX_HCALL_OPCODE   H_RPT_INVALIDATE
+#define H_SCM_FLUSH0x44C
+#define MAX_HCALL_OPCODE   H_SCM_FLUSH
 
 /* Scope args for H_SCM_UNBIND_ALL */
 #define H_UNBIND_SCOPE_ALL (0x1)
diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
b/arch/powerpc/platforms/pseries/papr_scm.c
index 835163f54244..b7a47fcc5aa5 100644
--- a/arch/powerpc/platforms/pseries/papr_scm.c
+++ b/arch/powerpc/platforms/pseries/papr_scm.c
@@ -93,6 +93,7 @@ struct papr_scm_priv {
uint64_t block_size;
int metadata_size;
bool is_volatile;
+   bool hcall_flush_required;
 
uint64_t bound_addr;
 
@@ -117,6 +118,39 @@ struct papr_scm_priv {
size_t stat_buffer_len;
 };
 
+static int papr_scm_pmem_flush(struct nd_region *nd_region,
+  struct bio *bio __maybe_unused)
+{
+   struct papr_scm_priv *p = nd_region_provider_data(nd_region);
+   unsigned long ret_buf[PLPAR_HCALL_BUFSIZE];
+   uint64_t token = 0;
+   int64_t rc;
+
+   dev_dbg(&p->pdev->dev, "flush drc 0x%x", p->drc_index);
+
+   do {
+   rc = plpar_hcall(H_SCM_FLUSH, ret_buf, p->drc_index, token);
+   token = ret_buf[0];
+
+   /* Check if we are stalled for some time */
+   if (H_IS_LONG_BUSY(rc)) {
+   msleep(get_longbusy_msecs(rc));
+   rc = H_BUSY;
+   } else if (rc == H_BUSY) {
+   cond_resched();
+   }
+   } while (rc == H_BUSY);
+
+   if (rc) {
+   dev_err(&p->pdev->dev, "flush error: %lld", rc);
+   rc = -EIO;
+   } else {
+   dev_dbg(&p->pdev->dev, "flush drc 0x%x complete", p->drc_index);
+   }
+
+   return rc;
+}
+
 static LIST_HEAD(papr_nd_regions);
 static DEFINE_MUTEX(papr_ndr_lock);
 
@@ -943,6 +977,11 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
ndr_desc.num_mappings = 1;
ndr_desc.nd_set = &p->nd_set;
 
+   if (p->hcall_flush_required) {
+   set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
+   ndr_desc.flush = papr_scm_pmem_flush;
+   }
+
if (p->is_volatile)
p->region = nvdimm_volatile_region_create(p->bus, &ndr_desc);
else {
@@ -1088,6 +1127,7 @@ static int papr_scm_probe(struct platform_device *pdev)
p->block_size = block_size;
p->blocks = blocks;
p->is_volatile = !of_property_read_bool(dn, "ibm,cache-flush-required"

Re: [PATCH v11 0/6] KASAN for powerpc64 radix

2021-03-29 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 23/03/2021 à 02:21, Daniel Axtens a écrit :
>> Hi Christophe,
>> 
>>> In the discussion we had long time ago,
>>> https://patchwork.ozlabs.org/project/linuxppc-dev/patch/20190806233827.16454-5-...@axtens.net/#2321067
>>> , I challenged you on why it was not possible to implement things the same 
>>> way as other
>>> architectures, in extenso with an early mapping.
>>>
>>> Your first answer was that too many things were done in real mode at 
>>> startup. After some discussion
>>> you said that finally there was not that much things at startup but the 
>>> issue was KVM.
>>>
>>> Now you say that instrumentation on KVM is fully disabled.
>>>
>>> So my question is, if KVM is not a problem anymore, why not go the standard 
>>> way with an early shadow
>>> ? Then you could also support inline instrumentation.
>> 
>> Fair enough, I've had some trouble both understanding the problem myself
>> and clearly articulating it. Let me try again.
>> 
>> We need translations on to access the shadow area.
>> 
>> We reach setup_64.c::early_setup() with translations off. At this point
>> we don't know what MMU we're running under, or our CPU features.
>
> What do you need to know ? Whether it is Hash or Radix, or
> more/different details ?

Yes, as well as some other details like SLB size, supported segment &
page sizes, possibly the CPU version for workarounds, various other
device tree things.

You also need to know if you're bare metal or in a guest, or on a PS3 ...

> IIUC, today we only support KASAN on Radix. Would it make sense to say that a 
> kernel built with 
> KASAN can only run on processors having Radix capacility ? Then select 
> CONFIG_PPC_RADIX_MMU_DEFAULT 
> when KASAN is set, and accept that the kernel crashes if Radix is not 
> available ?

I would rather not. We already have some options like that
(EARLY_DEBUG), and they have caused people to waste time debugging
crashes over the years that turned out to just due to the wrong CONFIG
selected.

>> To determine our MMU and CPU features, early_setup() calls functions
>> (dt_cpu_ftrs_init, early_init_devtree) that call out to generic code
>> like of_scan_flat_dt. We need to do this before we turn on translations
>> because we can't set up the MMU until we know what MMU we have.
>> 
>> So this puts us in a bind:
>> 
>>   - We can't set up an early shadow until we have translations on, which
>> requires that the MMU is set up.
>> 
>>   - We can't set up an MMU until we call out to generic code for FDT
>> parsing.
>> 
>> So there will be calls to generic FDT parsing code that happen before the
>> early shadow is set up.
>
> I see some logic in kernel/prom_init.c for detecting MMU. Can we get the 
> information from there in 
> order to setup the MMU ?

You could find some of the information, but you'd need to stash it
somewhere (like the flat device tree :P) because you can't turn the MMU
on until we shutdown open firmware.

That also doesn't help you on bare metal where we don't use prom_init.

>> The setup code also prints a bunch of information about the platform
>> with printk() while translations are off, so it wouldn't even be enough
>> to disable instrumentation for bits of the generic DT code on ppc64.
>
> I'm sure the printk() stuff can be avoided or delayed without much problems, 
> I guess the main 
> problem is the DT code, isn't it ?

We spent many years making printk() work for early boot messages,
because it has the nice property of being persisted in dmesg.

But possibly we could come up with some workaround for that.

Disabling KASAN for the flat DT code seems like it wouldn't be a huge
loss, most (all?) of that code should only run at boot anyway.

But we also have code spread out in various files that would need to be
built without KASAN. See eg. everything called by of_scan_flat_dt(),
mmu_early_init_devtree(), pseries_probe_fw_features()
pkey_early_init_devtree() etc.

Because we can only disable KASAN per-file that would require quite a
bit of code movement and related churn.

> As far as I can see the code only use udbg_printf() before MMU is on, and 
> this could be simply 
> skipped when KASAN is selected, I see no situation where you need early 
> printk together with KASAN.

We definitely use printk() before the MMU is on.

>> Does that make sense? If you can figure out how to 'square the circle'
>> here I'm all ears.
>
> Yes it is a lot more clear now, thanks you. Gave a few ideas above,
> does it help ?

A little? :)

It's possible we could do slightly less of the current boot sequence
before turning the MMU on. But we would still need to scan the flat
device tree, so all that code would be implicated either way.

We could also rearrange the early boot code to put bits in separate
files so they can be built without KASAN, but like I said above that
would be a lot of churn.

I don't see a way to fix printk() though, other than not using it during
early boot. Maybe that's OK but it feels like a bit

Re: [PATCH] powerpc/vdso: Separate vvar vma from vdso

2021-03-29 Thread Dmitry Safonov
On 3/29/21 4:14 PM, Laurent Dufour wrote:
> Le 26/03/2021 à 20:17, Dmitry Safonov a écrit :
>> Since commit 511157ab641e ("powerpc/vdso: Move vdso datapage up front")
>> VVAR page is in front of the VDSO area. In result it breaks CRIU
>> (Checkpoint Restore In Userspace) [1], where CRIU expects that "[vdso]"
>> from /proc/../maps points at ELF/vdso image, rather than at VVAR data
>> page.
>> Laurent made a patch to keep CRIU working (by reading aux vector).
>> But I think it still makes sence to separate two mappings into different
>> VMAs. It will also make ppc64 less "special" for userspace and as
>> a side-bonus will make VVAR page un-writable by debugger (which
>> previously
>> would COW page and can be unexpected).
>>
>> I opportunistically Cc stable on it: I understand that usually such
>> stuff isn't a stable material, but that will allow us in CRIU have
>> one workaround less that is needed just for one release (v5.11) on
>> one platform (ppc64), which we otherwise have to maintain.
>> I wouldn't go as far as to say that the commit 511157ab641e is ABI
>> regression as no other userspace got broken, but I'd really appreciate
>> if it gets backported to v5.11 after v5.12 is released, so as not
>> to complicate already non-simple CRIU-vdso code. Thanks!
>>
>> Cc: Andrei Vagin 
>> Cc: Andy Lutomirski 
>> Cc: Benjamin Herrenschmidt 
>> Cc: Christophe Leroy 
>> Cc: Laurent Dufour 
>> Cc: Michael Ellerman 
>> Cc: Paul Mackerras 
>> Cc: linuxppc-dev@lists.ozlabs.org
>> Cc: sta...@vger.kernel.org # v5.11
>> [1]: https://github.com/checkpoint-restore/criu/issues/1417
>> Signed-off-by: Dmitry Safonov 
>> Tested-by: Christophe Leroy 
> 
> I run the CRIU's test suite and except the usual suspects, all the tests
> passed.
> 
> Tested-by: Laurent Dufour 

Thank you, Laurent!

-- 
  Dmitry


Re: [PATCH v3 11/17] riscv: Convert to GENERIC_CMDLINE

2021-03-29 Thread Nick Kossifidis

Στις 2021-03-26 17:26, Rob Herring έγραψε:

On Fri, Mar 26, 2021 at 8:20 AM Christophe Leroy
 wrote:




Le 26/03/2021 à 15:08, Andreas Schwab a écrit :
> On Mär 26 2021, Christophe Leroy wrote:
>
>> diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
>> index f8f15332caa2..e7c91ee478d1 100644
>> --- a/arch/riscv/kernel/setup.c
>> +++ b/arch/riscv/kernel/setup.c
>> @@ -20,6 +20,7 @@
>>   #include 
>>   #include 
>>   #include 
>> +#include 
>>
>>   #include 
>>   #include 
>> @@ -228,10 +229,8 @@ static void __init parse_dtb(void)
>>  }
>>
>>  pr_err("No DTB passed to the kernel\n");
>> -#ifdef CONFIG_CMDLINE_FORCE
>> -strlcpy(boot_command_line, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
>> +cmdline_build(boot_command_line, NULL, COMMAND_LINE_SIZE);
>>  pr_info("Forcing kernel command line to: %s\n", boot_command_line);
>
> Shouldn't that message become conditional in some way?
>

You are right, I did something similar on ARM but looks like I missed 
it on RISCV.


How is this hunk even useful? Under what conditions can you boot
without a DTB? Even with a built-in DTB, the DT cmdline handling would
be called.

Rob



cced Paul who introduced this:
https://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git/commit/arch/riscv/kernel/setup.c?id=8fd6e05c7463b635e51ec7df0a1858c1b5a6e350



[powerpc:fixes-test] BUILD SUCCESS 53f1d31708f6240e4615b0927df31f182e389e2f

2021-03-29 Thread kernel test robot
 allyesconfig
s390 allmodconfig
s390defconfig
sparc   defconfig
i386defconfig
i386   tinyconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a002-20210329
x86_64   randconfig-a003-20210329
x86_64   randconfig-a006-20210329
x86_64   randconfig-a001-20210329
x86_64   randconfig-a005-20210329
x86_64   randconfig-a004-20210329
i386 randconfig-a003-20210329
i386 randconfig-a004-20210329
i386 randconfig-a001-20210329
i386 randconfig-a002-20210329
i386 randconfig-a006-20210329
i386 randconfig-a005-20210329
i386 randconfig-a003-20210327
i386 randconfig-a004-20210327
i386 randconfig-a001-20210327
i386 randconfig-a002-20210327
i386 randconfig-a006-20210327
i386 randconfig-a005-20210327
x86_64   randconfig-a015-20210328
x86_64   randconfig-a012-20210328
x86_64   randconfig-a013-20210328
x86_64   randconfig-a014-20210328
x86_64   randconfig-a016-20210328
x86_64   randconfig-a011-20210328
i386 randconfig-a011-20210329
i386 randconfig-a016-20210329
i386 randconfig-a013-20210329
i386 randconfig-a012-20210329
i386 randconfig-a014-20210329
i386 randconfig-a015-20210329
i386 randconfig-a014-20210326
i386 randconfig-a011-20210326
i386 randconfig-a015-20210326
i386 randconfig-a016-20210326
i386 randconfig-a012-20210326
i386 randconfig-a013-20210326
i386 randconfig-a014-20210327
i386 randconfig-a011-20210327
i386 randconfig-a015-20210327
i386 randconfig-a016-20210327
i386 randconfig-a012-20210327
i386 randconfig-a013-20210327
x86_64   randconfig-a002-20210327
x86_64   randconfig-a003-20210327
x86_64   randconfig-a006-20210327
x86_64   randconfig-a001-20210327
x86_64   randconfig-a004-20210327
x86_64   randconfig-a005-20210327
riscv allnoconfig
riscv   defconfig
riscv  rv32_defconfig
um   allmodconfig
umallnoconfig
um   allyesconfig
um  defconfig
x86_64rhel-8.3-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a002-20210328
x86_64   randconfig-a003-20210328
x86_64   randconfig-a001-20210328
x86_64   randconfig-a006-20210328
x86_64   randconfig-a005-20210328
x86_64   randconfig-a004-20210328
x86_64   randconfig-a015-20210329
x86_64   randconfig-a012-20210329
x86_64   randconfig-a013-20210329
x86_64   randconfig-a014-20210329
x86_64   randconfig-a011-20210329
x86_64   randconfig-a016-20210329
x86_64   randconfig-a012-20210327
x86_64   randconfig-a015-20210327
x86_64   randconfig-a014-20210327
x86_64   randconfig-a013-20210327
x86_64   randconfig-a011-20210327
x86_64   randconfig-a016-20210327

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


Re: [PATCH] scsi: ibmvscsi: delete the useless casting value returned

2021-03-29 Thread Martin K. Petersen
On Fri, 12 Mar 2021 10:18:53 +0800, Wang Qing wrote:

> Fix the following coccicheck warning:
> WARNING: casting value returned by memory allocation function is useless.

Applied to 5.13/scsi-queue, thanks!

[1/1] scsi: ibmvscsi: delete the useless casting value returned
  https://git.kernel.org/mkp/scsi/c/0d556a21a9da

-- 
Martin K. Petersen  Oracle Linux Engineering


[PATCH v10 00/10] powerpc: Further Strict RWX support

2021-03-29 Thread Jordan Niethe
Another revision to this series adding more Strict RWX support on powerpc, in
particular Strict Module RWX.  This revision adds consideration for bpf.

The changes in v10 for each patch:

Christophe Leroy (2):
  powerpc/mm: implement set_memory_attr()
  powerpc/32: use set_memory_attr()

Jordan Niethe (3):
  powerpc/lib/code-patching: Set up Strict RWX patching earlier
  powerpc: Always define MODULES_{VADDR,END}
v10: - New to series

  powerpc/bpf: Write protect JIT code
v10: - New to series

Russell Currey (5):
  powerpc/mm: Implement set_memory() routines
v10: - WARN if trying to change the hash linear map

  powerpc/kprobes: Mark newly allocated probes as ROX
v10: - Use __vmalloc_node_range()

  powerpc/mm/ptdump: debugfs handler for W+X checks at runtime
v10: check_wx_pages now affects kernel_page_tables rather
 then triggers its own action.

  powerpc: Set ARCH_HAS_STRICT_MODULE_RWX
v10: - Predicate on !PPC_BOOK3S_604
 - Make module_alloc() use PAGE_KERNEL protection

  powerpc/configs: Enable STRICT_MODULE_RWX in skiroot_defconfig

 arch/powerpc/Kconfig   |   2 +
 arch/powerpc/Kconfig.debug |   6 +-
 arch/powerpc/configs/skiroot_defconfig |   1 +
 arch/powerpc/include/asm/pgtable.h |   5 +
 arch/powerpc/include/asm/set_memory.h  |  34 +++
 arch/powerpc/kernel/kprobes.c  |  14 +++
 arch/powerpc/kernel/module.c   |  14 +--
 arch/powerpc/lib/code-patching.c   |  12 +--
 arch/powerpc/mm/Makefile   |   2 +-
 arch/powerpc/mm/pageattr.c | 121 +
 arch/powerpc/mm/pgtable_32.c   |  60 ++--
 arch/powerpc/mm/ptdump/ptdump.c|  34 ++-
 arch/powerpc/net/bpf_jit_comp.c|   5 +-
 arch/powerpc/net/bpf_jit_comp64.c  |   4 +
 14 files changed, 245 insertions(+), 69 deletions(-)
 create mode 100644 arch/powerpc/include/asm/set_memory.h
 create mode 100644 arch/powerpc/mm/pageattr.c

-- 
2.25.1



[PATCH v10 01/10] powerpc/mm: Implement set_memory() routines

2021-03-29 Thread Jordan Niethe
From: Russell Currey 

The set_memory_{ro/rw/nx/x}() functions are required for STRICT_MODULE_RWX,
and are generally useful primitives to have.  This implementation is
designed to be completely generic across powerpc's many MMUs.

It's possible that this could be optimised to be faster for specific
MMUs, but the focus is on having a generic and safe implementation for
now.

This implementation does not handle cases where the caller is attempting
to change the mapping of the page it is executing from, or if another
CPU is concurrently using the page being altered.  These cases likely
shouldn't happen, but a more complex implementation with MMU-specific code
could safely handle them, so that is left as a TODO for now.

On hash the linear mapping is not kept in the linux pagetable, so this
will not change the protection if used on that range. Currently these
functions are not used on the linear map so just WARN for now.

These functions do nothing if STRICT_KERNEL_RWX is not enabled.

Reviewed-by: Daniel Axtens 
Signed-off-by: Russell Currey 
Signed-off-by: Christophe Leroy 
[jpn: -rebase on next plus "powerpc/mm/64s: Allow STRICT_KERNEL_RWX again"
  - WARN on hash linear map]
Signed-off-by: Jordan Niethe 
---
v10: WARN if trying to change the hash linear map
---
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/include/asm/set_memory.h | 32 ++
 arch/powerpc/mm/Makefile  |  2 +-
 arch/powerpc/mm/pageattr.c| 88 +++
 4 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/set_memory.h
 create mode 100644 arch/powerpc/mm/pageattr.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index fc7f5c5933e6..4498a27ac9db 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,6 +135,7 @@ config PPC
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_SCALED_CPUTIME  if VIRT_CPU_ACCOUNTING_NATIVE 
&& PPC_BOOK3S_64
+   select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE
diff --git a/arch/powerpc/include/asm/set_memory.h 
b/arch/powerpc/include/asm/set_memory.h
new file mode 100644
index ..64011ea444b4
--- /dev/null
+++ b/arch/powerpc/include/asm/set_memory.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_SET_MEMORY_H
+#define _ASM_POWERPC_SET_MEMORY_H
+
+#define SET_MEMORY_RO  0
+#define SET_MEMORY_RW  1
+#define SET_MEMORY_NX  2
+#define SET_MEMORY_X   3
+
+int change_memory_attr(unsigned long addr, int numpages, long action);
+
+static inline int set_memory_ro(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_RO);
+}
+
+static inline int set_memory_rw(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_RW);
+}
+
+static inline int set_memory_nx(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_NX);
+}
+
+static inline int set_memory_x(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_X);
+}
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3b4e9e4e25ea..d8a08abde1ae 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -5,7 +5,7 @@
 
 ccflags-$(CONFIG_PPC64):= $(NO_MINIMAL_TOC)
 
-obj-y  := fault.o mem.o pgtable.o mmap.o maccess.o \
+obj-y  := fault.o mem.o pgtable.o mmap.o maccess.o 
pageattr.o \
   init_$(BITS).o pgtable_$(BITS).o \
   pgtable-frag.o ioremap.o ioremap_$(BITS).o \
   init-common.o mmu_context.o drmem.o
diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
new file mode 100644
index ..9efcb01088da
--- /dev/null
+++ b/arch/powerpc/mm/pageattr.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * MMU-generic set_memory implementation for powerpc
+ *
+ * Copyright 2019, IBM Corporation.
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+
+/*
+ * Updates the attributes of a page in three steps:
+ *
+ * 1. invalidate the page table entry
+ * 2. flush the TLB
+ * 3. install the new entry with the updated attributes
+ *
+ * This is unsafe if the caller is attempting to change the mapping of the
+ * page it is executing from, or if another CPU is concurrently using the
+ * page being altered.
+ *
+ * TODO make the implementation resistant to this.
+ *
+ * NOTE: can be dangerous to call without STRICT_KERNEL_RWX
+ */
+static int change_page_attr(pte_t *ptep, unsigned long addr, void *data)
+{
+   long action = (long)data;
+   pte_t pt

[PATCH v10 02/10] powerpc/lib/code-patching: Set up Strict RWX patching earlier

2021-03-29 Thread Jordan Niethe
setup_text_poke_area() is a late init call so it runs before
mark_rodata_ro() and after the init calls. This lets all the init code
patching simply write to their locations. In the future, kprobes is
going to allocate its instruction pages RO which means they will need
setup_text__poke_area() to have been already called for their code
patching. However, init_kprobes() (which allocates and patches some
instruction pages) is an early init call so it happens before
setup_text__poke_area().

start_kernel() calls poking_init() before any of the init calls. On
powerpc, poking_init() is currently a nop. setup_text_poke_area() relies
on kernel virtual memory, cpu hotplug and per_cpu_areas being setup.
setup_per_cpu_areas(), boot_cpu_hotplug_init() and mm_init() are called
before poking_init().

Turn setup_text_poke_area() into poking_init().

Reviewed-by: Russell Currey 
Signed-off-by: Jordan Niethe 
---
v9: New to series
---
 arch/powerpc/lib/code-patching.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 2333625b5e31..b28afa1133db 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -65,14 +65,11 @@ static int text_area_cpu_down(unsigned int cpu)
 }
 
 /*
- * Run as a late init call. This allows all the boot time patching to be done
- * simply by patching the code, and then we're called here prior to
- * mark_rodata_ro(), which happens after all init calls are run. Although
- * BUG_ON() is rude, in this case it should only happen if ENOMEM, and we judge
- * it as being preferable to a kernel that will crash later when someone tries
- * to use patch_instruction().
+ * Although BUG_ON() is rude, in this case it should only happen if ENOMEM, and
+ * we judge it as being preferable to a kernel that will crash later when
+ * someone tries to use patch_instruction().
  */
-static int __init setup_text_poke_area(void)
+int __init poking_init(void)
 {
BUG_ON(!cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
"powerpc/text_poke:online", text_area_cpu_up,
@@ -80,7 +77,6 @@ static int __init setup_text_poke_area(void)
 
return 0;
 }
-late_initcall(setup_text_poke_area);
 
 /*
  * This can be called for kernel text or a module.
-- 
2.25.1



[PATCH v10 03/10] powerpc: Always define MODULES_{VADDR,END}

2021-03-29 Thread Jordan Niethe
If MODULES_{VADDR,END} are not defined set them to VMALLOC_START and
VMALLOC_END respectively. This reduces the need for special cases. For
example, powerpc's module_alloc() was previously predicated on
MODULES_VADDR being defined but now is unconditionally defined.

This will be useful reducing conditional code in other places that need
to allocate from the module region (i.e., kprobes).

Signed-off-by: Jordan Niethe 
---
v10: New to series
---
 arch/powerpc/include/asm/pgtable.h | 5 +
 arch/powerpc/kernel/module.c   | 5 +
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 4eed82172e33..014c2921f26a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -167,6 +167,11 @@ struct seq_file;
 void arch_report_meminfo(struct seq_file *m);
 #endif /* CONFIG_PPC64 */
 
+#ifndef MODULES_VADDR
+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index a211b0253cdb..f1fb58389d58 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static LIST_HEAD(module_bug_list);
 
@@ -87,13 +88,9 @@ int module_finalize(const Elf_Ehdr *hdr,
return 0;
 }
 
-#ifdef MODULES_VADDR
 void *module_alloc(unsigned long size)
 {
-   BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
-
return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, 
GFP_KERNEL,
PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
NUMA_NO_NODE,
__builtin_return_address(0));
 }
-#endif
-- 
2.25.1



[PATCH v10 04/10] powerpc/kprobes: Mark newly allocated probes as ROX

2021-03-29 Thread Jordan Niethe
From: Russell Currey 

Add the arch specific insn page allocator for powerpc. This allocates
ROX pages if STRICT_KERNEL_RWX is enabled. These pages are only written
to with patch_instruction() which is able to write RO pages.

Reviewed-by: Daniel Axtens 
Signed-off-by: Russell Currey 
Signed-off-by: Christophe Leroy 
[jpn: Reword commit message, switch to __vmalloc_node_range()]
Signed-off-by: Jordan Niethe 
---
v9: - vmalloc_exec() no longer exists
- Set the page to RW before freeing it
v10: - use __vmalloc_node_range()
---
 arch/powerpc/kernel/kprobes.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 01ab2163659e..3ae27af9b094 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
 DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -103,6 +104,19 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, 
unsigned int offset)
return addr;
 }
 
+void *alloc_insn_page(void)
+{
+   if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) {
+   return __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, 
MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_ROX, 
VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE, __builtin_return_address(0));
+   } else {
+   return __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, 
MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_EXEC, 
VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE, __builtin_return_address(0));
+   }
+}
+
 int arch_prepare_kprobe(struct kprobe *p)
 {
int ret = 0;
-- 
2.25.1



[PATCH v10 05/10] powerpc/bpf: Write protect JIT code

2021-03-29 Thread Jordan Niethe
Once CONFIG_STRICT_MODULE_RWX is enabled there will be no need to
override bpf_jit_free() because it is now possible to set images
read-only. So use the default implementation.

Also add the necessary call to bpf_jit_binary_lock_ro() which will
remove write protection and add exec protection to the JIT image after
it has finished being written.

Signed-off-by: Jordan Niethe 
---
v10: New to series
---
 arch/powerpc/net/bpf_jit_comp.c   | 5 -
 arch/powerpc/net/bpf_jit_comp64.c | 4 
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index e809cb5a1631..8015e4a7d2d4 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -659,12 +659,15 @@ void bpf_jit_compile(struct bpf_prog *fp)
bpf_jit_dump(flen, proglen, pass, code_base);
 
bpf_flush_icache(code_base, code_base + (proglen/4));
-
 #ifdef CONFIG_PPC64
/* Function descriptor nastiness: Address + TOC */
((u64 *)image)[0] = (u64)code_base;
((u64 *)image)[1] = local_paca->kernel_toc;
 #endif
+   if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) {
+   set_memory_ro((unsigned long)image, alloclen >> PAGE_SHIFT);
+   set_memory_x((unsigned long)image, alloclen >> PAGE_SHIFT);
+   }
 
fp->bpf_func = (void *)image;
fp->jited = 1;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index aaf1a887f653..1484ad588685 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -1240,6 +1240,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
fp->jited_len = alloclen;
 
bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE));
+   if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+   bpf_jit_binary_lock_ro(bpf_hdr);
if (!fp->is_func || extra_pass) {
bpf_prog_fill_jited_linfo(fp, addrs);
 out_addrs:
@@ -1262,6 +1264,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 }
 
 /* Overriding bpf_jit_free() as we don't set images read-only. */
+#ifndef CONFIG_STRICT_MODULE_RWX
 void bpf_jit_free(struct bpf_prog *fp)
 {
unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK;
@@ -1272,3 +1275,4 @@ void bpf_jit_free(struct bpf_prog *fp)
 
bpf_prog_unlock_free(fp);
 }
+#endif
-- 
2.25.1



[PATCH v10 06/10] powerpc/mm/ptdump: debugfs handler for W+X checks at runtime

2021-03-29 Thread Jordan Niethe
From: Russell Currey 

Optionally run W+X checks when dumping pagetable information to
debugfs' kernel_page_tables.

To use:
$ echo 1 > /sys/kernel/debug/check_wx_pages
$ cat /sys/kernel/debug/kernel_page_tables

and check the kernel log.  Useful for testing strict module RWX.

To disable W+X checks:
$ echo 0 > /sys/kernel/debug/check_wx_pages

Update the Kconfig entry to reflect this.

Also fix a typo.

Reviewed-by: Kees Cook 
Signed-off-by: Russell Currey 
[jpn: Change check_wx_pages to act as mode bit affecting
  kernel_page_tables instead of triggering action on its own]
Signed-off-by: Jordan Niethe 
---
v10: check_wx_pages now affects kernel_page_tables rather then triggers
 its own action.
---
 arch/powerpc/Kconfig.debug  |  6 --
 arch/powerpc/mm/ptdump/ptdump.c | 34 -
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
index ae084357994e..56e99e9a30d9 100644
--- a/arch/powerpc/Kconfig.debug
+++ b/arch/powerpc/Kconfig.debug
@@ -371,7 +371,7 @@ config PPC_PTDUMP
  If you are unsure, say N.
 
 config PPC_DEBUG_WX
-   bool "Warn on W+X mappings at boot"
+   bool "Warn on W+X mappings at boot & enable manual checks at runtime"
depends on PPC_PTDUMP && STRICT_KERNEL_RWX
help
  Generate a warning if any W+X mappings are found at boot.
@@ -385,7 +385,9 @@ config PPC_DEBUG_WX
  of other unfixed kernel bugs easier.
 
  There is no runtime or memory usage effect of this option
- once the kernel has booted up - it's a one time check.
+ once the kernel has booted up, it only automatically checks once.
+
+ Enables the "check_wx_pages" debugfs entry for checking at runtime.
 
  If in doubt, say "Y".
 
diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c
index aca354fb670b..6592f7a48c96 100644
--- a/arch/powerpc/mm/ptdump/ptdump.c
+++ b/arch/powerpc/mm/ptdump/ptdump.c
@@ -4,7 +4,7 @@
  *
  * This traverses the kernel pagetables and dumps the
  * information about the used sections of memory to
- * /sys/kernel/debug/kernel_pagetables.
+ * /sys/kernel/debug/kernel_page_tables.
  *
  * Derived from the arm64 implementation:
  * Copyright (c) 2014, The Linux Foundation, Laura Abbott.
@@ -27,6 +27,8 @@
 
 #include "ptdump.h"
 
+static bool check_wx;
+
 /*
  * To visualise what is happening,
  *
@@ -410,6 +412,9 @@ static int ptdump_show(struct seq_file *m, void *v)
/* Traverse kernel page tables */
walk_pagetables(&st);
note_page(&st, 0, 0, 0, 0);
+
+   if (check_wx)
+   ptdump_check_wx();
return 0;
 }
 
@@ -459,6 +464,33 @@ void ptdump_check_wx(void)
else
pr_info("Checked W+X mappings: passed, no W+X pages found\n");
 }
+
+static int check_wx_debugfs_set(void *data, u64 val)
+{
+   if (val == 1ULL)
+   check_wx = true;
+   else if (val == 0ULL)
+   check_wx = false;
+   else
+   return -EINVAL;
+
+   return 0;
+}
+
+static int check_wx_debugfs_get(void *data, u64 *val)
+{
+   *val = check_wx ? 1 : 0;
+   return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(check_wx_fops, check_wx_debugfs_get, 
check_wx_debugfs_set, "%llu\n");
+
+static int ptdump_check_wx_init(void)
+{
+   return debugfs_create_file("check_wx_pages", 0200, NULL,
+  NULL, &check_wx_fops) ? 0 : -ENOMEM;
+}
+device_initcall(ptdump_check_wx_init);
 #endif
 
 static int ptdump_init(void)
-- 
2.25.1



[PATCH v10 07/10] powerpc: Set ARCH_HAS_STRICT_MODULE_RWX

2021-03-29 Thread Jordan Niethe
From: Russell Currey 

To enable strict module RWX on powerpc, set:

CONFIG_STRICT_MODULE_RWX=y

You should also have CONFIG_STRICT_KERNEL_RWX=y set to have any real
security benefit.

ARCH_HAS_STRICT_MODULE_RWX is set to require ARCH_HAS_STRICT_KERNEL_RWX.
This is due to a quirk in arch/Kconfig and arch/powerpc/Kconfig that
makes STRICT_MODULE_RWX *on by default* in configurations where
STRICT_KERNEL_RWX is *unavailable*.

Since this doesn't make much sense, and module RWX without kernel RWX
doesn't make much sense, having the same dependencies as kernel RWX
works around this problem.

With STRICT_MODULE_RWX, now make module_alloc() allocate pages with
KERNEL_PAGE protection rather than KERNEL_PAGE_EXEC.

Book32s/32 processors with a hash mmu (i.e. 604 core) can not set memory
protection on a page by page basis so do not enable.

Signed-off-by: Russell Currey 
[jpn: - predicate on !PPC_BOOK3S_604
  - make module_alloc() use PAGE_KERNEL protection]
Signed-off-by: Jordan Niethe 
---
v10: - Predicate on !PPC_BOOK3S_604
 - Make module_alloc() use PAGE_KERNEL protection
---
 arch/powerpc/Kconfig |  1 +
 arch/powerpc/kernel/module.c | 11 ---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 4498a27ac9db..97c0c3540bfd 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_SCALED_CPUTIME  if VIRT_CPU_ACCOUNTING_NATIVE 
&& PPC_BOOK3S_64
select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!HIBERNATION)
+   select ARCH_HAS_STRICT_MODULE_RWX   if ARCH_HAS_STRICT_KERNEL_RWX 
&& !PPC_BOOK3S_604
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE
select ARCH_HAS_COPY_MC if PPC64
diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index f1fb58389d58..d086f5534fac 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -90,7 +90,12 @@ int module_finalize(const Elf_Ehdr *hdr,
 
 void *module_alloc(unsigned long size)
 {
-   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, 
GFP_KERNEL,
-   PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
NUMA_NO_NODE,
-   __builtin_return_address(0));
+   pgprot_t prot = PAGE_KERNEL_EXEC;
+
+   if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
+   prot = PAGE_KERNEL;
+
+   return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+   GFP_KERNEL, prot, VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE, __builtin_return_address(0));
 }
-- 
2.25.1



[PATCH v10 08/10] powerpc/configs: Enable STRICT_MODULE_RWX in skiroot_defconfig

2021-03-29 Thread Jordan Niethe
From: Russell Currey 

skiroot_defconfig is the only powerpc defconfig with STRICT_KERNEL_RWX
enabled, and if you want memory protection for kernel text you'd want it
for modules too, so enable STRICT_MODULE_RWX there.

Acked-by: Joel Stanley 
Signed-off-by: Russell Currey 
Signed-off-by: Jordan Niethe 
---
 arch/powerpc/configs/skiroot_defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/configs/skiroot_defconfig 
b/arch/powerpc/configs/skiroot_defconfig
index b806a5d3a695..50fe06cb3a31 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -50,6 +50,7 @@ CONFIG_CMDLINE="console=tty0 console=hvc0 ipr.fast_reboot=1 
quiet"
 # CONFIG_PPC_MEM_KEYS is not set
 CONFIG_JUMP_LABEL=y
 CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
 CONFIG_MODULES=y
 CONFIG_MODULE_UNLOAD=y
 CONFIG_MODULE_SIG_FORCE=y
-- 
2.25.1



[PATCH v10 09/10] powerpc/mm: implement set_memory_attr()

2021-03-29 Thread Jordan Niethe
From: Christophe Leroy 

In addition to the set_memory_xx() functions which allows to change
the memory attributes of not (yet) used memory regions, implement a
set_memory_attr() function to:
- set the final memory protection after init on currently used
kernel regions.
- enable/disable kernel memory regions in the scope of DEBUG_PAGEALLOC.

Unlike the set_memory_xx() which can act in three step as the regions
are unused, this function must modify 'on the fly' as the kernel is
executing from them. At the moment only PPC32 will use it and changing
page attributes on the fly is not an issue.

Signed-off-by: Christophe Leroy 
Reported-by: kbuild test robot 
[ruscur: cast "data" to unsigned long instead of int]
Signed-off-by: Russell Currey 
Signed-off-by: Jordan Niethe 
---
 arch/powerpc/include/asm/set_memory.h |  2 ++
 arch/powerpc/mm/pageattr.c| 33 +++
 2 files changed, 35 insertions(+)

diff --git a/arch/powerpc/include/asm/set_memory.h 
b/arch/powerpc/include/asm/set_memory.h
index 64011ea444b4..b040094f7920 100644
--- a/arch/powerpc/include/asm/set_memory.h
+++ b/arch/powerpc/include/asm/set_memory.h
@@ -29,4 +29,6 @@ static inline int set_memory_x(unsigned long addr, int 
numpages)
return change_memory_attr(addr, numpages, SET_MEMORY_X);
 }
 
+int set_memory_attr(unsigned long addr, int numpages, pgprot_t prot);
+
 #endif
diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
index 9efcb01088da..9611dfaebd45 100644
--- a/arch/powerpc/mm/pageattr.c
+++ b/arch/powerpc/mm/pageattr.c
@@ -86,3 +86,36 @@ int change_memory_attr(unsigned long addr, int numpages, 
long action)
return apply_to_existing_page_range(&init_mm, start, sz,
change_page_attr, (void *)action);
 }
+
+/*
+ * Set the attributes of a page:
+ *
+ * This function is used by PPC32 at the end of init to set final kernel memory
+ * protection. It includes changing the maping of the page it is executing from
+ * and data pages it is using.
+ */
+static int set_page_attr(pte_t *ptep, unsigned long addr, void *data)
+{
+   pgprot_t prot = __pgprot((unsigned long)data);
+
+   spin_lock(&init_mm.page_table_lock);
+
+   set_pte_at(&init_mm, addr, ptep, pte_modify(*ptep, prot));
+   flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+
+   spin_unlock(&init_mm.page_table_lock);
+
+   return 0;
+}
+
+int set_memory_attr(unsigned long addr, int numpages, pgprot_t prot)
+{
+   unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
+   unsigned long sz = numpages * PAGE_SIZE;
+
+   if (numpages <= 0)
+   return 0;
+
+   return apply_to_existing_page_range(&init_mm, start, sz, set_page_attr,
+   (void *)pgprot_val(prot));
+}
-- 
2.25.1



[PATCH v10 10/10] powerpc/32: use set_memory_attr()

2021-03-29 Thread Jordan Niethe
From: Christophe Leroy 

Use set_memory_attr() instead of the PPC32 specific change_page_attr()

change_page_attr() was checking that the address was not mapped by
blocks and was handling highmem, but that's unneeded because the
affected pages can't be in highmem and block mapping verification
is already done by the callers.

Signed-off-by: Christophe Leroy 
[ruscur: rebase on powerpc/merge with Christophe's new patches]
Signed-off-by: Russell Currey 
Signed-off-by: Jordan Niethe 
---
 arch/powerpc/mm/pgtable_32.c | 60 ++--
 1 file changed, 10 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
index e0ec67a16887..dcf5ecca19d9 100644
--- a/arch/powerpc/mm/pgtable_32.c
+++ b/arch/powerpc/mm/pgtable_32.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -132,64 +133,20 @@ void __init mapin_ram(void)
}
 }
 
-static int __change_page_attr_noflush(struct page *page, pgprot_t prot)
-{
-   pte_t *kpte;
-   unsigned long address;
-
-   BUG_ON(PageHighMem(page));
-   address = (unsigned long)page_address(page);
-
-   if (v_block_mapped(address))
-   return 0;
-   kpte = virt_to_kpte(address);
-   if (!kpte)
-   return -EINVAL;
-   __set_pte_at(&init_mm, address, kpte, mk_pte(page, prot), 0);
-
-   return 0;
-}
-
-/*
- * Change the page attributes of an page in the linear mapping.
- *
- * THIS DOES NOTHING WITH BAT MAPPINGS, DEBUG USE ONLY
- */
-static int change_page_attr(struct page *page, int numpages, pgprot_t prot)
-{
-   int i, err = 0;
-   unsigned long flags;
-   struct page *start = page;
-
-   local_irq_save(flags);
-   for (i = 0; i < numpages; i++, page++) {
-   err = __change_page_attr_noflush(page, prot);
-   if (err)
-   break;
-   }
-   wmb();
-   local_irq_restore(flags);
-   flush_tlb_kernel_range((unsigned long)page_address(start),
-  (unsigned long)page_address(page));
-   return err;
-}
-
 void mark_initmem_nx(void)
 {
-   struct page *page = virt_to_page(_sinittext);
unsigned long numpages = PFN_UP((unsigned long)_einittext) -
 PFN_DOWN((unsigned long)_sinittext);
 
if (v_block_mapped((unsigned long)_sinittext))
mmu_mark_initmem_nx();
else
-   change_page_attr(page, numpages, PAGE_KERNEL);
+   set_memory_attr((unsigned long)_sinittext, numpages, 
PAGE_KERNEL);
 }
 
 #ifdef CONFIG_STRICT_KERNEL_RWX
 void mark_rodata_ro(void)
 {
-   struct page *page;
unsigned long numpages;
 
if (v_block_mapped((unsigned long)_stext + 1)) {
@@ -198,20 +155,18 @@ void mark_rodata_ro(void)
return;
}
 
-   page = virt_to_page(_stext);
numpages = PFN_UP((unsigned long)_etext) -
   PFN_DOWN((unsigned long)_stext);
 
-   change_page_attr(page, numpages, PAGE_KERNEL_ROX);
+   set_memory_attr((unsigned long)_stext, numpages, PAGE_KERNEL_ROX);
/*
 * mark .rodata as read only. Use __init_begin rather than __end_rodata
 * to cover NOTES and EXCEPTION_TABLE.
 */
-   page = virt_to_page(__start_rodata);
numpages = PFN_UP((unsigned long)__init_begin) -
   PFN_DOWN((unsigned long)__start_rodata);
 
-   change_page_attr(page, numpages, PAGE_KERNEL_RO);
+   set_memory_attr((unsigned long)__start_rodata, numpages, 
PAGE_KERNEL_RO);
 
// mark_initmem_nx() should have already run by now
ptdump_check_wx();
@@ -221,9 +176,14 @@ void mark_rodata_ro(void)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 void __kernel_map_pages(struct page *page, int numpages, int enable)
 {
+   unsigned long addr = (unsigned long)page_address(page);
+
if (PageHighMem(page))
return;
 
-   change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+   if (enable)
+   set_memory_attr(addr, numpages, PAGE_KERNEL);
+   else
+   set_memory_attr(addr, numpages, __pgprot(0));
 }
 #endif /* CONFIG_DEBUG_PAGEALLOC */
-- 
2.25.1



Re: [PATCH v3] powerpc/papr_scm: Implement support for H_SCM_FLUSH hcall

2021-03-29 Thread Aneesh Kumar K.V
Shivaprasad G Bhat  writes:

> Add support for ND_REGION_ASYNC capability if the device tree
> indicates 'ibm,hcall-flush-required' property in the NVDIMM node.
> Flush is done by issuing H_SCM_FLUSH hcall to the hypervisor.
>
> If the flush request failed, the hypervisor is expected to
> to reflect the problem in the subsequent nvdimm H_SCM_HEALTH call.
>
> This patch prevents mmap of namespaces with MAP_SYNC flag if the
> nvdimm requires an explicit flush[1].
>
> References:
> [1] 
> https://github.com/avocado-framework-tests/avocado-misc-tests/blob/master/memory/ndctl.py.data/map_sync.c


Reviewed-by: Aneesh Kumar K.V 

>
> Signed-off-by: Shivaprasad G Bhat 
> ---
> v2 - https://www.spinics.net/lists/kvm-ppc/msg18799.html
> Changes from v2:
>- Fixed the commit message.
>- Add dev_dbg before the H_SCM_FLUSH hcall
>
> v1 - https://www.spinics.net/lists/kvm-ppc/msg18272.html
> Changes from v1:
>- Hcall semantics finalized, all changes are to accomodate them.
>
>  Documentation/powerpc/papr_hcalls.rst |   14 ++
>  arch/powerpc/include/asm/hvcall.h |3 +-
>  arch/powerpc/platforms/pseries/papr_scm.c |   40 
> +
>  3 files changed, 56 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/powerpc/papr_hcalls.rst 
> b/Documentation/powerpc/papr_hcalls.rst
> index 48fcf1255a33..648f278eea8f 100644
> --- a/Documentation/powerpc/papr_hcalls.rst
> +++ b/Documentation/powerpc/papr_hcalls.rst
> @@ -275,6 +275,20 @@ Health Bitmap Flags:
>  Given a DRC Index collect the performance statistics for NVDIMM and copy them
>  to the resultBuffer.
>  
> +**H_SCM_FLUSH**
> +
> +| Input: *drcIndex, continue-token*
> +| Out: *continue-token*
> +| Return Value: *H_SUCCESS, H_Parameter, H_P2, H_BUSY*
> +
> +Given a DRC Index Flush the data to backend NVDIMM device.
> +
> +The hcall returns H_BUSY when the flush takes longer time and the hcall needs
> +to be issued multiple times in order to be completely serviced. The
> +*continue-token* from the output to be passed in the argument list of
> +subsequent hcalls to the hypervisor until the hcall is completely serviced
> +at which point H_SUCCESS or other error is returned by the hypervisor.
> +
>  References
>  ==
>  .. [1] "Power Architecture Platform Reference"
> diff --git a/arch/powerpc/include/asm/hvcall.h 
> b/arch/powerpc/include/asm/hvcall.h
> index ed6086d57b22..9f7729a97ebd 100644
> --- a/arch/powerpc/include/asm/hvcall.h
> +++ b/arch/powerpc/include/asm/hvcall.h
> @@ -315,7 +315,8 @@
>  #define H_SCM_HEALTH0x400
>  #define H_SCM_PERFORMANCE_STATS 0x418
>  #define H_RPT_INVALIDATE 0x448
> -#define MAX_HCALL_OPCODE H_RPT_INVALIDATE
> +#define H_SCM_FLUSH  0x44C
> +#define MAX_HCALL_OPCODE H_SCM_FLUSH
>  
>  /* Scope args for H_SCM_UNBIND_ALL */
>  #define H_UNBIND_SCOPE_ALL (0x1)
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index 835163f54244..b7a47fcc5aa5 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -93,6 +93,7 @@ struct papr_scm_priv {
>   uint64_t block_size;
>   int metadata_size;
>   bool is_volatile;
> + bool hcall_flush_required;
>  
>   uint64_t bound_addr;
>  
> @@ -117,6 +118,39 @@ struct papr_scm_priv {
>   size_t stat_buffer_len;
>  };
>  
> +static int papr_scm_pmem_flush(struct nd_region *nd_region,
> +struct bio *bio __maybe_unused)
> +{
> + struct papr_scm_priv *p = nd_region_provider_data(nd_region);
> + unsigned long ret_buf[PLPAR_HCALL_BUFSIZE];
> + uint64_t token = 0;
> + int64_t rc;
> +
> + dev_dbg(&p->pdev->dev, "flush drc 0x%x", p->drc_index);
> +
> + do {
> + rc = plpar_hcall(H_SCM_FLUSH, ret_buf, p->drc_index, token);
> + token = ret_buf[0];
> +
> + /* Check if we are stalled for some time */
> + if (H_IS_LONG_BUSY(rc)) {
> + msleep(get_longbusy_msecs(rc));
> + rc = H_BUSY;
> + } else if (rc == H_BUSY) {
> + cond_resched();
> + }
> + } while (rc == H_BUSY);
> +
> + if (rc) {
> + dev_err(&p->pdev->dev, "flush error: %lld", rc);
> + rc = -EIO;
> + } else {
> + dev_dbg(&p->pdev->dev, "flush drc 0x%x complete", p->drc_index);
> + }
> +
> + return rc;
> +}
> +
>  static LIST_HEAD(papr_nd_regions);
>  static DEFINE_MUTEX(papr_ndr_lock);
>  
> @@ -943,6 +977,11 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>   ndr_desc.num_mappings = 1;
>   ndr_desc.nd_set = &p->nd_set;
>  
> + if (p->hcall_flush_required) {
> + set_bit(ND_REGION_ASYNC, &ndr_desc.flags);
> + ndr_desc.flush = papr_scm_pmem_flush;
> + }
> +
>   if (p->is_volatile)
>   p->region = nvdimm_volatile_region_create(p->bus, &n

Re: [PATCH v10 03/10] powerpc: Always define MODULES_{VADDR,END}

2021-03-29 Thread Christophe Leroy




Le 30/03/2021 à 06:51, Jordan Niethe a écrit :

If MODULES_{VADDR,END} are not defined set them to VMALLOC_START and
VMALLOC_END respectively. This reduces the need for special cases. For
example, powerpc's module_alloc() was previously predicated on
MODULES_VADDR being defined but now is unconditionally defined.

This will be useful reducing conditional code in other places that need
to allocate from the module region (i.e., kprobes).

Signed-off-by: Jordan Niethe 
---
v10: New to series
---
  arch/powerpc/include/asm/pgtable.h | 5 +
  arch/powerpc/kernel/module.c   | 5 +
  2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable.h 
b/arch/powerpc/include/asm/pgtable.h
index 4eed82172e33..014c2921f26a 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -167,6 +167,11 @@ struct seq_file;
  void arch_report_meminfo(struct seq_file *m);
  #endif /* CONFIG_PPC64 */
  
+#ifndef MODULES_VADDR

+#define MODULES_VADDR VMALLOC_START
+#define MODULES_END VMALLOC_END
+#endif
+
  #endif /* __ASSEMBLY__ */
  
  #endif /* _ASM_POWERPC_PGTABLE_H */

diff --git a/arch/powerpc/kernel/module.c b/arch/powerpc/kernel/module.c
index a211b0253cdb..f1fb58389d58 100644
--- a/arch/powerpc/kernel/module.c
+++ b/arch/powerpc/kernel/module.c
@@ -14,6 +14,7 @@
  #include 
  #include 
  #include 
+#include 
  
  static LIST_HEAD(module_bug_list);
  
@@ -87,13 +88,9 @@ int module_finalize(const Elf_Ehdr *hdr,

return 0;
  }
  
-#ifdef MODULES_VADDR

  void *module_alloc(unsigned long size)
  {
-   BUILD_BUG_ON(TASK_SIZE > MODULES_VADDR);
-


This check is important, if we remove it from here it should be done somewhere else, for instance in 
asm/task_size_32.h



return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, 
GFP_KERNEL,
PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, 
NUMA_NO_NODE,
__builtin_return_address(0));
  }
-#endif



Re: [PATCH] powerpc/papr_scm: Mark nvdimm as unarmed if needed during probe

2021-03-29 Thread Aneesh Kumar K.V
Vaibhav Jain  writes:

> In case an nvdimm is found to be unarmed during probe then set its
> NDD_UNARMED flag before nvdimm_create(). This would enforce a
> read-only access to the ndimm region. Presently even if an nvdimm is
> unarmed its not marked as read-only on ppc64 guests.
>
> The patch updates papr_scm_nvdimm_init() to force query of nvdimm
> health via __drc_pmem_query_health() and if nvdimm is found to be
> unarmed then set the nvdimm flag ND_UNARMED for nvdimm_create().
>

Reviewed-by: Aneesh Kumar K.V 

> Signed-off-by: Vaibhav Jain 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 9 +
>  1 file changed, 9 insertions(+)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index 835163f54244..7e8168e19427 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -914,6 +914,15 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
>   dimm_flags = 0;
>   set_bit(NDD_LABELING, &dimm_flags);
>  
> + /*
> +  * Check if the nvdimm is unarmed. No locking needed as we are still
> +  * initializing. Ignore error encountered if any.
> +  */
> + __drc_pmem_query_health(p);
> +
> + if (p->health_bitmap & PAPR_PMEM_UNARMED_MASK)
> + set_bit(NDD_UNARMED, &dimm_flags);
> +
>   p->nvdimm = nvdimm_create(p->bus, p, papr_nd_attr_groups,
> dimm_flags, PAPR_SCM_DIMM_CMD_MASK, 0, NULL);
>   if (!p->nvdimm) {
> -- 
> 2.30.2
> ___
> Linux-nvdimm mailing list -- linux-nvd...@lists.01.org
> To unsubscribe send an email to linux-nvdimm-le...@lists.01.org


Re: [PATCH v10 04/10] powerpc/kprobes: Mark newly allocated probes as ROX

2021-03-29 Thread Christophe Leroy




Le 30/03/2021 à 06:51, Jordan Niethe a écrit :

From: Russell Currey 

Add the arch specific insn page allocator for powerpc. This allocates
ROX pages if STRICT_KERNEL_RWX is enabled. These pages are only written
to with patch_instruction() which is able to write RO pages.

Reviewed-by: Daniel Axtens 
Signed-off-by: Russell Currey 
Signed-off-by: Christophe Leroy 
[jpn: Reword commit message, switch to __vmalloc_node_range()]
Signed-off-by: Jordan Niethe 
---
v9: - vmalloc_exec() no longer exists
 - Set the page to RW before freeing it
v10: - use __vmalloc_node_range()
---
  arch/powerpc/kernel/kprobes.c | 14 ++
  1 file changed, 14 insertions(+)

diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 01ab2163659e..3ae27af9b094 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -25,6 +25,7 @@
  #include 
  #include 
  #include 
+#include 
  
  DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;

  DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
@@ -103,6 +104,19 @@ kprobe_opcode_t *kprobe_lookup_name(const char *name, 
unsigned int offset)
return addr;
  }
  
+void *alloc_insn_page(void)

+{
+   if (IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) {
+   return __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, 
MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_ROX, 
VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE, __builtin_return_address(0));
+   } else {
+   return __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, 
MODULES_END,
+   GFP_KERNEL, PAGE_KERNEL_EXEC, 
VM_FLUSH_RESET_PERMS,
+   NUMA_NO_NODE, __builtin_return_address(0));
+   }
+}
+


What about

void *alloc_insn_page(void)
{
pgprot_t prot = IS_ENABLED(CONFIG_STRICT_KERNEL_RWX) ? PAGE_KERNEL_ROX 
: PAGE_KERNEL_EXEC;

return __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, MODULES_END,
GFP_KERNEL, prot, VM_FLUSH_RESET_PERMS,
NUMA_NO_NODE, __builtin_return_address(0));
}


  int arch_prepare_kprobe(struct kprobe *p)
  {
int ret = 0;



Re: [PATCH v10 01/10] powerpc/mm: Implement set_memory() routines

2021-03-29 Thread Christophe Leroy




Le 30/03/2021 à 06:51, Jordan Niethe a écrit :

From: Russell Currey 

The set_memory_{ro/rw/nx/x}() functions are required for STRICT_MODULE_RWX,
and are generally useful primitives to have.  This implementation is
designed to be completely generic across powerpc's many MMUs.

It's possible that this could be optimised to be faster for specific
MMUs, but the focus is on having a generic and safe implementation for
now.

This implementation does not handle cases where the caller is attempting
to change the mapping of the page it is executing from, or if another
CPU is concurrently using the page being altered.  These cases likely
shouldn't happen, but a more complex implementation with MMU-specific code
could safely handle them, so that is left as a TODO for now.

On hash the linear mapping is not kept in the linux pagetable, so this
will not change the protection if used on that range. Currently these
functions are not used on the linear map so just WARN for now.

These functions do nothing if STRICT_KERNEL_RWX is not enabled.

Reviewed-by: Daniel Axtens 
Signed-off-by: Russell Currey 
Signed-off-by: Christophe Leroy 
[jpn: -rebase on next plus "powerpc/mm/64s: Allow STRICT_KERNEL_RWX again"
   - WARN on hash linear map]
Signed-off-by: Jordan Niethe 
---
v10: WARN if trying to change the hash linear map
---
  arch/powerpc/Kconfig  |  1 +
  arch/powerpc/include/asm/set_memory.h | 32 ++
  arch/powerpc/mm/Makefile  |  2 +-
  arch/powerpc/mm/pageattr.c| 88 +++
  4 files changed, 122 insertions(+), 1 deletion(-)
  create mode 100644 arch/powerpc/include/asm/set_memory.h
  create mode 100644 arch/powerpc/mm/pageattr.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index fc7f5c5933e6..4498a27ac9db 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -135,6 +135,7 @@ config PPC
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_SCALED_CPUTIME  if VIRT_CPU_ACCOUNTING_NATIVE 
&& PPC_BOOK3S_64
+   select ARCH_HAS_SET_MEMORY
select ARCH_HAS_STRICT_KERNEL_RWX   if ((PPC_BOOK3S_64 || PPC32) && 
!HIBERNATION)
select ARCH_HAS_TICK_BROADCAST  if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAS_UACCESS_FLUSHCACHE
diff --git a/arch/powerpc/include/asm/set_memory.h 
b/arch/powerpc/include/asm/set_memory.h
new file mode 100644
index ..64011ea444b4
--- /dev/null
+++ b/arch/powerpc/include/asm/set_memory.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_POWERPC_SET_MEMORY_H
+#define _ASM_POWERPC_SET_MEMORY_H
+
+#define SET_MEMORY_RO  0
+#define SET_MEMORY_RW  1
+#define SET_MEMORY_NX  2
+#define SET_MEMORY_X   3
+
+int change_memory_attr(unsigned long addr, int numpages, long action);
+
+static inline int set_memory_ro(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_RO);
+}
+
+static inline int set_memory_rw(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_RW);
+}
+
+static inline int set_memory_nx(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_NX);
+}
+
+static inline int set_memory_x(unsigned long addr, int numpages)
+{
+   return change_memory_attr(addr, numpages, SET_MEMORY_X);
+}
+
+#endif
diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
index 3b4e9e4e25ea..d8a08abde1ae 100644
--- a/arch/powerpc/mm/Makefile
+++ b/arch/powerpc/mm/Makefile
@@ -5,7 +5,7 @@
  
  ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
  
-obj-y:= fault.o mem.o pgtable.o mmap.o maccess.o \

+obj-y  := fault.o mem.o pgtable.o mmap.o maccess.o 
pageattr.o \
   init_$(BITS).o pgtable_$(BITS).o \
   pgtable-frag.o ioremap.o ioremap_$(BITS).o \
   init-common.o mmu_context.o drmem.o
diff --git a/arch/powerpc/mm/pageattr.c b/arch/powerpc/mm/pageattr.c
new file mode 100644
index ..9efcb01088da
--- /dev/null
+++ b/arch/powerpc/mm/pageattr.c
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * MMU-generic set_memory implementation for powerpc
+ *
+ * Copyright 2019, IBM Corporation.
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+
+
+/*
+ * Updates the attributes of a page in three steps:
+ *
+ * 1. invalidate the page table entry
+ * 2. flush the TLB
+ * 3. install the new entry with the updated attributes
+ *
+ * This is unsafe if the caller is attempting to change the mapping of the
+ * page it is executing from, or if another CPU is concurrently using the
+ * page being altered.
+ *
+ * TODO make the implementation resistant to this.
+ *
+ * NOTE: can be dangerous to call without STRICT_KERNEL_RWX
+ */
+static int change_page_attr(pte_t *ptep, unsigned long addr, void *data)
+{
+   long ac

Re: [PATCH v10 08/10] powerpc/configs: Enable STRICT_MODULE_RWX in skiroot_defconfig

2021-03-29 Thread Christophe Leroy




Le 30/03/2021 à 06:51, Jordan Niethe a écrit :

From: Russell Currey 

skiroot_defconfig is the only powerpc defconfig with STRICT_KERNEL_RWX
enabled, and if you want memory protection for kernel text you'd want it
for modules too, so enable STRICT_MODULE_RWX there.


Maybe we could now selectt ARCH_OPTIONAL_KERNEL_RWX_DEFAULT in 
arch/powerpc/Kconfig.

Then this change would not be necessary.

Would be in line with https://github.com/linuxppc/issues/issues/223




Acked-by: Joel Stanley 
Signed-off-by: Russell Currey 
Signed-off-by: Jordan Niethe 
---
  arch/powerpc/configs/skiroot_defconfig | 1 +
  1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/configs/skiroot_defconfig 
b/arch/powerpc/configs/skiroot_defconfig
index b806a5d3a695..50fe06cb3a31 100644
--- a/arch/powerpc/configs/skiroot_defconfig
+++ b/arch/powerpc/configs/skiroot_defconfig
@@ -50,6 +50,7 @@ CONFIG_CMDLINE="console=tty0 console=hvc0 ipr.fast_reboot=1 
quiet"
  # CONFIG_PPC_MEM_KEYS is not set
  CONFIG_JUMP_LABEL=y
  CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
  CONFIG_MODULES=y
  CONFIG_MODULE_UNLOAD=y
  CONFIG_MODULE_SIG_FORCE=y



[powerpc:next] BUILD SUCCESS 69931cc387cca289e0415c79ce5389119670066d

2021-03-29 Thread kernel test robot
fconfig
shedosk7705_defconfig
powerpc mpc5200_defconfig
powerpc  mpc885_ads_defconfig
mips loongson1b_defconfig
mips decstation_defconfig
sh apsh4a3a_defconfig
arm assabet_defconfig
sh   alldefconfig
arm s3c6400_defconfig
xtensa   common_defconfig
powerpc mpc834x_mds_defconfig
powerpc  ppc64e_defconfig
powerpc  ppc6xx_defconfig
powerpc wii_defconfig
arcnsimosci_defconfig
powerpc ps3_defconfig
arm   mainstone_defconfig
arm   omap1_defconfig
arc  axs103_smp_defconfig
arm   spitz_defconfig
arm s3c2410_defconfig
arm  jornada720_defconfig
mips  pic32mzda_defconfig
mipsjmr3927_defconfig
m68k   m5275evb_defconfig
mipsnlm_xlp_defconfig
sh   se7206_defconfig
mips cu1830-neo_defconfig
nios2 10m50_defconfig
ia64 allmodconfig
ia64defconfig
ia64 allyesconfig
m68k allmodconfig
m68kdefconfig
m68k allyesconfig
nios2   defconfig
arc  allyesconfig
nds32 allnoconfig
nds32   defconfig
nios2allyesconfig
cskydefconfig
alphaallyesconfig
h8300allyesconfig
arc defconfig
xtensa   allyesconfig
sh   allmodconfig
s390defconfig
parisc  defconfig
s390 allyesconfig
s390 allmodconfig
sparc   defconfig
i386defconfig
mips allyesconfig
mips allmodconfig
powerpc  allyesconfig
powerpc  allmodconfig
powerpc   allnoconfig
x86_64   randconfig-a002-20210329
x86_64   randconfig-a003-20210329
x86_64   randconfig-a006-20210329
x86_64   randconfig-a001-20210329
x86_64   randconfig-a005-20210329
x86_64   randconfig-a004-20210329
i386 randconfig-a003-20210329
i386 randconfig-a004-20210329
i386 randconfig-a001-20210329
i386 randconfig-a002-20210329
i386 randconfig-a006-20210329
i386 randconfig-a005-20210329
i386 randconfig-a004-20210330
i386 randconfig-a006-20210330
i386 randconfig-a003-20210330
i386 randconfig-a002-20210330
i386 randconfig-a001-20210330
i386 randconfig-a005-20210330
i386 randconfig-a011-20210329
i386 randconfig-a016-20210329
i386 randconfig-a013-20210329
i386 randconfig-a012-20210329
i386 randconfig-a014-20210329
i386 randconfig-a015-20210329
i386 randconfig-a015-20210330
i386 randconfig-a011-20210330
i386 randconfig-a014-20210330
i386 randconfig-a013-20210330
i386 randconfig-a016-20210330
i386 randconfig-a012-20210330
riscvnommu_k210_defconfig
riscv allnoconfig
riscv   defconfig
riscv  rv32_defconfig
um   allmodconfig
umallnoconfig
um   allyesconfig
um  defconfig
x86_64rhel-8.3-kselftests
x86_64  defconfig
x86_64   rhel-8.3
x86_64  rhel-8.3-kbuiltin
x86_64  kexec

clang tested configs:
x86_64   randconfig-a015-20210329
x86_64   randconfig-a012-20210329
x86_64   randconfig-a013-20210329
x86_64   randconfig-a014-20210329
x86_64   randconfig-a011-20210329
x86_64   randconfig-a016-20210329

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


[PATCH v3 1/9] selftest/mremap_test: Update the test to handle pagesize other than 4K

2021-03-29 Thread Aneesh Kumar K.V
Instead of hardcoding 4K page size fetch it using sysconf(). For the performance
measurements test still assume 2M and 1G are hugepage sizes.

Signed-off-by: Aneesh Kumar K.V 
---
 tools/testing/selftests/vm/mremap_test.c | 113 ---
 1 file changed, 61 insertions(+), 52 deletions(-)

diff --git a/tools/testing/selftests/vm/mremap_test.c 
b/tools/testing/selftests/vm/mremap_test.c
index 9c391d016922..c9a5461eb786 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -45,14 +45,15 @@ enum {
_4MB = 4ULL << 20,
_1GB = 1ULL << 30,
_2GB = 2ULL << 30,
-   PTE = _4KB,
PMD = _2MB,
PUD = _1GB,
 };
 
+#define PTE page_size
+
 #define MAKE_TEST(source_align, destination_align, size,   \
  overlaps, should_fail, test_name) \
-{  \
+(struct test){ \
.name = test_name,  \
.config = { \
.src_alignment = source_align,  \
@@ -252,12 +253,17 @@ static int parse_args(int argc, char **argv, unsigned int 
*threshold_mb,
return 0;
 }
 
+#define MAX_TEST 13
+#define MAX_PERF_TEST 3
 int main(int argc, char **argv)
 {
int failures = 0;
int i, run_perf_tests;
unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD;
unsigned int pattern_seed;
+   struct test test_cases[MAX_TEST];
+   struct test perf_test_cases[MAX_PERF_TEST];
+   int page_size;
time_t t;
 
pattern_seed = (unsigned int) time(&t);
@@ -268,56 +274,59 @@ int main(int argc, char **argv)
ksft_print_msg("Test 
configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n",
   threshold_mb, pattern_seed);
 
-   struct test test_cases[] = {
-   /* Expected mremap failures */
-   MAKE_TEST(_4KB, _4KB, _4KB, OVERLAPPING, EXPECT_FAILURE,
- "mremap - Source and Destination Regions Overlapping"),
-   MAKE_TEST(_4KB, _1KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
- "mremap - Destination Address Misaligned (1KB-aligned)"),
-   MAKE_TEST(_1KB, _4KB, _4KB, NON_OVERLAPPING, EXPECT_FAILURE,
- "mremap - Source Address Misaligned (1KB-aligned)"),
-
-   /* Src addr PTE aligned */
-   MAKE_TEST(PTE, PTE, _8KB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "8KB mremap - Source PTE-aligned, Destination PTE-aligned"),
-
-   /* Src addr 1MB aligned */
-   MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"),
-   MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"),
-
-   /* Src addr PMD aligned */
-   MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "4MB mremap - Source PMD-aligned, Destination PTE-aligned"),
-   MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"),
-   MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "4MB mremap - Source PMD-aligned, Destination PMD-aligned"),
-
-   /* Src addr PUD aligned */
-   MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination PTE-aligned"),
-   MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"),
-   MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination PMD-aligned"),
-   MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "2GB mremap - Source PUD-aligned, Destination PUD-aligned"),
-   };
-
-   struct test perf_test_cases[] = {
-   /*
-* mremap 1GB region - Page table level aligned time
-* comparison.
-*/
-   MAKE_TEST(PTE, PTE, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "1GB mremap - Source PTE-aligned, Destination PTE-aligned"),
-   MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "1GB mremap - Source PMD-aligned, Destination PMD-aligned"),
-   MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS,
- "1GB mremap - Source PUD-aligned, Destination PUD-aligned"),
-   };
+   page_size = sysconf(_SC_PAGESIZE);
+
+   /* Expected mremap failures */
+   test_cases[0] = MAKE_TEST(page_size, page_size, pag

[PATCH v3 0/9] Speedup mremap on ppc64

2021-03-29 Thread Aneesh Kumar K.V
This patchset enables MOVE_PMD/MOVE_PUD support on power. This requires
the platform to support updating higher-level page tables without
updating page table entries. This also needs to invalidate the Page Walk
Cache on architecture supporting the same.

Changes from v2:
* switch from using mmu_gather to flush_pte_tlb_pwc_range() 

Changes from v1:
* Rebase to recent upstream
* Fix build issues with tlb_gather_mmu changes



Aneesh Kumar K.V (9):
  selftest/mremap_test: Update the test to handle pagesize other than 4K
  selftest/mremap_test: Avoid crash with static build
  mm/mremap: Use pmd/pud_poplulate to update page table entries
  powerpc/mm/book3s64: Fix possible build error
  powerpc/mm/book3s64: Update tlb flush routines to take a page walk
cache flush argument
  mm/mremap: Use range flush that does TLB and page walk cache flush
  mm/mremap: Move TLB flush outside page table lock
  mm/mremap: Allow arch runtime override
  powerpc/mm: Enable move pmd/pud

 arch/arc/include/asm/tlb.h|   5 +
 arch/arm64/include/asm/tlb.h  |   6 +
 .../include/asm/book3s/64/tlbflush-radix.h|  19 +--
 arch/powerpc/include/asm/book3s/64/tlbflush.h |  30 -
 arch/powerpc/include/asm/tlb.h|   6 +
 arch/powerpc/mm/book3s64/radix_hugetlbpage.c  |   4 +-
 arch/powerpc/mm/book3s64/radix_tlb.c  |  49 
 arch/powerpc/platforms/Kconfig.cputype|   2 +
 arch/x86/include/asm/tlb.h|   5 +
 mm/mremap.c   |  40 --
 tools/testing/selftests/vm/mremap_test.c  | 118 ++
 11 files changed, 187 insertions(+), 97 deletions(-)

-- 
2.30.2



[PATCH v3 2/9] selftest/mremap_test: Avoid crash with static build

2021-03-29 Thread Aneesh Kumar K.V
With a large mmap map size, we can overlap with the text area and using
MAP_FIXED results in unmapping that area. Switch to MAP_FIXED_NOREPLACE
and handle the EEXIST error.

Signed-off-by: Aneesh Kumar K.V 
---
 tools/testing/selftests/vm/mremap_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/vm/mremap_test.c 
b/tools/testing/selftests/vm/mremap_test.c
index c9a5461eb786..0624d1bd71b5 100644
--- a/tools/testing/selftests/vm/mremap_test.c
+++ b/tools/testing/selftests/vm/mremap_test.c
@@ -75,9 +75,10 @@ static void *get_source_mapping(struct config c)
 retry:
addr += c.src_alignment;
src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE,
-   MAP_FIXED | MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+   MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED,
+   -1, 0);
if (src_addr == MAP_FAILED) {
-   if (errno == EPERM)
+   if (errno == EPERM || errno == EEXIST)
goto retry;
goto error;
}
-- 
2.30.2



[PATCH v3 3/9] mm/mremap: Use pmd/pud_poplulate to update page table entries

2021-03-29 Thread Aneesh Kumar K.V
pmd/pud_populate is the right interface to be used to set the respective
page table entries. Some architectures like ppc64 do assume that set_pmd/pud_at
can only be used to set a hugepage PTE. Since we are not setting up a hugepage
PTE here, use the pmd/pud_populate interface.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/mremap.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index ec8f840399ed..574287f9bb39 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -26,6 +26,7 @@
 
 #include 
 #include 
+#include 
 
 #include "internal.h"
 
@@ -257,9 +258,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, 
unsigned long old_addr,
pmd_clear(old_pmd);
 
VM_BUG_ON(!pmd_none(*new_pmd));
+   pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
 
-   /* Set the new pmd */
-   set_pmd_at(mm, new_addr, new_pmd, pmd);
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -306,8 +306,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, 
unsigned long old_addr,
 
VM_BUG_ON(!pud_none(*new_pud));
 
-   /* Set the new pud */
-   set_pud_at(mm, new_addr, new_pud, pud);
+   pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
-- 
2.30.2



[PATCH v3 4/9] powerpc/mm/book3s64: Fix possible build error

2021-03-29 Thread Aneesh Kumar K.V
Update _tlbiel_pid() such that we can avoid build errors like below when
using this function in other places.

arch/powerpc/mm/book3s64/radix_tlb.c: In function 
‘__radix__flush_tlb_range_psize’:
arch/powerpc/mm/book3s64/radix_tlb.c:114:2: warning: ‘asm’ operand 3 probably 
does not match constraints
  114 |  asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
  |  ^~~
arch/powerpc/mm/book3s64/radix_tlb.c:114:2: error: impossible constraint in 
‘asm’
make[4]: *** [scripts/Makefile.build:271: arch/powerpc/mm/book3s64/radix_tlb.o] 
Error 1
m

With this fix, we can also drop the __always_inline in 
__radix_flush_tlb_range_psize
which was added by commit e12d6d7d46a6 ("powerpc/mm/radix: mark 
__radix__flush_tlb_range_psize() as __always_inline")

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/mm/book3s64/radix_tlb.c | 26 +-
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c 
b/arch/powerpc/mm/book3s64/radix_tlb.c
index 409e61210789..817a02ef6032 100644
--- a/arch/powerpc/mm/book3s64/radix_tlb.c
+++ b/arch/powerpc/mm/book3s64/radix_tlb.c
@@ -291,22 +291,30 @@ static inline void fixup_tlbie_lpid(unsigned long lpid)
 /*
  * We use 128 set in radix mode and 256 set in hpt mode.
  */
-static __always_inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
+static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
 {
int set;
 
asm volatile("ptesync": : :"memory");
 
-   /*
-* Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
-* also flush the entire Page Walk Cache.
-*/
-   __tlbiel_pid(pid, 0, ric);
+   switch (ric) {
+   case RIC_FLUSH_PWC:
 
-   /* For PWC, only one flush is needed */
-   if (ric == RIC_FLUSH_PWC) {
+   /* For PWC, only one flush is needed */
+   __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
ppc_after_tlbiel_barrier();
return;
+   case RIC_FLUSH_TLB:
+   __tlbiel_pid(pid, 0, RIC_FLUSH_TLB);
+   break;
+   case RIC_FLUSH_ALL:
+   default:
+   /*
+* Flush the first set of the TLB, and if
+* we're doing a RIC_FLUSH_ALL, also flush
+* the entire Page Walk Cache.
+*/
+   __tlbiel_pid(pid, 0, RIC_FLUSH_ALL);
}
 
if (!cpu_has_feature(CPU_FTR_ARCH_31)) {
@@ -1176,7 +1184,7 @@ void radix__tlb_flush(struct mmu_gather *tlb)
}
 }
 
-static __always_inline void __radix__flush_tlb_range_psize(struct mm_struct 
*mm,
+static void __radix__flush_tlb_range_psize(struct mm_struct *mm,
unsigned long start, unsigned long end,
int psize, bool also_pwc)
 {
-- 
2.30.2



[PATCH v3 6/9] mm/mremap: Use range flush that does TLB and page walk cache flush

2021-03-29 Thread Aneesh Kumar K.V
Some architectures do have the concept of page walk cache which need
to be flush when updating higher levels of page tables. A fast mremap
that involves moving page table pages instead of copying pte entries
should flush page walk cache since the old translation cache is no more
valid.

Add new helper flush_pte_tlb_pwc_range() which invalidates both TLB and
page walk cache where TLB entries are mapped with page size PAGE_SIZE.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/book3s/64/tlbflush.h | 11 +++
 mm/mremap.c   | 15 +--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index efe5336e2b6f..b9022eb9f20e 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -80,6 +80,17 @@ static inline void flush_hugetlb_tlb_range(struct 
vm_area_struct *vma,
return flush_hugetlb_tlb_pwc_range(vma, start, end, false);
 }
 
+#define flush_pte_tlb_pwc_range flush_tlb_pwc_range
+static inline void flush_pte_tlb_pwc_range(struct vm_area_struct *vma,
+  unsigned long start, unsigned long 
end,
+  bool also_pwc)
+{
+   if (radix_enabled())
+   return radix__flush_tlb_pwc_range_psize(vma->vm_mm, start,
+   end, mmu_virtual_psize, 
also_pwc);
+   return hash__flush_tlb_range(vma, start, end);
+}
+
 static inline void flush_tlb_range(struct vm_area_struct *vma,
   unsigned long start, unsigned long end)
 {
diff --git a/mm/mremap.c b/mm/mremap.c
index 574287f9bb39..0e7b11daafee 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,6 +210,17 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t 
*old_pmd,
drop_rmap_locks(vma);
 }
 
+#ifndef flush_pte_tlb_pwc_range
+#define flush_pte_tlb_pwc_range flush_pte_tlb_pwc_range
+static inline void flush_pte_tlb_pwc_range(struct vm_area_struct *vma,
+  unsigned long start,
+  unsigned long end,
+  bool also_pwc)
+{
+   return flush_tlb_range(vma, start, end);
+}
+#endif
+
 #ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
@@ -260,7 +271,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, 
unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
 
-   flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+   flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE, true);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -307,7 +318,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, 
unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
 
pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
-   flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+   flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE, true);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
-- 
2.30.2



[PATCH v3 5/9] powerpc/mm/book3s64: Update tlb flush routines to take a page walk cache flush argument

2021-03-29 Thread Aneesh Kumar K.V
Signed-off-by: Aneesh Kumar K.V 
---
 .../include/asm/book3s/64/tlbflush-radix.h| 19 ---
 arch/powerpc/include/asm/book3s/64/tlbflush.h | 23 +++
 arch/powerpc/mm/book3s64/radix_hugetlbpage.c  |  4 ++--
 arch/powerpc/mm/book3s64/radix_tlb.c  | 23 ---
 4 files changed, 42 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
index 8b33601cdb9d..90c91f7b526f 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -56,15 +56,18 @@ static inline void radix__flush_all_lpid_guest(unsigned int 
lpid)
 }
 #endif
 
-extern void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
-  unsigned long start, unsigned long 
end);
-extern void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long 
start,
-unsigned long end, int psize);
-extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
-  unsigned long start, unsigned long end);
-extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long 
start,
+void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma,
+   unsigned long start, unsigned long end,
+   bool also_pwc);
+void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
+   unsigned long start, unsigned long end,
+   bool also_pwc);
+void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long 
start,
+ unsigned long end, int psize, bool 
also_pwc);
+void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end);
-extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long 
end);
+void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end);
+
 
 extern void radix__local_flush_tlb_mm(struct mm_struct *mm);
 extern void radix__local_flush_all_mm(struct mm_struct *mm);
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h 
b/arch/powerpc/include/asm/book3s/64/tlbflush.h
index 215973b4cb26..efe5336e2b6f 100644
--- a/arch/powerpc/include/asm/book3s/64/tlbflush.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h
@@ -45,13 +45,30 @@ static inline void tlbiel_all_lpid(bool radix)
hash__tlbiel_all(TLB_INVAL_SCOPE_LPID);
 }
 
+static inline void flush_pmd_tlb_pwc_range(struct vm_area_struct *vma,
+  unsigned long start,
+  unsigned long end,
+  bool also_pwc)
+{
+   if (radix_enabled())
+   return radix__flush_pmd_tlb_range(vma, start, end, also_pwc);
+   return hash__flush_tlb_range(vma, start, end);
+}
 
 #define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
 static inline void flush_pmd_tlb_range(struct vm_area_struct *vma,
   unsigned long start, unsigned long end)
+{
+   return flush_pmd_tlb_pwc_range(vma, start, end, false);
+}
+
+static inline void flush_hugetlb_tlb_pwc_range(struct vm_area_struct *vma,
+  unsigned long start,
+  unsigned long end,
+  bool also_pwc)
 {
if (radix_enabled())
-   return radix__flush_pmd_tlb_range(vma, start, end);
+   return radix__flush_hugetlb_tlb_range(vma, start, end, 
also_pwc);
return hash__flush_tlb_range(vma, start, end);
 }
 
@@ -60,9 +77,7 @@ static inline void flush_hugetlb_tlb_range(struct 
vm_area_struct *vma,
   unsigned long start,
   unsigned long end)
 {
-   if (radix_enabled())
-   return radix__flush_hugetlb_tlb_range(vma, start, end);
-   return hash__flush_tlb_range(vma, start, end);
+   return flush_hugetlb_tlb_pwc_range(vma, start, end, false);
 }
 
 static inline void flush_tlb_range(struct vm_area_struct *vma,
diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c 
b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
index cb91071eef52..55c5c9c39ae2 100644
--- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
+++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c
@@ -26,13 +26,13 @@ void radix__local_flush_hugetlb_page(struct vm_area_struct 
*vma, unsigned long v
 }
 
 void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long 
start,
-  unsigned long end)
+   unsigned long end, bool also_pwc)
 {
int psize;
struct hstate *hstate = hstate_file(vma->vm_file);
 
psize =

[PATCH v3 7/9] mm/mremap: Move TLB flush outside page table lock

2021-03-29 Thread Aneesh Kumar K.V
Move TLB flush outside page table lock so that kernel does
less with page table lock held. Releasing the ptl with old
TLB contents still valid will behave such that such access
happened before the level3 or level2 entry update.

Signed-off-by: Aneesh Kumar K.V 
---
 mm/mremap.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 0e7b11daafee..7ac1df8e6d51 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -259,7 +259,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, 
unsigned long old_addr,
 * We don't have to worry about the ordering of src and dst
 * ptlocks because exclusive mmap_lock prevents deadlock.
 */
-   old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+   old_ptl = pmd_lock(mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -271,11 +271,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, 
unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
pmd_populate(mm, new_pmd, (pgtable_t)pmd_page_vaddr(pmd));
 
-   flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE, true);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
 
+   flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PMD_SIZE, true);
return true;
 }
 #else
@@ -306,7 +306,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, 
unsigned long old_addr,
 * We don't have to worry about the ordering of src and dst
 * ptlocks because exclusive mmap_lock prevents deadlock.
 */
-   old_ptl = pud_lock(vma->vm_mm, old_pud);
+   old_ptl = pud_lock(mm, old_pud);
new_ptl = pud_lockptr(mm, new_pud);
if (new_ptl != old_ptl)
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -318,11 +318,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, 
unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
 
pud_populate(mm, new_pud, (pmd_t *)pud_page_vaddr(pud));
-   flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE, true);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
 
+   flush_pte_tlb_pwc_range(vma, old_addr, old_addr + PUD_SIZE, true);
return true;
 }
 #else
-- 
2.30.2



[PATCH v3 8/9] mm/mremap: Allow arch runtime override

2021-03-29 Thread Aneesh Kumar K.V
Architectures like ppc64 can only support faster mremap only with radix
translation. Hence allow a runtime check w.r.t support for fast mremap.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/arc/include/asm/tlb.h |  5 +
 arch/arm64/include/asm/tlb.h   |  6 ++
 arch/powerpc/include/asm/tlb.h |  6 ++
 arch/x86/include/asm/tlb.h |  5 +
 mm/mremap.c| 14 +-
 5 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h
index 975b35d3738d..22b8cfb46cbf 100644
--- a/arch/arc/include/asm/tlb.h
+++ b/arch/arc/include/asm/tlb.h
@@ -9,4 +9,9 @@
 #include 
 #include 
 
+#define arch_supports_page_tables_move arch_supports_page_tables_move
+static inline bool arch_supports_page_tables_move(void)
+{
+   return true;
+}
 #endif /* _ASM_ARC_TLB_H */
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index 61c97d3b58c7..fe209efc6a10 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -94,4 +94,10 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, 
pud_t *pudp,
 }
 #endif
 
+#define arch_supports_page_tables_move arch_supports_page_tables_move
+static inline bool arch_supports_page_tables_move(void)
+{
+   return true;
+}
+
 #endif
diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
index 160422a439aa..058918a7cd3c 100644
--- a/arch/powerpc/include/asm/tlb.h
+++ b/arch/powerpc/include/asm/tlb.h
@@ -83,5 +83,11 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
 }
 #endif
 
+#define arch_supports_page_tables_move arch_supports_page_tables_move
+static inline bool arch_supports_page_tables_move(void)
+{
+   return radix_enabled();
+}
+
 #endif /* __KERNEL__ */
 #endif /* __ASM_POWERPC_TLB_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 1bfe979bb9bc..62915238bb36 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -37,4 +37,9 @@ static inline void __tlb_remove_table(void *table)
free_page_and_swap_cache(table);
 }
 
+#define arch_supports_page_tables_move arch_supports_page_tables_move
+static inline bool arch_supports_page_tables_move(void)
+{
+   return true;
+}
 #endif /* _ASM_X86_TLB_H */
diff --git a/mm/mremap.c b/mm/mremap.c
index 7ac1df8e6d51..4d812af3e44b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,7 @@
 #include 
 
 #include 
-#include 
+#include 
 #include 
 
 #include "internal.h"
@@ -221,6 +221,14 @@ static inline void flush_pte_tlb_pwc_range(struct 
vm_area_struct *vma,
 }
 #endif
 
+#ifndef arch_supports_page_tables_move
+#define arch_supports_page_tables_move arch_supports_page_tables_move
+static inline bool arch_supports_page_tables_move(void)
+{
+   return false;
+}
+#endif
+
 #ifdef CONFIG_HAVE_MOVE_PMD
 static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
@@ -229,6 +237,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, 
unsigned long old_addr,
struct mm_struct *mm = vma->vm_mm;
pmd_t pmd;
 
+   if (!arch_supports_page_tables_move())
+   return false;
/*
 * The destination pmd shouldn't be established, free_pgtables()
 * should have released it.
@@ -295,6 +305,8 @@ static bool move_normal_pud(struct vm_area_struct *vma, 
unsigned long old_addr,
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
 
+   if (!arch_supports_page_tables_move())
+   return false;
/*
 * The destination pud shouldn't be established, free_pgtables()
 * should have released it.
-- 
2.30.2



[PATCH v3 9/9] powerpc/mm: Enable move pmd/pud

2021-03-29 Thread Aneesh Kumar K.V
mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:
1GB mremap - Source PTE-aligned, Destination PTE-aligned
  mremap time:  1127034ns
1GB mremap - Source PMD-aligned, Destination PMD-aligned
  mremap time:   508817ns
1GB mremap - Source PUD-aligned, Destination PUD-aligned
  mremap time:23046ns

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/platforms/Kconfig.cputype | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/platforms/Kconfig.cputype 
b/arch/powerpc/platforms/Kconfig.cputype
index 3ce907523b1e..2e666e569fdf 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -97,6 +97,8 @@ config PPC_BOOK3S_64
select PPC_HAVE_PMU_SUPPORT
select SYS_SUPPORTS_HUGETLBFS
select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+   select HAVE_MOVE_PMD
+   select HAVE_MOVE_PUD
select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE
select ARCH_SUPPORTS_NUMA_BALANCING
select IRQ_WORK
-- 
2.30.2