Re: [PATCH v4] mm: Avoid unnecessary page fault retires on shared memory types

2022-05-30 Thread Christian Borntraeger




Am 30.05.22 um 18:00 schrieb Peter Xu:

On Mon, May 30, 2022 at 11:52:54AM -0400, Peter Xu wrote:

On Mon, May 30, 2022 at 11:35:10AM +0200, Christian Borntraeger wrote:



Am 29.05.22 um 22:33 schrieb Heiko Carstens:
[...]


Guess the patch below on top of your patch is what we want.
Just for clarification: if gmap is not NULL then the process is a kvm
process. So, depending on the workload, this optimization makes sense.

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 4608cc962ecf..e1d40ca341b7 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -436,12 +436,11 @@ static inline vm_fault_t do_exception(struct pt_regs 
*regs, int access)
/* The fault is fully completed (including releasing mmap lock) */
if (fault & VM_FAULT_COMPLETED) {
-   /*
-* Gmap will need the mmap lock again, so retake it.  TODO:
-* only conditionally take the lock when CONFIG_PGSTE set.
-*/
-   mmap_read_lock(mm);
-   goto out_gmap;
+   if (gmap) {
+   mmap_read_lock(mm);
+   goto out_gmap;
+   }
+   goto out;


Hmm, right after I replied I found "goto out" could be problematic, since
all s390 callers of do_exception() will assume it an error condition (side
note: "goto out_gmap" contains one step to clear "fault" to 0).  I'll
replace this with "return 0" instead if it looks good to both of you.

I'll wait for a confirmation before reposting.  Thanks,


Yes, that sounds good and thank you for double checking.

___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH v4] mm: Avoid unnecessary page fault retires on shared memory types

2022-05-30 Thread Christian Borntraeger




Am 29.05.22 um 22:33 schrieb Heiko Carstens:
[...]


Guess the patch below on top of your patch is what we want.
Just for clarification: if gmap is not NULL then the process is a kvm
process. So, depending on the workload, this optimization makes sense.

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 4608cc962ecf..e1d40ca341b7 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -436,12 +436,11 @@ static inline vm_fault_t do_exception(struct pt_regs 
*regs, int access)
  
  	/* The fault is fully completed (including releasing mmap lock) */

if (fault & VM_FAULT_COMPLETED) {
-   /*
-* Gmap will need the mmap lock again, so retake it.  TODO:
-* only conditionally take the lock when CONFIG_PGSTE set.
-*/
-   mmap_read_lock(mm);
-   goto out_gmap;
+   if (gmap) {
+   mmap_read_lock(mm);
+   goto out_gmap;
+   }
+   goto out;


Yes, that makes sense. With that

Acked-by: Christian Borntraeger 


___
linux-snps-arc mailing list
linux-snps-arc@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-snps-arc


Re: [PATCH 2/2] futex: remove futex_cmpxchg detection

2021-10-26 Thread Christian Borntraeger

Am 26.10.21 um 12:03 schrieb Arnd Bergmann:

From: Arnd Bergmann 

Now that all architectures have a working futex implementation
in any configuration, remove the runtime detection code.

Signed-off-by: Arnd Bergmann 


s390 part
Acked-by: Christian Borntraeger 


---
  arch/arc/Kconfig  |  1 -
  arch/arm/Kconfig  |  1 -
  arch/arm64/Kconfig|  1 -
  arch/csky/Kconfig |  1 -
  arch/m68k/Kconfig |  1 -
  arch/riscv/Kconfig|  1 -
  arch/s390/Kconfig |  1 -
  arch/sh/Kconfig   |  1 -
  arch/um/Kconfig   |  1 -
  arch/um/kernel/skas/uaccess.c |  1 -
  arch/xtensa/Kconfig   |  1 -
  init/Kconfig  |  8 
  kernel/futex/core.c   | 35 ---
  kernel/futex/futex.h  |  6 --
  kernel/futex/syscalls.c   | 22 --
  15 files changed, 82 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index 248389278e8f..f9413041686f 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -31,7 +31,6 @@ config ARC
select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4
select HAVE_DEBUG_STACKOVERFLOW
select HAVE_DEBUG_KMEMLEAK
-   select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_IOREMAP_PROT
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZMA
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index bb5d2c45477b..6448d311635d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -93,7 +93,6 @@ config ARM
select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL
select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG
select HAVE_FUNCTION_TRACER if !XIP_KERNEL && !(THUMB2_KERNEL && 
CC_IS_CLANG)
-   select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_GCC_PLUGINS
select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || 
CPU_V7)
select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 0efc501f77aa..6c3c2ff5cef8 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -193,7 +193,6 @@ config ARM64
select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_FUNCTION_ARG_ACCESS_API
-   select HAVE_FUTEX_CMPXCHG if FUTEX
select MMU_GATHER_RCU_TABLE_FREE
select HAVE_RSEQ
select HAVE_STACKPROTECTOR
diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig
index 823d3d5a9e11..efd7c5feac8b 100644
--- a/arch/csky/Kconfig
+++ b/arch/csky/Kconfig
@@ -53,7 +53,6 @@ config CSKY
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_ERROR_INJECTION
-   select HAVE_FUTEX_CMPXCHG if FUTEX && SMP
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_LZO
diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig
index 0b50da08a9c5..15a793c5b2dc 100644
--- a/arch/m68k/Kconfig
+++ b/arch/m68k/Kconfig
@@ -20,7 +20,6 @@ config M68K
select HAVE_ASM_MODVERSIONS
select HAVE_DEBUG_BUGVERBOSE
select HAVE_EFFICIENT_UNALIGNED_ACCESS if !CPU_HAS_NO_UNALIGNED
-   select HAVE_FUTEX_CMPXCHG if MMU && FUTEX
select HAVE_MOD_ARCH_SPECIFIC
select HAVE_UID16
select MMU_GATHER_NO_RANGE if MMU
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 77a088d0a7e9..037fea9fac14 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -84,7 +84,6 @@ config RISCV
select HAVE_DMA_CONTIGUOUS if MMU
select HAVE_EBPF_JIT if MMU
select HAVE_FUNCTION_ERROR_INJECTION
-   select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_GCC_PLUGINS
select HAVE_GENERIC_VDSO if MMU && 64BIT
select HAVE_IRQ_TIME_ACCOUNTING
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f615c3f65f5a..1c9ecf619e04 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -164,7 +164,6 @@ config S390
select HAVE_FUNCTION_ERROR_INJECTION
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
-   select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_GCC_PLUGINS
select HAVE_GENERIC_VDSO
select HAVE_IOREMAP_PROT if PCI
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6904f4bdbf00..93195d3368c0 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -34,7 +34,6 @@ config SUPERH
select HAVE_FAST_GUP if MMU
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
-   select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_HW_BREAKPOINT
select HAVE_IOREMAP_PROT if MMU && !X2TLB
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index c18b45f75d41..c906250d4970 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -14,7 +14,6 @@ config UML
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ASM_MODVERSIONS

Re: [PATCH v2 2/2] mm: speed up mremap by 500x on large regions

2018-10-15 Thread Christian Borntraeger



On 10/12/2018 03:37 AM, Joel Fernandes (Google) wrote:
> Android needs to mremap large regions of memory during memory management
> related operations. The mremap system call can be really slow if THP is
> not enabled. The bottleneck is move_page_tables, which is copying each
> pte at a time, and can be really slow across a large map. Turning on THP
> may not be a viable option, and is not for us. This patch speeds up the
> performance for non-THP system by copying at the PMD level when possible.
> 
> The speed up is three orders of magnitude. On a 1GB mremap, the mremap
> completion times drops from 160-250 millesconds to 380-400 microseconds.
> 
> Before:
> Total mremap time for 1GB data: 242321014 nanoseconds.
> Total mremap time for 1GB data: 196842467 nanoseconds.
> Total mremap time for 1GB data: 167051162 nanoseconds.
> 
> After:
> Total mremap time for 1GB data: 385781 nanoseconds.
> Total mremap time for 1GB data: 388959 nanoseconds.
> Total mremap time for 1GB data: 402813 nanoseconds.
> 
> Incase THP is enabled, the optimization is skipped. I also flush the
> tlb every time we do this optimization since I couldn't find a way to
> determine if the low-level PTEs are dirty. It is seen that the cost of
> doing so is not much compared the improvement, on both x86-64 and arm64.
> 
> Cc: minc...@kernel.org
> Cc: pan...@google.com
> Cc: hu...@google.com
> Cc: lokeshgi...@google.com
> Cc: dan...@google.com
> Cc: mho...@kernel.org
> Cc: kir...@shutemov.name
> Cc: a...@linux-foundation.org
> Signed-off-by: Joel Fernandes (Google) 
> ---
>  mm/mremap.c | 62 +
>  1 file changed, 62 insertions(+)
> 
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 9e68a02a52b1..d82c485822ef 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -191,6 +191,54 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t 
> *old_pmd,
>   drop_rmap_locks(vma);
>  }
>  
> +static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long 
> old_addr,
> +   unsigned long new_addr, unsigned long old_end,
> +   pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush)
> +{
> + spinlock_t *old_ptl, *new_ptl;
> + struct mm_struct *mm = vma->vm_mm;
> +
> + if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
> + || old_end - old_addr < PMD_SIZE)
> + return false;
> +
> + /*
> +  * The destination pmd shouldn't be established, free_pgtables()
> +  * should have release it.
> +  */
> + if (WARN_ON(!pmd_none(*new_pmd)))
> + return false;
> +
> + /*
> +  * We don't have to worry about the ordering of src and dst
> +  * ptlocks because exclusive mmap_sem prevents deadlock.
> +  */
> + old_ptl = pmd_lock(vma->vm_mm, old_pmd);
> + if (old_ptl) {
> + pmd_t pmd;
> +
> + new_ptl = pmd_lockptr(mm, new_pmd);
> + if (new_ptl != old_ptl)
> + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
> +
> + /* Clear the pmd */
> + pmd = *old_pmd;
> + pmd_clear(old_pmd);

Adding Martin Schwidefsky.
Is this mapping maybe still in use on other CPUs? If yes, I think for
s390 we need to flush here as well (in other word we might need to introduce
pmd_clear_flush). On s390 you have to use instructions like CRDTE,IPTE or IDTE
to modify page table entries that are still in use. Otherwise you can get a 
delayed access exception which is - in contrast to page faults - not 
recoverable.



> +
> + VM_BUG_ON(!pmd_none(*new_pmd));
> +
> + /* Set the new pmd */
> + set_pmd_at(mm, new_addr, new_pmd, pmd);
> + if (new_ptl != old_ptl)
> + spin_unlock(new_ptl);
> + spin_unlock(old_ptl);
> +
> + *need_flush = true;
> + return true;
> + }
> + return false;
> +}
> +
>  unsigned long move_page_tables(struct vm_area_struct *vma,
>   unsigned long old_addr, struct vm_area_struct *new_vma,
>   unsigned long new_addr, unsigned long len,
> @@ -239,7 +287,21 @@ unsigned long move_page_tables(struct vm_area_struct 
> *vma,
>   split_huge_pmd(vma, old_pmd, old_addr);
>   if (pmd_trans_unstable(old_pmd))
>   continue;
> + } else if (extent == PMD_SIZE) {
> + bool moved;
> +
> + /* See comment in move_ptes() */
> + if (need_rmap_locks)
> + take_rmap_locks(vma);
> + moved = move_normal_pmd(vma, old_addr, new_addr,
> + old_end, old_pmd, new_pmd,
> + _flush);
> + if (need_rmap_locks)
> + drop_rmap_locks(vma);
> + if (moved)
> + continue;
>