Re: [PATCH V3 2/2] mm: THP page cache support for ppc64

2016-11-13 Thread Kirill A. Shutemov
On Sun, Nov 13, 2016 at 08:30:25PM +0530, Aneesh Kumar K.V wrote:
> Add arch specific callback in the generic THP page cache code that will
> deposit and withdarw preallocated page table. Archs like ppc64 use
> this preallocated table to store the hash pte slot information.
> 
> Testing:
> kernel build of the patch series on tmpfs mounted with option huge=always
> 
> The related thp stat:
> thp_fault_alloc 72939
> thp_fault_fallback 60547
> thp_collapse_alloc 603
> thp_collapse_alloc_failed 0
> thp_file_alloc 253763
> thp_file_mapped 4251
> thp_split_page 51518
> thp_split_page_failed 1
> thp_deferred_split_page 73566
> thp_split_pmd 665
> thp_zero_page_alloc 3
> thp_zero_page_alloc_failed 0
> 
> Signed-off-by: Aneesh Kumar K.V 

One nit-pick below, but otherwise

Acked-by: Kirill A. Shutemov 

> @@ -2975,6 +3004,13 @@ static int do_set_pmd(struct fault_env *fe, struct 
> page *page)
>   ret = 0;
>   count_vm_event(THP_FILE_MAPPED);
>  out:
> + /*
> +  * If we are going to fallback to pte mapping, do a
> +  * withdraw with pmd lock held.
> +  */
> + if (arch_needs_pgtable_deposit() && (ret == VM_FAULT_FALLBACK))

Parenthesis are redundant around ret check.

> + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
> +fe->pmd);
>   spin_unlock(fe->ptl);
>   return ret;
>  }

-- 
 Kirill A. Shutemov


[PATCH V3 2/2] mm: THP page cache support for ppc64

2016-11-13 Thread Aneesh Kumar K.V
Add arch specific callback in the generic THP page cache code that will
deposit and withdarw preallocated page table. Archs like ppc64 use
this preallocated table to store the hash pte slot information.

Testing:
kernel build of the patch series on tmpfs mounted with option huge=always

The related thp stat:
thp_fault_alloc 72939
thp_fault_fallback 60547
thp_collapse_alloc 603
thp_collapse_alloc_failed 0
thp_file_alloc 253763
thp_file_mapped 4251
thp_split_page 51518
thp_split_page_failed 1
thp_deferred_split_page 73566
thp_split_pmd 665
thp_zero_page_alloc 3
thp_zero_page_alloc_failed 0

Signed-off-by: Aneesh Kumar K.V 
---
Changes from V2:
* Handle page table allocation failures.

 arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +
 include/asm-generic/pgtable.h|  3 ++
 mm/Kconfig   |  6 +--
 mm/huge_memory.c | 17 
 mm/khugepaged.c  | 21 +-
 mm/memory.c  | 60 +++-
 6 files changed, 100 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h 
b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 700301bc5190..0ebfbc8f0449 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1021,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock 
*new_pmd_ptl,
 */
return true;
 }
+
+
+#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit
+static inline bool arch_needs_pgtable_deposit(void)
+{
+   if (radix_enabled())
+   return false;
+   return true;
+}
+
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 324990273ad2..e00e3b7cf6a8 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -653,6 +653,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
 }
 #endif
 
+#ifndef arch_needs_pgtable_deposit
+#define arch_needs_pgtable_deposit() (false)
+#endif
 /*
  * This function is meant to be used by sites walking pagetables with
  * the mmap_sem hold in read mode to protect against MADV_DONTNEED and
diff --git a/mm/Kconfig b/mm/Kconfig
index be0ee11fa0d9..0a279d399722 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -447,13 +447,9 @@ choice
  benefit.
 endchoice
 
-#
-# We don't deposit page tables on file THP mapping,
-# but Power makes use of them to address MMU quirk.
-#
 config TRANSPARENT_HUGE_PAGECACHE
def_bool y
-   depends on TRANSPARENT_HUGEPAGE && !PPC
+   depends on TRANSPARENT_HUGEPAGE
 
 #
 # UP and nommu archs use km based percpu allocator
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 54f265ec902e..a6f1e4443adc 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1377,6 +1377,15 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, 
struct vm_area_struct *vma,
return ret;
 }
 
+static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
+{
+   pgtable_t pgtable;
+
+   pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+   pte_free(mm, pgtable);
+   atomic_long_dec(>nr_ptes);
+}
+
 int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 pmd_t *pmd, unsigned long addr)
 {
@@ -1416,6 +1425,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct 
vm_area_struct *vma,
atomic_long_dec(>mm->nr_ptes);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
} else {
+   if (arch_needs_pgtable_deposit())
+   zap_deposited_table(tlb->mm, pmd);
add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR);
}
spin_unlock(ptl);
@@ -1595,6 +1606,12 @@ static void __split_huge_pmd_locked(struct 
vm_area_struct *vma, pmd_t *pmd,
 
if (!vma_is_anonymous(vma)) {
_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+   /*
+* We are going to unmap this huge page. So
+* just go ahead and zap it
+*/
+   if (arch_needs_pgtable_deposit())
+   zap_deposited_table(mm, pmd);
if (vma_is_dax(vma))
return;
page = pmd_page(_pmd);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 728d7790dc2d..9fb7b275cb63 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1240,6 +1240,7 @@ static void retract_page_tables(struct address_space 
*mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
unsigned long addr;
pmd_t *pmd, _pmd;
+   bool deposited = false;
 
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, >i_mmap, pgoff, pgoff) {
@@ -1264,10