On Tue, Nov 08, 2016 at 08:31:50AM +0900, Naoya Horiguchi wrote:
> This patch prepares thp migration's core code. These code will be open when
> unmap_and_move() stops unconditionally splitting thp and get_new_page() starts
> to allocate destination thps.
> 
> Signed-off-by: Naoya Horiguchi <n-horigu...@ah.jp.nec.com>
> ---
> ChangeLog v1 -> v2:
> - support pte-mapped thp, doubly-mapped thp
> ---
>  arch/x86/include/asm/pgtable_64.h |   2 +
>  include/linux/swapops.h           |  61 +++++++++++++++
>  mm/huge_memory.c                  | 154 
> ++++++++++++++++++++++++++++++++++++++
>  mm/migrate.c                      |  44 ++++++++++-
>  mm/pgtable-generic.c              |   3 +-
>  5 files changed, 262 insertions(+), 2 deletions(-)
> 
> diff --git v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/include/asm/pgtable_64.h 
> v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/include/asm/pgtable_64.h
> index 1cc82ec..3a1b48e 100644
> --- v4.9-rc2-mmotm-2016-10-27-18-27/arch/x86/include/asm/pgtable_64.h
> +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/arch/x86/include/asm/pgtable_64.h
> @@ -167,7 +167,9 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
>                                        ((type) << (SWP_TYPE_FIRST_BIT)) \
>                                        | ((offset) << SWP_OFFSET_FIRST_BIT) })
>  #define __pte_to_swp_entry(pte)              ((swp_entry_t) { pte_val((pte)) 
> })
> +#define __pmd_to_swp_entry(pte)              ((swp_entry_t) { pmd_val((pmd)) 
> })
>  #define __swp_entry_to_pte(x)                ((pte_t) { .pte = (x).val })
> +#define __swp_entry_to_pmd(x)                ((pmd_t) { .pmd = (x).val })
>  
>  extern int kern_addr_valid(unsigned long addr);
>  extern void cleanup_highmap(void);
> diff --git v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/swapops.h 
> v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/swapops.h
> index 5c3a5f3..b6b22a2 100644
> --- v4.9-rc2-mmotm-2016-10-27-18-27/include/linux/swapops.h
> +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/include/linux/swapops.h
> @@ -163,6 +163,67 @@ static inline int is_write_migration_entry(swp_entry_t 
> entry)
>  
>  #endif
>  
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +extern void set_pmd_migration_entry(struct page *page,
> +             struct vm_area_struct *vma, unsigned long address);
> +
> +extern int remove_migration_pmd(struct page *new, pmd_t *pmd,
> +             struct vm_area_struct *vma, unsigned long addr, void *old);
> +
> +extern void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd);
> +
> +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> +{
> +     swp_entry_t arch_entry;
> +
> +     arch_entry = __pmd_to_swp_entry(pmd);
> +     return swp_entry(__swp_type(arch_entry), __swp_offset(arch_entry));
> +}
> +
> +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> +{
> +     swp_entry_t arch_entry;
> +
> +     arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
> +     return __swp_entry_to_pmd(arch_entry);
> +}
> +
> +static inline int is_pmd_migration_entry(pmd_t pmd)
> +{
> +     return !pmd_present(pmd) && is_migration_entry(pmd_to_swp_entry(pmd));
> +}
> +#else
> +static inline void set_pmd_migration_entry(struct page *page,
> +                     struct vm_area_struct *vma, unsigned long address)
> +{

VM_BUG()? Or BUILD_BUG()?

> +}
> +
> +static inline int remove_migration_pmd(struct page *new, pmd_t *pmd,
> +             struct vm_area_struct *vma, unsigned long addr, void *old)
> +{
> +     return 0;

Ditto.

> +}
> +
> +static inline void pmd_migration_entry_wait(struct mm_struct *m, pmd_t *p) { 
> }
> +
> +static inline swp_entry_t pmd_to_swp_entry(pmd_t pmd)
> +{
> +     return swp_entry(0, 0);

Ditto.

> +}
> +
> +static inline pmd_t swp_entry_to_pmd(swp_entry_t entry)
> +{
> +     pmd_t pmd = {};

Ditto.

> +     return pmd;
> +}
> +
> +static inline int is_pmd_migration_entry(pmd_t pmd)
> +{
> +     return 0;
> +}
> +#endif
> +
>  #ifdef CONFIG_MEMORY_FAILURE
>  
>  extern atomic_long_t num_poisoned_pages __read_mostly;
> diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c 
> v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c
> index 0509d17..b3022b3 100644
> --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/huge_memory.c
> +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/huge_memory.c
> @@ -2310,3 +2310,157 @@ static int __init split_huge_pages_debugfs(void)
>  }
>  late_initcall(split_huge_pages_debugfs);
>  #endif
> +
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +void set_pmd_migration_entry(struct page *page, struct vm_area_struct *vma,
> +                             unsigned long addr)
> +{
> +     struct mm_struct *mm = vma->vm_mm;
> +     pgd_t *pgd;
> +     pud_t *pud;
> +     pmd_t *pmd;
> +     pmd_t pmdval;
> +     swp_entry_t entry;
> +     spinlock_t *ptl;
> +
> +     pgd = pgd_offset(mm, addr);
> +     if (!pgd_present(*pgd))
> +             return;
> +     pud = pud_offset(pgd, addr);
> +     if (!pud_present(*pud))
> +             return;
> +     pmd = pmd_offset(pud, addr);
> +     pmdval = *pmd;
> +     barrier();
> +     if (!pmd_present(pmdval))
> +             return;
> +
> +     mmu_notifier_invalidate_range_start(mm, addr, addr + HPAGE_PMD_SIZE);
> +     if (pmd_trans_huge(pmdval)) {
> +             pmd_t pmdswp;
> +
> +             ptl = pmd_lock(mm, pmd);
> +             if (!pmd_present(*pmd))
> +                     goto unlock_pmd;
> +             if (unlikely(!pmd_trans_huge(*pmd)))
> +                     goto unlock_pmd;

Just check *pmd == pmdval?

> +             if (pmd_page(*pmd) != page)
> +                     goto unlock_pmd;
> +
> +             pmdval = pmdp_huge_get_and_clear(mm, addr, pmd);
> +             if (pmd_dirty(pmdval))
> +                     set_page_dirty(page);
> +             entry = make_migration_entry(page, pmd_write(pmdval));
> +             pmdswp = swp_entry_to_pmd(entry);
> +             pmdswp = pmd_mkhuge(pmdswp);
> +             set_pmd_at(mm, addr, pmd, pmdswp);
> +             page_remove_rmap(page, true);
> +             put_page(page);
> +unlock_pmd:
> +             spin_unlock(ptl);
> +     } else { /* pte-mapped thp */
> +             pte_t *pte;
> +             pte_t pteval;
> +             struct page *tmp = compound_head(page);
> +             unsigned long address = addr & HPAGE_PMD_MASK;
> +             pte_t swp_pte;
> +             int i;
> +
> +             pte = pte_offset_map(pmd, address);
> +             ptl = pte_lockptr(mm, pmd);
> +             spin_lock(ptl);

pte_offset_map_lock() ?

> +             for (i = 0; i < HPAGE_PMD_NR; i++, pte++, tmp++) {
> +                     if (!(pte_present(*pte) &&
> +                           page_to_pfn(tmp) == pte_pfn(*pte)))

                        if (!pte_present(*pte) || pte_page(*pte) != tmp) ?

> +                             continue;
> +                     pteval = ptep_clear_flush(vma, address, pte);
> +                     if (pte_dirty(pteval))
> +                             set_page_dirty(tmp);
> +                     entry = make_migration_entry(tmp, pte_write(pteval));
> +                     swp_pte = swp_entry_to_pte(entry);
> +                     set_pte_at(mm, address, pte, swp_pte);
> +                     page_remove_rmap(tmp, false);
> +                     put_page(tmp);
> +             }
> +             pte_unmap_unlock(pte, ptl);
> +     }
> +     mmu_notifier_invalidate_range_end(mm, addr, addr + HPAGE_PMD_SIZE);
> +     return;
> +}
> +
> +int remove_migration_pmd(struct page *new, pmd_t *pmd,
> +             struct vm_area_struct *vma, unsigned long addr, void *old)
> +{
> +     struct mm_struct *mm = vma->vm_mm;
> +     spinlock_t *ptl;
> +     pmd_t pmde;
> +     swp_entry_t entry;
> +
> +     pmde = *pmd;
> +     barrier();
> +
> +     if (!pmd_present(pmde)) {
> +             if (is_migration_entry(pmd_to_swp_entry(pmde))) {

                if (!is_migration_entry(pmd_to_swp_entry(pmde)))
                        return SWAP_AGAIN;

And one level less indentation below.

> +                     unsigned long mmun_start = addr & HPAGE_PMD_MASK;
> +                     unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
> +
> +                     ptl = pmd_lock(mm, pmd);
> +                     entry = pmd_to_swp_entry(*pmd);
> +                     if (migration_entry_to_page(entry) != old)
> +                             goto unlock_ptl;
> +                     get_page(new);
> +                     pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
> +                     if (is_write_migration_entry(entry))
> +                             pmde = maybe_pmd_mkwrite(pmde, vma);
> +                     flush_cache_range(vma, mmun_start, mmun_end);
> +                     page_add_anon_rmap(new, vma, mmun_start, true);
> +                     pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
> +                     set_pmd_at(mm, mmun_start, pmd, pmde);
> +                     flush_tlb_range(vma, mmun_start, mmun_end);
> +                     if (vma->vm_flags & VM_LOCKED)
> +                             mlock_vma_page(new);
> +                     update_mmu_cache_pmd(vma, addr, pmd);
> +unlock_ptl:
> +                     spin_unlock(ptl);

                        return SWAP_AGAIN;

And one level less indentation below.

> +             }
> +     } else { /* pte-mapped thp */
> +             pte_t *ptep;
> +             pte_t pte;
> +             int i;
> +             struct page *tmpnew = compound_head(new);
> +             struct page *tmpold = compound_head((struct page *)old);
> +             unsigned long address = addr & HPAGE_PMD_MASK;
> +
> +             ptep = pte_offset_map(pmd, addr);
> +             ptl = pte_lockptr(mm, pmd);
> +             spin_lock(ptl);

pte_offset_map_lock() ?

> +
> +             for (i = 0; i < HPAGE_PMD_NR;
> +                  i++, ptep++, tmpnew++, tmpold++, address += PAGE_SIZE) {
> +                     pte = *ptep;
> +                     if (!is_swap_pte(pte))
> +                             continue;
> +                     entry = pte_to_swp_entry(pte);
> +                     if (!is_migration_entry(entry) ||
> +                         migration_entry_to_page(entry) != tmpold)
> +                             continue;
> +                     get_page(tmpnew);
> +                     pte = pte_mkold(mk_pte(tmpnew,
> +                                            READ_ONCE(vma->vm_page_prot)));

READ_ONCE()? Do we get here under mmap_sem, right?

> +                     if (pte_swp_soft_dirty(*ptep))
> +                             pte = pte_mksoft_dirty(pte);
> +                     if (is_write_migration_entry(entry))
> +                             pte = maybe_mkwrite(pte, vma);
> +                     flush_dcache_page(tmpnew);
> +                     set_pte_at(mm, address, ptep, pte);
> +                     if (PageAnon(new))
> +                             page_add_anon_rmap(tmpnew, vma, address, false);
> +                     else
> +                             page_add_file_rmap(tmpnew, false);
> +                     update_mmu_cache(vma, address, ptep);
> +             }
> +             pte_unmap_unlock(ptep, ptl);
> +     }
> +     return SWAP_AGAIN;
> +}
> +#endif
> diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/migrate.c 
> v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/migrate.c
> index 66ce6b4..54f2eb6 100644
> --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/migrate.c
> +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/migrate.c
> @@ -198,6 +198,8 @@ static int remove_migration_pte(struct page *new, struct 
> vm_area_struct *vma,
>  {
>       struct mm_struct *mm = vma->vm_mm;
>       swp_entry_t entry;
> +     pgd_t *pgd;
> +     pud_t *pud;
>       pmd_t *pmd;
>       pte_t *ptep, pte;
>       spinlock_t *ptl;
> @@ -208,10 +210,29 @@ static int remove_migration_pte(struct page *new, 
> struct vm_area_struct *vma,
>                       goto out;
>               ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
>       } else {
> -             pmd = mm_find_pmd(mm, addr);
> +             pmd_t pmde;
> +
> +             pgd = pgd_offset(mm, addr);
> +             if (!pgd_present(*pgd))
> +                     goto out;
> +             pud = pud_offset(pgd, addr);
> +             if (!pud_present(*pud))
> +                     goto out;
> +             pmd = pmd_offset(pud, addr);
>               if (!pmd)
>                       goto out;
>  
> +             if (PageTransCompound(new)) {
> +                     remove_migration_pmd(new, pmd, vma, addr, old);
> +                     goto out;
> +             }
> +
> +             pmde = *pmd;
> +             barrier();
> +
> +             if (!pmd_present(pmde) || pmd_trans_huge(pmde))
> +                     goto out;
> +
>               ptep = pte_offset_map(pmd, addr);
>  
>               /*
> @@ -344,6 +365,27 @@ void migration_entry_wait_huge(struct vm_area_struct 
> *vma,
>       __migration_entry_wait(mm, pte, ptl);
>  }
>  
> +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
> +void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
> +{
> +     spinlock_t *ptl;
> +     struct page *page;
> +
> +     ptl = pmd_lock(mm, pmd);
> +     if (!is_pmd_migration_entry(*pmd))
> +             goto unlock;
> +     page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
> +     if (!get_page_unless_zero(page))
> +             goto unlock;
> +     spin_unlock(ptl);
> +     wait_on_page_locked(page);
> +     put_page(page);
> +     return;
> +unlock:
> +     spin_unlock(ptl);
> +}
> +#endif
> +
>  #ifdef CONFIG_BLOCK
>  /* Returns true if all buffers are successfully locked */
>  static bool buffer_migrate_lock_buffers(struct buffer_head *head,
> diff --git v4.9-rc2-mmotm-2016-10-27-18-27/mm/pgtable-generic.c 
> v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/pgtable-generic.c
> index 71c5f91..6012343 100644
> --- v4.9-rc2-mmotm-2016-10-27-18-27/mm/pgtable-generic.c
> +++ v4.9-rc2-mmotm-2016-10-27-18-27_patched/mm/pgtable-generic.c
> @@ -118,7 +118,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, 
> unsigned long address,
>  {
>       pmd_t pmd;
>       VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -     VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
> +     VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
> +               !pmd_devmap(*pmdp));
>       pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
>       flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
>       return pmd;
> -- 
> 2.7.0
> 

-- 
 Kirill A. Shutemov

Reply via email to