Yang Shi <[email protected]> writes:

> When the THP NUMA fault support was added THP migration was not supported yet.
> So the ad hoc THP migration was implemented in NUMA fault handling.  Since 
> v4.14
> THP migration has been supported so it doesn't make too much sense to still 
> keep
> another THP migration implementation rather than using the generic migration
> code.
>
> This patch reworked the NUMA fault handling to use generic migration 
> implementation
> to migrate misplaced page.  There is no functional change.
>
> After the refactor the flow of NUMA fault handling looks just like its
> PTE counterpart:
>   Acquire ptl
>   Restore PMD
>   Prepare for migration (elevate page refcount)
>   Release ptl
>   Isolate page from lru and elevate page refcount
>   Migrate the misplaced THP

There's some behavior change between the original implementation and
your new implementation.  Originally, PMD is restored after trying to
migrate the misplaced THP.  I think this can reduce the TLB
shooting-down IPI.

Best Regards,
Huang, Ying

> In the old code anon_vma lock was needed to serialize THP migration
> against THP split, but since then the THP code has been reworked a lot,
> it seems anon_vma lock is not required anymore to avoid the race.
>
> The page refcount elevation when holding ptl should prevent from THP
> split.
>
> Signed-off-by: Yang Shi <[email protected]>
> ---
>  include/linux/migrate.h |  23 ------
>  mm/huge_memory.c        | 132 ++++++++----------------------
>  mm/migrate.c            | 173 ++++++----------------------------------
>  3 files changed, 57 insertions(+), 271 deletions(-)
>
> diff --git a/include/linux/migrate.h b/include/linux/migrate.h
> index 6abd34986cc5..6c8640e9af4f 100644
> --- a/include/linux/migrate.h
> +++ b/include/linux/migrate.h
> @@ -100,15 +100,10 @@ static inline void __ClearPageMovable(struct page *page)
>  #endif
>  
>  #ifdef CONFIG_NUMA_BALANCING
> -extern bool pmd_trans_migrating(pmd_t pmd);
>  extern int migrate_misplaced_page(struct page *page,
>                                 struct vm_area_struct *vma, int node,
>                                 bool compound);
>  #else
> -static inline bool pmd_trans_migrating(pmd_t pmd)
> -{
> -     return false;
> -}
>  static inline int migrate_misplaced_page(struct page *page,
>                                        struct vm_area_struct *vma, int node,
>                                        bool compound)
> @@ -117,24 +112,6 @@ static inline int migrate_misplaced_page(struct page 
> *page,
>  }
>  #endif /* CONFIG_NUMA_BALANCING */
>  
> -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> -extern int migrate_misplaced_transhuge_page(struct mm_struct *mm,
> -                     struct vm_area_struct *vma,
> -                     pmd_t *pmd, pmd_t entry,
> -                     unsigned long address,
> -                     struct page *page, int node);
> -#else
> -static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm,
> -                     struct vm_area_struct *vma,
> -                     pmd_t *pmd, pmd_t entry,
> -                     unsigned long address,
> -                     struct page *page, int node)
> -{
> -     return -EAGAIN;
> -}
> -#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/
> -
> -
>  #ifdef CONFIG_MIGRATION
>  
>  /*
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 53f3843ce72a..157c63b0fd95 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1419,94 +1419,20 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>  {
>       struct vm_area_struct *vma = vmf->vma;
>       pmd_t pmd = vmf->orig_pmd;
> -     struct anon_vma *anon_vma = NULL;
>       struct page *page;
>       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -     int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
> +     int page_nid = NUMA_NO_NODE;
>       int target_nid, last_cpupid = -1;
> -     bool page_locked;
>       bool migrated = false;
> -     bool was_writable;
> +     bool was_writable = pmd_savedwrite(pmd);
>       int flags = 0;
>  
>       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
> -     if (unlikely(!pmd_same(pmd, *vmf->pmd)))
> -             goto out_unlock;
> -
> -     /*
> -      * If there are potential migrations, wait for completion and retry
> -      * without disrupting NUMA hinting information. Do not relock and
> -      * check_same as the page may no longer be mapped.
> -      */
> -     if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
> -             page = pmd_page(*vmf->pmd);
> -             if (!get_page_unless_zero(page))
> -                     goto out_unlock;
> -             spin_unlock(vmf->ptl);
> -             put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
> -             goto out;
> -     }
> -
> -     page = pmd_page(pmd);
> -     BUG_ON(is_huge_zero_page(page));
> -     page_nid = page_to_nid(page);
> -     last_cpupid = page_cpupid_last(page);
> -     count_vm_numa_event(NUMA_HINT_FAULTS);
> -     if (page_nid == this_nid) {
> -             count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
> -             flags |= TNF_FAULT_LOCAL;
> -     }
> -
> -     /* See similar comment in do_numa_page for explanation */
> -     if (!pmd_savedwrite(pmd))
> -             flags |= TNF_NO_GROUP;
> -
> -     /*
> -      * Acquire the page lock to serialise THP migrations but avoid dropping
> -      * page_table_lock if at all possible
> -      */
> -     page_locked = trylock_page(page);
> -     target_nid = mpol_misplaced(page, vma, haddr);
> -     if (target_nid == NUMA_NO_NODE) {
> -             /* If the page was locked, there are no parallel migrations */
> -             if (page_locked)
> -                     goto clear_pmdnuma;
> -     }
> -
> -     /* Migration could have started since the pmd_trans_migrating check */
> -     if (!page_locked) {
> -             page_nid = NUMA_NO_NODE;
> -             if (!get_page_unless_zero(page))
> -                     goto out_unlock;
> +     if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
>               spin_unlock(vmf->ptl);
> -             put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
>               goto out;
>       }
>  
> -     /*
> -      * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
> -      * to serialises splits
> -      */
> -     get_page(page);
> -     spin_unlock(vmf->ptl);
> -     anon_vma = page_lock_anon_vma_read(page);
> -
> -     /* Confirm the PMD did not change while page_table_lock was released */
> -     spin_lock(vmf->ptl);
> -     if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
> -             unlock_page(page);
> -             put_page(page);
> -             page_nid = NUMA_NO_NODE;
> -             goto out_unlock;
> -     }
> -
> -     /* Bail if we fail to protect against THP splits for any reason */
> -     if (unlikely(!anon_vma)) {
> -             put_page(page);
> -             page_nid = NUMA_NO_NODE;
> -             goto clear_pmdnuma;
> -     }
> -
>       /*
>        * Since we took the NUMA fault, we must have observed the !accessible
>        * bit. Make sure all other CPUs agree with that, to avoid them
> @@ -1533,38 +1459,44 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
>                                             haddr + HPAGE_PMD_SIZE);
>       }
>  
> -     /*
> -      * Migrate the THP to the requested node, returns with page unlocked
> -      * and access rights restored.
> -      */
> -     spin_unlock(vmf->ptl);
> -
> -     migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
> -                             vmf->pmd, pmd, vmf->address, page, target_nid);
> -     if (migrated) {
> -             flags |= TNF_MIGRATED;
> -             page_nid = target_nid;
> -     } else
> -             flags |= TNF_MIGRATE_FAIL;
> -
> -     goto out;
> -clear_pmdnuma:
> -     BUG_ON(!PageLocked(page));
> -     was_writable = pmd_savedwrite(pmd);
> +     /* Restore the PMD */
>       pmd = pmd_modify(pmd, vma->vm_page_prot);
>       pmd = pmd_mkyoung(pmd);
>       if (was_writable)
>               pmd = pmd_mkwrite(pmd);
>       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
>       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -     unlock_page(page);
> -out_unlock:
> +
> +     page = vm_normal_page_pmd(vma, haddr, pmd);
> +     if (!page) {
> +             spin_unlock(vmf->ptl);
> +             goto out;
> +     }
> +
> +     /* See similar comment in do_numa_page for explanation */
> +     if (!was_writable)
> +             flags |= TNF_NO_GROUP;
> +
> +     page_nid = page_to_nid(page);
> +     last_cpupid = page_cpupid_last(page);
> +     target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
> +                                    &flags);
> +
>       spin_unlock(vmf->ptl);
>  
> -out:
> -     if (anon_vma)
> -             page_unlock_anon_vma_read(anon_vma);
> +     if (target_nid == NUMA_NO_NODE) {
> +             put_page(page);
> +             goto out;
> +     }
> +
> +     migrated = migrate_misplaced_page(page, vma, target_nid, true);
> +     if (migrated) {
> +             flags |= TNF_MIGRATED;
> +             page_nid = target_nid;
> +     } else
> +             flags |= TNF_MIGRATE_FAIL;
>  
> +out:
>       if (page_nid != NUMA_NO_NODE)
>               task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
>                               flags);
> diff --git a/mm/migrate.c b/mm/migrate.c
> index 9c4ae5132919..86325c750c14 100644
> --- a/mm/migrate.c
> +++ b/mm/migrate.c
> @@ -2066,6 +2066,23 @@ static struct page *alloc_misplaced_dst_page(struct 
> page *page,
>       return newpage;
>  }
>  
> +static struct page *alloc_misplaced_dst_page_thp(struct page *page,
> +                                              unsigned long data)
> +{
> +     int nid = (int) data;
> +     struct page *newpage;
> +
> +     newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
> +                                HPAGE_PMD_ORDER);
> +     if (!newpage)
> +             goto out;
> +
> +     prep_transhuge_page(newpage);
> +
> +out:
> +     return newpage;
> +}
> +
>  static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
>  {
>       int page_lru;
> @@ -2104,12 +2121,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, 
> struct page *page)
>       return 1;
>  }
>  
> -bool pmd_trans_migrating(pmd_t pmd)
> -{
> -     struct page *page = pmd_page(pmd);
> -     return PageLocked(page);
> -}
> -
>  static inline bool is_shared_exec_page(struct vm_area_struct *vma,
>                                      struct page *page)
>  {
> @@ -2133,6 +2144,12 @@ int migrate_misplaced_page(struct page *page, struct 
> vm_area_struct *vma,
>       int isolated;
>       int nr_remaining;
>       LIST_HEAD(migratepages);
> +     new_page_t *new;
> +
> +     if (compound)
> +             new = alloc_misplaced_dst_page_thp;
> +     else
> +             new = alloc_misplaced_dst_page;
>  
>       /*
>        * Don't migrate file pages that are mapped in multiple processes
> @@ -2153,9 +2170,8 @@ int migrate_misplaced_page(struct page *page, struct 
> vm_area_struct *vma,
>               goto out;
>  
>       list_add(&page->lru, &migratepages);
> -     nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
> -                                  NULL, node, MIGRATE_ASYNC,
> -                                  MR_NUMA_MISPLACED);
> +     nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
> +                                  MIGRATE_ASYNC, MR_NUMA_MISPLACED);
>       if (nr_remaining) {
>               if (!list_empty(&migratepages)) {
>                       list_del(&page->lru);
> @@ -2174,145 +2190,6 @@ int migrate_misplaced_page(struct page *page, struct 
> vm_area_struct *vma,
>       return 0;
>  }
>  #endif /* CONFIG_NUMA_BALANCING */
> -
> -#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
> -/*
> - * Migrates a THP to a given target node. page must be locked and is unlocked
> - * before returning.
> - */
> -int migrate_misplaced_transhuge_page(struct mm_struct *mm,
> -                             struct vm_area_struct *vma,
> -                             pmd_t *pmd, pmd_t entry,
> -                             unsigned long address,
> -                             struct page *page, int node)
> -{
> -     spinlock_t *ptl;
> -     pg_data_t *pgdat = NODE_DATA(node);
> -     int isolated = 0;
> -     struct page *new_page = NULL;
> -     int page_lru = page_is_file_lru(page);
> -     unsigned long start = address & HPAGE_PMD_MASK;
> -
> -     if (is_shared_exec_page(vma, page))
> -             goto out;
> -
> -     new_page = alloc_pages_node(node,
> -             (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
> -             HPAGE_PMD_ORDER);
> -     if (!new_page)
> -             goto out_fail;
> -     prep_transhuge_page(new_page);
> -
> -     isolated = numamigrate_isolate_page(pgdat, page);
> -     if (!isolated) {
> -             put_page(new_page);
> -             goto out_fail;
> -     }
> -
> -     /* Prepare a page as a migration target */
> -     __SetPageLocked(new_page);
> -     if (PageSwapBacked(page))
> -             __SetPageSwapBacked(new_page);
> -
> -     /* anon mapping, we can simply copy page->mapping to the new page: */
> -     new_page->mapping = page->mapping;
> -     new_page->index = page->index;
> -     /* flush the cache before copying using the kernel virtual address */
> -     flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
> -     migrate_page_copy(new_page, page);
> -     WARN_ON(PageLRU(new_page));
> -
> -     /* Recheck the target PMD */
> -     ptl = pmd_lock(mm, pmd);
> -     if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
> -             spin_unlock(ptl);
> -
> -             /* Reverse changes made by migrate_page_copy() */
> -             if (TestClearPageActive(new_page))
> -                     SetPageActive(page);
> -             if (TestClearPageUnevictable(new_page))
> -                     SetPageUnevictable(page);
> -
> -             unlock_page(new_page);
> -             put_page(new_page);             /* Free it */
> -
> -             /* Retake the callers reference and putback on LRU */
> -             get_page(page);
> -             putback_lru_page(page);
> -             mod_node_page_state(page_pgdat(page),
> -                      NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
> -
> -             goto out_unlock;
> -     }
> -
> -     entry = mk_huge_pmd(new_page, vma->vm_page_prot);
> -     entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -
> -     /*
> -      * Overwrite the old entry under pagetable lock and establish
> -      * the new PTE. Any parallel GUP will either observe the old
> -      * page blocking on the page lock, block on the page table
> -      * lock or observe the new page. The SetPageUptodate on the
> -      * new page and page_add_new_anon_rmap guarantee the copy is
> -      * visible before the pagetable update.
> -      */
> -     page_add_anon_rmap(new_page, vma, start, true);
> -     /*
> -      * At this point the pmd is numa/protnone (i.e. non present) and the TLB
> -      * has already been flushed globally.  So no TLB can be currently
> -      * caching this non present pmd mapping.  There's no need to clear the
> -      * pmd before doing set_pmd_at(), nor to flush the TLB after
> -      * set_pmd_at().  Clearing the pmd here would introduce a race
> -      * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
> -      * mmap_lock for reading.  If the pmd is set to NULL at any given time,
> -      * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
> -      * pmd.
> -      */
> -     set_pmd_at(mm, start, pmd, entry);
> -     update_mmu_cache_pmd(vma, address, &entry);
> -
> -     page_ref_unfreeze(page, 2);
> -     mlock_migrate_page(new_page, page);
> -     page_remove_rmap(page, true);
> -     set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
> -
> -     spin_unlock(ptl);
> -
> -     /* Take an "isolate" reference and put new page on the LRU. */
> -     get_page(new_page);
> -     putback_lru_page(new_page);
> -
> -     unlock_page(new_page);
> -     unlock_page(page);
> -     put_page(page);                 /* Drop the rmap reference */
> -     put_page(page);                 /* Drop the LRU isolation reference */
> -
> -     count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
> -     count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
> -
> -     mod_node_page_state(page_pgdat(page),
> -                     NR_ISOLATED_ANON + page_lru,
> -                     -HPAGE_PMD_NR);
> -     return isolated;
> -
> -out_fail:
> -     count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
> -     ptl = pmd_lock(mm, pmd);
> -     if (pmd_same(*pmd, entry)) {
> -             entry = pmd_modify(entry, vma->vm_page_prot);
> -             set_pmd_at(mm, start, pmd, entry);
> -             update_mmu_cache_pmd(vma, address, &entry);
> -     }
> -     spin_unlock(ptl);
> -
> -out_unlock:
> -     unlock_page(page);
> -out:
> -     put_page(page);
> -     return 0;
> -}
> -#endif /* CONFIG_NUMA_BALANCING */
> -
>  #endif /* CONFIG_NUMA */
>  
>  #ifdef CONFIG_DEVICE_PRIVATE

Reply via email to