On Mon 03-12-18 15:50:24, David Rientjes wrote:
> This is a full revert of ac5b2c18911f ("mm: thp: relax __GFP_THISNODE for
> MADV_HUGEPAGE mappings") and a partial revert of 89c83fb539f9 ("mm, thp:
> consolidate THP gfp handling into alloc_hugepage_direct_gfpmask").
> 
> By not setting __GFP_THISNODE, applications can allocate remote hugepages
> when the local node is fragmented or low on memory when either the thp
> defrag setting is "always" or the vma has been madvised with
> MADV_HUGEPAGE.
> 
> Remote access to hugepages often has much higher latency than local pages
> of the native page size.  On Haswell, ac5b2c18911f was shown to have a
> 13.9% access regression after this commit for binaries that remap their
> text segment to be backed by transparent hugepages.
> 
> The intent of ac5b2c18911f is to address an issue where a local node is
> low on memory or fragmented such that a hugepage cannot be allocated.  In
> every scenario where this was described as a fix, there is abundant and
> unfragmented remote memory available to allocate from, even with a greater
> access latency.
> 
> If remote memory is also low or fragmented, not setting __GFP_THISNODE was
> also measured on Haswell to have a 40% regression in allocation latency.
> 
> Restore __GFP_THISNODE for thp allocations.
> 
> Fixes: ac5b2c18911f ("mm: thp: relax __GFP_THISNODE for MADV_HUGEPAGE 
> mappings")
> Fixes: 89c83fb539f9 ("mm, thp: consolidate THP gfp handling into 
> alloc_hugepage_direct_gfpmask")

At minimum do not remove the cleanup part which consolidates the gfp
hadnling to a single place. There is no real reason to have the
__GFP_THISNODE ugliness outside of alloc_hugepage_direct_gfpmask.

I still hate the __GFP_THISNODE part as mentioned before. It is an ugly
hack but I can learn to live with it if this is indeed the only option
for the short term workaround until we find a proper solution.

> Signed-off-by: David Rientjes <[email protected]>
> ---
>  include/linux/mempolicy.h |  2 --
>  mm/huge_memory.c          | 42 +++++++++++++++------------------------
>  mm/mempolicy.c            |  7 ++++---
>  3 files changed, 20 insertions(+), 31 deletions(-)
> 
> diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
> --- a/include/linux/mempolicy.h
> +++ b/include/linux/mempolicy.h
> @@ -139,8 +139,6 @@ struct mempolicy *mpol_shared_policy_lookup(struct 
> shared_policy *sp,
>  struct mempolicy *get_task_policy(struct task_struct *p);
>  struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
>               unsigned long addr);
> -struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
> -                                             unsigned long addr);
>  bool vma_policy_mof(struct vm_area_struct *vma);
>  
>  extern void numa_default_policy(void);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -632,37 +632,27 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct 
> vm_fault *vmf,
>  static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct 
> *vma, unsigned long addr)
>  {
>       const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
> -     gfp_t this_node = 0;
> -
> -#ifdef CONFIG_NUMA
> -     struct mempolicy *pol;
> -     /*
> -      * __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
> -      * specified, to express a general desire to stay on the current
> -      * node for optimistic allocation attempts. If the defrag mode
> -      * and/or madvise hint requires the direct reclaim then we prefer
> -      * to fallback to other node rather than node reclaim because that
> -      * can lead to excessive reclaim even though there is free memory
> -      * on other nodes. We expect that NUMA preferences are specified
> -      * by memory policies.
> -      */
> -     pol = get_vma_policy(vma, addr);
> -     if (pol->mode != MPOL_BIND)
> -             this_node = __GFP_THISNODE;
> -     mpol_cond_put(pol);
> -#endif
> +     const gfp_t gfp_mask = GFP_TRANSHUGE_LIGHT | __GFP_THISNODE;
>  
> +     /* Always do synchronous compaction */
>       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, 
> &transparent_hugepage_flags))
> -             return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
> +             return GFP_TRANSHUGE | __GFP_THISNODE |
> +                    (vma_madvised ? 0 : __GFP_NORETRY);
> +
> +     /* Kick kcompactd and fail quickly */
>       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, 
> &transparent_hugepage_flags))
> -             return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
> +             return gfp_mask | __GFP_KSWAPD_RECLAIM;
> +
> +     /* Synchronous compaction if madvised, otherwise kick kcompactd */
>       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, 
> &transparent_hugepage_flags))
> -             return GFP_TRANSHUGE_LIGHT | (vma_madvised ? 
> __GFP_DIRECT_RECLAIM :
> -                                                          
> __GFP_KSWAPD_RECLAIM | this_node);
> +             return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM :
> +                                               __GFP_KSWAPD_RECLAIM);
> +
> +     /* Only do synchronous compaction if madvised */
>       if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, 
> &transparent_hugepage_flags))
> -             return GFP_TRANSHUGE_LIGHT | (vma_madvised ? 
> __GFP_DIRECT_RECLAIM :
> -                                                          this_node);
> -     return GFP_TRANSHUGE_LIGHT | this_node;
> +             return gfp_mask | (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
> +
> +     return gfp_mask;
>  }
>  
>  /* Caller must hold page table lock. */
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -1116,8 +1116,9 @@ static struct page *new_page(struct page *page, 
> unsigned long start)
>       } else if (PageTransHuge(page)) {
>               struct page *thp;
>  
> -             thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
> -                             address, numa_node_id());
> +             thp = alloc_pages_vma(GFP_TRANSHUGE | __GFP_THISNODE,
> +                                   HPAGE_PMD_ORDER, vma, address,
> +                                   numa_node_id());
>               if (!thp)
>                       return NULL;
>               prep_transhuge_page(thp);
> @@ -1662,7 +1663,7 @@ struct mempolicy *__get_vma_policy(struct 
> vm_area_struct *vma,
>   * freeing by another task.  It is the caller's responsibility to free the
>   * extra reference for shared policies.
>   */
> -struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
> +static struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
>                                               unsigned long addr)
>  {
>       struct mempolicy *pol = __get_vma_policy(vma, addr);

-- 
Michal Hocko
SUSE Labs

Reply via email to