On Tue, Jul 31, 2012 at 09:12:08PM +0200, Peter Zijlstra wrote:
> If we marked a THP with our special PROT_NONE protections, ensure we
> don't loose them over a split.
> 
> Collapse seems to always allocate a new (huge) page which should
> already end up on the new target node so loosing protections there
> isn't a problem.

This looks an optimization too, as it reduces a few branches.

If you didn't introduce an unnecessary goto it would have made the
actual change more readable and the patch much smaller. (you could
have cleaned it up with a later patch if you disliked the codying
style that tried to avoid using unnecessary gotos)

The s/barrier/ACCESS_ONCE/ I'll merge it in my tree as a separate
commit, as it's not related to sched-numa.

> 
> Cc: Rik van Riel <r...@redhat.com>
> Cc: Paul Turner <p...@google.com>
> Signed-off-by: Peter Zijlstra <a.p.zijls...@chello.nl>
> ---
>  arch/x86/include/asm/pgtable.h |    1 
>  mm/huge_memory.c               |  104 
> +++++++++++++++++++----------------------
>  2 files changed, 50 insertions(+), 55 deletions(-)
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -350,6 +350,7 @@ static inline pgprot_t pgprot_modify(pgp
>  }
>  
>  #define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
> +#define pmd_pgprot(x) __pgprot(pmd_val(x) & ~_HPAGE_CHG_MASK)
>  
>  #define canon_pgprot(p) __pgprot(massage_pgprot(p))
>  
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1353,64 +1353,60 @@ static int __split_huge_page_map(struct 
>       int ret = 0, i;
>       pgtable_t pgtable;
>       unsigned long haddr;
> +     pgprot_t prot;
>  
>       spin_lock(&mm->page_table_lock);
>       pmd = page_check_address_pmd(page, mm, address,
>                                    PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
> -     if (pmd) {
> -             pgtable = get_pmd_huge_pte(mm);
> -             pmd_populate(mm, &_pmd, pgtable);
> -
> -             for (i = 0, haddr = address; i < HPAGE_PMD_NR;
> -                  i++, haddr += PAGE_SIZE) {
> -                     pte_t *pte, entry;
> -                     BUG_ON(PageCompound(page+i));
> -                     entry = mk_pte(page + i, vma->vm_page_prot);
> -                     entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> -                     if (!pmd_write(*pmd))
> -                             entry = pte_wrprotect(entry);
> -                     else
> -                             BUG_ON(page_mapcount(page) != 1);
> -                     if (!pmd_young(*pmd))
> -                             entry = pte_mkold(entry);
> -                     pte = pte_offset_map(&_pmd, haddr);
> -                     BUG_ON(!pte_none(*pte));
> -                     set_pte_at(mm, haddr, pte, entry);
> -                     pte_unmap(pte);
> -             }
> +     if (!pmd)
> +             goto unlock;
>  
> -             smp_wmb(); /* make pte visible before pmd */
> -             /*
> -              * Up to this point the pmd is present and huge and
> -              * userland has the whole access to the hugepage
> -              * during the split (which happens in place). If we
> -              * overwrite the pmd with the not-huge version
> -              * pointing to the pte here (which of course we could
> -              * if all CPUs were bug free), userland could trigger
> -              * a small page size TLB miss on the small sized TLB
> -              * while the hugepage TLB entry is still established
> -              * in the huge TLB. Some CPU doesn't like that. See
> -              * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
> -              * Erratum 383 on page 93. Intel should be safe but is
> -              * also warns that it's only safe if the permission
> -              * and cache attributes of the two entries loaded in
> -              * the two TLB is identical (which should be the case
> -              * here). But it is generally safer to never allow
> -              * small and huge TLB entries for the same virtual
> -              * address to be loaded simultaneously. So instead of
> -              * doing "pmd_populate(); flush_tlb_range();" we first
> -              * mark the current pmd notpresent (atomically because
> -              * here the pmd_trans_huge and pmd_trans_splitting
> -              * must remain set at all times on the pmd until the
> -              * split is complete for this pmd), then we flush the
> -              * SMP TLB and finally we write the non-huge version
> -              * of the pmd entry with pmd_populate.
> -              */
> -             set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
> -             flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> -             pmd_populate(mm, pmd, pgtable);
> -             ret = 1;
> +     prot = pmd_pgprot(*pmd);
> +     pgtable = get_pmd_huge_pte(mm);
> +     pmd_populate(mm, &_pmd, pgtable);
> +
> +     for (i = 0, haddr = address; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) 
> {
> +             pte_t *pte, entry;
> +
> +             BUG_ON(PageCompound(page+i));
> +             entry = mk_pte(page + i, prot);
> +             entry = pte_mkdirty(entry);
> +             if (!pmd_young(*pmd))
> +                     entry = pte_mkold(entry);
> +             pte = pte_offset_map(&_pmd, haddr);
> +             BUG_ON(!pte_none(*pte));
> +             set_pte_at(mm, haddr, pte, entry);
> +             pte_unmap(pte);
>       }
> +
> +     smp_wmb(); /* make ptes visible before pmd, see __pte_alloc */
> +     /*
> +      * Up to this point the pmd is present and huge.
> +      *
> +      * If we overwrite the pmd with the not-huge version, we could trigger
> +      * a small page size TLB miss on the small sized TLB while the hugepage
> +      * TLB entry is still established in the huge TLB.
> +      *
> +      * Some CPUs don't like that. See
> +      * http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum 383
> +      * on page 93.
> +      *
> +      * Thus it is generally safer to never allow small and huge TLB entries
> +      * for overlapping virtual addresses to be loaded. So we first mark the
> +      * current pmd not present, then we flush the TLB and finally we write
> +      * the non-huge version of the pmd entry with pmd_populate.
> +      *
> +      * The above needs to be done under the ptl because pmd_trans_huge and
> +      * pmd_trans_splitting must remain set on the pmd until the split is
> +      * complete. The ptl also protects against concurrent faults due to
> +      * making the pmd not-present.
> +      */
> +     set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
> +     flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
> +     pmd_populate(mm, pmd, pgtable);
> +     ret = 1;
> +
> +unlock:
>       spin_unlock(&mm->page_table_lock);
>  
>       return ret;
> @@ -2241,9 +2237,7 @@ static int khugepaged_wait_event(void)
>  static void khugepaged_do_scan(struct page **hpage)
>  {
>       unsigned int progress = 0, pass_through_head = 0;
> -     unsigned int pages = khugepaged_pages_to_scan;
> -
> -     barrier(); /* write khugepaged_pages_to_scan to local stack */
> +     unsigned int pages = ACCESS_ONCE(khugepaged_pages_to_scan);
>  
>       while (progress < pages) {
>               cond_resched();
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to