In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the layouts may differ depending on the architecture. On s390 this will lead to inaccurate numap_maps accounting in /proc because of misguided pte_present() and pte_dirty() checks on the fake pte.
On other architectures pte_present() and pte_dirty() may work by chance, but there may be an issue with direct-access (dax) mappings w/o underlying struct pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page() the fake pte will be checked with pte_special() and because there is no "special" bit in a pmd, this will always return false and the VM_PFNMAP | VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an invalid struct page pointer would then be returned that can crash the kernel. This patch fixes the numa_maps THP handling by introducing new "_pmd" variants of the can_gather_numa_stats() and vm_normal_page() functions. Cc: <sta...@vger.kernel.org> # v4.3+ Signed-off-by: Gerald Schaefer <gerald.schae...@de.ibm.com> --- Changes in v2: - Fixed compile error on some archs when CONFIG_TRANSPARENT_HUGEPAGE is not defined. fs/proc/task_mmu.c | 33 ++++++++++++++++++++++++++++++--- include/linux/mm.h | 2 ++ mm/memory.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 3 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9df4316..f2ded210 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1518,6 +1518,32 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, return page; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static struct page *can_gather_numa_stats_pmd(pmd_t pmd, + struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *page; + int nid; + + if (!pmd_present(pmd)) + return NULL; + + page = vm_normal_page_pmd(vma, addr, pmd); + if (!page) + return NULL; + + if (PageReserved(page)) + return NULL; + + nid = page_to_nid(page); + if (!node_isset(nid, node_states[N_MEMORY])) + return NULL; + + return page; +} +#endif + static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -1527,14 +1553,14 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - pte_t huge_pte = *(pte_t *)pmd; struct page *page; - page = can_gather_numa_stats(huge_pte, vma, addr); + page = can_gather_numa_stats_pmd(*pmd, vma, addr); if (page) - gather_stats(page, md, pte_dirty(huge_pte), + gather_stats(page, md, pmd_dirty(*pmd), HPAGE_PMD_SIZE/PAGE_SIZE); spin_unlock(ptl); return 0; @@ -1542,6 +1568,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; +#endif orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { struct page *page = can_gather_numa_stats(*pte, vma, addr); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6bff79a..c5b8efc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1121,6 +1121,8 @@ struct zap_details { struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd); int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); diff --git a/mm/memory.c b/mm/memory.c index 288a508..b73b833 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -789,6 +789,46 @@ out: return pfn_to_page(pfn); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + /* + * There is no pmd_special() but there may be special pmds, e.g. + * in a direct-access (dax) mapping, so let's just replicate the + * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + */ + if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { + if (vma->vm_flags & VM_MIXEDMAP) { + if (!pfn_valid(pfn)) + return NULL; + goto out; + } else { + unsigned long off; + off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + if (!is_cow_mapping(vma->vm_flags)) + return NULL; + } + } + + if (is_zero_pfn(pfn)) + return NULL; + if (unlikely(pfn > highest_memmap_pfn)) + return NULL; + + /* + * NOTE! We still have PageReserved() pages in the page tables. + * eg. VDSO mappings can cause them to exist. + */ +out: + return pfn_to_page(pfn); +} +#endif + /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range -- 2.6.6