Recent woes with some arches needing their own pgd_addr_end macro; and 4-level clear_page_range regression since 2.6.10's clear_page_tables; and its long-standing well-known inefficiency in searching throughout the higher-level page tables for those few entries to clear and free: all can be blamed on ignoring the list of vmas when we free page tables.
Replace exit_mmap's clear_page_range of the total user address space by free_pgtables operating on the mm's vma list; unmap_region use it in the same way, giving floor and ceiling beyond which it may not free tables. This brings lmbench fork/exec/sh numbers back to 2.6.10 (unless preempt is enabled, in which case latency fixes spoil unmap_vmas throughput). Beware: the do_mmap_pgoff driver failure case must now use unmap_region instead of zap_page_range, since a page table might have been allocated, and can only be freed while it is touched by some vma. Move free_pgtables from mmap.c to memory.c, where its lower levels are adapted from the clear_page_range levels. (Most of free_pgtables' old code was actually for a non-existent case, prev not properly set up, dating from before hch gave us split_vma.) Pass mmu_gather** in the public interfaces, since we might want to add latency lockdrops later; but no attempt to do so yet, going by vma should itself reduce latency. But what if is_hugepage_only_range? Those ia64 and ppc64 cases need careful examination: put that off until a later patch of the series. Signed-off-by: Hugh Dickins <[EMAIL PROTECTED]> --- arch/i386/mm/pgtable.c | 2 arch/ia64/mm/hugetlbpage.c | 37 ------------- include/linux/mm.h | 3 - mm/memory.c | 121 ++++++++++++++++++++++++++++++--------------- mm/mmap.c | 102 +++++++------------------------------ 5 files changed, 108 insertions(+), 157 deletions(-) --- 2.6.12-rc1-bk1/arch/i386/mm/pgtable.c 2005-03-02 07:39:18.000000000 +0000 +++ freepgt1/arch/i386/mm/pgtable.c 2005-03-21 19:06:35.000000000 +0000 @@ -255,6 +255,6 @@ void pgd_free(pgd_t *pgd) if (PTRS_PER_PMD > 1) for (i = 0; i < USER_PTRS_PER_PGD; ++i) kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); - /* in the non-PAE case, clear_page_range() clears user pgd entries */ + /* in the non-PAE case, free_pgtables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } --- 2.6.12-rc1-bk1/arch/ia64/mm/hugetlbpage.c 2005-03-18 10:22:18.000000000 +0000 +++ freepgt1/arch/ia64/mm/hugetlbpage.c 2005-03-21 19:06:35.000000000 +0000 @@ -187,45 +187,12 @@ follow_huge_pmd(struct mm_struct *mm, un } /* - * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset - * are hugetlb region specific. + * Do nothing, until we've worked out what to do! To allow build, we + * must remove reference to clear_page_range since it no longer exists. */ void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, unsigned long start, unsigned long end) { - unsigned long first = start & HUGETLB_PGDIR_MASK; - unsigned long last = end + HUGETLB_PGDIR_SIZE - 1; - struct mm_struct *mm = tlb->mm; - - if (!prev) { - prev = mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last = prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next = prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev = next; - continue; - } - if (last > next->vm_start) - last = next->vm_start; - } - if (prev->vm_end > first) - first = prev->vm_end; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - clear_page_range(tlb, first, last); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) --- 2.6.12-rc1-bk1/include/linux/mm.h 2005-03-18 10:22:43.000000000 +0000 +++ freepgt1/include/linux/mm.h 2005-03-21 19:06:35.000000000 +0000 @@ -591,7 +591,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct vm_area_struct *start_vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *); -void clear_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end); +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, --- 2.6.12-rc1-bk1/mm/memory.c 2005-03-18 10:22:45.000000000 +0000 +++ freepgt1/mm/memory.c 2005-03-21 19:06:35.000000000 +0000 @@ -110,87 +110,130 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, - unsigned long addr, unsigned long end) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { - if (!((addr | end) & ~PMD_MASK)) { - /* Only free fully aligned ranges */ - struct page *page = pmd_page(*pmd); - pmd_clear(pmd); - dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; - pte_free_tlb(tlb, page); - } + struct page *page = pmd_page(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, page); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; } -static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr, unsigned long end) +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; - pmd_t *empty_pmd = NULL; + unsigned long start; - pmd = pmd_offset(pud, addr); + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; - /* Only free fully aligned ranges */ - if (!((addr | end) & ~PUD_MASK)) - empty_pmd = pmd; + start = addr; + pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - clear_pte_range(tlb, pmd, addr, next); + free_pte_range(tlb, pmd); } while (pmd++, addr = next, addr != end); - if (empty_pmd) { - pud_clear(pud); - pmd_free_tlb(tlb, empty_pmd); - } + start &= PUD_MASK; + if (start < floor) + return; + ceiling &= PUD_MASK; + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); } -static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, - unsigned long addr, unsigned long end) +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; - pud_t *empty_pud = NULL; + unsigned long start; + start = addr; pud = pud_offset(pgd, addr); - - /* Only free fully aligned ranges */ - if (!((addr | end) & ~PGDIR_MASK)) - empty_pud = pud; do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - clear_pmd_range(tlb, pud, addr, next); + free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); - if (empty_pud) { - pgd_clear(pgd); - pud_free_tlb(tlb, empty_pud); - } + start &= PGDIR_MASK; + if (start < floor) + return; + ceiling &= PGDIR_MASK; + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); } /* - * This function clears user-level page tables of a process. - * Unlike other pagetable walks, some memory layouts might give end 0. + * This function frees user-level page tables of a process. + * * Must be called with pagetable lock held. */ -void clear_page_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end) +static inline void free_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; + unsigned long start; + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + ceiling &= PMD_MASK; + if (addr > ceiling - 1) + return; + + start = addr; pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - clear_pud_range(tlb, pgd, addr, next); + free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); + + if (!tlb_is_full_mm(tlb)) + flush_tlb_pgtables(tlb->mm, start, end); +} + +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + /* Optimization: gather nearby vmas into a single call down */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE) { + vma = next; + next = vma->vm_next; + } + free_pgd_range(*tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + vma = next; + } } pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) --- 2.6.12-rc1-bk1/mm/mmap.c 2005-03-18 10:22:45.000000000 +0000 +++ freepgt1/mm/mmap.c 2005-03-21 19:06:35.000000000 +0000 @@ -29,6 +29,10 @@ #include <asm/cacheflush.h> #include <asm/tlb.h> +static void unmap_region(struct mm_struct *mm, + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end); + /* * WARNING: the debugging will use recursive algorithms so never enable this * unless you know what you are doing. @@ -1129,7 +1133,8 @@ unmap_and_free_vma: fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + charged = 0; free_vma: kmem_cache_free(vm_area_cachep, vma); unacct_error: @@ -1572,66 +1577,6 @@ find_extend_vma(struct mm_struct * mm, u } #endif -/* - * Try to free as many page directory entries as we can, - * without having to work very hard at actually scanning - * the page tables themselves. - * - * Right now we try to free page tables if we have a nice - * PGDIR-aligned area that got free'd up. We could be more - * granular if we want to, but this is fast and simple, - * and covers the bad cases. - * - * "prev", if it exists, points to a vma before the one - * we just free'd - but there's no telling how much before. - */ -static void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev, - unsigned long start, unsigned long end) -{ - unsigned long first = start & PGDIR_MASK; - unsigned long last = end + PGDIR_SIZE - 1; - struct mm_struct *mm = tlb->mm; - - if (last > MM_VM_SIZE(mm) || last < end) - last = MM_VM_SIZE(mm); - - if (!prev) { - prev = mm->mmap; - if (!prev) - goto no_mmaps; - if (prev->vm_end > start) { - if (last > prev->vm_start) - last = prev->vm_start; - goto no_mmaps; - } - } - for (;;) { - struct vm_area_struct *next = prev->vm_next; - - if (next) { - if (next->vm_start < start) { - prev = next; - continue; - } - if (last > next->vm_start) - last = next->vm_start; - } - if (prev->vm_end > first) - first = prev->vm_end; - break; - } -no_mmaps: - if (last < first) /* for arches with discontiguous pgd indices */ - return; - if (first < FIRST_USER_PGD_NR * PGDIR_SIZE) - first = FIRST_USER_PGD_NR * PGDIR_SIZE; - /* No point trying to free anything if we're in the same pte page */ - if ((first & PMD_MASK) < (last & PMD_MASK)) { - clear_page_range(tlb, first, last); - flush_tlb_pgtables(mm, first, last); - } -} - /* Normal function to fix up a mapping * This function is the default for when an area has no specific * function. This may be used as part of a more specific routine. @@ -1674,24 +1619,22 @@ static void unmap_vma_list(struct mm_str * Called with the page table lock held. */ static void unmap_region(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *prev, - unsigned long start, - unsigned long end) + struct vm_area_struct *vma, struct vm_area_struct *prev, + unsigned long start, unsigned long end) { + struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; struct mmu_gather *tlb; unsigned long nr_accounted = 0; lru_add_drain(); + spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - - if (is_hugepage_only_range(start, end - start)) - hugetlb_free_pgtables(tlb, prev, start, end); - else - free_pgtables(tlb, prev, start, end); + free_pgtables(&tlb, vma, prev? prev->vm_end: 0, + next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); + spin_unlock(&mm->page_table_lock); } /* @@ -1823,9 +1766,7 @@ int do_munmap(struct mm_struct *mm, unsi * Remove the vma's, and unmap the actual pages */ detach_vmas_to_be_unmapped(mm, mpnt, prev, end); - spin_lock(&mm->page_table_lock); unmap_region(mm, mpnt, prev, start, end); - spin_unlock(&mm->page_table_lock); /* Fix up all other VM information */ unmap_vma_list(mm, mpnt); @@ -1957,25 +1898,21 @@ EXPORT_SYMBOL(do_brk); void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma; + struct vm_area_struct *vma = mm->mmap; unsigned long nr_accounted = 0; lru_add_drain(); spin_lock(&mm->page_table_lock); - tlb = tlb_gather_mmu(mm, 1); flush_cache_mm(mm); - /* Use ~0UL here to ensure all VMAs in the mm are unmapped */ - mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0, - ~0UL, &nr_accounted, NULL); + tlb = tlb_gather_mmu(mm, 1); + /* Use -1 here to ensure all VMAs in the mm are unmapped */ + mm->map_count -= unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); - BUG_ON(mm->map_count); /* This is just debugging */ - clear_page_range(tlb, FIRST_USER_PGD_NR * PGDIR_SIZE, MM_VM_SIZE(mm)); - + free_pgtables(&tlb, vma, 0, 0); tlb_finish_mmu(tlb, 0, MM_VM_SIZE(mm)); - vma = mm->mmap; mm->mmap = mm->mmap_cache = NULL; mm->mm_rb = RB_ROOT; mm->rss = 0; @@ -1993,6 +1930,9 @@ void exit_mmap(struct mm_struct *mm) remove_vm_struct(vma); vma = next; } + + BUG_ON(mm->map_count); /* This is just debugging */ + BUG_ON(mm->nr_ptes); /* This is just debugging */ } /* Insert vm structure into process list sorted by address - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/