Use mmu_gather for fork() instead of flush_tlb_mm() This patch uses an mmu_gather for copying page tables instead of flush_tlb_mm(). This allows archs like ppc32 with hash table to avoid walking the page tables a second time to invalidate hash entries, and to only flush PTEs that have actually been changed from RW to RO.
Note that this contain a small change to the mmu gather stuff, it must not call free_pages_and_swap_cache() if no page have been queued up for freeing (if we are only invalidating PTEs). Calling it on fork can deadlock (I haven't dug why but it looks like a good idea to test anyway if we're going to use the mmu_gather for more than just removing pages). If the patch gets accepted, I will split that bit from the rest of the patch and send it separately. The main possible issue I see is with huge pages. Arch code might have relied on flush_tlb_mm() and might not cope with tlb_remove_tlb_entry() called for huge PTEs. Other possible issues are if archs make assumptions about flush_tlb_mm() being called in fork for different unrelated reasons. Ah also, we could probably improve the tracking of start/end, in the case of lock breaking, the outside function will still finish the batch with the entire range. It doesn't matter on ppc and x86 I think though. Index: linux-work/include/linux/hugetlb.h =================================================================== --- linux-work.orig/include/linux/hugetlb.h 2007-07-09 16:17:04.000000000 +1000 +++ linux-work/include/linux/hugetlb.h 2007-07-09 16:26:38.000000000 +1000 @@ -15,7 +15,7 @@ static inline int is_vm_hugetlb_page(str } int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); +int copy_hugetlb_page_range(struct mmu_gather **tlbp, struct mm_struct *, struct mm_struct *, struct vm_area_struct *); int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); void __unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long); @@ -107,7 +107,7 @@ static inline unsigned long hugetlb_tota #define follow_hugetlb_page(m,v,p,vs,a,b,i) ({ BUG(); 0; }) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) -#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) +#define copy_hugetlb_page_range(tlbp, src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) #define unmap_hugepage_range(vma, start, end) BUG() #define hugetlb_report_meminfo(buf) 0 Index: linux-work/include/linux/mm.h =================================================================== --- linux-work.orig/include/linux/mm.h 2007-07-09 16:17:04.000000000 +1000 +++ linux-work/include/linux/mm.h 2007-07-09 16:26:38.000000000 +1000 @@ -748,8 +748,8 @@ void free_pgd_range(struct mmu_gather ** unsigned long end, unsigned long floor, unsigned long ceiling); void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -int copy_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma); +int copy_page_range(struct mmu_gather **tlbp, struct mm_struct *dst, + struct mm_struct *src, struct vm_area_struct *vma); int zeromap_page_range(struct vm_area_struct *vma, unsigned long from, unsigned long size, pgprot_t prot); void unmap_mapping_range(struct address_space *mapping, Index: linux-work/kernel/fork.c =================================================================== --- linux-work.orig/kernel/fork.c 2007-07-09 16:17:04.000000000 +1000 +++ linux-work/kernel/fork.c 2007-07-09 16:26:38.000000000 +1000 @@ -56,6 +56,7 @@ #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> /* * Protected counters by write_lock_irq(&tasklist_lock) @@ -199,8 +200,9 @@ static inline int dup_mmap(struct mm_str struct vm_area_struct *mpnt, *tmp, **pprev; struct rb_node **rb_link, *rb_parent; int retval; - unsigned long charge; + unsigned long charge, tlb_start, tlb_end; struct mempolicy *pol; + struct mmu_gather *tlb; down_write(&oldmm->mmap_sem); flush_cache_dup_mm(oldmm); @@ -220,6 +222,10 @@ static inline int dup_mmap(struct mm_str rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; + tlb_start = -1; + tlb_end = 0; + + tlb = tlb_gather_mmu(oldmm, 1); for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -242,6 +248,11 @@ static inline int dup_mmap(struct mm_str if (!tmp) goto fail_nomem; *tmp = *mpnt; + + if (unlikely(tlb_start == -1)) + tlb_start = mpnt->vm_start; + tlb_end = mpnt->vm_end; + pol = mpol_copy(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) @@ -278,7 +289,7 @@ static inline int dup_mmap(struct mm_str rb_parent = &tmp->vm_rb; mm->map_count++; - retval = copy_page_range(mm, oldmm, mpnt); + retval = copy_page_range(&tlb, mm, oldmm, mpnt); if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); @@ -291,12 +302,15 @@ static inline int dup_mmap(struct mm_str retval = 0; out: up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); + if (tlb && tlb_start < tlb_end) + tlb_finish_mmu(tlb, tlb_start, tlb_end); up_write(&oldmm->mmap_sem); return retval; fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: + if (tlb && tlb_start < tlb_end) + tlb_finish_mmu(tlb, tlb_start, tlb_end); retval = -ENOMEM; vm_unacct_memory(charge); goto out; Index: linux-work/mm/hugetlb.c =================================================================== --- linux-work.orig/mm/hugetlb.c 2007-07-09 16:17:04.000000000 +1000 +++ linux-work/mm/hugetlb.c 2007-07-09 16:26:38.000000000 +1000 @@ -333,8 +333,8 @@ static void set_huge_ptep_writable(struc } -int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, - struct vm_area_struct *vma) +int copy_hugetlb_page_range(struct mmu_gather **tlbp, struct mm_struct *dst, + struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry; struct page *ptepage; @@ -353,8 +353,10 @@ int copy_hugetlb_page_range(struct mm_st spin_lock(&dst->page_table_lock); spin_lock(&src->page_table_lock); if (!pte_none(*src_pte)) { - if (cow) + if (cow) { ptep_set_wrprotect(src, addr, src_pte); + tlb_remove_tlb_entry((*tlbp), src_pte, addr); + } entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); @@ -363,6 +365,7 @@ int copy_hugetlb_page_range(struct mm_st spin_unlock(&src->page_table_lock); spin_unlock(&dst->page_table_lock); } + return 0; nomem: Index: linux-work/mm/memory.c =================================================================== --- linux-work.orig/mm/memory.c 2007-07-09 16:17:04.000000000 +1000 +++ linux-work/mm/memory.c 2007-07-09 16:34:54.000000000 +1000 @@ -425,9 +425,9 @@ struct page *vm_normal_page(struct vm_ar */ static inline void -copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, - unsigned long addr, int *rss) +copy_one_pte(struct mmu_gather *tlb, struct mm_struct *dst_mm, + struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, + struct vm_area_struct *vma, unsigned long addr, int *rss) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; @@ -466,8 +466,11 @@ copy_one_pte(struct mm_struct *dst_mm, s * in the parent and the child */ if (is_cow_mapping(vm_flags)) { + pte_t old = *src_pte; ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); + if (tlb && !pte_same(old, *src_pte)) + tlb_remove_tlb_entry(tlb, src_pte, addr); } /* @@ -489,13 +492,15 @@ out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mmu_gather **tlbp, + struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; - int progress = 0; + unsigned long start_addr = addr; + int fullmm, progress = 0; int rss[2]; again: @@ -524,7 +529,8 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + copy_one_pte(*tlbp, dst_mm, src_mm, dst_pte, src_pte, + vma, addr, rss); progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -533,13 +539,19 @@ again: pte_unmap_nested(src_pte - 1); add_mm_rss(dst_mm, rss[0], rss[1]); pte_unmap_unlock(dst_pte - 1, dst_ptl); + fullmm = (*tlbp)->fullmm; + tlb_finish_mmu(*tlbp, start_addr, addr); cond_resched(); - if (addr != end) + if (addr != end) { + *tlbp = tlb_gather_mmu(src_mm, fullmm); + start_addr = addr; goto again; + } return 0; } -static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static inline int copy_pmd_range(struct mmu_gather **tlbp, + struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { @@ -554,14 +566,15 @@ static inline int copy_pmd_range(struct next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(src_pmd)) continue; - if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + if (copy_pte_range(tlbp, dst_mm, src_mm, dst_pmd, src_pmd, + vma, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } -static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static inline int copy_pud_range(struct mmu_gather **tlbp, + struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { @@ -576,15 +589,15 @@ static inline int copy_pud_range(struct next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(src_pud)) continue; - if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + if (copy_pmd_range(tlbp, dst_mm, src_mm, dst_pud, src_pud, + vma, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } -int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) +int copy_page_range(struct mmu_gather **tlbp, struct mm_struct *dst_mm, + struct mm_struct *src_mm, struct vm_area_struct *vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; @@ -603,7 +616,7 @@ int copy_page_range(struct mm_struct *ds } if (is_vm_hugetlb_page(vma)) - return copy_hugetlb_page_range(dst_mm, src_mm, vma); + return copy_hugetlb_page_range(tlbp, dst_mm, src_mm, vma); dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); @@ -611,8 +624,8 @@ int copy_page_range(struct mm_struct *ds next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; - if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next)) + if (copy_pud_range(tlbp, dst_mm, src_mm, dst_pgd, src_pgd, + vma, addr, next)) return -ENOMEM; } while (dst_pgd++, src_pgd++, addr = next, addr != end); return 0; Index: linux-work/include/asm-generic/tlb.h =================================================================== --- linux-work.orig/include/asm-generic/tlb.h 2007-07-09 16:17:04.000000000 +1000 +++ linux-work/include/asm-generic/tlb.h 2007-07-09 16:26:38.000000000 +1000 @@ -72,7 +72,7 @@ tlb_flush_mmu(struct mmu_gather *tlb, un return; tlb->need_flush = 0; tlb_flush(tlb); - if (!tlb_fast_mode(tlb)) { + if (!tlb_fast_mode(tlb) && tlb->nr) { free_pages_and_swap_cache(tlb->pages, tlb->nr); tlb->nr = 0; } - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/