From: Ingo Molnar <[EMAIL PROTECTED]>, Paolo 'Blaisorblade' Giarrusso <[EMAIL PROTECTED]>
This is the "main" patch for the syscall code, containing the core of what was sent by Ingo Molnar, variously reworked. Differently from his patch, I've *not* added a new syscall, choosing to add a new flag (MAP_NOINHERIT) which the application must specify to get the new behavior (prot != 0 is accepted and prot == 0 means PROT_NONE). The changes to the page fault handler have been separated, even because that has required considerable amount of effort. Handle the possibility that remap_file_pages changes protections in various places. * Enable the 'prot' parameter for shared-writable mappings (the ones which are the primary target for remap_file_pages), without breaking up the vma * Use pte_file PTE's also when protections don't match, not only when the offset doesn't match; and add set_nonlinear_pte() for this testing * Save the current protection too when clearing a nonlinear PTE, by replacing pgoff_to_pte() uses with pgoff_prot_to_pte(). * Use the supplied protections on restore and on populate (partially uncomplete, fixed in subsequent patches) Signed-off-by: Paolo 'Blaisorblade' Giarrusso <[EMAIL PROTECTED]> --- linux-2.6.git-paolo/include/linux/pagemap.h | 19 ++++++++++ linux-2.6.git-paolo/mm/fremap.c | 50 +++++++++++++++++----------- linux-2.6.git-paolo/mm/memory.c | 14 ++++--- linux-2.6.git-paolo/mm/rmap.c | 3 - 4 files changed, 60 insertions(+), 26 deletions(-) diff -puN include/linux/pagemap.h~rfp-enhance-syscall-and-swapout-code include/linux/pagemap.h --- linux-2.6.git/include/linux/pagemap.h~rfp-enhance-syscall-and-swapout-code 2005-08-11 22:59:47.000000000 +0200 +++ linux-2.6.git-paolo/include/linux/pagemap.h 2005-08-11 22:59:47.000000000 +0200 @@ -159,6 +159,25 @@ static inline pgoff_t linear_page_index( return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); } +/*** + * Checks if the PTE is nonlinear, and if yes sets it. + * @vma: the VMA in which @addr is; we don't check if it's VM_NONLINEAR, just + * if this PTE is nonlinear. + * @addr: the addr which @pte refers to. + * @pte: the old PTE value (to read its protections. + * @ptep: the PTE pointer (for setting it). + * @mm: passed to set_pte_at. + * @page: the page which was installed (to read its ->index, i.e. the old + * offset inside the file. + */ +static inline void set_nonlinear_pte(pte_t pte, pte_t * ptep, struct vm_area_struct *vma, struct mm_struct *mm, struct page* page, unsigned long addr) +{ + pgprot_t pgprot = pte_to_pgprot(pte); + if(linear_page_index(vma, addr) != page->index || + pgprot_val(pgprot) != pgprot_val(vma->vm_page_prot)) + set_pte_at(mm, addr, ptep, pgoff_prot_to_pte(page->index, pgprot)); +} + extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); diff -puN mm/fremap.c~rfp-enhance-syscall-and-swapout-code mm/fremap.c --- linux-2.6.git/mm/fremap.c~rfp-enhance-syscall-and-swapout-code 2005-08-11 22:59:47.000000000 +0200 +++ linux-2.6.git-paolo/mm/fremap.c 2005-08-11 23:01:14.000000000 +0200 @@ -54,7 +54,7 @@ static inline void zap_pte(struct mm_str * previously existing mapping. */ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, struct page *page, pgprot_t prot) + unsigned long addr, struct page *page, pgprot_t pgprot) { struct inode *inode; pgoff_t size; @@ -94,7 +94,7 @@ int install_page(struct mm_struct *mm, s inc_mm_counter(mm,rss); flush_icache_page(vma, page); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + set_pte_at(mm, addr, pte, mk_pte(page, pgprot)); page_add_file_rmap(page); pte_val = *pte; pte_unmap(pte); @@ -113,7 +113,7 @@ EXPORT_SYMBOL(install_page); * previously existing mapping. */ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, unsigned long pgoff, pgprot_t prot) + unsigned long addr, unsigned long pgoff, pgprot_t pgprot) { int err = -ENOMEM; pte_t *pte; @@ -139,7 +139,7 @@ int install_file_pte(struct mm_struct *m zap_pte(mm, vma, addr, pte); - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + set_pte_at(mm, addr, pte, pgoff_prot_to_pte(pgoff, pgprot)); pte_val = *pte; pte_unmap(pte); update_mmu_cache(vma, addr, pte_val); @@ -157,31 +157,28 @@ err_unlock: * file within an existing vma. * @start: start of the remapped virtual memory range * @size: size of the remapped virtual memory range - * @prot: new protection bits of the range + * @prot: new protection bits of the range, must be 0 if not using MAP_NOINHERIT * @pgoff: to be mapped page of the backing store file - * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO. + * @flags: bits MAP_NOINHERIT or MAP_NONBLOCKED - the later will cause no IO. * * this syscall works purely via pagetables, so it's the most efficient * way to map the same (large) file into a given virtual window. Unlike * mmap()/mremap() it does not create any new vmas. The new mappings are * also safe across swapout. - * - * NOTE: the 'prot' parameter right now is ignored, and the vma's default - * protection is used. Arbitrary protections might be implemented in the - * future. */ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long __prot, unsigned long pgoff, unsigned long flags) + unsigned long prot, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; + pgprot_t pgprot; struct address_space *mapping; unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; - if (__prot) - return err; + if (prot && !(flags & MAP_NOINHERIT)) + goto out; /* * Sanitize the syscall parameters: */ @@ -200,7 +197,7 @@ asmlinkage long sys_remap_file_pages(uns /* We need down_write() to change vma->vm_flags. */ down_read(&mm->mmap_sem); - retry: +retry: vma = find_vma(mm, start); /* @@ -210,7 +207,22 @@ asmlinkage long sys_remap_file_pages(uns * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED * or VM_LOCKED, but VM_LOCKED could be revoked later on). */ - if (vma && (vma->vm_flags & VM_SHARED) && + if (!vma) + goto out_unlock; + + if (flags & MAP_NOINHERIT) { + err = -EPERM; + if (((prot & PROT_READ) && !(vma->vm_flags & VM_MAYREAD))) + goto out_unlock; + if (((prot & PROT_WRITE) && !(vma->vm_flags & VM_MAYWRITE))) + goto out_unlock; + if (((prot & PROT_EXEC) && !(vma->vm_flags & VM_MAYEXEC))) + goto out_unlock; + pgprot = protection_map[calc_vm_prot_bits(prot) | VM_SHARED]; + } else + pgprot = vma->vm_page_prot; + + if ((vma->vm_flags & VM_SHARED) && (!vma->vm_private_data || (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) && vma->vm_ops && vma->vm_ops->populate && @@ -236,9 +248,8 @@ asmlinkage long sys_remap_file_pages(uns spin_unlock(&mapping->i_mmap_lock); } - err = vma->vm_ops->populate(vma, start, size, - vma->vm_page_prot, - pgoff, flags & MAP_NONBLOCK); + err = vma->vm_ops->populate(vma, start, size, pgprot, pgoff, + flags & MAP_NONBLOCK); /* * We can't clear VM_NONLINEAR because we'd have to do @@ -246,11 +257,14 @@ asmlinkage long sys_remap_file_pages(uns * downgrading the lock. (Locks can't be upgraded). */ } + +out_unlock: if (likely(!has_write_lock)) up_read(&mm->mmap_sem); else up_write(&mm->mmap_sem); +out: return err; } diff -puN mm/memory.c~rfp-enhance-syscall-and-swapout-code mm/memory.c --- linux-2.6.git/mm/memory.c~rfp-enhance-syscall-and-swapout-code 2005-08-11 22:59:47.000000000 +0200 +++ linux-2.6.git-paolo/mm/memory.c 2005-08-11 22:59:47.000000000 +0200 @@ -555,11 +555,11 @@ static void zap_pte_range(struct mmu_gat tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; - if (unlikely(details) && details->nonlinear_vma - && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(tlb->mm, addr, pte, - pgoff_to_pte(page->index)); + if (unlikely(details) && details->nonlinear_vma) { + set_nonlinear_pte(ptent, pte, + details->nonlinear_vma, + tlb->mm, page, addr); + } if (pte_dirty(ptent)) set_page_dirty(page); if (PageAnon(page)) @@ -1926,6 +1926,7 @@ static int do_file_page(struct mm_struct unsigned long address, int write_access, pte_t *pte, pmd_t *pmd) { unsigned long pgoff; + pgprot_t pgprot; int err; BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage); @@ -1940,11 +1941,12 @@ static int do_file_page(struct mm_struct } pgoff = pte_to_pgoff(*pte); + pgprot = pte_to_pgprot(*pte); pte_unmap(pte); spin_unlock(&mm->page_table_lock); - err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); + err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, pgprot, pgoff, 0); if (err == -ENOMEM) return VM_FAULT_OOM; if (err) diff -puN mm/rmap.c~rfp-enhance-syscall-and-swapout-code mm/rmap.c --- linux-2.6.git/mm/rmap.c~rfp-enhance-syscall-and-swapout-code 2005-08-11 22:59:47.000000000 +0200 +++ linux-2.6.git-paolo/mm/rmap.c 2005-08-11 22:59:47.000000000 +0200 @@ -660,8 +660,7 @@ static void try_to_unmap_cluster(unsigne pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + set_nonlinear_pte(pteval, pte, vma, mm, page, address); /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) _ - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/