Module Name: src Committed By: rmind Date: Sun Jun 28 15:18:51 UTC 2009
Modified Files: src/sys/arch/amd64/amd64: vector.S src/sys/arch/i386/i386: vector.S src/sys/arch/x86/include: pmap.h src/sys/arch/x86/x86: pmap.c src/sys/kern: kern_synch.c sys_pipe.c src/sys/sys: lwp.h pipe.h src/sys/uvm: files.uvm uvm.h uvm_extern.h uvm_glue.c uvm_init.c Added Files: src/sys/uvm: uvm_emap.c Log Message: Ephemeral mapping (emap) implementation. Concept is based on the idea that activity of other threads will perform the TLB flush for the processes using emap as a side effect. To track that, global and per-CPU generation numbers are used. This idea was suggested by Andrew Doran; various improvements to it by me. Notes: - For now, zero-copy on pipe is not yet enabled. - TCP socket code would likely need more work. - Additional UVM loaning improvements are needed. Proposed on <tech-kern>, silence there. Quickly reviewed by <ad>. To generate a diff of this commit: cvs rdiff -u -r1.29 -r1.30 src/sys/arch/amd64/amd64/vector.S cvs rdiff -u -r1.45 -r1.46 src/sys/arch/i386/i386/vector.S cvs rdiff -u -r1.24 -r1.25 src/sys/arch/x86/include/pmap.h cvs rdiff -u -r1.85 -r1.86 src/sys/arch/x86/x86/pmap.c cvs rdiff -u -r1.264 -r1.265 src/sys/kern/kern_synch.c cvs rdiff -u -r1.114 -r1.115 src/sys/kern/sys_pipe.c cvs rdiff -u -r1.119 -r1.120 src/sys/sys/lwp.h cvs rdiff -u -r1.27 -r1.28 src/sys/sys/pipe.h cvs rdiff -u -r1.13 -r1.14 src/sys/uvm/files.uvm cvs rdiff -u -r1.55 -r1.56 src/sys/uvm/uvm.h cvs rdiff -u -r0 -r1.1 src/sys/uvm/uvm_emap.c cvs rdiff -u -r1.154 -r1.155 src/sys/uvm/uvm_extern.h cvs rdiff -u -r1.137 -r1.138 src/sys/uvm/uvm_glue.c cvs rdiff -u -r1.34 -r1.35 src/sys/uvm/uvm_init.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/amd64/amd64/vector.S diff -u src/sys/arch/amd64/amd64/vector.S:1.29 src/sys/arch/amd64/amd64/vector.S:1.30 --- src/sys/arch/amd64/amd64/vector.S:1.29 Tue Nov 25 16:25:29 2008 +++ src/sys/arch/amd64/amd64/vector.S Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: vector.S,v 1.29 2008/11/25 16:25:29 ad Exp $ */ +/* $NetBSD: vector.S,v 1.30 2009/06/28 15:18:50 rmind Exp $ */ /*- * Copyright (c) 1998, 2007, 2008 The NetBSD Foundation, Inc. @@ -452,8 +452,12 @@ 0: pushq %rax pushq %rbx - pushq %rcx + pushq %rdi + pushq %rsi pushq %rdx + pushq %rcx + pushq %r8 + pushq %r9 incq CPUVAR(TLB_EVCNT)+EV_COUNT /* Find out what needs to be invalidated and unlock the mailbox. */ movq CPUVAR(PMAP_CPU),%rcx @@ -489,8 +493,12 @@ movl $TLBSTATE_STALE, CPUVAR(TLBSTATE) 3: /* Restore state and return. */ - popq %rdx + popq %r9 + popq %r8 popq %rcx + popq %rdx + popq %rsi + popq %rdi popq %rbx popq %rax testq $SEL_UPL, 8(%rsp) @@ -499,9 +507,17 @@ 4: iretq 5: - /* Invalidate all user pages. */ + /* + * Get the emap generation number. Invalidate user TLB entries. + * Perform emap update, pass the generation number. Note that + * caller-save registers might be modified (all saved in the + * beginning). Only %rbx value is used by 2b context. + */ + callq _C_LABEL(uvm_emap_gen_return) + movq %rax, %rdi movq %cr3, %rax movq %rax, %cr3 + callq _C_LABEL(uvm_emap_update) jmp 2b /* @@ -510,12 +526,16 @@ IDTVEC(intr_lapic_tlb_bcast) /* Save state. */ pushq %rax - pushq %rbx + pushq %rdi + pushq %rsi pushq %rdx + pushq %rcx + pushq %r8 + pushq %r9 /* Find out what needs to be invalidated. */ movq _C_LABEL(pmap_mbox)+MB_ADDR1, %rax movq _C_LABEL(pmap_mbox)+MB_ADDR2, %rdx - movq _C_LABEL(pmap_mbox)+MB_GLOBAL, %rbx + movq _C_LABEL(pmap_mbox)+MB_GLOBAL, %rdi movl $0, _C_LABEL(local_apic)+LAPIC_EOI cmpq $-1, %rax je,pn 3f @@ -529,27 +549,44 @@ /* Notify waiter of completion, restore state & return */ lock incq _C_LABEL(pmap_mbox)+MB_TAIL + popq %r9 + popq %r8 + popq %rcx popq %rdx - popq %rbx + popq %rsi + popq %rdi popq %rax iretq 3: - testq %rbx, %rbx + testq %rdi, %rdi jz 4f /* - * If we have been asked to invalidate the entire TLB - * we arrive here. + * If we have been asked to invalidate the entire TLB we arrive here. + * Get the emap generation before flush, and use it after for update. + * Note that caller-save registers might be modified, though no + * registers need to be preserved for 2b context. */ + callq _C_LABEL(uvm_emap_gen_return) + movq %rax, %rdi movq %cr4, %rax movq %rax, %rdx andq $~CR4_PGE, %rdx movq %rdx, %cr4 movq %rax, %cr4 + callq _C_LABEL(uvm_emap_update) jmp 2b 4: - /* Invalidate user TLB entries. */ + /* + * Get the emap generation number. Invalidate user TLB entries. + * Perform emap update, pass the generation number. Note that + * caller-save registers might be modified, though no registers + * need to be preserved for 2b context. + */ + callq _C_LABEL(uvm_emap_gen_return) + movq %rax, %rdi movq %cr3, %rax movq %rax, %cr3 + callq _C_LABEL(uvm_emap_update) jmp 2b #endif /* !XEN */ Index: src/sys/arch/i386/i386/vector.S diff -u src/sys/arch/i386/i386/vector.S:1.45 src/sys/arch/i386/i386/vector.S:1.46 --- src/sys/arch/i386/i386/vector.S:1.45 Sat Mar 21 14:41:29 2009 +++ src/sys/arch/i386/i386/vector.S Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: vector.S,v 1.45 2009/03/21 14:41:29 ad Exp $ */ +/* $NetBSD: vector.S,v 1.46 2009/06/28 15:18:50 rmind Exp $ */ /* * Copyright 2002 (c) Wasabi Systems, Inc. @@ -65,7 +65,7 @@ */ #include <machine/asm.h> -__KERNEL_RCSID(0, "$NetBSD: vector.S,v 1.45 2009/03/21 14:41:29 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: vector.S,v 1.46 2009/06/28 15:18:50 rmind Exp $"); #include "opt_ddb.h" #include "opt_multiprocessor.h" @@ -227,9 +227,19 @@ popl %eax iret 4: - /* Invalidate all user pages. */ + /* + * Get the emap generation number. Invalidate user TLB entries. + * Perform emap update, pass the generation number. Note that + * caller-save registers might be modified (all saved in the + * beginning). Only %ebx value is used by 2b context. + */ + call _C_LABEL(uvm_emap_gen_return) + movl %eax, %edx movl %cr3, %eax movl %eax, %cr3 + pushl %edx + call _C_LABEL(uvm_emap_update) + addl $4, %esp jmp 2b IDTVEC_END(intr_lapic_tlb_mcast) @@ -240,6 +250,7 @@ /* Save state and ack the interrupt. */ pushl %eax pushl %ebx + pushl %ecx pushl %edx /* Find out what we need to invalidate. */ movl %ss:_C_LABEL(pmap_mbox)+MB_ADDR1, %eax @@ -259,6 +270,7 @@ lock incl %ss:_C_LABEL(pmap_mbox)+MB_TAIL popl %edx + popl %ecx popl %ebx popl %eax iret @@ -266,19 +278,36 @@ testl %ebx, %ebx jz 4f /* - * If the CPU understands global pages and we have been asked - * to invalidate the entire TLB we arrive here. + * If we have been asked to invalidate the entire TLB we arrive here. + * Get the emap generation before flush, and use it after for update. + * Note that caller-save registers might be modified, though no + * registers need to be preserved for 2b context. */ + call _C_LABEL(uvm_emap_gen_return) + movl %eax, %ebx movl %cr4, %eax movl %eax, %edx andl $~CR4_PGE, %edx movl %edx, %cr4 movl %eax, %cr4 + pushl %ebx + call _C_LABEL(uvm_emap_update) + addl $4, %esp jmp 2b 4: - /* Invalidate user TLB entries. */ + /* + * Get the emap generation number. Invalidate user TLB entries. + * Perform emap update, pass the generation number. Note that + * caller-save registers might be modified, though no registers + * need to be preserved for 2b context. + */ + call _C_LABEL(uvm_emap_gen_return) + movl %eax, %ebx movl %cr3, %eax movl %eax, %cr3 + pushl %ebx + call _C_LABEL(uvm_emap_update) + addl $4, %esp jmp 2b IDTVEC_END(intr_lapic_tlb_bcast) Index: src/sys/arch/x86/include/pmap.h diff -u src/sys/arch/x86/include/pmap.h:1.24 src/sys/arch/x86/include/pmap.h:1.25 --- src/sys/arch/x86/include/pmap.h:1.24 Wed Apr 22 10:17:48 2009 +++ src/sys/arch/x86/include/pmap.h Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.h,v 1.24 2009/04/22 10:17:48 cegger Exp $ */ +/* $NetBSD: pmap.h,v 1.25 2009/06/28 15:18:50 rmind Exp $ */ /* * @@ -224,11 +224,17 @@ void pmap_remove_all(struct pmap *); void pmap_ldt_sync(struct pmap *); +void pmap_emap_enter(vaddr_t, paddr_t, vm_prot_t); +void pmap_emap_remove(vaddr_t, vsize_t); +void pmap_emap_sync(void); + vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */ void pmap_tlb_shootdown(pmap_t, vaddr_t, vaddr_t, pt_entry_t); void pmap_tlb_shootwait(void); +#define __HAVE_PMAP_EMAP + #define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */ #define PMAP_FORK /* turn on pmap_fork interface */ Index: src/sys/arch/x86/x86/pmap.c diff -u src/sys/arch/x86/x86/pmap.c:1.85 src/sys/arch/x86/x86/pmap.c:1.86 --- src/sys/arch/x86/x86/pmap.c:1.85 Thu Apr 23 12:18:41 2009 +++ src/sys/arch/x86/x86/pmap.c Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.c,v 1.85 2009/04/23 12:18:41 cegger Exp $ */ +/* $NetBSD: pmap.c,v 1.86 2009/06/28 15:18:50 rmind Exp $ */ /* * Copyright (c) 2007 Manuel Bouyer. @@ -154,7 +154,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.85 2009/04/23 12:18:41 cegger Exp $"); +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.86 2009/06/28 15:18:50 rmind Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" @@ -1056,6 +1056,66 @@ } } +void +pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot) +{ + pt_entry_t *pte, opte, npte; + + KASSERT((prot & ~VM_PROT_ALL) == 0); + pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); + +#ifdef DOM0OPS + if (pa < pmap_pa_start || pa >= pmap_pa_end) { + npte = pa; + } else +#endif + npte = pmap_pa2pte(pa); + + npte = pmap_pa2pte(pa); + npte |= protection_codes[prot] | PG_k | PG_V; + opte = pmap_pte_testset(pte, npte); +} + +/* + * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred. + */ +void +pmap_emap_sync(void) +{ + struct cpu_info *ci = curcpu(); + struct pmap *pmap; + + KASSERT(kpreempt_disabled()); + if (__predict_true(ci->ci_want_pmapload)) { + /* + * XXX: Hint for pmap_reactivate(), which might suggest to + * not perform TLB flush, if state has not changed. + */ + pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map); + if (__predict_false(pmap == ci->ci_pmap)) { + const uint32_t cpumask = ci->ci_cpumask; + atomic_and_32(&pmap->pm_cpus, ~cpumask); + } + pmap_load(); + KASSERT(ci->ci_want_pmapload == 0); + } else { + tlbflush(); + } + +} + +void +pmap_emap_remove(vaddr_t sva, vsize_t len) +{ + pt_entry_t *pte, xpte; + vaddr_t va, eva = sva + len; + + for (va = sva; va < eva; va += PAGE_SIZE) { + pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va); + xpte |= pmap_pte_testset(pte, 0); + } +} + #ifdef XEN /* * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking @@ -2609,6 +2669,7 @@ if (pmap == oldpmap) { if (!pmap_reactivate(pmap)) { + u_int gen = uvm_emap_gen_return(); /* * pmap has been changed during deactivated. @@ -2616,6 +2677,7 @@ */ tlbflush(); + uvm_emap_update(gen); } ci->ci_want_pmapload = 0; @@ -2732,7 +2794,11 @@ splx(s); } #else /* PAE */ + { + u_int gen = uvm_emap_gen_return(); lcr3(pcb->pcb_cr3); + uvm_emap_update(gen); + } #endif /* PAE */ #endif /* XEN && x86_64 */ @@ -4648,10 +4714,13 @@ return; if (sva == (vaddr_t)-1LL) { - if (pte != 0) + u_int gen = uvm_emap_gen_return(); + if (pte != 0) { tlbflushg(); - else + } else { tlbflush(); + } + uvm_emap_update(gen); } else { do { pmap_update_pg(sva); Index: src/sys/kern/kern_synch.c diff -u src/sys/kern/kern_synch.c:1.264 src/sys/kern/kern_synch.c:1.265 --- src/sys/kern/kern_synch.c:1.264 Thu Apr 16 21:19:23 2009 +++ src/sys/kern/kern_synch.c Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: kern_synch.c,v 1.264 2009/04/16 21:19:23 ad Exp $ */ +/* $NetBSD: kern_synch.c,v 1.265 2009/06/28 15:18:50 rmind Exp $ */ /*- * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009 @@ -69,7 +69,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.264 2009/04/16 21:19:23 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.265 2009/06/28 15:18:50 rmind Exp $"); #include "opt_kstack.h" #include "opt_perfctrs.h" @@ -767,6 +767,8 @@ * Restore VM context and IPL. */ pmap_activate(l); + uvm_emap_switch(l); + if (prevlwp != NULL) { /* Normalize the count of the spin-mutexes */ ci->ci_mtx_count++; Index: src/sys/kern/sys_pipe.c diff -u src/sys/kern/sys_pipe.c:1.114 src/sys/kern/sys_pipe.c:1.115 --- src/sys/kern/sys_pipe.c:1.114 Sun Jun 28 14:34:48 2009 +++ src/sys/kern/sys_pipe.c Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: sys_pipe.c,v 1.114 2009/06/28 14:34:48 rmind Exp $ */ +/* $NetBSD: sys_pipe.c,v 1.115 2009/06/28 15:18:50 rmind Exp $ */ /*- * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc. @@ -68,7 +68,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.114 2009/06/28 14:34:48 rmind Exp $"); +__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.115 2009/06/28 15:18:50 rmind Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -94,7 +94,11 @@ #include <uvm/uvm.h> -/* Use this define if you want to disable *fancy* VM things. */ +/* + * Use this to disable direct I/O and decrease the code size: + * #define PIPE_NODIRECT + */ + /* XXX Disabled for now; rare hangs switching between direct/buffered */ #define PIPE_NODIRECT @@ -500,6 +504,7 @@ * Direct copy, bypassing a kernel buffer. */ void *va; + u_int gen; KASSERT(rpipe->pipe_state & PIPE_DIRECTW); @@ -508,8 +513,15 @@ size = uio->uio_resid; va = (char *)rpipe->pipe_map.kva + rpipe->pipe_map.pos; + gen = rpipe->pipe_map.egen; mutex_exit(lock); + + /* + * Consume emap and read the data from loaned pages. + */ + uvm_emap_consume(gen); error = uiomove(va, size, uio); + mutex_enter(lock); if (error) break; @@ -635,6 +647,7 @@ vsize_t len; len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT; + uvm_emap_remove(wpipe->pipe_map.kva, len); /* XXX */ uvm_km_free(kernel_map, wpipe->pipe_map.kva, len, UVM_KMF_VAONLY); wpipe->pipe_map.kva = 0; atomic_add_int(&amountpipekva, -len); @@ -656,10 +669,10 @@ static int pipe_direct_write(file_t *fp, struct pipe *wpipe, struct uio *uio) { - int error, npages, j; struct vm_page **pgs; - vaddr_t bbase, kva, base, bend; + vaddr_t bbase, base, bend; vsize_t blen, bcnt; + int error, npages; voff_t bpos; kmutex_t *lock = wpipe->pipe_lock; @@ -713,12 +726,9 @@ return (ENOMEM); /* so that caller fallback to ordinary write */ } - /* Enter the loaned pages to kva */ - kva = wpipe->pipe_map.kva; - for (j = 0; j < npages; j++, kva += PAGE_SIZE) { - pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ); - } - pmap_update(pmap_kernel()); + /* Enter the loaned pages to KVA, produce new emap generation number. */ + uvm_emap_enter(wpipe->pipe_map.kva, pgs, npages); + wpipe->pipe_map.egen = uvm_emap_produce(); /* Now we can put the pipe in direct write mode */ wpipe->pipe_map.pos = bpos; @@ -760,8 +770,7 @@ mutex_exit(lock); if (pgs != NULL) { - pmap_kremove(wpipe->pipe_map.kva, blen); - pmap_update(pmap_kernel()); + /* XXX: uvm_emap_remove */ uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE); } if (error || amountpipekva > maxpipekva) Index: src/sys/sys/lwp.h diff -u src/sys/sys/lwp.h:1.119 src/sys/sys/lwp.h:1.120 --- src/sys/sys/lwp.h:1.119 Wed May 27 12:08:35 2009 +++ src/sys/sys/lwp.h Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: lwp.h,v 1.119 2009/05/27 12:08:35 yamt Exp $ */ +/* $NetBSD: lwp.h,v 1.120 2009/06/28 15:18:50 rmind Exp $ */ /*- * Copyright (c) 2001, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc. @@ -121,6 +121,7 @@ int l_sleeperr; /* !: error before unblock */ u_int l_slptime; /* l: time since last blocked */ callout_t l_timeout_ch; /* !: callout for tsleep */ + u_int l_emap_gen; /* !: emap generation number */ /* Process level and global state, misc. */ LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */ Index: src/sys/sys/pipe.h diff -u src/sys/sys/pipe.h:1.27 src/sys/sys/pipe.h:1.28 --- src/sys/sys/pipe.h:1.27 Sat Apr 11 15:46:18 2009 +++ src/sys/sys/pipe.h Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: pipe.h,v 1.27 2009/04/11 15:46:18 christos Exp $ */ +/* $NetBSD: pipe.h,v 1.28 2009/06/28 15:18:50 rmind Exp $ */ /* * Copyright (c) 1996 John S. Dyson @@ -81,6 +81,7 @@ voff_t pos; /* current position within page */ int npages; /* how many pages allocated */ struct vm_page **pgs; /* pointers to the pages */ + u_int egen; /* emap generation number */ }; /* Index: src/sys/uvm/files.uvm diff -u src/sys/uvm/files.uvm:1.13 src/sys/uvm/files.uvm:1.14 --- src/sys/uvm/files.uvm:1.13 Sun Mar 29 10:51:53 2009 +++ src/sys/uvm/files.uvm Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -# $NetBSD: files.uvm,v 1.13 2009/03/29 10:51:53 ad Exp $ +# $NetBSD: files.uvm,v 1.14 2009/06/28 15:18:50 rmind Exp $ # # UVM options @@ -17,6 +17,7 @@ file uvm/uvm_bio.c file uvm/uvm_coredump.c coredump file uvm/uvm_device.c +file uvm/uvm_emap.c file uvm/uvm_fault.c file uvm/uvm_glue.c file uvm/uvm_init.c Index: src/sys/uvm/uvm.h diff -u src/sys/uvm/uvm.h:1.55 src/sys/uvm/uvm.h:1.56 --- src/sys/uvm/uvm.h:1.55 Wed Jun 4 15:06:04 2008 +++ src/sys/uvm/uvm.h Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: uvm.h,v 1.55 2008/06/04 15:06:04 ad Exp $ */ +/* $NetBSD: uvm.h,v 1.56 2009/06/28 15:18:50 rmind Exp $ */ /* * @@ -84,6 +84,7 @@ bool page_idle_zero; /* TRUE if we should try to zero pages in the idle loop */ int pages[PGFL_NQUEUES]; /* total of pages in page_free */ + u_int emap_gen; /* emap generation number */ }; /* Index: src/sys/uvm/uvm_extern.h diff -u src/sys/uvm/uvm_extern.h:1.154 src/sys/uvm/uvm_extern.h:1.155 --- src/sys/uvm/uvm_extern.h:1.154 Mon Mar 30 16:36:36 2009 +++ src/sys/uvm/uvm_extern.h Sun Jun 28 15:18:50 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_extern.h,v 1.154 2009/03/30 16:36:36 yamt Exp $ */ +/* $NetBSD: uvm_extern.h,v 1.155 2009/06/28 15:18:50 rmind Exp $ */ /* * @@ -226,6 +226,11 @@ #define UBC_MAX_PAGES 8 /* + * Value representing inactive emap. + */ +#define UVM_EMAP_INACTIVE (0) + +/* * structures */ @@ -567,6 +572,31 @@ int ubc_uiomove(struct uvm_object *, struct uio *, vsize_t, int, int); +/* uvm_emap.c */ +void uvm_emap_sysinit(void); +#ifdef __HAVE_PMAP_EMAP +void uvm_emap_switch(lwp_t *); +#else +#define uvm_emap_switch(l) +#endif + +u_int uvm_emap_gen_return(void); +void uvm_emap_update(u_int); + +vaddr_t uvm_emap_alloc(vsize_t, bool); +void uvm_emap_free(vaddr_t, size_t); + +void uvm_emap_enter(vaddr_t, struct vm_page **, u_int); +void uvm_emap_remove(vaddr_t, vsize_t); + +#ifdef __HAVE_PMAP_EMAP +void uvm_emap_consume(u_int); +u_int uvm_emap_produce(void); +#else +#define uvm_emap_consume(x) +#define uvm_emap_produce() UVM_EMAP_INACTIVE +#endif + /* uvm_fault.c */ #define uvm_fault(m, a, p) uvm_fault_internal(m, a, p, 0) int uvm_fault_internal(struct vm_map *, vaddr_t, vm_prot_t, int); Index: src/sys/uvm/uvm_glue.c diff -u src/sys/uvm/uvm_glue.c:1.137 src/sys/uvm/uvm_glue.c:1.138 --- src/sys/uvm/uvm_glue.c:1.137 Thu Apr 16 00:17:19 2009 +++ src/sys/uvm/uvm_glue.c Sun Jun 28 15:18:51 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_glue.c,v 1.137 2009/04/16 00:17:19 rmind Exp $ */ +/* $NetBSD: uvm_glue.c,v 1.138 2009/06/28 15:18:51 rmind Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -67,7 +67,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.137 2009/04/16 00:17:19 rmind Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.138 2009/06/28 15:18:51 rmind Exp $"); #include "opt_kgdb.h" #include "opt_kstack.h" @@ -264,6 +264,9 @@ * the specified entry point will be executed. */ cpu_lwp_fork(l1, l2, stack, stacksize, func, arg); + + /* Inactive emap for new LWP. */ + l2->l_emap_gen = UVM_EMAP_INACTIVE; } static int Index: src/sys/uvm/uvm_init.c diff -u src/sys/uvm/uvm_init.c:1.34 src/sys/uvm/uvm_init.c:1.35 --- src/sys/uvm/uvm_init.c:1.34 Sat Oct 18 03:46:22 2008 +++ src/sys/uvm/uvm_init.c Sun Jun 28 15:18:51 2009 @@ -1,4 +1,4 @@ -/* $NetBSD: uvm_init.c,v 1.34 2008/10/18 03:46:22 rmind Exp $ */ +/* $NetBSD: uvm_init.c,v 1.35 2009/06/28 15:18:51 rmind Exp $ */ /* * @@ -39,7 +39,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.34 2008/10/18 03:46:22 rmind Exp $"); +__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.35 2009/06/28 15:18:51 rmind Exp $"); #include <sys/param.h> #include <sys/systm.h> @@ -167,6 +167,12 @@ kmem_init(); /* + * init emap subsystem. + */ + + uvm_emap_sysinit(); + + /* * the VM system is now up! now that kmem is up we can resize the * <obj,off> => <page> hash table for general use and enable paging * of kernel objects. Added files: Index: src/sys/uvm/uvm_emap.c diff -u /dev/null src/sys/uvm/uvm_emap.c:1.1 --- /dev/null Sun Jun 28 15:18:51 2009 +++ src/sys/uvm/uvm_emap.c Sun Jun 28 15:18:50 2009 @@ -0,0 +1,360 @@ +/* $NetBSD: uvm_emap.c,v 1.1 2009/06/28 15:18:50 rmind Exp $ */ + +/*- + * Copyright (c) 2009 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Mindaugas Rasiukevicius and Andrew Doran. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * UVM ephemeral mapping interface. + * + * Generic (more expensive) stubs are implemented for architectures which + * do not support pmap. + * + * Note that uvm_emap_update() is called from lower pmap(9) layer, while + * other functions call to pmap(9). Typical pattern of update in pmap: + * + * u_int gen = uvm_emap_gen_return(); + * tlbflush(); + * uvm_emap_update(); + * + * It is also used from IPI context, therefore functions must safe. + */ + +#include <sys/cdefs.h> +__KERNEL_RCSID(0, "$NetBSD: uvm_emap.c,v 1.1 2009/06/28 15:18:50 rmind Exp $"); + +#include <sys/param.h> +#include <sys/kernel.h> + +#include <sys/atomic.h> +#include <sys/lwp.h> +#include <sys/vmem.h> +#include <sys/types.h> + +#include <uvm/uvm.h> +#include <uvm/uvm_extern.h> + +/* XXX: Arbitrary. */ +#ifdef _LP64 +#define UVM_EMAP_SIZE (128 * 1024 * 1024) /* 128 MB */ +#else +#define UVM_EMAP_SIZE (32 * 1024 * 1024) /* 32 MB */ +#endif + +static u_int _uvm_emap_gen[COHERENCY_UNIT - sizeof(u_int)] + __aligned(COHERENCY_UNIT); + +#define uvm_emap_gen (_uvm_emap_gen[0]) + +static u_int uvm_emap_size = UVM_EMAP_SIZE; +static vaddr_t uvm_emap_va; +static vmem_t * uvm_emap_vmem; + +/* + * uvm_emap_init: initialize subsystem. + */ +void +uvm_emap_sysinit(void) +{ + size_t qmax; + + uvm_emap_size = roundup(uvm_emap_size, PAGE_SIZE); + qmax = 16 * PAGE_SIZE; + + uvm_emap_va = uvm_km_alloc(kernel_map, uvm_emap_size, 0, + UVM_KMF_VAONLY | UVM_KMF_WAITVA); + if (uvm_emap_va == 0) { + panic("uvm_emap_init: KVA allocation failed"); + } + + uvm_emap_vmem = vmem_create("emap", uvm_emap_va, uvm_emap_size, + PAGE_SIZE, NULL, NULL, NULL, qmax, VM_SLEEP, IPL_NONE); + if (uvm_emap_vmem == NULL) { + panic("uvm_emap_init: vmem creation failed"); + } + + uvm_emap_gen = 1; +} + +/* + * uvm_emap_alloc: allocate a window. + */ +vaddr_t +uvm_emap_alloc(vsize_t size, bool waitok) +{ + + KASSERT(size > 0); + KASSERT(round_page(size) == size); + + return vmem_alloc(uvm_emap_vmem, size, + VM_INSTANTFIT | (waitok ? VM_SLEEP : VM_NOSLEEP)); +} + +/* + * uvm_emap_free: free a window. + */ +void +uvm_emap_free(vaddr_t va, size_t size) +{ + + KASSERT(va >= uvm_emap_va); + KASSERT(size <= uvm_emap_size); + KASSERT(va + size <= uvm_emap_va + uvm_emap_size); + + vmem_free(uvm_emap_vmem, va, size); +} + +#ifdef __HAVE_PMAP_EMAP + +/* + * uvm_emap_enter: enter a new mapping, without TLB flush. + */ +void +uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages) +{ + paddr_t pa; + u_int n; + + for (n = 0; n < npages; n++, va += PAGE_SIZE) { + pa = VM_PAGE_TO_PHYS(pgs[n]); + pmap_emap_enter(va, pa, VM_PROT_READ); + } +} + +/* + * uvm_emap_remove: remove a mapping. + */ +void +uvm_emap_remove(vaddr_t sva, vsize_t len) +{ + + pmap_emap_remove(sva, len); +} + +/* + * uvm_emap_gen_return: get the global generation number. + * + * => can be called from IPI handler, therefore function must be safe. + */ +u_int +uvm_emap_gen_return(void) +{ + u_int gen; + + gen = uvm_emap_gen; + if (__predict_false(gen == UVM_EMAP_INACTIVE)) { + /* + * Instead of looping, just increase in our side. + * Other thread could race and increase it again, + * but without any negative effect. + */ + gen = atomic_inc_uint_nv(&uvm_emap_gen); + } + KASSERT(gen != UVM_EMAP_INACTIVE); + return gen; +} + +/* + * uvm_emap_switch: if the CPU is 'behind' the LWP in emap visibility, + * perform TLB flush and thus update the local view. Main purpose is + * to handle kernel preemption, while emap is in use. + * + * => called from mi_switch(), when LWP returns after block or preempt. + */ +void +uvm_emap_switch(lwp_t *l) +{ + struct uvm_cpu *ucpu; + u_int curgen, gen; + + KASSERT(kpreempt_disabled()); + + /* If LWP did not use emap, then nothing to do. */ + if (__predict_true(l->l_emap_gen == UVM_EMAP_INACTIVE)) { + return; + } + + /* + * No need to synchronise if generation number of current CPU is + * newer than the number of this LWP. + * + * This test assumes two's complement arithmetic and allows + * ~2B missed updates before it will produce bad results. + */ + ucpu = curcpu()->ci_data.cpu_uvm; + curgen = ucpu->emap_gen; + gen = l->l_emap_gen; + if (__predict_true((signed int)(curgen - gen) >= 0)) { + return; + } + + /* + * See comments in uvm_emap_consume() about memory + * barriers and race conditions. + */ + curgen = uvm_emap_gen_return(); + pmap_emap_sync(); + ucpu->emap_gen = curgen; +} + +/* + * uvm_emap_consume: update the current CPU and LWP to the given generation + * of the emap. In a case of LWP migration to a different CPU after block + * or preempt, uvm_emap_switch() will synchronise. + * + * => may be called from both interrupt and thread context. + */ +void +uvm_emap_consume(u_int gen) +{ + struct cpu_info *ci; + struct uvm_cpu *ucpu; + lwp_t *l = curlwp; + u_int curgen; + + if (gen == UVM_EMAP_INACTIVE) { + return; + } + + /* + * No need to synchronise if generation number of current CPU is + * newer than the number of this LWP. + * + * This test assumes two's complement arithmetic and allows + * ~2B missed updates before it will produce bad results. + */ + KPREEMPT_DISABLE(l); + ci = l->l_cpu; + ucpu = ci->ci_data.cpu_uvm; + if (__predict_true((signed int)(ucpu->emap_gen - gen) >= 0)) { + l->l_emap_gen = ucpu->emap_gen; + KPREEMPT_ENABLE(l); + return; + } + + /* + * Record the current generation _before_ issuing the TLB flush. + * No need for a memory barrier before, as reading a stale value + * for uvm_emap_gen is not a problem. + * + * pmap_emap_sync() must implicitly perform a full memory barrier, + * which prevents us from fetching a value from after the TLB flush + * has occurred (which would be bad). + * + * We can race with an interrupt on the current CPU updating the + * counter to a newer value. This could cause us to set a stale + * value into ucpu->emap_gen, overwriting a newer update from the + * interrupt. However, it does not matter since: + * (1) Interrupts always run to completion or block. + * (2) Interrupts will only ever install a newer value and, + * (3) We will roll the value forward later. + */ + curgen = uvm_emap_gen_return(); + pmap_emap_sync(); + ucpu->emap_gen = curgen; + l->l_emap_gen = curgen; + KASSERT((signed int)(curgen - gen) >= 0); + KPREEMPT_ENABLE(l); +} + +/* + * uvm_emap_produce: increment emap generation counter. + * + * => pmap updates must be globally visible. + * => caller must have already entered mappings. + * => may be called from both interrupt and thread context. + */ +u_int +uvm_emap_produce(void) +{ + u_int gen; +again: + gen = atomic_inc_uint_nv(&uvm_emap_gen); + if (__predict_false(gen == UVM_EMAP_INACTIVE)) { + goto again; + } + return gen; +} + +/* + * uvm_emap_update: update global emap generation number for current CPU. + * + * Function is called by MD code (eg. pmap) to take advantage of TLB flushes + * initiated for other reasons, that sync the emap as a side effect. Note + * update should be performed before the actual TLB flush, to avoid race + * with newly generated number. + * + * => can be called from IPI handler, therefore function must be safe. + * => should be called _after_ TLB flush. + * => emap generation number should be taken _before_ TLB flush. + * => must be called with preemption disabled. + */ +void +uvm_emap_update(u_int gen) +{ + struct uvm_cpu *ucpu; + + /* + * See comments in uvm_emap_consume() about memory barriers and + * race conditions. Store is atomic if emap_gen size is word. + */ + CTASSERT(sizeof(ucpu->emap_gen) == sizeof(int)); + /* XXX: KASSERT(kpreempt_disabled()); */ + + ucpu = curcpu()->ci_data.cpu_uvm; + ucpu->emap_gen = gen; +} + +#else + +/* + * Stubs for architectures which do not support emap. + */ + +void +uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages) +{ + paddr_t pa; + u_int n; + + for (n = 0; n < npages; n++, va += PAGE_SIZE) { + pa = VM_PAGE_TO_PHYS(pgs[n]); + pmap_kenter_pa(va, pa, VM_PROT_READ); + } + pmap_update(pmap_kernel()); +} + +void +uvm_emap_remove(vaddr_t sva, vsize_t len) +{ + + pmap_kremove(sva, len); + pmap_update(pmap_kernel()); +} + +#endif