Module Name: src
Committed By: rmind
Date: Sun Jun 28 15:18:51 UTC 2009
Modified Files:
src/sys/arch/amd64/amd64: vector.S
src/sys/arch/i386/i386: vector.S
src/sys/arch/x86/include: pmap.h
src/sys/arch/x86/x86: pmap.c
src/sys/kern: kern_synch.c sys_pipe.c
src/sys/sys: lwp.h pipe.h
src/sys/uvm: files.uvm uvm.h uvm_extern.h uvm_glue.c uvm_init.c
Added Files:
src/sys/uvm: uvm_emap.c
Log Message:
Ephemeral mapping (emap) implementation. Concept is based on the idea that
activity of other threads will perform the TLB flush for the processes using
emap as a side effect. To track that, global and per-CPU generation numbers
are used. This idea was suggested by Andrew Doran; various improvements to
it by me. Notes:
- For now, zero-copy on pipe is not yet enabled.
- TCP socket code would likely need more work.
- Additional UVM loaning improvements are needed.
Proposed on <tech-kern>, silence there.
Quickly reviewed by <ad>.
To generate a diff of this commit:
cvs rdiff -u -r1.29 -r1.30 src/sys/arch/amd64/amd64/vector.S
cvs rdiff -u -r1.45 -r1.46 src/sys/arch/i386/i386/vector.S
cvs rdiff -u -r1.24 -r1.25 src/sys/arch/x86/include/pmap.h
cvs rdiff -u -r1.85 -r1.86 src/sys/arch/x86/x86/pmap.c
cvs rdiff -u -r1.264 -r1.265 src/sys/kern/kern_synch.c
cvs rdiff -u -r1.114 -r1.115 src/sys/kern/sys_pipe.c
cvs rdiff -u -r1.119 -r1.120 src/sys/sys/lwp.h
cvs rdiff -u -r1.27 -r1.28 src/sys/sys/pipe.h
cvs rdiff -u -r1.13 -r1.14 src/sys/uvm/files.uvm
cvs rdiff -u -r1.55 -r1.56 src/sys/uvm/uvm.h
cvs rdiff -u -r0 -r1.1 src/sys/uvm/uvm_emap.c
cvs rdiff -u -r1.154 -r1.155 src/sys/uvm/uvm_extern.h
cvs rdiff -u -r1.137 -r1.138 src/sys/uvm/uvm_glue.c
cvs rdiff -u -r1.34 -r1.35 src/sys/uvm/uvm_init.c
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/arch/amd64/amd64/vector.S
diff -u src/sys/arch/amd64/amd64/vector.S:1.29 src/sys/arch/amd64/amd64/vector.S:1.30
--- src/sys/arch/amd64/amd64/vector.S:1.29 Tue Nov 25 16:25:29 2008
+++ src/sys/arch/amd64/amd64/vector.S Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: vector.S,v 1.29 2008/11/25 16:25:29 ad Exp $ */
+/* $NetBSD: vector.S,v 1.30 2009/06/28 15:18:50 rmind Exp $ */
/*-
* Copyright (c) 1998, 2007, 2008 The NetBSD Foundation, Inc.
@@ -452,8 +452,12 @@
0:
pushq %rax
pushq %rbx
- pushq %rcx
+ pushq %rdi
+ pushq %rsi
pushq %rdx
+ pushq %rcx
+ pushq %r8
+ pushq %r9
incq CPUVAR(TLB_EVCNT)+EV_COUNT
/* Find out what needs to be invalidated and unlock the mailbox. */
movq CPUVAR(PMAP_CPU),%rcx
@@ -489,8 +493,12 @@
movl $TLBSTATE_STALE, CPUVAR(TLBSTATE)
3:
/* Restore state and return. */
- popq %rdx
+ popq %r9
+ popq %r8
popq %rcx
+ popq %rdx
+ popq %rsi
+ popq %rdi
popq %rbx
popq %rax
testq $SEL_UPL, 8(%rsp)
@@ -499,9 +507,17 @@
4:
iretq
5:
- /* Invalidate all user pages. */
+ /*
+ * Get the emap generation number. Invalidate user TLB entries.
+ * Perform emap update, pass the generation number. Note that
+ * caller-save registers might be modified (all saved in the
+ * beginning). Only %rbx value is used by 2b context.
+ */
+ callq _C_LABEL(uvm_emap_gen_return)
+ movq %rax, %rdi
movq %cr3, %rax
movq %rax, %cr3
+ callq _C_LABEL(uvm_emap_update)
jmp 2b
/*
@@ -510,12 +526,16 @@
IDTVEC(intr_lapic_tlb_bcast)
/* Save state. */
pushq %rax
- pushq %rbx
+ pushq %rdi
+ pushq %rsi
pushq %rdx
+ pushq %rcx
+ pushq %r8
+ pushq %r9
/* Find out what needs to be invalidated. */
movq _C_LABEL(pmap_mbox)+MB_ADDR1, %rax
movq _C_LABEL(pmap_mbox)+MB_ADDR2, %rdx
- movq _C_LABEL(pmap_mbox)+MB_GLOBAL, %rbx
+ movq _C_LABEL(pmap_mbox)+MB_GLOBAL, %rdi
movl $0, _C_LABEL(local_apic)+LAPIC_EOI
cmpq $-1, %rax
je,pn 3f
@@ -529,27 +549,44 @@
/* Notify waiter of completion, restore state & return */
lock
incq _C_LABEL(pmap_mbox)+MB_TAIL
+ popq %r9
+ popq %r8
+ popq %rcx
popq %rdx
- popq %rbx
+ popq %rsi
+ popq %rdi
popq %rax
iretq
3:
- testq %rbx, %rbx
+ testq %rdi, %rdi
jz 4f
/*
- * If we have been asked to invalidate the entire TLB
- * we arrive here.
+ * If we have been asked to invalidate the entire TLB we arrive here.
+ * Get the emap generation before flush, and use it after for update.
+ * Note that caller-save registers might be modified, though no
+ * registers need to be preserved for 2b context.
*/
+ callq _C_LABEL(uvm_emap_gen_return)
+ movq %rax, %rdi
movq %cr4, %rax
movq %rax, %rdx
andq $~CR4_PGE, %rdx
movq %rdx, %cr4
movq %rax, %cr4
+ callq _C_LABEL(uvm_emap_update)
jmp 2b
4:
- /* Invalidate user TLB entries. */
+ /*
+ * Get the emap generation number. Invalidate user TLB entries.
+ * Perform emap update, pass the generation number. Note that
+ * caller-save registers might be modified, though no registers
+ * need to be preserved for 2b context.
+ */
+ callq _C_LABEL(uvm_emap_gen_return)
+ movq %rax, %rdi
movq %cr3, %rax
movq %rax, %cr3
+ callq _C_LABEL(uvm_emap_update)
jmp 2b
#endif /* !XEN */
Index: src/sys/arch/i386/i386/vector.S
diff -u src/sys/arch/i386/i386/vector.S:1.45 src/sys/arch/i386/i386/vector.S:1.46
--- src/sys/arch/i386/i386/vector.S:1.45 Sat Mar 21 14:41:29 2009
+++ src/sys/arch/i386/i386/vector.S Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: vector.S,v 1.45 2009/03/21 14:41:29 ad Exp $ */
+/* $NetBSD: vector.S,v 1.46 2009/06/28 15:18:50 rmind Exp $ */
/*
* Copyright 2002 (c) Wasabi Systems, Inc.
@@ -65,7 +65,7 @@
*/
#include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: vector.S,v 1.45 2009/03/21 14:41:29 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vector.S,v 1.46 2009/06/28 15:18:50 rmind Exp $");
#include "opt_ddb.h"
#include "opt_multiprocessor.h"
@@ -227,9 +227,19 @@
popl %eax
iret
4:
- /* Invalidate all user pages. */
+ /*
+ * Get the emap generation number. Invalidate user TLB entries.
+ * Perform emap update, pass the generation number. Note that
+ * caller-save registers might be modified (all saved in the
+ * beginning). Only %ebx value is used by 2b context.
+ */
+ call _C_LABEL(uvm_emap_gen_return)
+ movl %eax, %edx
movl %cr3, %eax
movl %eax, %cr3
+ pushl %edx
+ call _C_LABEL(uvm_emap_update)
+ addl $4, %esp
jmp 2b
IDTVEC_END(intr_lapic_tlb_mcast)
@@ -240,6 +250,7 @@
/* Save state and ack the interrupt. */
pushl %eax
pushl %ebx
+ pushl %ecx
pushl %edx
/* Find out what we need to invalidate. */
movl %ss:_C_LABEL(pmap_mbox)+MB_ADDR1, %eax
@@ -259,6 +270,7 @@
lock
incl %ss:_C_LABEL(pmap_mbox)+MB_TAIL
popl %edx
+ popl %ecx
popl %ebx
popl %eax
iret
@@ -266,19 +278,36 @@
testl %ebx, %ebx
jz 4f
/*
- * If the CPU understands global pages and we have been asked
- * to invalidate the entire TLB we arrive here.
+ * If we have been asked to invalidate the entire TLB we arrive here.
+ * Get the emap generation before flush, and use it after for update.
+ * Note that caller-save registers might be modified, though no
+ * registers need to be preserved for 2b context.
*/
+ call _C_LABEL(uvm_emap_gen_return)
+ movl %eax, %ebx
movl %cr4, %eax
movl %eax, %edx
andl $~CR4_PGE, %edx
movl %edx, %cr4
movl %eax, %cr4
+ pushl %ebx
+ call _C_LABEL(uvm_emap_update)
+ addl $4, %esp
jmp 2b
4:
- /* Invalidate user TLB entries. */
+ /*
+ * Get the emap generation number. Invalidate user TLB entries.
+ * Perform emap update, pass the generation number. Note that
+ * caller-save registers might be modified, though no registers
+ * need to be preserved for 2b context.
+ */
+ call _C_LABEL(uvm_emap_gen_return)
+ movl %eax, %ebx
movl %cr3, %eax
movl %eax, %cr3
+ pushl %ebx
+ call _C_LABEL(uvm_emap_update)
+ addl $4, %esp
jmp 2b
IDTVEC_END(intr_lapic_tlb_bcast)
Index: src/sys/arch/x86/include/pmap.h
diff -u src/sys/arch/x86/include/pmap.h:1.24 src/sys/arch/x86/include/pmap.h:1.25
--- src/sys/arch/x86/include/pmap.h:1.24 Wed Apr 22 10:17:48 2009
+++ src/sys/arch/x86/include/pmap.h Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.h,v 1.24 2009/04/22 10:17:48 cegger Exp $ */
+/* $NetBSD: pmap.h,v 1.25 2009/06/28 15:18:50 rmind Exp $ */
/*
*
@@ -224,11 +224,17 @@
void pmap_remove_all(struct pmap *);
void pmap_ldt_sync(struct pmap *);
+void pmap_emap_enter(vaddr_t, paddr_t, vm_prot_t);
+void pmap_emap_remove(vaddr_t, vsize_t);
+void pmap_emap_sync(void);
+
vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
void pmap_tlb_shootdown(pmap_t, vaddr_t, vaddr_t, pt_entry_t);
void pmap_tlb_shootwait(void);
+#define __HAVE_PMAP_EMAP
+
#define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */
#define PMAP_FORK /* turn on pmap_fork interface */
Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.85 src/sys/arch/x86/x86/pmap.c:1.86
--- src/sys/arch/x86/x86/pmap.c:1.85 Thu Apr 23 12:18:41 2009
+++ src/sys/arch/x86/x86/pmap.c Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.c,v 1.85 2009/04/23 12:18:41 cegger Exp $ */
+/* $NetBSD: pmap.c,v 1.86 2009/06/28 15:18:50 rmind Exp $ */
/*
* Copyright (c) 2007 Manuel Bouyer.
@@ -154,7 +154,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.85 2009/04/23 12:18:41 cegger Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.86 2009/06/28 15:18:50 rmind Exp $");
#include "opt_user_ldt.h"
#include "opt_lockdebug.h"
@@ -1056,6 +1056,66 @@
}
}
+void
+pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
+{
+ pt_entry_t *pte, opte, npte;
+
+ KASSERT((prot & ~VM_PROT_ALL) == 0);
+ pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
+
+#ifdef DOM0OPS
+ if (pa < pmap_pa_start || pa >= pmap_pa_end) {
+ npte = pa;
+ } else
+#endif
+ npte = pmap_pa2pte(pa);
+
+ npte = pmap_pa2pte(pa);
+ npte |= protection_codes[prot] | PG_k | PG_V;
+ opte = pmap_pte_testset(pte, npte);
+}
+
+/*
+ * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
+ */
+void
+pmap_emap_sync(void)
+{
+ struct cpu_info *ci = curcpu();
+ struct pmap *pmap;
+
+ KASSERT(kpreempt_disabled());
+ if (__predict_true(ci->ci_want_pmapload)) {
+ /*
+ * XXX: Hint for pmap_reactivate(), which might suggest to
+ * not perform TLB flush, if state has not changed.
+ */
+ pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
+ if (__predict_false(pmap == ci->ci_pmap)) {
+ const uint32_t cpumask = ci->ci_cpumask;
+ atomic_and_32(&pmap->pm_cpus, ~cpumask);
+ }
+ pmap_load();
+ KASSERT(ci->ci_want_pmapload == 0);
+ } else {
+ tlbflush();
+ }
+
+}
+
+void
+pmap_emap_remove(vaddr_t sva, vsize_t len)
+{
+ pt_entry_t *pte, xpte;
+ vaddr_t va, eva = sva + len;
+
+ for (va = sva; va < eva; va += PAGE_SIZE) {
+ pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
+ xpte |= pmap_pte_testset(pte, 0);
+ }
+}
+
#ifdef XEN
/*
* pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
@@ -2609,6 +2669,7 @@
if (pmap == oldpmap) {
if (!pmap_reactivate(pmap)) {
+ u_int gen = uvm_emap_gen_return();
/*
* pmap has been changed during deactivated.
@@ -2616,6 +2677,7 @@
*/
tlbflush();
+ uvm_emap_update(gen);
}
ci->ci_want_pmapload = 0;
@@ -2732,7 +2794,11 @@
splx(s);
}
#else /* PAE */
+ {
+ u_int gen = uvm_emap_gen_return();
lcr3(pcb->pcb_cr3);
+ uvm_emap_update(gen);
+ }
#endif /* PAE */
#endif /* XEN && x86_64 */
@@ -4648,10 +4714,13 @@
return;
if (sva == (vaddr_t)-1LL) {
- if (pte != 0)
+ u_int gen = uvm_emap_gen_return();
+ if (pte != 0) {
tlbflushg();
- else
+ } else {
tlbflush();
+ }
+ uvm_emap_update(gen);
} else {
do {
pmap_update_pg(sva);
Index: src/sys/kern/kern_synch.c
diff -u src/sys/kern/kern_synch.c:1.264 src/sys/kern/kern_synch.c:1.265
--- src/sys/kern/kern_synch.c:1.264 Thu Apr 16 21:19:23 2009
+++ src/sys/kern/kern_synch.c Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: kern_synch.c,v 1.264 2009/04/16 21:19:23 ad Exp $ */
+/* $NetBSD: kern_synch.c,v 1.265 2009/06/28 15:18:50 rmind Exp $ */
/*-
* Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009
@@ -69,7 +69,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.264 2009/04/16 21:19:23 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.265 2009/06/28 15:18:50 rmind Exp $");
#include "opt_kstack.h"
#include "opt_perfctrs.h"
@@ -767,6 +767,8 @@
* Restore VM context and IPL.
*/
pmap_activate(l);
+ uvm_emap_switch(l);
+
if (prevlwp != NULL) {
/* Normalize the count of the spin-mutexes */
ci->ci_mtx_count++;
Index: src/sys/kern/sys_pipe.c
diff -u src/sys/kern/sys_pipe.c:1.114 src/sys/kern/sys_pipe.c:1.115
--- src/sys/kern/sys_pipe.c:1.114 Sun Jun 28 14:34:48 2009
+++ src/sys/kern/sys_pipe.c Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: sys_pipe.c,v 1.114 2009/06/28 14:34:48 rmind Exp $ */
+/* $NetBSD: sys_pipe.c,v 1.115 2009/06/28 15:18:50 rmind Exp $ */
/*-
* Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.114 2009/06/28 14:34:48 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.115 2009/06/28 15:18:50 rmind Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -94,7 +94,11 @@
#include <uvm/uvm.h>
-/* Use this define if you want to disable *fancy* VM things. */
+/*
+ * Use this to disable direct I/O and decrease the code size:
+ * #define PIPE_NODIRECT
+ */
+
/* XXX Disabled for now; rare hangs switching between direct/buffered */
#define PIPE_NODIRECT
@@ -500,6 +504,7 @@
* Direct copy, bypassing a kernel buffer.
*/
void *va;
+ u_int gen;
KASSERT(rpipe->pipe_state & PIPE_DIRECTW);
@@ -508,8 +513,15 @@
size = uio->uio_resid;
va = (char *)rpipe->pipe_map.kva + rpipe->pipe_map.pos;
+ gen = rpipe->pipe_map.egen;
mutex_exit(lock);
+
+ /*
+ * Consume emap and read the data from loaned pages.
+ */
+ uvm_emap_consume(gen);
error = uiomove(va, size, uio);
+
mutex_enter(lock);
if (error)
break;
@@ -635,6 +647,7 @@
vsize_t len;
len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT;
+ uvm_emap_remove(wpipe->pipe_map.kva, len); /* XXX */
uvm_km_free(kernel_map, wpipe->pipe_map.kva, len, UVM_KMF_VAONLY);
wpipe->pipe_map.kva = 0;
atomic_add_int(&amountpipekva, -len);
@@ -656,10 +669,10 @@
static int
pipe_direct_write(file_t *fp, struct pipe *wpipe, struct uio *uio)
{
- int error, npages, j;
struct vm_page **pgs;
- vaddr_t bbase, kva, base, bend;
+ vaddr_t bbase, base, bend;
vsize_t blen, bcnt;
+ int error, npages;
voff_t bpos;
kmutex_t *lock = wpipe->pipe_lock;
@@ -713,12 +726,9 @@
return (ENOMEM); /* so that caller fallback to ordinary write */
}
- /* Enter the loaned pages to kva */
- kva = wpipe->pipe_map.kva;
- for (j = 0; j < npages; j++, kva += PAGE_SIZE) {
- pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ);
- }
- pmap_update(pmap_kernel());
+ /* Enter the loaned pages to KVA, produce new emap generation number. */
+ uvm_emap_enter(wpipe->pipe_map.kva, pgs, npages);
+ wpipe->pipe_map.egen = uvm_emap_produce();
/* Now we can put the pipe in direct write mode */
wpipe->pipe_map.pos = bpos;
@@ -760,8 +770,7 @@
mutex_exit(lock);
if (pgs != NULL) {
- pmap_kremove(wpipe->pipe_map.kva, blen);
- pmap_update(pmap_kernel());
+ /* XXX: uvm_emap_remove */
uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
}
if (error || amountpipekva > maxpipekva)
Index: src/sys/sys/lwp.h
diff -u src/sys/sys/lwp.h:1.119 src/sys/sys/lwp.h:1.120
--- src/sys/sys/lwp.h:1.119 Wed May 27 12:08:35 2009
+++ src/sys/sys/lwp.h Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: lwp.h,v 1.119 2009/05/27 12:08:35 yamt Exp $ */
+/* $NetBSD: lwp.h,v 1.120 2009/06/28 15:18:50 rmind Exp $ */
/*-
* Copyright (c) 2001, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -121,6 +121,7 @@
int l_sleeperr; /* !: error before unblock */
u_int l_slptime; /* l: time since last blocked */
callout_t l_timeout_ch; /* !: callout for tsleep */
+ u_int l_emap_gen; /* !: emap generation number */
/* Process level and global state, misc. */
LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */
Index: src/sys/sys/pipe.h
diff -u src/sys/sys/pipe.h:1.27 src/sys/sys/pipe.h:1.28
--- src/sys/sys/pipe.h:1.27 Sat Apr 11 15:46:18 2009
+++ src/sys/sys/pipe.h Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: pipe.h,v 1.27 2009/04/11 15:46:18 christos Exp $ */
+/* $NetBSD: pipe.h,v 1.28 2009/06/28 15:18:50 rmind Exp $ */
/*
* Copyright (c) 1996 John S. Dyson
@@ -81,6 +81,7 @@
voff_t pos; /* current position within page */
int npages; /* how many pages allocated */
struct vm_page **pgs; /* pointers to the pages */
+ u_int egen; /* emap generation number */
};
/*
Index: src/sys/uvm/files.uvm
diff -u src/sys/uvm/files.uvm:1.13 src/sys/uvm/files.uvm:1.14
--- src/sys/uvm/files.uvm:1.13 Sun Mar 29 10:51:53 2009
+++ src/sys/uvm/files.uvm Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-# $NetBSD: files.uvm,v 1.13 2009/03/29 10:51:53 ad Exp $
+# $NetBSD: files.uvm,v 1.14 2009/06/28 15:18:50 rmind Exp $
#
# UVM options
@@ -17,6 +17,7 @@
file uvm/uvm_bio.c
file uvm/uvm_coredump.c coredump
file uvm/uvm_device.c
+file uvm/uvm_emap.c
file uvm/uvm_fault.c
file uvm/uvm_glue.c
file uvm/uvm_init.c
Index: src/sys/uvm/uvm.h
diff -u src/sys/uvm/uvm.h:1.55 src/sys/uvm/uvm.h:1.56
--- src/sys/uvm/uvm.h:1.55 Wed Jun 4 15:06:04 2008
+++ src/sys/uvm/uvm.h Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm.h,v 1.55 2008/06/04 15:06:04 ad Exp $ */
+/* $NetBSD: uvm.h,v 1.56 2009/06/28 15:18:50 rmind Exp $ */
/*
*
@@ -84,6 +84,7 @@
bool page_idle_zero; /* TRUE if we should try to zero
pages in the idle loop */
int pages[PGFL_NQUEUES]; /* total of pages in page_free */
+ u_int emap_gen; /* emap generation number */
};
/*
Index: src/sys/uvm/uvm_extern.h
diff -u src/sys/uvm/uvm_extern.h:1.154 src/sys/uvm/uvm_extern.h:1.155
--- src/sys/uvm/uvm_extern.h:1.154 Mon Mar 30 16:36:36 2009
+++ src/sys/uvm/uvm_extern.h Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_extern.h,v 1.154 2009/03/30 16:36:36 yamt Exp $ */
+/* $NetBSD: uvm_extern.h,v 1.155 2009/06/28 15:18:50 rmind Exp $ */
/*
*
@@ -226,6 +226,11 @@
#define UBC_MAX_PAGES 8
/*
+ * Value representing inactive emap.
+ */
+#define UVM_EMAP_INACTIVE (0)
+
+/*
* structures
*/
@@ -567,6 +572,31 @@
int ubc_uiomove(struct uvm_object *, struct uio *, vsize_t,
int, int);
+/* uvm_emap.c */
+void uvm_emap_sysinit(void);
+#ifdef __HAVE_PMAP_EMAP
+void uvm_emap_switch(lwp_t *);
+#else
+#define uvm_emap_switch(l)
+#endif
+
+u_int uvm_emap_gen_return(void);
+void uvm_emap_update(u_int);
+
+vaddr_t uvm_emap_alloc(vsize_t, bool);
+void uvm_emap_free(vaddr_t, size_t);
+
+void uvm_emap_enter(vaddr_t, struct vm_page **, u_int);
+void uvm_emap_remove(vaddr_t, vsize_t);
+
+#ifdef __HAVE_PMAP_EMAP
+void uvm_emap_consume(u_int);
+u_int uvm_emap_produce(void);
+#else
+#define uvm_emap_consume(x)
+#define uvm_emap_produce() UVM_EMAP_INACTIVE
+#endif
+
/* uvm_fault.c */
#define uvm_fault(m, a, p) uvm_fault_internal(m, a, p, 0)
int uvm_fault_internal(struct vm_map *, vaddr_t, vm_prot_t, int);
Index: src/sys/uvm/uvm_glue.c
diff -u src/sys/uvm/uvm_glue.c:1.137 src/sys/uvm/uvm_glue.c:1.138
--- src/sys/uvm/uvm_glue.c:1.137 Thu Apr 16 00:17:19 2009
+++ src/sys/uvm/uvm_glue.c Sun Jun 28 15:18:51 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_glue.c,v 1.137 2009/04/16 00:17:19 rmind Exp $ */
+/* $NetBSD: uvm_glue.c,v 1.138 2009/06/28 15:18:51 rmind Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -67,7 +67,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.137 2009/04/16 00:17:19 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.138 2009/06/28 15:18:51 rmind Exp $");
#include "opt_kgdb.h"
#include "opt_kstack.h"
@@ -264,6 +264,9 @@
* the specified entry point will be executed.
*/
cpu_lwp_fork(l1, l2, stack, stacksize, func, arg);
+
+ /* Inactive emap for new LWP. */
+ l2->l_emap_gen = UVM_EMAP_INACTIVE;
}
static int
Index: src/sys/uvm/uvm_init.c
diff -u src/sys/uvm/uvm_init.c:1.34 src/sys/uvm/uvm_init.c:1.35
--- src/sys/uvm/uvm_init.c:1.34 Sat Oct 18 03:46:22 2008
+++ src/sys/uvm/uvm_init.c Sun Jun 28 15:18:51 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_init.c,v 1.34 2008/10/18 03:46:22 rmind Exp $ */
+/* $NetBSD: uvm_init.c,v 1.35 2009/06/28 15:18:51 rmind Exp $ */
/*
*
@@ -39,7 +39,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.34 2008/10/18 03:46:22 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.35 2009/06/28 15:18:51 rmind Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -167,6 +167,12 @@
kmem_init();
/*
+ * init emap subsystem.
+ */
+
+ uvm_emap_sysinit();
+
+ /*
* the VM system is now up! now that kmem is up we can resize the
* <obj,off> => <page> hash table for general use and enable paging
* of kernel objects.
Added files:
Index: src/sys/uvm/uvm_emap.c
diff -u /dev/null src/sys/uvm/uvm_emap.c:1.1
--- /dev/null Sun Jun 28 15:18:51 2009
+++ src/sys/uvm/uvm_emap.c Sun Jun 28 15:18:50 2009
@@ -0,0 +1,360 @@
+/* $NetBSD: uvm_emap.c,v 1.1 2009/06/28 15:18:50 rmind Exp $ */
+
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Mindaugas Rasiukevicius and Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * UVM ephemeral mapping interface.
+ *
+ * Generic (more expensive) stubs are implemented for architectures which
+ * do not support pmap.
+ *
+ * Note that uvm_emap_update() is called from lower pmap(9) layer, while
+ * other functions call to pmap(9). Typical pattern of update in pmap:
+ *
+ * u_int gen = uvm_emap_gen_return();
+ * tlbflush();
+ * uvm_emap_update();
+ *
+ * It is also used from IPI context, therefore functions must safe.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: uvm_emap.c,v 1.1 2009/06/28 15:18:50 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+
+#include <sys/atomic.h>
+#include <sys/lwp.h>
+#include <sys/vmem.h>
+#include <sys/types.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_extern.h>
+
+/* XXX: Arbitrary. */
+#ifdef _LP64
+#define UVM_EMAP_SIZE (128 * 1024 * 1024) /* 128 MB */
+#else
+#define UVM_EMAP_SIZE (32 * 1024 * 1024) /* 32 MB */
+#endif
+
+static u_int _uvm_emap_gen[COHERENCY_UNIT - sizeof(u_int)]
+ __aligned(COHERENCY_UNIT);
+
+#define uvm_emap_gen (_uvm_emap_gen[0])
+
+static u_int uvm_emap_size = UVM_EMAP_SIZE;
+static vaddr_t uvm_emap_va;
+static vmem_t * uvm_emap_vmem;
+
+/*
+ * uvm_emap_init: initialize subsystem.
+ */
+void
+uvm_emap_sysinit(void)
+{
+ size_t qmax;
+
+ uvm_emap_size = roundup(uvm_emap_size, PAGE_SIZE);
+ qmax = 16 * PAGE_SIZE;
+
+ uvm_emap_va = uvm_km_alloc(kernel_map, uvm_emap_size, 0,
+ UVM_KMF_VAONLY | UVM_KMF_WAITVA);
+ if (uvm_emap_va == 0) {
+ panic("uvm_emap_init: KVA allocation failed");
+ }
+
+ uvm_emap_vmem = vmem_create("emap", uvm_emap_va, uvm_emap_size,
+ PAGE_SIZE, NULL, NULL, NULL, qmax, VM_SLEEP, IPL_NONE);
+ if (uvm_emap_vmem == NULL) {
+ panic("uvm_emap_init: vmem creation failed");
+ }
+
+ uvm_emap_gen = 1;
+}
+
+/*
+ * uvm_emap_alloc: allocate a window.
+ */
+vaddr_t
+uvm_emap_alloc(vsize_t size, bool waitok)
+{
+
+ KASSERT(size > 0);
+ KASSERT(round_page(size) == size);
+
+ return vmem_alloc(uvm_emap_vmem, size,
+ VM_INSTANTFIT | (waitok ? VM_SLEEP : VM_NOSLEEP));
+}
+
+/*
+ * uvm_emap_free: free a window.
+ */
+void
+uvm_emap_free(vaddr_t va, size_t size)
+{
+
+ KASSERT(va >= uvm_emap_va);
+ KASSERT(size <= uvm_emap_size);
+ KASSERT(va + size <= uvm_emap_va + uvm_emap_size);
+
+ vmem_free(uvm_emap_vmem, va, size);
+}
+
+#ifdef __HAVE_PMAP_EMAP
+
+/*
+ * uvm_emap_enter: enter a new mapping, without TLB flush.
+ */
+void
+uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages)
+{
+ paddr_t pa;
+ u_int n;
+
+ for (n = 0; n < npages; n++, va += PAGE_SIZE) {
+ pa = VM_PAGE_TO_PHYS(pgs[n]);
+ pmap_emap_enter(va, pa, VM_PROT_READ);
+ }
+}
+
+/*
+ * uvm_emap_remove: remove a mapping.
+ */
+void
+uvm_emap_remove(vaddr_t sva, vsize_t len)
+{
+
+ pmap_emap_remove(sva, len);
+}
+
+/*
+ * uvm_emap_gen_return: get the global generation number.
+ *
+ * => can be called from IPI handler, therefore function must be safe.
+ */
+u_int
+uvm_emap_gen_return(void)
+{
+ u_int gen;
+
+ gen = uvm_emap_gen;
+ if (__predict_false(gen == UVM_EMAP_INACTIVE)) {
+ /*
+ * Instead of looping, just increase in our side.
+ * Other thread could race and increase it again,
+ * but without any negative effect.
+ */
+ gen = atomic_inc_uint_nv(&uvm_emap_gen);
+ }
+ KASSERT(gen != UVM_EMAP_INACTIVE);
+ return gen;
+}
+
+/*
+ * uvm_emap_switch: if the CPU is 'behind' the LWP in emap visibility,
+ * perform TLB flush and thus update the local view. Main purpose is
+ * to handle kernel preemption, while emap is in use.
+ *
+ * => called from mi_switch(), when LWP returns after block or preempt.
+ */
+void
+uvm_emap_switch(lwp_t *l)
+{
+ struct uvm_cpu *ucpu;
+ u_int curgen, gen;
+
+ KASSERT(kpreempt_disabled());
+
+ /* If LWP did not use emap, then nothing to do. */
+ if (__predict_true(l->l_emap_gen == UVM_EMAP_INACTIVE)) {
+ return;
+ }
+
+ /*
+ * No need to synchronise if generation number of current CPU is
+ * newer than the number of this LWP.
+ *
+ * This test assumes two's complement arithmetic and allows
+ * ~2B missed updates before it will produce bad results.
+ */
+ ucpu = curcpu()->ci_data.cpu_uvm;
+ curgen = ucpu->emap_gen;
+ gen = l->l_emap_gen;
+ if (__predict_true((signed int)(curgen - gen) >= 0)) {
+ return;
+ }
+
+ /*
+ * See comments in uvm_emap_consume() about memory
+ * barriers and race conditions.
+ */
+ curgen = uvm_emap_gen_return();
+ pmap_emap_sync();
+ ucpu->emap_gen = curgen;
+}
+
+/*
+ * uvm_emap_consume: update the current CPU and LWP to the given generation
+ * of the emap. In a case of LWP migration to a different CPU after block
+ * or preempt, uvm_emap_switch() will synchronise.
+ *
+ * => may be called from both interrupt and thread context.
+ */
+void
+uvm_emap_consume(u_int gen)
+{
+ struct cpu_info *ci;
+ struct uvm_cpu *ucpu;
+ lwp_t *l = curlwp;
+ u_int curgen;
+
+ if (gen == UVM_EMAP_INACTIVE) {
+ return;
+ }
+
+ /*
+ * No need to synchronise if generation number of current CPU is
+ * newer than the number of this LWP.
+ *
+ * This test assumes two's complement arithmetic and allows
+ * ~2B missed updates before it will produce bad results.
+ */
+ KPREEMPT_DISABLE(l);
+ ci = l->l_cpu;
+ ucpu = ci->ci_data.cpu_uvm;
+ if (__predict_true((signed int)(ucpu->emap_gen - gen) >= 0)) {
+ l->l_emap_gen = ucpu->emap_gen;
+ KPREEMPT_ENABLE(l);
+ return;
+ }
+
+ /*
+ * Record the current generation _before_ issuing the TLB flush.
+ * No need for a memory barrier before, as reading a stale value
+ * for uvm_emap_gen is not a problem.
+ *
+ * pmap_emap_sync() must implicitly perform a full memory barrier,
+ * which prevents us from fetching a value from after the TLB flush
+ * has occurred (which would be bad).
+ *
+ * We can race with an interrupt on the current CPU updating the
+ * counter to a newer value. This could cause us to set a stale
+ * value into ucpu->emap_gen, overwriting a newer update from the
+ * interrupt. However, it does not matter since:
+ * (1) Interrupts always run to completion or block.
+ * (2) Interrupts will only ever install a newer value and,
+ * (3) We will roll the value forward later.
+ */
+ curgen = uvm_emap_gen_return();
+ pmap_emap_sync();
+ ucpu->emap_gen = curgen;
+ l->l_emap_gen = curgen;
+ KASSERT((signed int)(curgen - gen) >= 0);
+ KPREEMPT_ENABLE(l);
+}
+
+/*
+ * uvm_emap_produce: increment emap generation counter.
+ *
+ * => pmap updates must be globally visible.
+ * => caller must have already entered mappings.
+ * => may be called from both interrupt and thread context.
+ */
+u_int
+uvm_emap_produce(void)
+{
+ u_int gen;
+again:
+ gen = atomic_inc_uint_nv(&uvm_emap_gen);
+ if (__predict_false(gen == UVM_EMAP_INACTIVE)) {
+ goto again;
+ }
+ return gen;
+}
+
+/*
+ * uvm_emap_update: update global emap generation number for current CPU.
+ *
+ * Function is called by MD code (eg. pmap) to take advantage of TLB flushes
+ * initiated for other reasons, that sync the emap as a side effect. Note
+ * update should be performed before the actual TLB flush, to avoid race
+ * with newly generated number.
+ *
+ * => can be called from IPI handler, therefore function must be safe.
+ * => should be called _after_ TLB flush.
+ * => emap generation number should be taken _before_ TLB flush.
+ * => must be called with preemption disabled.
+ */
+void
+uvm_emap_update(u_int gen)
+{
+ struct uvm_cpu *ucpu;
+
+ /*
+ * See comments in uvm_emap_consume() about memory barriers and
+ * race conditions. Store is atomic if emap_gen size is word.
+ */
+ CTASSERT(sizeof(ucpu->emap_gen) == sizeof(int));
+ /* XXX: KASSERT(kpreempt_disabled()); */
+
+ ucpu = curcpu()->ci_data.cpu_uvm;
+ ucpu->emap_gen = gen;
+}
+
+#else
+
+/*
+ * Stubs for architectures which do not support emap.
+ */
+
+void
+uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages)
+{
+ paddr_t pa;
+ u_int n;
+
+ for (n = 0; n < npages; n++, va += PAGE_SIZE) {
+ pa = VM_PAGE_TO_PHYS(pgs[n]);
+ pmap_kenter_pa(va, pa, VM_PROT_READ);
+ }
+ pmap_update(pmap_kernel());
+}
+
+void
+uvm_emap_remove(vaddr_t sva, vsize_t len)
+{
+
+ pmap_kremove(sva, len);
+ pmap_update(pmap_kernel());
+}
+
+#endif