Module Name:    src
Committed By:   rmind
Date:           Sun Jun 28 15:18:51 UTC 2009

Modified Files:
        src/sys/arch/amd64/amd64: vector.S
        src/sys/arch/i386/i386: vector.S
        src/sys/arch/x86/include: pmap.h
        src/sys/arch/x86/x86: pmap.c
        src/sys/kern: kern_synch.c sys_pipe.c
        src/sys/sys: lwp.h pipe.h
        src/sys/uvm: files.uvm uvm.h uvm_extern.h uvm_glue.c uvm_init.c
Added Files:
        src/sys/uvm: uvm_emap.c

Log Message:
Ephemeral mapping (emap) implementation.  Concept is based on the idea that
activity of other threads will perform the TLB flush for the processes using
emap as a side effect.  To track that, global and per-CPU generation numbers
are used.  This idea was suggested by Andrew Doran; various improvements to
it by me.  Notes:

- For now, zero-copy on pipe is not yet enabled.
- TCP socket code would likely need more work.
- Additional UVM loaning improvements are needed.

Proposed on <tech-kern>, silence there.
Quickly reviewed by <ad>.


To generate a diff of this commit:
cvs rdiff -u -r1.29 -r1.30 src/sys/arch/amd64/amd64/vector.S
cvs rdiff -u -r1.45 -r1.46 src/sys/arch/i386/i386/vector.S
cvs rdiff -u -r1.24 -r1.25 src/sys/arch/x86/include/pmap.h
cvs rdiff -u -r1.85 -r1.86 src/sys/arch/x86/x86/pmap.c
cvs rdiff -u -r1.264 -r1.265 src/sys/kern/kern_synch.c
cvs rdiff -u -r1.114 -r1.115 src/sys/kern/sys_pipe.c
cvs rdiff -u -r1.119 -r1.120 src/sys/sys/lwp.h
cvs rdiff -u -r1.27 -r1.28 src/sys/sys/pipe.h
cvs rdiff -u -r1.13 -r1.14 src/sys/uvm/files.uvm
cvs rdiff -u -r1.55 -r1.56 src/sys/uvm/uvm.h
cvs rdiff -u -r0 -r1.1 src/sys/uvm/uvm_emap.c
cvs rdiff -u -r1.154 -r1.155 src/sys/uvm/uvm_extern.h
cvs rdiff -u -r1.137 -r1.138 src/sys/uvm/uvm_glue.c
cvs rdiff -u -r1.34 -r1.35 src/sys/uvm/uvm_init.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/amd64/amd64/vector.S
diff -u src/sys/arch/amd64/amd64/vector.S:1.29 src/sys/arch/amd64/amd64/vector.S:1.30
--- src/sys/arch/amd64/amd64/vector.S:1.29	Tue Nov 25 16:25:29 2008
+++ src/sys/arch/amd64/amd64/vector.S	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: vector.S,v 1.29 2008/11/25 16:25:29 ad Exp $	*/
+/*	$NetBSD: vector.S,v 1.30 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*-
  * Copyright (c) 1998, 2007, 2008 The NetBSD Foundation, Inc.
@@ -452,8 +452,12 @@
 0:
 	pushq	%rax
 	pushq	%rbx
-	pushq	%rcx
+	pushq	%rdi
+	pushq	%rsi
 	pushq	%rdx
+	pushq	%rcx
+	pushq	%r8
+	pushq	%r9
 	incq	CPUVAR(TLB_EVCNT)+EV_COUNT
 	/* Find out what needs to be invalidated and unlock the mailbox. */
 	movq	CPUVAR(PMAP_CPU),%rcx
@@ -489,8 +493,12 @@
 	movl	$TLBSTATE_STALE, CPUVAR(TLBSTATE)
 3:
 	/* Restore state and return. */
-	popq	%rdx
+	popq	%r9
+	popq	%r8
 	popq	%rcx
+	popq	%rdx
+	popq	%rsi
+	popq	%rdi
 	popq	%rbx
 	popq	%rax
 	testq	$SEL_UPL, 8(%rsp)
@@ -499,9 +507,17 @@
 4:
 	iretq
 5:
-	/* Invalidate all user pages. */
+	/*
+	 * Get the emap generation number.  Invalidate user TLB entries.
+	 * Perform emap update, pass the generation number.  Note that
+	 * caller-save registers might be modified (all saved in the
+	 * beginning).  Only %rbx value is used by 2b context.
+	 */
+	callq	_C_LABEL(uvm_emap_gen_return)
+	movq	%rax, %rdi
 	movq	%cr3, %rax
 	movq	%rax, %cr3
+	callq	_C_LABEL(uvm_emap_update)
 	jmp	2b
 
 /*
@@ -510,12 +526,16 @@
 IDTVEC(intr_lapic_tlb_bcast)
 	/* Save state. */
 	pushq	%rax
-	pushq	%rbx
+	pushq	%rdi
+	pushq	%rsi
 	pushq	%rdx
+	pushq	%rcx
+	pushq	%r8
+	pushq	%r9
 	/* Find out what needs to be invalidated. */
 	movq	_C_LABEL(pmap_mbox)+MB_ADDR1, %rax
 	movq	_C_LABEL(pmap_mbox)+MB_ADDR2, %rdx
-	movq	_C_LABEL(pmap_mbox)+MB_GLOBAL, %rbx
+	movq	_C_LABEL(pmap_mbox)+MB_GLOBAL, %rdi
 	movl	$0, _C_LABEL(local_apic)+LAPIC_EOI
 	cmpq	$-1, %rax
 	je,pn	3f
@@ -529,27 +549,44 @@
 	/* Notify waiter of completion, restore state & return */
 	lock
 	incq	_C_LABEL(pmap_mbox)+MB_TAIL
+	popq	%r9
+	popq	%r8
+	popq	%rcx
 	popq	%rdx
-	popq	%rbx
+	popq	%rsi
+	popq	%rdi
 	popq	%rax
 	iretq
 3:
-	testq	%rbx, %rbx
+	testq	%rdi, %rdi
 	jz	4f
 	/*
-	 * If we have been asked to invalidate the entire TLB
-	 * we arrive here.
+	 * If we have been asked to invalidate the entire TLB we arrive here.
+	 * Get the emap generation before flush, and use it after for update.
+	 * Note that caller-save registers might be modified, though no
+	 * registers need to be preserved for 2b context.
 	 */
+	callq	_C_LABEL(uvm_emap_gen_return)
+	movq	%rax, %rdi
 	movq	%cr4, %rax
 	movq	%rax, %rdx
 	andq	$~CR4_PGE, %rdx
 	movq	%rdx, %cr4
 	movq	%rax, %cr4
+	callq	_C_LABEL(uvm_emap_update)
 	jmp	2b
 4:
-	/* Invalidate user TLB entries. */
+	/*
+	 * Get the emap generation number.  Invalidate user TLB entries.
+	 * Perform emap update, pass the generation number.  Note that
+	 * caller-save registers might be modified, though no registers
+	 * need to be preserved for 2b context.
+	 */
+	callq	_C_LABEL(uvm_emap_gen_return)
+	movq	%rax, %rdi
 	movq	%cr3, %rax
 	movq	%rax, %cr3
+	callq	_C_LABEL(uvm_emap_update)
 	jmp	2b
 
 #endif /* !XEN */

Index: src/sys/arch/i386/i386/vector.S
diff -u src/sys/arch/i386/i386/vector.S:1.45 src/sys/arch/i386/i386/vector.S:1.46
--- src/sys/arch/i386/i386/vector.S:1.45	Sat Mar 21 14:41:29 2009
+++ src/sys/arch/i386/i386/vector.S	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: vector.S,v 1.45 2009/03/21 14:41:29 ad Exp $	*/
+/*	$NetBSD: vector.S,v 1.46 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*
  * Copyright 2002 (c) Wasabi Systems, Inc.
@@ -65,7 +65,7 @@
  */
 
 #include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: vector.S,v 1.45 2009/03/21 14:41:29 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vector.S,v 1.46 2009/06/28 15:18:50 rmind Exp $");
 
 #include "opt_ddb.h"
 #include "opt_multiprocessor.h"
@@ -227,9 +227,19 @@
 	popl	%eax
 	iret
 4:
-	/* Invalidate all user pages. */
+	/*
+	 * Get the emap generation number.  Invalidate user TLB entries.
+	 * Perform emap update, pass the generation number.  Note that
+	 * caller-save registers might be modified (all saved in the
+	 * beginning).  Only %ebx value is used by 2b context.
+	 */
+	call	_C_LABEL(uvm_emap_gen_return)
+	movl	%eax, %edx
 	movl	%cr3, %eax
 	movl	%eax, %cr3
+	pushl	%edx
+	call	_C_LABEL(uvm_emap_update)
+	addl	$4, %esp
 	jmp	2b
 IDTVEC_END(intr_lapic_tlb_mcast)
 
@@ -240,6 +250,7 @@
 	/* Save state and ack the interrupt. */
 	pushl	%eax
 	pushl	%ebx
+	pushl	%ecx
 	pushl	%edx
 	/* Find out what we need to invalidate. */
 	movl	%ss:_C_LABEL(pmap_mbox)+MB_ADDR1, %eax
@@ -259,6 +270,7 @@
 	lock
 	incl	%ss:_C_LABEL(pmap_mbox)+MB_TAIL
 	popl	%edx
+	popl	%ecx
 	popl	%ebx
 	popl	%eax
 	iret
@@ -266,19 +278,36 @@
 	testl	%ebx, %ebx
 	jz	4f
 	/*
-	 * If the CPU understands global pages and we have been asked
-	 * to invalidate the entire TLB we arrive here.
+	 * If we have been asked to invalidate the entire TLB we arrive here.
+	 * Get the emap generation before flush, and use it after for update.
+	 * Note that caller-save registers might be modified, though no
+	 * registers need to be preserved for 2b context.
 	 */
+	call	_C_LABEL(uvm_emap_gen_return)
+	movl	%eax, %ebx
 	movl	%cr4, %eax
 	movl	%eax, %edx
 	andl	$~CR4_PGE, %edx
 	movl	%edx, %cr4
 	movl	%eax, %cr4
+	pushl	%ebx
+	call	_C_LABEL(uvm_emap_update)
+	addl	$4, %esp
 	jmp	2b
 4:
-	/* Invalidate user TLB entries. */
+	/*
+	 * Get the emap generation number.  Invalidate user TLB entries.
+	 * Perform emap update, pass the generation number.  Note that
+	 * caller-save registers might be modified, though no registers
+	 * need to be preserved for 2b context.
+	 */
+	call	_C_LABEL(uvm_emap_gen_return)
+	movl	%eax, %ebx
 	movl	%cr3, %eax
 	movl	%eax, %cr3
+	pushl	%ebx
+	call	_C_LABEL(uvm_emap_update)
+	addl	$4, %esp
 	jmp	2b
 IDTVEC_END(intr_lapic_tlb_bcast)
 

Index: src/sys/arch/x86/include/pmap.h
diff -u src/sys/arch/x86/include/pmap.h:1.24 src/sys/arch/x86/include/pmap.h:1.25
--- src/sys/arch/x86/include/pmap.h:1.24	Wed Apr 22 10:17:48 2009
+++ src/sys/arch/x86/include/pmap.h	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.h,v 1.24 2009/04/22 10:17:48 cegger Exp $	*/
+/*	$NetBSD: pmap.h,v 1.25 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*
  *
@@ -224,11 +224,17 @@
 void		pmap_remove_all(struct pmap *);
 void		pmap_ldt_sync(struct pmap *);
 
+void		pmap_emap_enter(vaddr_t, paddr_t, vm_prot_t);
+void		pmap_emap_remove(vaddr_t, vsize_t);
+void		pmap_emap_sync(void);
+
 vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
 
 void	pmap_tlb_shootdown(pmap_t, vaddr_t, vaddr_t, pt_entry_t);
 void	pmap_tlb_shootwait(void);
 
+#define	__HAVE_PMAP_EMAP
+
 #define PMAP_GROWKERNEL		/* turn on pmap_growkernel interface */
 #define PMAP_FORK		/* turn on pmap_fork interface */
 

Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.85 src/sys/arch/x86/x86/pmap.c:1.86
--- src/sys/arch/x86/x86/pmap.c:1.85	Thu Apr 23 12:18:41 2009
+++ src/sys/arch/x86/x86/pmap.c	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.c,v 1.85 2009/04/23 12:18:41 cegger Exp $	*/
+/*	$NetBSD: pmap.c,v 1.86 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*
  * Copyright (c) 2007 Manuel Bouyer.
@@ -154,7 +154,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.85 2009/04/23 12:18:41 cegger Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.86 2009/06/28 15:18:50 rmind Exp $");
 
 #include "opt_user_ldt.h"
 #include "opt_lockdebug.h"
@@ -1056,6 +1056,66 @@
 	}
 }
 
+void
+pmap_emap_enter(vaddr_t va, paddr_t pa, vm_prot_t prot)
+{
+	pt_entry_t *pte, opte, npte;
+
+	KASSERT((prot & ~VM_PROT_ALL) == 0);
+	pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
+
+#ifdef DOM0OPS
+	if (pa < pmap_pa_start || pa >= pmap_pa_end) {
+		npte = pa;
+	} else
+#endif
+		npte = pmap_pa2pte(pa);
+
+	npte = pmap_pa2pte(pa);
+	npte |= protection_codes[prot] | PG_k | PG_V;
+	opte = pmap_pte_testset(pte, npte);
+}
+
+/*
+ * pmap_emap_sync: perform TLB flush or pmap load, if it was deferred.
+ */
+void
+pmap_emap_sync(void)
+{
+	struct cpu_info *ci = curcpu();
+	struct pmap *pmap;
+
+	KASSERT(kpreempt_disabled());
+	if (__predict_true(ci->ci_want_pmapload)) {
+		/*
+		 * XXX: Hint for pmap_reactivate(), which might suggest to
+		 * not perform TLB flush, if state has not changed.
+		 */
+		pmap = vm_map_pmap(&curlwp->l_proc->p_vmspace->vm_map);
+		if (__predict_false(pmap == ci->ci_pmap)) {
+			const uint32_t cpumask = ci->ci_cpumask;
+			atomic_and_32(&pmap->pm_cpus, ~cpumask);
+		}
+		pmap_load();
+		KASSERT(ci->ci_want_pmapload == 0);
+	} else {
+		tlbflush();
+	}
+
+}
+
+void
+pmap_emap_remove(vaddr_t sva, vsize_t len)
+{
+	pt_entry_t *pte, xpte;
+	vaddr_t va, eva = sva + len;
+
+	for (va = sva; va < eva; va += PAGE_SIZE) {
+		pte = (va < VM_MIN_KERNEL_ADDRESS) ? vtopte(va) : kvtopte(va);
+		xpte |= pmap_pte_testset(pte, 0);
+	}
+}
+
 #ifdef XEN
 /*
  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
@@ -2609,6 +2669,7 @@
 
 	if (pmap == oldpmap) {
 		if (!pmap_reactivate(pmap)) {
+			u_int gen = uvm_emap_gen_return();
 
 			/*
 			 * pmap has been changed during deactivated.
@@ -2616,6 +2677,7 @@
 			 */
 
 			tlbflush();
+			uvm_emap_update(gen);
 		}
 
 		ci->ci_want_pmapload = 0;
@@ -2732,7 +2794,11 @@
 	splx(s);
 	}
 #else /* PAE */
+	{
+	u_int gen = uvm_emap_gen_return();
 	lcr3(pcb->pcb_cr3);
+	uvm_emap_update(gen);
+	}
 #endif /* PAE */
 #endif /* XEN && x86_64 */
 
@@ -4648,10 +4714,13 @@
 		return;
 
 	if (sva == (vaddr_t)-1LL) {
-		if (pte != 0)
+		u_int gen = uvm_emap_gen_return();
+		if (pte != 0) {
 			tlbflushg();
-		else
+		} else {
 			tlbflush();
+		}
+		uvm_emap_update(gen);
 	} else {
 		do {
 			pmap_update_pg(sva);

Index: src/sys/kern/kern_synch.c
diff -u src/sys/kern/kern_synch.c:1.264 src/sys/kern/kern_synch.c:1.265
--- src/sys/kern/kern_synch.c:1.264	Thu Apr 16 21:19:23 2009
+++ src/sys/kern/kern_synch.c	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_synch.c,v 1.264 2009/04/16 21:19:23 ad Exp $	*/
+/*	$NetBSD: kern_synch.c,v 1.265 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009
@@ -69,7 +69,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.264 2009/04/16 21:19:23 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.265 2009/06/28 15:18:50 rmind Exp $");
 
 #include "opt_kstack.h"
 #include "opt_perfctrs.h"
@@ -767,6 +767,8 @@
 		 * Restore VM context and IPL.
 		 */
 		pmap_activate(l);
+		uvm_emap_switch(l);
+
 		if (prevlwp != NULL) {
 			/* Normalize the count of the spin-mutexes */
 			ci->ci_mtx_count++;

Index: src/sys/kern/sys_pipe.c
diff -u src/sys/kern/sys_pipe.c:1.114 src/sys/kern/sys_pipe.c:1.115
--- src/sys/kern/sys_pipe.c:1.114	Sun Jun 28 14:34:48 2009
+++ src/sys/kern/sys_pipe.c	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: sys_pipe.c,v 1.114 2009/06/28 14:34:48 rmind Exp $	*/
+/*	$NetBSD: sys_pipe.c,v 1.115 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2003, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -68,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.114 2009/06/28 14:34:48 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.115 2009/06/28 15:18:50 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -94,7 +94,11 @@
 
 #include <uvm/uvm.h>
 
-/* Use this define if you want to disable *fancy* VM things. */
+/*
+ * Use this to disable direct I/O and decrease the code size:
+ * #define PIPE_NODIRECT
+ */
+
 /* XXX Disabled for now; rare hangs switching between direct/buffered */
 #define PIPE_NODIRECT
 
@@ -500,6 +504,7 @@
 			 * Direct copy, bypassing a kernel buffer.
 			 */
 			void *va;
+			u_int gen;
 
 			KASSERT(rpipe->pipe_state & PIPE_DIRECTW);
 
@@ -508,8 +513,15 @@
 				size = uio->uio_resid;
 
 			va = (char *)rpipe->pipe_map.kva + rpipe->pipe_map.pos;
+			gen = rpipe->pipe_map.egen;
 			mutex_exit(lock);
+
+			/*
+			 * Consume emap and read the data from loaned pages.
+			 */
+			uvm_emap_consume(gen);
 			error = uiomove(va, size, uio);
+
 			mutex_enter(lock);
 			if (error)
 				break;
@@ -635,6 +647,7 @@
 	vsize_t len;
 
 	len = (vsize_t)wpipe->pipe_map.npages << PAGE_SHIFT;
+	uvm_emap_remove(wpipe->pipe_map.kva, len);	/* XXX */
 	uvm_km_free(kernel_map, wpipe->pipe_map.kva, len, UVM_KMF_VAONLY);
 	wpipe->pipe_map.kva = 0;
 	atomic_add_int(&amountpipekva, -len);
@@ -656,10 +669,10 @@
 static int
 pipe_direct_write(file_t *fp, struct pipe *wpipe, struct uio *uio)
 {
-	int error, npages, j;
 	struct vm_page **pgs;
-	vaddr_t bbase, kva, base, bend;
+	vaddr_t bbase, base, bend;
 	vsize_t blen, bcnt;
+	int error, npages;
 	voff_t bpos;
 	kmutex_t *lock = wpipe->pipe_lock;
 
@@ -713,12 +726,9 @@
 		return (ENOMEM); /* so that caller fallback to ordinary write */
 	}
 
-	/* Enter the loaned pages to kva */
-	kva = wpipe->pipe_map.kva;
-	for (j = 0; j < npages; j++, kva += PAGE_SIZE) {
-		pmap_kenter_pa(kva, VM_PAGE_TO_PHYS(pgs[j]), VM_PROT_READ);
-	}
-	pmap_update(pmap_kernel());
+	/* Enter the loaned pages to KVA, produce new emap generation number. */
+	uvm_emap_enter(wpipe->pipe_map.kva, pgs, npages);
+	wpipe->pipe_map.egen = uvm_emap_produce();
 
 	/* Now we can put the pipe in direct write mode */
 	wpipe->pipe_map.pos = bpos;
@@ -760,8 +770,7 @@
 	mutex_exit(lock);
 
 	if (pgs != NULL) {
-		pmap_kremove(wpipe->pipe_map.kva, blen);
-		pmap_update(pmap_kernel());
+		/* XXX: uvm_emap_remove */
 		uvm_unloan(pgs, npages, UVM_LOAN_TOPAGE);
 	}
 	if (error || amountpipekva > maxpipekva)

Index: src/sys/sys/lwp.h
diff -u src/sys/sys/lwp.h:1.119 src/sys/sys/lwp.h:1.120
--- src/sys/sys/lwp.h:1.119	Wed May 27 12:08:35 2009
+++ src/sys/sys/lwp.h	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: lwp.h,v 1.119 2009/05/27 12:08:35 yamt Exp $	*/
+/*	$NetBSD: lwp.h,v 1.120 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*-
  * Copyright (c) 2001, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
@@ -121,6 +121,7 @@
 	int		l_sleeperr;	/* !: error before unblock */
 	u_int		l_slptime;	/* l: time since last blocked */
 	callout_t	l_timeout_ch;	/* !: callout for tsleep */
+	u_int		l_emap_gen;	/* !: emap generation number */
 
 	/* Process level and global state, misc. */
 	LIST_ENTRY(lwp)	l_list;		/* a: entry on list of all LWPs */

Index: src/sys/sys/pipe.h
diff -u src/sys/sys/pipe.h:1.27 src/sys/sys/pipe.h:1.28
--- src/sys/sys/pipe.h:1.27	Sat Apr 11 15:46:18 2009
+++ src/sys/sys/pipe.h	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/* $NetBSD: pipe.h,v 1.27 2009/04/11 15:46:18 christos Exp $ */
+/* $NetBSD: pipe.h,v 1.28 2009/06/28 15:18:50 rmind Exp $ */
 
 /*
  * Copyright (c) 1996 John S. Dyson
@@ -81,6 +81,7 @@
 	voff_t		pos;		/* current position within page */
 	int		npages;		/* how many pages allocated */
 	struct vm_page	**pgs;		/* pointers to the pages */
+	u_int		egen;		/* emap generation number */
 };
 
 /*

Index: src/sys/uvm/files.uvm
diff -u src/sys/uvm/files.uvm:1.13 src/sys/uvm/files.uvm:1.14
--- src/sys/uvm/files.uvm:1.13	Sun Mar 29 10:51:53 2009
+++ src/sys/uvm/files.uvm	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-#	$NetBSD: files.uvm,v 1.13 2009/03/29 10:51:53 ad Exp $
+#	$NetBSD: files.uvm,v 1.14 2009/06/28 15:18:50 rmind Exp $
 
 #
 # UVM options
@@ -17,6 +17,7 @@
 file	uvm/uvm_bio.c
 file	uvm/uvm_coredump.c		coredump
 file	uvm/uvm_device.c
+file	uvm/uvm_emap.c
 file	uvm/uvm_fault.c
 file	uvm/uvm_glue.c
 file	uvm/uvm_init.c

Index: src/sys/uvm/uvm.h
diff -u src/sys/uvm/uvm.h:1.55 src/sys/uvm/uvm.h:1.56
--- src/sys/uvm/uvm.h:1.55	Wed Jun  4 15:06:04 2008
+++ src/sys/uvm/uvm.h	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm.h,v 1.55 2008/06/04 15:06:04 ad Exp $	*/
+/*	$NetBSD: uvm.h,v 1.56 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*
  *
@@ -84,6 +84,7 @@
 	bool page_idle_zero;		/* TRUE if we should try to zero
 					   pages in the idle loop */
 	int pages[PGFL_NQUEUES];	/* total of pages in page_free */
+	u_int emap_gen;			/* emap generation number */
 };
 
 /*

Index: src/sys/uvm/uvm_extern.h
diff -u src/sys/uvm/uvm_extern.h:1.154 src/sys/uvm/uvm_extern.h:1.155
--- src/sys/uvm/uvm_extern.h:1.154	Mon Mar 30 16:36:36 2009
+++ src/sys/uvm/uvm_extern.h	Sun Jun 28 15:18:50 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_extern.h,v 1.154 2009/03/30 16:36:36 yamt Exp $	*/
+/*	$NetBSD: uvm_extern.h,v 1.155 2009/06/28 15:18:50 rmind Exp $	*/
 
 /*
  *
@@ -226,6 +226,11 @@
 #define	UBC_MAX_PAGES	8
 
 /*
+ * Value representing inactive emap.
+ */
+#define	UVM_EMAP_INACTIVE	(0)
+
+/*
  * structures
  */
 
@@ -567,6 +572,31 @@
 int			ubc_uiomove(struct uvm_object *, struct uio *, vsize_t,
 			    int, int);
 
+/* uvm_emap.c */
+void			uvm_emap_sysinit(void);
+#ifdef __HAVE_PMAP_EMAP
+void			uvm_emap_switch(lwp_t *);
+#else
+#define			uvm_emap_switch(l)
+#endif
+
+u_int			uvm_emap_gen_return(void);
+void			uvm_emap_update(u_int);
+
+vaddr_t			uvm_emap_alloc(vsize_t, bool);
+void			uvm_emap_free(vaddr_t, size_t);
+
+void			uvm_emap_enter(vaddr_t, struct vm_page **, u_int);
+void			uvm_emap_remove(vaddr_t, vsize_t);
+
+#ifdef __HAVE_PMAP_EMAP
+void			uvm_emap_consume(u_int);
+u_int			uvm_emap_produce(void);
+#else
+#define			uvm_emap_consume(x)
+#define			uvm_emap_produce()	UVM_EMAP_INACTIVE
+#endif
+
 /* uvm_fault.c */
 #define uvm_fault(m, a, p) uvm_fault_internal(m, a, p, 0)
 int		uvm_fault_internal(struct vm_map *, vaddr_t, vm_prot_t, int);

Index: src/sys/uvm/uvm_glue.c
diff -u src/sys/uvm/uvm_glue.c:1.137 src/sys/uvm/uvm_glue.c:1.138
--- src/sys/uvm/uvm_glue.c:1.137	Thu Apr 16 00:17:19 2009
+++ src/sys/uvm/uvm_glue.c	Sun Jun 28 15:18:51 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_glue.c,v 1.137 2009/04/16 00:17:19 rmind Exp $	*/
+/*	$NetBSD: uvm_glue.c,v 1.138 2009/06/28 15:18:51 rmind Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -67,7 +67,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.137 2009/04/16 00:17:19 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.138 2009/06/28 15:18:51 rmind Exp $");
 
 #include "opt_kgdb.h"
 #include "opt_kstack.h"
@@ -264,6 +264,9 @@
 	 * the specified entry point will be executed.
 	 */
 	cpu_lwp_fork(l1, l2, stack, stacksize, func, arg);
+
+	/* Inactive emap for new LWP. */
+	l2->l_emap_gen = UVM_EMAP_INACTIVE;
 }
 
 static int

Index: src/sys/uvm/uvm_init.c
diff -u src/sys/uvm/uvm_init.c:1.34 src/sys/uvm/uvm_init.c:1.35
--- src/sys/uvm/uvm_init.c:1.34	Sat Oct 18 03:46:22 2008
+++ src/sys/uvm/uvm_init.c	Sun Jun 28 15:18:51 2009
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_init.c,v 1.34 2008/10/18 03:46:22 rmind Exp $	*/
+/*	$NetBSD: uvm_init.c,v 1.35 2009/06/28 15:18:51 rmind Exp $	*/
 
 /*
  *
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.34 2008/10/18 03:46:22 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.35 2009/06/28 15:18:51 rmind Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -167,6 +167,12 @@
 	kmem_init();
 
 	/*
+	 * init emap subsystem.
+	 */
+
+	uvm_emap_sysinit();
+
+	/*
 	 * the VM system is now up!  now that kmem is up we can resize the
 	 * <obj,off> => <page> hash table for general use and enable paging
 	 * of kernel objects.

Added files:

Index: src/sys/uvm/uvm_emap.c
diff -u /dev/null src/sys/uvm/uvm_emap.c:1.1
--- /dev/null	Sun Jun 28 15:18:51 2009
+++ src/sys/uvm/uvm_emap.c	Sun Jun 28 15:18:50 2009
@@ -0,0 +1,360 @@
+/*	$NetBSD: uvm_emap.c,v 1.1 2009/06/28 15:18:50 rmind Exp $	*/
+
+/*-
+ * Copyright (c) 2009 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Mindaugas Rasiukevicius and Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * UVM ephemeral mapping interface.
+ *
+ * Generic (more expensive) stubs are implemented for architectures which
+ * do not support pmap.
+ *
+ * Note that uvm_emap_update() is called from lower pmap(9) layer, while
+ * other functions call to pmap(9).  Typical pattern of update in pmap:
+ *
+ *	u_int gen = uvm_emap_gen_return();
+ *	tlbflush();
+ *	uvm_emap_update();
+ *
+ * It is also used from IPI context, therefore functions must safe.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: uvm_emap.c,v 1.1 2009/06/28 15:18:50 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+
+#include <sys/atomic.h>
+#include <sys/lwp.h>
+#include <sys/vmem.h>
+#include <sys/types.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_extern.h>
+
+/* XXX: Arbitrary. */
+#ifdef _LP64
+#define	UVM_EMAP_SIZE		(128 * 1024 * 1024)	/* 128 MB */
+#else
+#define	UVM_EMAP_SIZE		(32 * 1024 * 1024)	/*  32 MB */
+#endif
+
+static u_int		_uvm_emap_gen[COHERENCY_UNIT - sizeof(u_int)]
+    __aligned(COHERENCY_UNIT);
+
+#define	uvm_emap_gen	(_uvm_emap_gen[0])
+
+static u_int		uvm_emap_size = UVM_EMAP_SIZE;
+static vaddr_t		uvm_emap_va;
+static vmem_t *		uvm_emap_vmem;
+
+/*
+ * uvm_emap_init: initialize subsystem.
+ */
+void
+uvm_emap_sysinit(void)
+{
+	size_t qmax;
+
+	uvm_emap_size = roundup(uvm_emap_size, PAGE_SIZE);
+	qmax = 16 * PAGE_SIZE;
+
+	uvm_emap_va = uvm_km_alloc(kernel_map, uvm_emap_size, 0,
+	    UVM_KMF_VAONLY | UVM_KMF_WAITVA);
+	if (uvm_emap_va == 0) {
+		panic("uvm_emap_init: KVA allocation failed");
+	}
+
+	uvm_emap_vmem = vmem_create("emap", uvm_emap_va, uvm_emap_size,
+	    PAGE_SIZE, NULL, NULL, NULL, qmax, VM_SLEEP, IPL_NONE);
+	if (uvm_emap_vmem == NULL) {
+		panic("uvm_emap_init: vmem creation failed");
+	}
+
+	uvm_emap_gen = 1;
+}
+
+/*
+ * uvm_emap_alloc: allocate a window.
+ */
+vaddr_t
+uvm_emap_alloc(vsize_t size, bool waitok)
+{
+
+	KASSERT(size > 0);
+	KASSERT(round_page(size) == size);
+
+	return vmem_alloc(uvm_emap_vmem, size,
+	    VM_INSTANTFIT | (waitok ? VM_SLEEP : VM_NOSLEEP));
+}
+
+/*
+ * uvm_emap_free: free a window.
+ */
+void
+uvm_emap_free(vaddr_t va, size_t size)
+{
+
+	KASSERT(va >= uvm_emap_va);
+	KASSERT(size <= uvm_emap_size);
+	KASSERT(va + size <= uvm_emap_va + uvm_emap_size);
+
+	vmem_free(uvm_emap_vmem, va, size);
+}
+
+#ifdef __HAVE_PMAP_EMAP
+
+/*
+ * uvm_emap_enter: enter a new mapping, without TLB flush.
+ */
+void
+uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages)
+{
+	paddr_t pa;
+	u_int n;
+
+	for (n = 0; n < npages; n++, va += PAGE_SIZE) {
+		pa = VM_PAGE_TO_PHYS(pgs[n]);
+		pmap_emap_enter(va, pa, VM_PROT_READ);
+	}
+}
+
+/*
+ * uvm_emap_remove: remove a mapping.
+ */
+void
+uvm_emap_remove(vaddr_t sva, vsize_t len)
+{
+
+	pmap_emap_remove(sva, len);
+}
+
+/*
+ * uvm_emap_gen_return: get the global generation number.
+ *
+ * => can be called from IPI handler, therefore function must be safe.
+ */
+u_int
+uvm_emap_gen_return(void)
+{
+	u_int gen;
+
+	gen = uvm_emap_gen;
+	if (__predict_false(gen == UVM_EMAP_INACTIVE)) {
+		/*
+		 * Instead of looping, just increase in our side.
+		 * Other thread could race and increase it again,
+		 * but without any negative effect.
+		 */
+		gen = atomic_inc_uint_nv(&uvm_emap_gen);
+	}
+	KASSERT(gen != UVM_EMAP_INACTIVE);
+	return gen;
+}
+
+/*
+ * uvm_emap_switch: if the CPU is 'behind' the LWP in emap visibility,
+ * perform TLB flush and thus update the local view.  Main purpose is
+ * to handle kernel preemption, while emap is in use.
+ *
+ * => called from mi_switch(), when LWP returns after block or preempt.
+ */
+void
+uvm_emap_switch(lwp_t *l)
+{
+	struct uvm_cpu *ucpu;
+	u_int curgen, gen;
+
+	KASSERT(kpreempt_disabled());
+
+	/* If LWP did not use emap, then nothing to do. */
+	if (__predict_true(l->l_emap_gen == UVM_EMAP_INACTIVE)) {
+		return;
+	}
+
+	/*
+	 * No need to synchronise if generation number of current CPU is
+	 * newer than the number of this LWP.
+	 *
+	 * This test assumes two's complement arithmetic and allows
+	 * ~2B missed updates before it will produce bad results.
+	 */
+	ucpu = curcpu()->ci_data.cpu_uvm;
+	curgen = ucpu->emap_gen;
+	gen = l->l_emap_gen;
+	if (__predict_true((signed int)(curgen - gen) >= 0)) {
+		return;
+	}
+
+	/*
+	 * See comments in uvm_emap_consume() about memory
+	 * barriers and race conditions.
+	 */
+	curgen = uvm_emap_gen_return();
+	pmap_emap_sync();
+	ucpu->emap_gen = curgen;
+}
+
+/*
+ * uvm_emap_consume: update the current CPU and LWP to the given generation
+ * of the emap.  In a case of LWP migration to a different CPU after block
+ * or preempt, uvm_emap_switch() will synchronise.
+ *
+ * => may be called from both interrupt and thread context.
+ */
+void
+uvm_emap_consume(u_int gen)
+{
+	struct cpu_info *ci;
+	struct uvm_cpu *ucpu;
+	lwp_t *l = curlwp;
+	u_int curgen;
+
+	if (gen == UVM_EMAP_INACTIVE) {
+		return;
+	}
+
+	/*
+	 * No need to synchronise if generation number of current CPU is
+	 * newer than the number of this LWP.
+	 *
+	 * This test assumes two's complement arithmetic and allows
+	 * ~2B missed updates before it will produce bad results.
+	 */
+	KPREEMPT_DISABLE(l);
+	ci = l->l_cpu;
+	ucpu = ci->ci_data.cpu_uvm;
+	if (__predict_true((signed int)(ucpu->emap_gen - gen) >= 0)) {
+		l->l_emap_gen = ucpu->emap_gen;
+		KPREEMPT_ENABLE(l);
+		return;
+	}
+
+	/*
+	 * Record the current generation _before_ issuing the TLB flush.
+	 * No need for a memory barrier before, as reading a stale value
+	 * for uvm_emap_gen is not a problem.
+	 *
+	 * pmap_emap_sync() must implicitly perform a full memory barrier,
+	 * which prevents us from fetching a value from after the TLB flush
+	 * has occurred (which would be bad).
+	 *
+	 * We can race with an interrupt on the current CPU updating the
+	 * counter to a newer value.  This could cause us to set a stale
+	 * value into ucpu->emap_gen, overwriting a newer update from the
+	 * interrupt.  However, it does not matter since:
+	 *  (1) Interrupts always run to completion or block.
+	 *  (2) Interrupts will only ever install a newer value and,
+	 *  (3) We will roll the value forward later.
+	 */
+	curgen = uvm_emap_gen_return();
+	pmap_emap_sync();
+	ucpu->emap_gen = curgen;
+	l->l_emap_gen = curgen;
+	KASSERT((signed int)(curgen - gen) >= 0);
+	KPREEMPT_ENABLE(l);
+}
+
+/*
+ * uvm_emap_produce: increment emap generation counter.
+ *
+ * => pmap updates must be globally visible.
+ * => caller must have already entered mappings.
+ * => may be called from both interrupt and thread context.
+ */
+u_int
+uvm_emap_produce(void)
+{
+	u_int gen;
+again:
+	gen = atomic_inc_uint_nv(&uvm_emap_gen);
+	if (__predict_false(gen == UVM_EMAP_INACTIVE)) {
+		goto again;
+	}
+	return gen;
+}
+
+/*
+ * uvm_emap_update: update global emap generation number for current CPU.
+ *
+ * Function is called by MD code (eg. pmap) to take advantage of TLB flushes
+ * initiated for other reasons, that sync the emap as a side effect.  Note
+ * update should be performed before the actual TLB flush, to avoid race
+ * with newly generated number.
+ *
+ * => can be called from IPI handler, therefore function must be safe.
+ * => should be called _after_ TLB flush.
+ * => emap generation number should be taken _before_ TLB flush.
+ * => must be called with preemption disabled.
+ */
+void
+uvm_emap_update(u_int gen)
+{
+	struct uvm_cpu *ucpu;
+
+	/*
+	 * See comments in uvm_emap_consume() about memory barriers and
+	 * race conditions.  Store is atomic if emap_gen size is word.
+	 */
+	CTASSERT(sizeof(ucpu->emap_gen) == sizeof(int));
+	/* XXX: KASSERT(kpreempt_disabled()); */
+
+	ucpu = curcpu()->ci_data.cpu_uvm;
+	ucpu->emap_gen = gen;
+}
+
+#else
+
+/*
+ * Stubs for architectures which do not support emap.
+ */
+
+void
+uvm_emap_enter(vaddr_t va, struct vm_page **pgs, u_int npages)
+{
+	paddr_t pa;
+	u_int n;
+
+	for (n = 0; n < npages; n++, va += PAGE_SIZE) {
+		pa = VM_PAGE_TO_PHYS(pgs[n]);
+		pmap_kenter_pa(va, pa, VM_PROT_READ);
+	}
+	pmap_update(pmap_kernel());
+}
+
+void
+uvm_emap_remove(vaddr_t sva, vsize_t len)
+{
+
+	pmap_kremove(sva, len);
+	pmap_update(pmap_kernel());
+}
+
+#endif

Reply via email to