Module Name:    src
Committed By:   jym
Date:           Sat Jul 24 00:45:57 UTC 2010

Modified Files:
        src/sys/arch/i386/conf: GENERIC
        src/sys/arch/i386/i386: bioscall.S kvm86call.S locore.S machdep.c
            mptramp.S multiboot.c
        src/sys/arch/i386/include: pmap.h
        src/sys/arch/x86/include: cpu.h pmap.h
        src/sys/arch/x86/x86: cpu.c pmap.c
        src/sys/arch/xen/x86: cpu.c x86_xpmap.c xenfunc.c

Log Message:
Welcome PAE inside i386 current.

This patch is inspired by work previously done by Jeremy Morse, ported by me
to -current, merged with the work previously done for port-xen, together with
additionals fixes and improvements.

PAE option is disabled by default in GENERIC (but will be enabled in ALL in
the next few days).

In quick, PAE switches the CPU to a mode where physical addresses become
36 bits (64 GiB). Virtual address space remains at 32 bits (4 GiB). To cope
with the increased size of the physical address, they are manipulated as
64 bits variables by kernel and MMU.

When supported by the CPU, it also allows the use of the NX/XD bit that
provides no-execution right enforcement on a per physical page basis.

Notes:

- reworked locore.S

- introduce cpu_load_pmap(), used to switch pmap for the curcpu. Due to the
different handling of pmap mappings with PAE vs !PAE, Xen vs native, details
are hidden within this function. This helps calling it from assembly,
as some features, like BIOS calls, switch to pmap_kernel before mapping
trampoline code in low memory.

- some changes in bioscall and kvm86_call, to reflect the above.

- the L3 is "pinned" per-CPU, and is only manipulated by a
reduced set of functions within pmap. To track the L3, I added two
elements to struct cpu_info, namely ci_l3_pdirpa (PA of the L3), and
ci_l3_pdir (the L3 VA). Rest of the code considers that it runs "just
like" a normal i386, except that the L2 is 4 pages long (PTP_LEVELS is
still 2).

- similar to the ci_pae_l3_pdir{,pa} variables, amd64's xen_current_user_pgd
becomes an element of cpu_info (slowly paving the way for MP world).

- bootinfo_source struct declaration is modified, to cope with paddr_t size
change with PAE (it is not correct to assume that bs_addr is a paddr_t when
compiled with PAE - it should remain 32 bits). bs_addrs is now a
void * array (in bootloader's code under i386/stand/, the bs_addrs
is a physaddr_t, which is an unsigned long).

- fixes in multiboot code (same reason as bootinfo): paddr_t size
change. I used Elf32_* types, use RELOC() where necessary, and move the
memcpy() functions out of the if/else if (I do not expect sym and str tables
to overlap with ELF).

- 64 bits atomic functions for pmap

- all pmap_pdirpa access are now done through the pmap_pdirpa macro. It
hides the L3/L2 stuff from PAE, as well as the pm_pdirpa change in
struct pmap (it now becomes a PDP_SIZE array, with or without PAE).

- manipulation of recursive mappings ( PDIR_SLOT_{,A}PTEs ) is done via
loops on PDP_SIZE.

See also http://mail-index.netbsd.org/port-i386/2010/07/17/msg002062.html

No objection raised on port-i386@ and port-...@r for about a week.

XXX kvm(3) will be fixed in another patch to properly handle both PAE and !PAE
kernel dumps (VA => PA macros are slightly different, and need proper 64 bits
PA support in kvm_i386).

XXX Mixing PAE and !PAE modules may lead to unwanted/unexpected results. This
cannot be solved easily, and needs lots of thinking before being declared
safe (paddr_t/bus_addr_t size handling, PD/PT macros abstractions).


To generate a diff of this commit:
cvs rdiff -u -r1.988 -r1.989 src/sys/arch/i386/conf/GENERIC
cvs rdiff -u -r1.8 -r1.9 src/sys/arch/i386/i386/bioscall.S
cvs rdiff -u -r1.9 -r1.10 src/sys/arch/i386/i386/kvm86call.S
cvs rdiff -u -r1.92 -r1.93 src/sys/arch/i386/i386/locore.S
cvs rdiff -u -r1.690 -r1.691 src/sys/arch/i386/i386/machdep.c
cvs rdiff -u -r1.20 -r1.21 src/sys/arch/i386/i386/mptramp.S
cvs rdiff -u -r1.19 -r1.20 src/sys/arch/i386/i386/multiboot.c
cvs rdiff -u -r1.106 -r1.107 src/sys/arch/i386/include/pmap.h
cvs rdiff -u -r1.22 -r1.23 src/sys/arch/x86/include/cpu.h
cvs rdiff -u -r1.32 -r1.33 src/sys/arch/x86/include/pmap.h
cvs rdiff -u -r1.72 -r1.73 src/sys/arch/x86/x86/cpu.c
cvs rdiff -u -r1.112 -r1.113 src/sys/arch/x86/x86/pmap.c
cvs rdiff -u -r1.46 -r1.47 src/sys/arch/xen/x86/cpu.c
cvs rdiff -u -r1.20 -r1.21 src/sys/arch/xen/x86/x86_xpmap.c
cvs rdiff -u -r1.10 -r1.11 src/sys/arch/xen/x86/xenfunc.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/i386/conf/GENERIC
diff -u src/sys/arch/i386/conf/GENERIC:1.988 src/sys/arch/i386/conf/GENERIC:1.989
--- src/sys/arch/i386/conf/GENERIC:1.988	Fri Jul 23 00:43:20 2010
+++ src/sys/arch/i386/conf/GENERIC	Sat Jul 24 00:45:54 2010
@@ -1,4 +1,4 @@
-# $NetBSD: GENERIC,v 1.988 2010/07/23 00:43:20 jakllsch Exp $
+# $NetBSD: GENERIC,v 1.989 2010/07/24 00:45:54 jym Exp $
 #
 # GENERIC machine description file
 #
@@ -22,7 +22,7 @@
 
 options 	INCLUDE_CONFIG_FILE	# embed config file in kernel binary
 
-#ident 		"GENERIC-$Revision: 1.988 $"
+#ident 		"GENERIC-$Revision: 1.989 $"
 
 maxusers	64		# estimated number of users
 
@@ -35,6 +35,7 @@
 # CPU-related options.
 options 	VM86		# virtual 8086 emulation
 options 	USER_LDT	# user-settable LDT; used by WINE
+#options 	PAE		# PAE mode (36 bits physical addressing)
 
 # Enhanced SpeedStep Technology in the Pentium M
 options 	ENHANCED_SPEEDSTEP

Index: src/sys/arch/i386/i386/bioscall.S
diff -u src/sys/arch/i386/i386/bioscall.S:1.8 src/sys/arch/i386/i386/bioscall.S:1.9
--- src/sys/arch/i386/i386/bioscall.S:1.8	Mon Apr 28 20:23:24 2008
+++ src/sys/arch/i386/i386/bioscall.S	Sat Jul 24 00:45:54 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: bioscall.S,v 1.8 2008/04/28 20:23:24 martin Exp $ */
+/*	$NetBSD: bioscall.S,v 1.9 2010/07/24 00:45:54 jym Exp $ */
 
 /*-
  * Copyright (c) 1997 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: bioscall.S,v 1.8 2008/04/28 20:23:24 martin Exp $");
+__KERNEL_RCSID(0, "$NetBSD: bioscall.S,v 1.9 2010/07/24 00:45:54 jym Exp $");
 
 #include <machine/bioscall.h>
 
@@ -39,8 +39,6 @@
 /* LINTSTUB: include <sys/types.h> */
 /* LINTSTUB: include <machine/bioscall.h> */
 
-	.globl	_C_LABEL(PDPpaddr)	/* from locore.S */
-
 	.section ".rodata"
 _C_LABEL(biostramp_image):
 	.globl	_C_LABEL(biostramp_image)
@@ -69,11 +67,11 @@
 	pushl	%ebp
 	movl	%esp,%ebp		/* set up frame ptr */
 
-	movl	%cr3,%eax		/* save PDP base register */
+	/* install lwp0 pmap */
+	movl	_C_LABEL(kernel_pmap_ptr),%eax
 	pushl	%eax
-
-	movl	_C_LABEL(PDPpaddr),%eax	/* install proc0 PDP */
-	movl	%eax,%cr3
+	call	_C_LABEL(cpu_load_pmap)
+	addl	$4,%esp
 
 	movl	$(BIOSTRAMP_BASE),%eax	/* address of trampoline area */
 	pushl	12(%ebp)
@@ -81,8 +79,11 @@
 	call	*%eax			/* machdep.c initializes it */
 	addl	$8,%esp			/* clear args from stack */
 
-	popl	%eax
-	movl	%eax,%cr3			/* restore PTDB register */
+	/* restore pmap - saved value is in curcpu()->ci_pmap */
+	movl	%fs:(CPU_INFO_PMAP),%eax
+	pushl	%eax
+	call	_C_LABEL(cpu_load_pmap)
+	addl	$4,%esp
 
 	leave
 	ret

Index: src/sys/arch/i386/i386/kvm86call.S
diff -u src/sys/arch/i386/i386/kvm86call.S:1.9 src/sys/arch/i386/i386/kvm86call.S:1.10
--- src/sys/arch/i386/i386/kvm86call.S:1.9	Fri Jan  4 15:55:31 2008
+++ src/sys/arch/i386/i386/kvm86call.S	Sat Jul 24 00:45:54 2010
@@ -1,4 +1,4 @@
-/* $NetBSD: kvm86call.S,v 1.9 2008/01/04 15:55:31 yamt Exp $ */
+/* $NetBSD: kvm86call.S,v 1.10 2010/07/24 00:45:54 jym Exp $ */
 
 /*-
  * Copyright (c) 1998 Jonathan Lemon
@@ -34,7 +34,7 @@
 
 #include "assym.h"
 
-__KERNEL_RCSID(0, "$NetBSD: kvm86call.S,v 1.9 2008/01/04 15:55:31 yamt Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kvm86call.S,v 1.10 2010/07/24 00:45:54 jym Exp $");
 
 	.data
 	.align 4
@@ -79,10 +79,7 @@
 	andl	$~0x0200,4(%eax,%edi,1)	/* reset "task busy" */
 	ltr	%di
 
-	movl	%cr3,%eax
-	pushl	%eax			/* save address space */
-	movl	PDPpaddr,%ecx
-	movl	%ecx,%ebx
+	movl	_C_LABEL(PDPpaddr),%ebx
 	addl	$KERNBASE,%ebx		/* va of Idle PDP */
 	movl	0(%ebx),%eax
 	pushl	%eax			/* old pde */
@@ -93,7 +90,12 @@
 	movl	vm86newptd,%eax		/* mapping for vm86 page table */
 	movl	%eax,0(%ebx)		/* ... install as PDP entry 0 */
 
-	movl	%ecx,%cr3		/* new page tables */
+	/* install Idle pmap (lwp0 pmap) */
+	movl	_C_LABEL(kernel_pmap_ptr),%eax
+	pushl	%eax
+	call	_C_LABEL(cpu_load_pmap)
+	addl	$4,%esp
+
 	movl	vm86frame,%esp		/* switch to new stack */
 
 	movl	$1,kvm86_incall		/* set flag for trap() */
@@ -129,8 +131,12 @@
 	popl	%ebx			/* saved va of Idle PDP */
 	popl	%eax
 	movl	%eax,0(%ebx)		/* restore old pde */
-	popl	%eax
-	movl	%eax,%cr3		/* install old page table */
+
+	/* restore pmap - saved value is in curcpu()->ci_pmap */
+	movl	%fs:(CPU_INFO_PMAP),%eax
+	pushl	%eax
+	call	_C_LABEL(cpu_load_pmap)
+	addl	$4,%esp
 
 	movl	$0,kvm86_incall		/* reset trapflag */
 

Index: src/sys/arch/i386/i386/locore.S
diff -u src/sys/arch/i386/i386/locore.S:1.92 src/sys/arch/i386/i386/locore.S:1.93
--- src/sys/arch/i386/i386/locore.S:1.92	Thu Jul 15 18:55:27 2010
+++ src/sys/arch/i386/i386/locore.S	Sat Jul 24 00:45:54 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: locore.S,v 1.92 2010/07/15 18:55:27 jym Exp $	*/
+/*	$NetBSD: locore.S,v 1.93 2010/07/24 00:45:54 jym Exp $	*/
 
 /*
  * Copyright-o-rama!
@@ -129,7 +129,7 @@
  */
 
 #include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.92 2010/07/15 18:55:27 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: locore.S,v 1.93 2010/07/24 00:45:54 jym Exp $");
 
 #include "opt_compat_oldboot.h"
 #include "opt_ddb.h"
@@ -482,29 +482,43 @@
 	movl	$_RELOC(tmpstk),%esp	# bootstrap stack end location
 
 /*
- * Virtual address space of kernel:
+ * Virtual address space of kernel, without PAE. The page dir is 1 page long.
  *
  * text | data | bss | [syms] | [blobs] | page dir | proc0 kstack | L1 ptp
  *			                0          1       2      3
+ *
+ * Virtual address space of kernel, with PAE. We need 4 pages for the page dir
+ * and 1 page for the L3.
+ * text | data | bss | [syms] | [blobs] | L3 | page dir | proc0 kstack | L1 ptp
+ * 					0    1          5       6      7
  */
-
+#ifndef PAE
 #define	PROC0_PDIR_OFF	0
-#define	PROC0_STK_OFF	(PROC0_PDIR_OFF + PAGE_SIZE)
+#else
+#define PROC0_L3_OFF	0
+#define PROC0_PDIR_OFF	1 * PAGE_SIZE
+#endif
+
+#define	PROC0_STK_OFF	(PROC0_PDIR_OFF + PDP_SIZE * PAGE_SIZE)
 #define	PROC0_PTP1_OFF	(PROC0_STK_OFF + UPAGES * PAGE_SIZE)
 
 /*
- * fillkpt
+ * fillkpt - Fill in a kernel page table
  *	eax = pte (page frame | control | status)
  *	ebx = page table address
  *	ecx = number of pages to map
+ * 
+ * For PAE, each entry is 8 bytes long: we must set the 4 upper bytes to 0.
+ * This is done by the first instruction of fillkpt. In the non-PAE case, this
+ * instruction just clears the page table entry.
  */
 
 #define fillkpt	\
-1:	movl	%eax,(%ebx)	; 	/* store phys addr */ \
-	addl	$4,%ebx		; 	/* next pte/pde */ \
-	addl	$PAGE_SIZE,%eax	; 	/* next phys page */ \
-	loop	1b		;  \
-
+1:	movl	$0,(PDE_SIZE-4)(%ebx)	;	/* clear bits */	\
+	movl	%eax,(%ebx)		;	/* store phys addr */   \
+	addl	$PDE_SIZE,%ebx		;	/* next pte/pde */      \
+	addl	$PAGE_SIZE,%eax		;	/* next phys page */    \
+	loop	1b			;
 
 	/* Find end of kernel image. */
 	movl	$RELOC(end),%edi
@@ -538,9 +552,14 @@
 	incl	%eax		/* one more ptp for VAs stolen by bootstrap */
 1:	movl	%eax,RELOC(nkptp)+1*4
 
-	/* tablesize = (1 + UPAGES + nkptp) << PGSHIFT; */
-	addl	$(1+UPAGES),%eax
+	/* tablesize = (PDP_SIZE + UPAGES + nkptp) << PGSHIFT; */
+	addl	$(PDP_SIZE+UPAGES),%eax
+#ifdef PAE
+	incl	%eax 		/* one more page for the L3 PD */
+	shll	$PGSHIFT+1,%eax	/* PTP tables are twice larger with PAE */
+#else
 	shll	$PGSHIFT,%eax
+#endif
 	movl	%eax,RELOC(tablesize)
 
 	/* ensure that nkptp covers bootstrap tables */
@@ -578,7 +597,10 @@
  	 */
 	movl	$_RELOC(KERNTEXTOFF),%eax
 	movl	%eax,%ecx
- 	shrl	$(PGSHIFT-2),%ecx	/* ((n >> PGSHIFT) << 2) for # pdes */
+	shrl	$(PGSHIFT-2),%ecx	/* ((n >> PGSHIFT) << 2) for # pdes */
+#ifdef PAE
+ 	shll	$1,%ecx			/* pdes are twice larger with PAE */
+#endif
  	addl	%ecx,%ebx
 
 	/* Map the kernel text read-only. */
@@ -605,36 +627,51 @@
  * Construct a page table directory.
  */
  	/* Set up top level entries for identity mapping */
- 	leal    (PROC0_PDIR_OFF)(%esi),%ebx
+ 	leal	(PROC0_PDIR_OFF)(%esi),%ebx
  	leal	(PROC0_PTP1_OFF)(%esi),%eax
  	orl	$(PG_V|PG_KW), %eax
  	movl	RELOC(nkptp)+1*4,%ecx
 	fillkpt
 
  	/* Set up top level entries for actual kernel mapping */
- 	leal    (PROC0_PDIR_OFF + L2_SLOT_KERNBASE*4)(%esi),%ebx
+ 	leal	(PROC0_PDIR_OFF + L2_SLOT_KERNBASE*PDE_SIZE)(%esi),%ebx
  	leal	(PROC0_PTP1_OFF)(%esi),%eax
  	orl	$(PG_V|PG_KW), %eax
  	movl	RELOC(nkptp)+1*4,%ecx
 	fillkpt
 
 	/* Install a PDE recursively mapping page directory as a page table! */
- 	leal    (PROC0_PDIR_OFF + PDIR_SLOT_PTE*4)(%esi),%ebx
- 	leal    (PROC0_PDIR_OFF)(%esi),%eax
+ 	leal	(PROC0_PDIR_OFF + PDIR_SLOT_PTE*PDE_SIZE)(%esi),%ebx
+ 	leal	(PROC0_PDIR_OFF)(%esi),%eax
  	orl	$(PG_V|PG_KW),%eax
- 	movl	%eax,(%ebx)
- 
+	movl	$PDP_SIZE,%ecx
+	fillkpt
+
+#ifdef PAE
+	/* Fill in proc0 L3 page with entries pointing to the page dirs */
+	leal	(PROC0_L3_OFF)(%esi),%ebx
+	leal	(PROC0_PDIR_OFF)(%esi),%eax
+	orl	$(PG_V),%eax
+	movl	$PDP_SIZE,%ecx
+	fillkpt
+
+	/* Enable PAE mode */
+	movl	%cr4,%eax
+	orl	$CR4_PAE,%eax
+	movl	%eax,%cr4
+#endif
 
 	/* Save phys. addr of PDP, for libkvm. */
-	movl	%esi,RELOC(PDPpaddr)
+	leal	(PROC0_PDIR_OFF)(%esi),%eax
+	movl	%eax,RELOC(PDPpaddr)
 
- 	/*
- 	 * Startup checklist:
- 	 * 1. Load %cr3 with pointer to PDIR.
- 	 */
+	/*
+	 * Startup checklist:
+	 * 1. Load %cr3 with pointer to PDIR (or L3 PD page for PAE).
+	 */
 	movl	%esi,%eax		# phys address of ptd in proc 0
 	movl	%eax,%cr3		# load ptd addr into mmu
- 
+
  	/*
  	 * 2. Enable paging and the rest of it.
  	 */
@@ -653,10 +690,11 @@
  	 * memory, remove it.
  	 */
  	movl	_C_LABEL(nkptp)+1*4,%ecx
- 	leal	(PROC0_PDIR_OFF)(%esi),%ebx	# old, phys  address of PDIR
- 	addl	$(KERNBASE), %ebx	# new, virtual address of PDIR
-1:	movl	$0,(%ebx)
- 	addl	$4,%ebx
+ 	leal	(PROC0_PDIR_OFF)(%esi),%ebx	# old, phys address of PDIR
+ 	addl	$(KERNBASE), %ebx		# new, virtual address of PDIR
+1:	movl	$0,(PDE_SIZE-4)(%ebx)		# Upper bits (for PAE)
+	movl	$0,(%ebx)
+ 	addl	$PDE_SIZE,%ebx
 	loop	1b
 
 	/* Relocate atdevbase. */
@@ -688,9 +726,13 @@
  	movl	_C_LABEL(tablesize),%eax
 	addl	%esi,%eax		# skip past stack and page tables
 
+#ifdef PAE
+	pushl	$0	# init386() expects a 64 bits paddr_t with PAE
+#endif
 	pushl	%eax
 	call	_C_LABEL(init386)	# wire 386 chip for unix operation
-	addl	$4+NGDT*8,%esp		# pop temporary gdt
+	addl	$PDE_SIZE,%esp		# pop paddr_t
+	addl	$NGDT*8,%esp		# pop temporary gdt
 
 #ifdef SAFARI_FIFO_HACK
 	movb	$5,%al
@@ -765,7 +807,7 @@
 #endif
 	pushl	%esi
 	call	_C_LABEL(init386)	# wire 386 chip for unix operation
-	addl	$PDE_SIZE,%esp
+	addl	$PDE_SIZE,%esp		# pop paddr_t
 	call 	_C_LABEL(main)
 
 #if defined(XEN) && !defined(XEN_COMPAT_030001)

Index: src/sys/arch/i386/i386/machdep.c
diff -u src/sys/arch/i386/i386/machdep.c:1.690 src/sys/arch/i386/i386/machdep.c:1.691
--- src/sys/arch/i386/i386/machdep.c:1.690	Thu Jul 15 23:20:34 2010
+++ src/sys/arch/i386/i386/machdep.c	Sat Jul 24 00:45:54 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: machdep.c,v 1.690 2010/07/15 23:20:34 jym Exp $	*/
+/*	$NetBSD: machdep.c,v 1.691 2010/07/24 00:45:54 jym Exp $	*/
 
 /*-
  * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009
@@ -67,7 +67,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.690 2010/07/15 23:20:34 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.691 2010/07/24 00:45:54 jym Exp $");
 
 #include "opt_beep.h"
 #include "opt_compat_ibcs2.h"
@@ -320,7 +320,7 @@
  * boot loader.  Only be used by native_loader(). */
 struct bootinfo_source {
 	uint32_t bs_naddrs;
-	paddr_t bs_addrs[1]; /* Actually longer. */
+	void *bs_addrs[1]; /* Actually longer. */
 };
 
 /* Only called by locore.h; no need to be in a header file. */
@@ -384,10 +384,10 @@
 		for (i = 0; i < bl_bootinfo->bs_naddrs; i++) {
 			struct btinfo_common *bc;
 
-			bc = (struct btinfo_common *)(bl_bootinfo->bs_addrs[i]);
+			bc = bl_bootinfo->bs_addrs[i];
 
-			if ((paddr_t)(data + bc->len) >
-			    (paddr_t)(&bidest->bi_data[0] + BOOTINFO_MAXSIZE))
+			if ((data + bc->len) >
+			    (&bidest->bi_data[0] + BOOTINFO_MAXSIZE))
 				break;
 
 			memcpy(data, bc, bc->len);
@@ -1312,6 +1312,14 @@
 	    (void *)atdevbase));
 #endif
 
+#if defined(PAE) && !defined(XEN)
+	/*
+	 * Save VA and PA of L3 PD of boot processor (for Xen, this is done
+	 * in xen_pmap_bootstrap())
+	 */
+	cpu_info_primary.ci_pae_l3_pdirpa = rcr3();
+	cpu_info_primary.ci_pae_l3_pdir = (pd_entry_t *)(rcr3() + KERNBASE);
+#endif /* PAE && !XEN */
 
 #ifdef XBOX
 	/*
@@ -1457,6 +1465,9 @@
 		       VM_PROT_ALL, 0);		/* protection */
 	pmap_update(pmap_kernel());
 	memcpy((void *)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
+
+	/* Needed early, for bioscall() and kvm86_call() */
+	cpu_info_primary.ci_pmap = pmap_kernel();
 #endif
 #endif /* !XEN */
 

Index: src/sys/arch/i386/i386/mptramp.S
diff -u src/sys/arch/i386/i386/mptramp.S:1.20 src/sys/arch/i386/i386/mptramp.S:1.21
--- src/sys/arch/i386/i386/mptramp.S:1.20	Tue Feb  9 23:09:47 2010
+++ src/sys/arch/i386/i386/mptramp.S	Sat Jul 24 00:45:55 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: mptramp.S,v 1.20 2010/02/09 23:09:47 jym Exp $	*/
+/*	$NetBSD: mptramp.S,v 1.21 2010/07/24 00:45:55 jym Exp $	*/
 
 /*-
  * Copyright (c) 2000 The NetBSD Foundation, Inc.
@@ -76,7 +76,7 @@
  */
 
 #include <machine/asm.h>
-__KERNEL_RCSID(0, "$NetBSD: mptramp.S,v 1.20 2010/02/09 23:09:47 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: mptramp.S,v 1.21 2010/07/24 00:45:55 jym Exp $");
 	
 #include "opt_mpbios.h"		/* for MPDEBUG */
 		
@@ -160,6 +160,12 @@
 	movl	%eax,%cr4
 1:
 
+#ifdef PAE /* Enable PAE */
+	movl	%cr4,%eax
+	or	$CR4_PAE,%eax
+	movl	%eax,%cr4
+#endif
+
 	movl	RELOC(mp_pdirpa),%ecx
 	HALTT(0x5,%ecx)
 	

Index: src/sys/arch/i386/i386/multiboot.c
diff -u src/sys/arch/i386/i386/multiboot.c:1.19 src/sys/arch/i386/i386/multiboot.c:1.20
--- src/sys/arch/i386/i386/multiboot.c:1.19	Sun Feb 22 18:05:42 2009
+++ src/sys/arch/i386/i386/multiboot.c	Sat Jul 24 00:45:55 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: multiboot.c,v 1.19 2009/02/22 18:05:42 ahoka Exp $	*/
+/*	$NetBSD: multiboot.c,v 1.20 2010/07/24 00:45:55 jym Exp $	*/
 
 /*-
  * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: multiboot.c,v 1.19 2009/02/22 18:05:42 ahoka Exp $");
+__KERNEL_RCSID(0, "$NetBSD: multiboot.c,v 1.20 2010/07/24 00:45:55 jym Exp $");
 
 #include "opt_multiboot.h"
 
@@ -276,12 +276,11 @@
 {
 #define RELOC(type, x) ((type)((vaddr_t)(x) - KERNBASE))
 	int i;
-	Elf32_Shdr *symtabp, *strtabp;
 	struct multiboot_symbols *ms;
-	size_t symsize, strsize;
-	paddr_t symaddr, straddr;
-	paddr_t symstart, strstart;
-
+	Elf32_Shdr *symtabp, *strtabp;
+	Elf32_Word symsize, strsize;
+	Elf32_Addr symaddr, straddr;
+	Elf32_Addr symstart, strstart;
 
 	/*
 	 * Check if the Multiboot information header has symbols or not.
@@ -336,38 +335,32 @@
 	 * that if the tables start before the kernel's end address,
 	 * they will not grow over this address.
 	 */
-        if ((paddr_t)symtabp < (paddr_t)&end - KERNBASE &&
-	    (paddr_t)strtabp < (paddr_t)&end - KERNBASE) {
-		symstart = (paddr_t)((vaddr_t)&end - KERNBASE);
+        if ((void *)symtabp < RELOC(void *, &end) &&
+	    (void *)strtabp < RELOC(void *, &end)) {
+		symstart = RELOC(Elf32_Addr, &end);
 		strstart = symstart + symsize;
-		memcpy((void *)symstart, (void *)symaddr, symsize);
-		memcpy((void *)strstart, (void *)straddr, strsize);
-        } else if ((paddr_t)symtabp > (paddr_t)&end - KERNBASE &&
-	           (paddr_t)strtabp < (paddr_t)&end - KERNBASE) {
-		symstart = (paddr_t)((vaddr_t)&end - KERNBASE);
+        } else if ((void *)symtabp > RELOC(void *, &end) &&
+	           (void *)strtabp < RELOC(void *, &end)) {
+		symstart = RELOC(Elf32_Addr, &end);
 		strstart = symstart + symsize;
-		memcpy((void *)symstart, (void *)symaddr, symsize);
-		memcpy((void *)strstart, (void *)straddr, strsize);
-        } else if ((paddr_t)symtabp < (paddr_t)&end - KERNBASE &&
-	           (paddr_t)strtabp > (paddr_t)&end - KERNBASE) {
-		strstart = (paddr_t)((vaddr_t)&end - KERNBASE);
+        } else if ((void *)symtabp < RELOC(void *, &end) &&
+	           (void *)strtabp > RELOC(void *, &end)) {
+		strstart = RELOC(Elf32_Addr, &end);
 		symstart = strstart + strsize;
-		memcpy((void *)strstart, (void *)straddr, strsize);
-		memcpy((void *)symstart, (void *)symaddr, symsize);
 	} else {
 		/* symtabp and strtabp are both over end */
-		if ((paddr_t)symtabp < (paddr_t)strtabp) {
-			symstart = (paddr_t)((vaddr_t)&end - KERNBASE);
+		if (symtabp < strtabp) {
+			symstart = RELOC(Elf32_Addr, &end);
 			strstart = symstart + symsize;
-			memcpy((void *)symstart, (void *)symaddr, symsize);
-			memcpy((void *)strstart, (void *)straddr, strsize);
 		} else {
-			strstart = (paddr_t)((vaddr_t)&end - KERNBASE);
+			strstart = RELOC(Elf32_Addr, &end);
 			symstart = strstart + strsize;
-			memcpy((void *)strstart, (void *)straddr, strsize);
-			memcpy((void *)symstart, (void *)symaddr, symsize);
 		}
 	}
+
+	memcpy((void *)strstart, (void *)straddr, strsize);
+	memcpy((void *)symstart, (void *)symaddr, symsize);
+
 	*RELOC(int *, &esym) =
 	    (int)(symstart + symsize + strsize + KERNBASE);
 

Index: src/sys/arch/i386/include/pmap.h
diff -u src/sys/arch/i386/include/pmap.h:1.106 src/sys/arch/i386/include/pmap.h:1.107
--- src/sys/arch/i386/include/pmap.h:1.106	Thu Jul 15 18:58:40 2010
+++ src/sys/arch/i386/include/pmap.h	Sat Jul 24 00:45:55 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.h,v 1.106 2010/07/15 18:58:40 jym Exp $	*/
+/*	$NetBSD: pmap.h,v 1.107 2010/07/24 00:45:55 jym Exp $	*/
 
 /*
  *
@@ -181,25 +181,45 @@
  * note that in the APTE_BASE space, the APDP appears at VA
  * "APDP_BASE" (0xfffff000).
  *
- * When PAE is in use, the L3 page directory breaks up the address space in
- * 4 1GB * regions, each of them broken in 512 2MB regions by the L2 PD
- * (the size of the pages at the L1 level is still 4K).
+ * - PAE support -
+ * ---------------
+ *
+ * PAE adds another layer of indirection during address translation, breaking
+ * up the translation process in 3 different levels:
+ * - L3 page directory, containing 4 * 64-bits addresses (index determined by
+ * bits [31:30] from the virtual address). This breaks up the address space
+ * in 4 1GB regions.
+ * - the PD (L2), containing 512 64-bits addresses, breaking each L3 region
+ * in 512 * 2MB regions.
+ * - the PT (L1), also containing 512 64-bits addresses (at L1, the size of
+ * the pages is still 4K).
+ *
  * The kernel virtual space is mapped by the last entry in the L3 page,
  * the first 3 entries mapping the user VA space.
+ *
  * Because the L3 has only 4 entries of 1GB each, we can't use recursive
- * mappings at this level for PDP_PDE and APDP_PDE (this would eat 2 of the
- * 4GB virtual space). There's also restrictions imposed by Xen on the
- * last entry of the L3 PD, which makes it hard to use one L3 page per pmap
- * switch %cr3 to switch pmaps. So we use one static L3 page which is
- * always loaded in %cr3, and we use it as 2 virtual PD pointers: one for
- * kernel space (L3[3], always loaded), and one for user space (in fact the
- * first 3 entries of the L3 PD), and we claim the VM has only a 2-level
- * PTP (with the L2 index extended by 2 bytes).
- * PTE_BASE and APTE_BASE will need 4 entries in the L2 page table.
- * In addition, we can't recursively map L3[3] (Xen wants the ref count on
- * this page to be exactly once), so we use a shadow PD page for the last
- * L2 PD. The shadow page could be static too, but to make pm_pdir[]
- * contigous we'll allocate/copy one page per pmap.
+ * mappings at this level for PDP_PDE and APDP_PDE (this would eat up 2 of
+ * the 4GB virtual space). There are also restrictions imposed by Xen on the
+ * last entry of the L3 PD (reference count to this page cannot be bigger
+ * than 1), which makes it hard to use one L3 page per pmap to switch
+ * between pmaps using %cr3.
+ *
+ * As such, each CPU gets its own L3 page that is always loaded into its %cr3
+ * (ci_pae_l3_pd in the associated cpu_info struct). We claim that the VM has
+ * only a 2-level PTP (similar to the non-PAE case). L2 PD is now 4 contiguous
+ * pages long (corresponding to the 4 entries of the L3), and the different
+ * index/slots (like PDP_PDE) are adapted accordingly.
+ * 
+ * Kernel space remains in L3[3], L3[0-2] maps the user VA space. Switching
+ * between pmaps consists in modifying the first 3 entries of the CPU's L3 page.
+ *
+ * PTE_BASE and APTE_BASE will need 4 entries in the L2 PD pages to map the
+ * L2 pages recursively.
+ *
+ * In addition, for Xen, we can't recursively map L3[3] (Xen wants the ref
+ * count on this page to be exactly one), so we use a shadow PD page for
+ * the last L2 PD. The shadow page could be static too, but to make pm_pdir[]
+ * contiguous we'll allocate/copy one page per pmap.
  */
 /* XXX MP should we allocate one APDP_PDE per processor?? */
 
@@ -219,12 +239,16 @@
 #ifdef PAE
 #define L2_SLOT_PTE	(KERNBASE/NBPD_L2-4) /* 1532: for recursive PDP map */
 #define L2_SLOT_KERN	(KERNBASE/NBPD_L2)   /* 1536: start of kernel space */
-#define L2_SLOT_APTE	1960                 /* 1964-2047 reserved by Xen */
+#ifndef XEN
+#define L2_SLOT_APTE	2044		/* 2044: alternative recursive slot */
+#else
+#define L2_SLOT_APTE	1960		/* 1964-2047 reserved by Xen */
+#endif
 #else /* PAE */
 #define L2_SLOT_PTE	(KERNBASE/NBPD_L2-1) /* 767: for recursive PDP map */
 #define L2_SLOT_KERN	(KERNBASE/NBPD_L2)   /* 768: start of kernel space */
 #ifndef XEN
-#define L2_SLOT_APTE	1023		 /* 1023: alternative recursive slot */
+#define L2_SLOT_APTE	1023		/* 1023: alternative recursive slot */
 #else
 #define L2_SLOT_APTE	1007		/* 1008-1023 reserved by Xen */
 #endif
@@ -254,17 +278,17 @@
 #define AL2_BASE ((pd_entry_t *)((char *)AL1_BASE + L2_SLOT_PTE * NBPD_L1))
 
 #define PDP_PDE		(L2_BASE + PDIR_SLOT_PTE)
-#ifdef PAE
+#if defined(PAE) && defined(XEN)
 /*
- * when PAE is in use we can't write APDP_PDE though the recursive mapping,
- * because it points to the shadow PD. Use the kernel PD instead, which is 
- * static
+ * when PAE is in use under Xen, we can't write APDP_PDE through the recursive
+ * mapping, because it points to the shadow PD. Use the kernel PD instead,
+ * which is static
  */
 #define APDP_PDE	(&pmap_kl2pd[l2tol2(PDIR_SLOT_APTE)])
 #define APDP_PDE_SHADOW	(L2_BASE + PDIR_SLOT_APTE)
-#else /* PAE */
+#else /* PAE && XEN */
 #define APDP_PDE	(L2_BASE + PDIR_SLOT_APTE)
-#endif /* PAE */
+#endif /* PAE && XEN */
 
 #define PDP_BASE	L2_BASE
 #define APDP_BASE	AL2_BASE
@@ -316,6 +340,17 @@
 #define pmap_pa2pte(a)			(a)
 #define pmap_pte2pa(a)			((a) & PG_FRAME)
 #define pmap_pte_set(p, n)		do { *(p) = (n); } while (0)
+#define pmap_pte_flush()		/* nothing */
+
+#ifdef PAE
+#define pmap_pte_cas(p, o, n)		atomic_cas_64((p), (o), (n))
+#define pmap_pte_testset(p, n)		\
+    atomic_swap_64((volatile uint64_t *)p, n)
+#define pmap_pte_setbits(p, b)		\
+    atomic_or_64((volatile uint64_t *)p, b)
+#define pmap_pte_clearbits(p, b)	\
+    atomic_and_64((volatile uint64_t *)p, ~(b))
+#else /* PAE */
 #define pmap_pte_cas(p, o, n)		atomic_cas_32((p), (o), (n))
 #define pmap_pte_testset(p, n)		\
     atomic_swap_ulong((volatile unsigned long *)p, n)
@@ -323,8 +358,9 @@
     atomic_or_ulong((volatile unsigned long *)p, b)
 #define pmap_pte_clearbits(p, b)	\
     atomic_and_ulong((volatile unsigned long *)p, ~(b))
-#define pmap_pte_flush()		/* nothing */
-#else
+#endif /* PAE */
+
+#else /* XEN */
 static __inline pt_entry_t
 pmap_pa2pte(paddr_t pa)
 {
@@ -400,11 +436,7 @@
 #endif
 
 #ifdef PAE
-/* addresses of static pages used for PAE pmap: */
-/* the L3 page */
-pd_entry_t *pmap_l3pd;
-paddr_t pmap_l3paddr;
-/* the kernel's L2 page */
+/* Address of the static kernel's L2 page */
 pd_entry_t *pmap_kl2pd;
 paddr_t pmap_kl2paddr;
 #endif

Index: src/sys/arch/x86/include/cpu.h
diff -u src/sys/arch/x86/include/cpu.h:1.22 src/sys/arch/x86/include/cpu.h:1.23
--- src/sys/arch/x86/include/cpu.h:1.22	Sun May  9 20:32:41 2010
+++ src/sys/arch/x86/include/cpu.h	Sat Jul 24 00:45:56 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.22 2010/05/09 20:32:41 rmind Exp $	*/
+/*	$NetBSD: cpu.h,v 1.23 2010/07/24 00:45:56 jym Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -50,6 +50,7 @@
  * Definitions unique to x86 cpu support.
  */
 #include <machine/frame.h>
+#include <machine/pte.h>
 #include <machine/segments.h>
 #include <machine/tss.h>
 #include <machine/intrdefs.h>
@@ -162,6 +163,17 @@
 	struct i386tss	ci_doubleflt_tss;
 	struct i386tss	ci_ddbipi_tss;
 #endif
+
+#ifdef PAE
+	uint32_t	ci_pae_l3_pdirpa; /* PA of L3 PD */
+	pd_entry_t *	ci_pae_l3_pdir; /* VA pointer to L3 PD */
+#endif
+
+#if defined(XEN) && defined(__x86_64__)
+	/* Currently active user PGD (can't use rcr3() with Xen) */
+	paddr_t		ci_xen_current_user_pgd;
+#endif
+
 	char *ci_doubleflt_stack;
 	char *ci_ddbipi_stack;
 
@@ -276,6 +288,7 @@
 void cpu_boot_secondary_processors(void);
 void cpu_init_idle_lwps(void);
 void cpu_init_msrs(struct cpu_info *, bool);
+void cpu_load_pmap(struct pmap *);
 
 extern uint32_t cpus_attached;
 #ifndef XEN

Index: src/sys/arch/x86/include/pmap.h
diff -u src/sys/arch/x86/include/pmap.h:1.32 src/sys/arch/x86/include/pmap.h:1.33
--- src/sys/arch/x86/include/pmap.h:1.32	Thu Jul 15 19:02:26 2010
+++ src/sys/arch/x86/include/pmap.h	Sat Jul 24 00:45:56 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.h,v 1.32 2010/07/15 19:02:26 jym Exp $	*/
+/*	$NetBSD: pmap.h,v 1.33 2010/07/24 00:45:56 jym Exp $	*/
 
 /*
  *
@@ -144,11 +144,7 @@
 #define	pm_lock	pm_obj[0].vmobjlock
 	LIST_ENTRY(pmap) pm_list;	/* list (lck by pm_list lock) */
 	pd_entry_t *pm_pdir;		/* VA of PD (lck by object lock) */
-#ifdef PAE
-	paddr_t pm_pdirpa[PDP_SIZE];
-#else
-	paddr_t pm_pdirpa;		/* PA of PD (read-only after create) */
-#endif
+	paddr_t pm_pdirpa[PDP_SIZE];	/* PA of PDs (read-only after create) */
 	struct vm_page *pm_ptphint[PTP_LEVELS-1];
 					/* pointer to a PTP in our pmap */
 	struct pmap_statistics pm_stats;  /* pmap stats (lck by object lock) */
@@ -166,13 +162,13 @@
 					 of pmap */
 };
 
-/* macro to access pm_pdirpa */
+/* macro to access pm_pdirpa slots */
 #ifdef PAE
 #define pmap_pdirpa(pmap, index) \
 	((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t))
 #else
 #define pmap_pdirpa(pmap, index) \
-	((pmap)->pm_pdirpa + (index) * sizeof(pd_entry_t))
+	((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t))
 #endif
 
 /*
@@ -187,6 +183,8 @@
  * PDPpaddr is the physical address of the kernel's PDP.
  * - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3
  * value associated to the kernel process, proc0.
+ * - i386 PAE: it still represents the PA of the kernel's PDP (L2). Due to
+ * the L3 PD, it cannot be considered as the equivalent of a %cr3 any more.
  * - Xen: it corresponds to the PFN of the kernel's PDP.
  */
 extern u_long PDPpaddr;

Index: src/sys/arch/x86/x86/cpu.c
diff -u src/sys/arch/x86/x86/cpu.c:1.72 src/sys/arch/x86/x86/cpu.c:1.73
--- src/sys/arch/x86/x86/cpu.c:1.72	Thu Jul  8 11:22:24 2010
+++ src/sys/arch/x86/x86/cpu.c	Sat Jul 24 00:45:56 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.c,v 1.72 2010/07/08 11:22:24 rmind Exp $	*/
+/*	$NetBSD: cpu.c,v 1.73 2010/07/24 00:45:56 jym Exp $	*/
 
 /*-
  * Copyright (c) 2000, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -62,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.72 2010/07/08 11:22:24 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.73 2010/07/24 00:45:56 jym Exp $");
 
 #include "opt_ddb.h"
 #include "opt_mpbios.h"		/* for MPDEBUG */
@@ -717,9 +717,18 @@
 
 	KASSERT((ci->ci_flags & CPUF_RUNNING) == 0);
 
-	lcr3(pmap_kernel()->pm_pdirpa);
+#ifdef PAE
+	pd_entry_t * l3_pd = ci->ci_pae_l3_pdir;
+	for (i = 0 ; i < PDP_SIZE; i++) {
+		l3_pd[i] = pmap_kernel()->pm_pdirpa[i] | PG_V;
+	}
+	lcr3(ci->ci_pae_l3_pdirpa);
+#else
+	lcr3(pmap_pdirpa(pmap_kernel(), 0));
+#endif
+
 	pcb = lwp_getpcb(curlwp);
-	pcb->pcb_cr3 = pmap_kernel()->pm_pdirpa;
+	pcb->pcb_cr3 = rcr3();
 	pcb = lwp_getpcb(ci->ci_data.cpu_idlelwp);
 	lcr0(pcb->pcb_cr0);
 
@@ -812,6 +821,8 @@
 static void
 tss_init(struct i386tss *tss, void *stack, void *func)
 {
+	KASSERT(curcpu()->ci_pmap == pmap_kernel());
+
 	memset(tss, 0, sizeof *tss);
 	tss->tss_esp0 = tss->tss_esp = (int)((char *)stack + USPACE - 16);
 	tss->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
@@ -819,7 +830,8 @@
 	tss->tss_fs = GSEL(GCPU_SEL, SEL_KPL);
 	tss->tss_gs = tss->__tss_es = tss->__tss_ds =
 	    tss->__tss_ss = GSEL(GDATA_SEL, SEL_KPL);
-	tss->tss_cr3 = pmap_kernel()->pm_pdirpa;
+	/* %cr3 contains the value associated to pmap_kernel */
+	tss->tss_cr3 = rcr3();
 	tss->tss_esp = (int)((char *)stack + USPACE - 16);
 	tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
 	tss->__tss_eflags = PSL_MBO | PSL_NT;	/* XXX not needed? */
@@ -1094,3 +1106,26 @@
 		x86_enable_intr();
 	}
 }
+
+/*
+ * Loads pmap for the current CPU.
+ */
+void
+cpu_load_pmap(struct pmap *pmap)
+{
+#ifdef PAE
+	int i, s;
+	struct cpu_info *ci;
+
+	s = splvm(); /* just to be safe */
+	ci = curcpu();
+	pd_entry_t *l3_pd = ci->ci_pae_l3_pdir;
+	for (i = 0 ; i < PDP_SIZE; i++) {
+		l3_pd[i] = pmap->pm_pdirpa[i] | PG_V;
+	}
+	splx(s);
+	tlbflush();
+#else /* PAE */
+	lcr3(pmap_pdirpa(pmap, 0));
+#endif /* PAE */
+}

Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.112 src/sys/arch/x86/x86/pmap.c:1.113
--- src/sys/arch/x86/x86/pmap.c:1.112	Thu Jul 15 21:14:31 2010
+++ src/sys/arch/x86/x86/pmap.c	Sat Jul 24 00:45:56 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.c,v 1.112 2010/07/15 21:14:31 jym Exp $	*/
+/*	$NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $	*/
 
 /*
  * Copyright (c) 2007 Manuel Bouyer.
@@ -149,7 +149,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.112 2010/07/15 21:14:31 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.113 2010/07/24 00:45:56 jym Exp $");
 
 #include "opt_user_ldt.h"
 #include "opt_lockdebug.h"
@@ -422,8 +422,6 @@
 #ifdef __x86_64__
 /* Dummy PGD for user cr3, used between pmap_deactivate() and pmap_activate() */
 static paddr_t xen_dummy_user_pgd;
-/* Currently active user PGD (can't use rcr3()) */
-static paddr_t xen_current_user_pgd = 0;
 #endif /* __x86_64__ */
 paddr_t pmap_pa_start; /* PA of first physical page for this domain */
 paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
@@ -1283,7 +1281,6 @@
 {
 	struct pmap *kpm;
 	pt_entry_t *pte;
-	struct pcb *pcb;
 	int i;
 	vaddr_t kva;
 #ifndef XEN
@@ -1334,14 +1331,11 @@
 		kpm->pm_ptphint[i] = NULL;
 	}
 	memset(&kpm->pm_list, 0, sizeof(kpm->pm_list));  /* pm_list not used */
-	pcb = lwp_getpcb(&lwp0);
-	kpm->pm_pdir = (pd_entry_t *)(pcb->pcb_cr3 + KERNBASE);
-#ifdef PAE
+
+	kpm->pm_pdir = (pd_entry_t *)(PDPpaddr + KERNBASE);
 	for (i = 0; i < PDP_SIZE; i++)
-		kpm->pm_pdirpa[i] = (paddr_t)pcb->pcb_cr3 + PAGE_SIZE * i;
-#else
-	kpm->pm_pdirpa = (paddr_t)pcb->pcb_cr3;
-#endif
+		kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
+
 	kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
 		x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
 
@@ -1612,7 +1606,7 @@
 	paddr_t newp;
 	paddr_t pdes_pa;
 
-	pdes_pa = pmap_kernel()->pm_pdirpa;
+	pdes_pa = pmap_pdirpa(pmap_kernel(), 0);
 	level = PTP_LEVELS;
 	for (;;) {
 		newp = avail_start;
@@ -1715,6 +1709,40 @@
 
 	evcnt_attach_dynamic(&ci->ci_tlb_evcnt, EVCNT_TYPE_MISC,
 	    NULL, device_xname(ci->ci_dev), "TLB IPI");
+
+#ifdef PAE
+	int ret;
+	struct pglist pg;
+	struct vm_page *vmap;
+
+	/* The BP has already its own L3 page allocated in locore.S. */
+	if (ci == &cpu_info_primary)
+		return;
+
+	/*
+	 * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
+	 * resides below the 4GB boundary.
+	 */
+	ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
+	vmap = TAILQ_FIRST(&pg);
+
+	if (ret != 0 || vmap == NULL)
+		panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
+			__func__, cpu_index(ci), ret);
+
+	ci->ci_pae_l3_pdirpa = vmap->phys_addr;
+
+	ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
+		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
+	if (ci->ci_pae_l3_pdir == NULL)
+		panic("%s: failed to allocate L3 PD for CPU %d\n",
+			__func__, cpu_index(ci));
+
+	pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
+		VM_PROT_READ | VM_PROT_WRITE, 0);
+
+	pmap_update(pmap_kernel());
+#endif
 }
 
 /*
@@ -1931,7 +1959,7 @@
 		 * If ptp is a L3 currently mapped in kernel space,
 		 * clear it before freeing
 		 */
-		if (pmap->pm_pdirpa == xen_current_user_pgd
+		if (pmap_pdirpa(pmap, 0) == curcpu()->ci_xen_current_user_pgd
 		    && level == PTP_LEVELS - 1)
 			pmap_pte_set(&pmap_kernel()->pm_pdir[index], 0);
 #endif /* XEN && __x86_64__ */
@@ -2274,13 +2302,9 @@
 		goto try_again;
 	}
 
-#ifdef PAE
 	for (i = 0; i < PDP_SIZE; i++)
 		pmap->pm_pdirpa[i] =
 		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
-#else
-	pmap->pm_pdirpa = pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE]);
-#endif
 
 	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
 
@@ -2602,11 +2626,11 @@
 
 	KASSERT(kpreempt_disabled());
 #if defined(XEN) && defined(__x86_64__)
-	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
+	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
 #elif defined(PAE)
-	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
+	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
 #elif !defined(XEN) 
-	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
+	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
 #endif
 
 	/*
@@ -2708,12 +2732,12 @@
 	atomic_and_32(&oldpmap->pm_kernel_cpus, ~cpumask);
 
 #if defined(XEN) && defined(__x86_64__)
-	KASSERT(oldpmap->pm_pdirpa == xen_current_user_pgd ||
+	KASSERT(pmap_pdirpa(oldpmap, 0) == ci->ci_xen_current_user_pgd ||
 	    oldpmap == pmap_kernel());
 #elif defined(PAE)
-	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
+	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
 #elif !defined(XEN)
-	KASSERT(oldpmap->pm_pdirpa == pmap_pte2pa(rcr3()));
+	KASSERT(pmap_pdirpa(oldpmap, 0) == pmap_pte2pa(rcr3()));
 #endif
 	KASSERT((pmap->pm_cpus & cpumask) == 0);
 	KASSERT((pmap->pm_kernel_cpus & cpumask) == 0);
@@ -2735,36 +2759,13 @@
 	 * from other CPUs, we're good to load the page tables.
 	 */
 #ifdef PAE
-	pcb->pcb_cr3 = pmap_l3paddr;
+	pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
 #else
-	pcb->pcb_cr3 = pmap->pm_pdirpa;
+	pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
 #endif
-#if defined(XEN) && defined(__x86_64__)
-	/* kernel pmap always in cr3 and should never go in user cr3 */
-	if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) {
-		/*
-		 * Map user space address in kernel space and load
-		 * user cr3
-		 */
-		int i, s;
-		pd_entry_t *old_pgd, *new_pgd;
-		paddr_t addr;
-		s = splvm();
-		new_pgd  = pmap->pm_pdir;
-		old_pgd = pmap_kernel()->pm_pdir;
-		addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0));
-		for (i = 0; i < PDIR_SLOT_PTE;
-		    i++, addr += sizeof(pd_entry_t)) {
-			if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V))
-				xpq_queue_pte_update(addr, new_pgd[i]);
-		}
-		tlbflush();
-		xen_set_user_pgd(pmap_pdirpa(pmap, 0));
-		xen_current_user_pgd = pmap_pdirpa(pmap, 0);
-		splx(s);
-	}
-#else /* XEN && x86_64 */
-#if defined(XEN)
+
+#ifdef i386
+#ifdef XEN
 	/*
 	 * clear APDP slot, in case it points to a page table that has 
 	 * been freed
@@ -2773,34 +2774,19 @@
 		pmap_unmap_apdp();
 	}
 	/* lldt() does pmap_pte_flush() */
-#else /* XEN */
-#if defined(i386)
+#endif /* XEN */
+
+#ifndef XEN
 	ci->ci_tss.tss_ldt = pmap->pm_ldt_sel;
 	ci->ci_tss.tss_cr3 = pcb->pcb_cr3;
-#endif
-#endif /* XEN */
+#endif /* !XEN */
+#endif /* i386 */
+
 	lldt(pmap->pm_ldt_sel);
-#ifdef PAE
-	{
-	paddr_t l3_pd = xpmap_ptom_masked(pmap_l3paddr);
-	int i;
-	int s = splvm();
-	/* don't update the kernel L3 slot */
-	for (i = 0 ; i < PDP_SIZE - 1; i++, l3_pd += sizeof(pd_entry_t)) {
-		xpq_queue_pte_update(l3_pd,
-		    xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V);
-	}
-	tlbflush();
-	splx(s);
-	}
-#else /* PAE */
-	{
+
 	u_int gen = uvm_emap_gen_return();
-	lcr3(pcb->pcb_cr3);
+	cpu_load_pmap(pmap);
 	uvm_emap_update(gen);
-	}
-#endif /* PAE */
-#endif /* XEN && x86_64 */
 
 	ci->ci_want_pmapload = 0;
 
@@ -2867,11 +2853,11 @@
 	}
 
 #if defined(XEN) && defined(__x86_64__)
-	KASSERT(pmap->pm_pdirpa == xen_current_user_pgd);
+	KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd);
 #elif defined(PAE)
-	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(pmap_l3pd[0]));
+	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]));
 #elif !defined(XEN) 
-	KASSERT(pmap->pm_pdirpa == pmap_pte2pa(rcr3()));
+	KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()));
 #endif
 	KASSERT(ci->ci_pmap == pmap);
 
@@ -4761,6 +4747,21 @@
 	tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
 	memcpy(tmp_pml, kernel_pml, PAGE_SIZE);
 
+#ifdef PAE
+	/*
+	 * Use the last 4 entries of the L2 page as L3 PD entries. These
+	 * last entries are unlikely to be used for temporary mappings.
+	 * 508: maps 0->1GB (userland)
+	 * 509: unused
+	 * 510: unused
+	 * 511: maps 3->4GB (kernel)
+	 */
+	tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PG_V;
+	tmp_pml[509] = 0;
+	tmp_pml[510] = 0;
+	tmp_pml[511] = pmap_pdirpa(pmap_kernel(),PDIR_SLOT_KERN) | PG_V;
+#endif
+
 	for (level = PTP_LEVELS - 1; level > 0; --level) {
 		tmp_pml = (void *)x86_tmp_pml_vaddr[level];
 
@@ -4771,5 +4772,10 @@
 	tmp_pml = (void *)x86_tmp_pml_vaddr[0];
 	tmp_pml[pl_i(pg, 1)] = (pg & PG_FRAME) | PG_RW | PG_V;
 
+#ifdef PAE
+	/* Return the PA of the L3 page (entry 508 of the L2 page) */
+	return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
+#endif
+
 	return x86_tmp_pml_paddr[PTP_LEVELS - 1];
 }

Index: src/sys/arch/xen/x86/cpu.c
diff -u src/sys/arch/xen/x86/cpu.c:1.46 src/sys/arch/xen/x86/cpu.c:1.47
--- src/sys/arch/xen/x86/cpu.c:1.46	Tue Jul  6 20:50:35 2010
+++ src/sys/arch/xen/x86/cpu.c	Sat Jul 24 00:45:56 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.c,v 1.46 2010/07/06 20:50:35 cegger Exp $	*/
+/*	$NetBSD: cpu.c,v 1.47 2010/07/24 00:45:56 jym Exp $	*/
 /* NetBSD: cpu.c,v 1.18 2004/02/20 17:35:01 yamt Exp  */
 
 /*-
@@ -66,7 +66,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.46 2010/07/06 20:50:35 cegger Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.47 2010/07/24 00:45:56 jym Exp $");
 
 #include "opt_ddb.h"
 #include "opt_multiprocessor.h"
@@ -582,6 +582,11 @@
 			lcr4(rcr4() | CR4_OSXMMEXCPT);
 	}
 
+#ifdef __x86_64__
+	/* No user PGD mapped for this CPU yet */
+	ci->ci_xen_current_user_pgd = 0;
+#endif
+
 	atomic_or_32(&cpus_running, ci->ci_cpumask);
 	atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
 }
@@ -1111,3 +1116,59 @@
 		x86_enable_intr();
 	}
 }
+
+/*
+ * Loads pmap for the current CPU.
+ */
+void
+cpu_load_pmap(struct pmap *pmap)
+{
+#ifdef i386
+#ifdef PAE
+	int i, s;
+	struct cpu_info *ci;
+
+	s = splvm(); /* just to be safe */
+	ci = curcpu();
+	paddr_t l3_pd = xpmap_ptom_masked(ci->ci_pae_l3_pdirpa);
+	/* don't update the kernel L3 slot */
+	for (i = 0 ; i < PDP_SIZE - 1; i++) {
+		xpq_queue_pte_update(l3_pd + i * sizeof(pd_entry_t),
+		    xpmap_ptom(pmap->pm_pdirpa[i]) | PG_V);
+	}
+	splx(s);
+	tlbflush();
+#else /* PAE */
+	lcr3(pmap_pdirpa(pmap, 0));
+#endif /* PAE */
+#endif /* i386 */
+
+#ifdef __x86_64__
+	int i, s;
+	pd_entry_t *old_pgd, *new_pgd;
+	paddr_t addr;
+	struct cpu_info *ci;
+
+	/* kernel pmap always in cr3 and should never go in user cr3 */
+	if (pmap_pdirpa(pmap, 0) != pmap_pdirpa(pmap_kernel(), 0)) {
+		ci = curcpu();
+		/*
+		 * Map user space address in kernel space and load
+		 * user cr3
+		 */
+		s = splvm();
+		new_pgd = pmap->pm_pdir;
+		old_pgd = pmap_kernel()->pm_pdir;
+		addr = xpmap_ptom(pmap_pdirpa(pmap_kernel(), 0));
+		for (i = 0; i < PDIR_SLOT_PTE;
+		    i++, addr += sizeof(pd_entry_t)) {
+			if ((new_pgd[i] & PG_V) || (old_pgd[i] & PG_V))
+				xpq_queue_pte_update(addr, new_pgd[i]);
+		}
+		tlbflush();
+		xen_set_user_pgd(pmap_pdirpa(pmap, 0));
+		ci->ci_xen_current_user_pgd = pmap_pdirpa(pmap, 0);
+		splx(s);
+	}
+#endif /* __x86_64__ */
+}

Index: src/sys/arch/xen/x86/x86_xpmap.c
diff -u src/sys/arch/xen/x86/x86_xpmap.c:1.20 src/sys/arch/xen/x86/x86_xpmap.c:1.21
--- src/sys/arch/xen/x86/x86_xpmap.c:1.20	Thu Jul 15 23:20:34 2010
+++ src/sys/arch/xen/x86/x86_xpmap.c	Sat Jul 24 00:45:56 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: x86_xpmap.c,v 1.20 2010/07/15 23:20:34 jym Exp $	*/
+/*	$NetBSD: x86_xpmap.c,v 1.21 2010/07/24 00:45:56 jym Exp $	*/
 
 /*
  * Copyright (c) 2006 Mathieu Ropert <[email protected]>
@@ -69,7 +69,7 @@
 
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.20 2010/07/15 23:20:34 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.21 2010/07/24 00:45:56 jym Exp $");
 
 #include "opt_xen.h"
 #include "opt_ddb.h"
@@ -814,22 +814,26 @@
 #else
 	xpq_queue_pin_table(xpmap_ptom_masked(new_pgd - KERNBASE));
 #endif
-#ifdef __i386__
+
 	/* Save phys. addr of PDP, for libkvm. */
-	PDPpaddr = (long)pde - KERNBASE;
 #ifdef PAE
-	/* also save the address of the L3 page */
-	pmap_l3pd = pdtpe;
-	pmap_l3paddr = (new_pgd - KERNBASE);
-#endif /* PAE */
-#endif /* i386 */
+	PDPpaddr = (u_long)pde - KERNBASE; /* PDP is the L2 with PAE */
+#else
+	PDPpaddr = (u_long)new_pgd - KERNBASE;
+#endif
+
 	/* Switch to new tables */
 	__PRINTK(("switch to PGD\n"));
 	xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE));
 	__PRINTK(("bt_pgd[PDIR_SLOT_PTE] now entry %#" PRIxPADDR "\n",
 	    bt_pgd[PDIR_SLOT_PTE]));
+
 #ifdef PAE
 	if (final) {
+		/* save the address of the L3 page */
+		cpu_info_primary.ci_pae_l3_pdir = pdtpe;
+		cpu_info_primary.ci_pae_l3_pdirpa = (new_pgd - KERNBASE);
+
 		/* now enter kernel's PTE mappings */
 		addr =  (u_long)pde - KERNBASE + PAGE_SIZE * 3;
 		xpq_queue_pte_update(
@@ -839,8 +843,6 @@
 	}
 #endif
 
-
-
 	/* Now we can safely reclaim space taken by old tables */
 	
 	__PRINTK(("unpin old PGD\n"));

Index: src/sys/arch/xen/x86/xenfunc.c
diff -u src/sys/arch/xen/x86/xenfunc.c:1.10 src/sys/arch/xen/x86/xenfunc.c:1.11
--- src/sys/arch/xen/x86/xenfunc.c:1.10	Fri Feb 12 01:55:46 2010
+++ src/sys/arch/xen/x86/xenfunc.c	Sat Jul 24 00:45:56 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: xenfunc.c,v 1.10 2010/02/12 01:55:46 jym Exp $	*/
+/*	$NetBSD: xenfunc.c,v 1.11 2010/07/24 00:45:56 jym Exp $	*/
 
 /*
  *
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: xenfunc.c,v 1.10 2010/02/12 01:55:46 jym Exp $");
+__KERNEL_RCSID(0, "$NetBSD: xenfunc.c,v 1.11 2010/07/24 00:45:56 jym Exp $");
 
 #include <sys/param.h>
 
@@ -58,10 +58,10 @@
 	splx(s);
 }  
 
-#ifndef __x86_64__
 void
 lldt(u_short sel)
 {
+#ifndef __x86_64__
 	struct cpu_info *ci;
 
 	ci = curcpu();
@@ -75,8 +75,8 @@
 		xen_set_ldt(ci->ci_gdt[IDXSELN(sel)].ld.ld_base,
 		    ci->ci_gdt[IDXSELN(sel)].ld.ld_entries);
 	ci->ci_curldt = sel;
-}
 #endif
+}
 
 void
 ltr(u_short sel)

Reply via email to