Module Name:    src
Committed By:   ad
Date:           Sat Jan  4 22:49:20 UTC 2020

Modified Files:
        src/sys/arch/x86/include: pmap.h pmap_pv.h
        src/sys/arch/x86/x86: pmap.c
        src/sys/arch/xen/x86: xen_pmap.c

Log Message:
x86 pmap improvements, reducing system time during a build by about 15% on
my test machine:

- Replace the global pv_hash with a per-pmap record of dynamically allocated
  pv entries.  The data structure used for this can be changed easily, and
  has no special concurrency requirements.  For now go with radixtree.

- Change pmap_pdp_cache back into a pool; cache the page directory with the
  pmap, and avoid contention on pmaps_lock by adjusting the global list in
  the pool_cache ctor & dtor.  Align struct pmap and its lock, and update
  some comments.

- Simplify pv_entry lists slightly.  Allow both PP_EMBEDDED and dynamically
  allocated entries to co-exist on a single page.  This adds a pointer to
  struct vm_page on x86, but shrinks pv_entry to 32 bytes (which also gets
  it nicely aligned).

- More elegantly solve the chicken-and-egg problem introduced into the pmap
  with radixtree lookup for pages, where we need PTEs mapped and page
  allocations to happen under a single hold of the pmap's lock.  While here
  undo some cut-n-paste.

- Don't adjust pmap_kernel's stats with atomics, because its mutex is now
  held in the places the stats are changed.


To generate a diff of this commit:
cvs rdiff -u -r1.107 -r1.108 src/sys/arch/x86/include/pmap.h
cvs rdiff -u -r1.8 -r1.9 src/sys/arch/x86/include/pmap_pv.h
cvs rdiff -u -r1.352 -r1.353 src/sys/arch/x86/x86/pmap.c
cvs rdiff -u -r1.34 -r1.35 src/sys/arch/xen/x86/xen_pmap.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/x86/include/pmap.h
diff -u src/sys/arch/x86/include/pmap.h:1.107 src/sys/arch/x86/include/pmap.h:1.108
--- src/sys/arch/x86/include/pmap.h:1.107	Sun Dec 15 19:24:11 2019
+++ src/sys/arch/x86/include/pmap.h	Sat Jan  4 22:49:20 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.h,v 1.107 2019/12/15 19:24:11 ad Exp $	*/
+/*	$NetBSD: pmap.h,v 1.108 2020/01/04 22:49:20 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -67,6 +67,8 @@
 #ifndef _X86_PMAP_H_
 #define	_X86_PMAP_H_
 
+#include <sys/radixtree.h>
+
 /*
  * pl*_pi: index in the ptp page for a pde mapping a VA.
  * (pl*_i below is the index in the virtual array of all pdes per level)
@@ -232,9 +234,9 @@ extern struct pmap_head pmaps;
 extern kmutex_t pmaps_lock;    /* protects pmaps */
 
 /*
- * pool_cache(9) that PDPs are allocated from 
+ * pool_cache(9) that pmaps are allocated from 
  */
-extern struct pool_cache pmap_pdp_cache;
+extern struct pool_cache pmap_cache;
 
 /*
  * the pmap structure
@@ -248,14 +250,14 @@ extern struct pool_cache pmap_pdp_cache;
  */
 
 struct pmap {
-	struct uvm_object pm_obj[PTP_LEVELS-1]; /* objects for lvl >= 1) */
-	kmutex_t pm_lock;		/* locks for pm_objs */
-	LIST_ENTRY(pmap) pm_list;	/* list (lck by pm_list lock) */
-	pd_entry_t *pm_pdir;		/* VA of PD (lck by object lock) */
+	struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */
+	LIST_ENTRY(pmap) pm_list;	/* list of all pmaps */
+	pd_entry_t *pm_pdir;		/* VA of PD */
 	paddr_t pm_pdirpa[PDP_SIZE];	/* PA of PDs (read-only after create) */
 	struct vm_page *pm_ptphint[PTP_LEVELS-1];
 					/* pointer to a PTP in our pmap */
-	struct pmap_statistics pm_stats;  /* pmap stats (lck by object lock) */
+	struct radix_tree pm_pvtree;	/* tree of non-embedded pv entries */
+	struct pmap_statistics pm_stats;  /* pmap stats */
 
 #if !defined(__x86_64__)
 	vaddr_t pm_hiexec;		/* highest executable mapping */
@@ -286,6 +288,9 @@ struct pmap {
 
 	void (*pm_tlb_flush)(struct pmap *);
 	void *pm_data;
+
+	kmutex_t pm_lock		/* locks for pm_objs */
+	    __aligned(64);		/* give lock own cache line */
 };
 
 /* macro to access pm_pdirpa slots */
@@ -374,7 +379,7 @@ void		pmap_pv_untrack(paddr_t, psize_t);
 
 void		pmap_map_ptes(struct pmap *, struct pmap **, pd_entry_t **,
 		    pd_entry_t * const **);
-void		pmap_unmap_ptes(struct pmap *, struct pmap *, struct vm_page *);
+void		pmap_unmap_ptes(struct pmap *, struct pmap *);
 
 bool		pmap_pdes_valid(vaddr_t, pd_entry_t * const *, pd_entry_t *,
 		    int *lastlvl);

Index: src/sys/arch/x86/include/pmap_pv.h
diff -u src/sys/arch/x86/include/pmap_pv.h:1.8 src/sys/arch/x86/include/pmap_pv.h:1.9
--- src/sys/arch/x86/include/pmap_pv.h:1.8	Thu Jan  2 21:39:42 2020
+++ src/sys/arch/x86/include/pmap_pv.h	Sat Jan  4 22:49:20 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap_pv.h,v 1.8 2020/01/02 21:39:42 ad Exp $	*/
+/*	$NetBSD: pmap_pv.h,v 1.9 2020/01/04 22:49:20 ad Exp $	*/
 
 /*-
  * Copyright (c)2008 YAMAMOTO Takashi,
@@ -55,8 +55,7 @@ struct pv_pte {
 
 struct pv_entry {
 	struct pv_pte pve_pte;		/* should be the first member */
-	LIST_ENTRY(pv_entry) pve_list;	/* on pv_head::pvh_list */
-	SLIST_ENTRY(pv_entry) pve_hash;
+	LIST_ENTRY(pv_entry) pve_list;	/* on pmap_page::pp_pvlist */
 };
 #define	pve_next	pve_list.le_next
 
@@ -69,16 +68,11 @@ struct pmap_page {
 		/* PP_EMBEDDED */
 		struct pv_pte u_pte;
 
-		/* !PP_EMBEDDED */
-		struct pv_head {
-			LIST_HEAD(, pv_entry) pvh_list;
-		} u_head;
-
 		/* PTPs */
 		struct vm_page *u_link;
 	} pp_u;
+	LIST_HEAD(, pv_entry) pp_pvlist;
 #define	pp_pte	pp_u.u_pte
-#define	pp_head	pp_u.u_head
 #define	pp_link	pp_u.u_link
 	uint8_t pp_flags;
 	uint8_t pp_attrs;
@@ -90,6 +84,6 @@ struct pmap_page {
 /* pp_flags */
 #define	PP_EMBEDDED	1
 
-#define	PMAP_PAGE_INIT(pp)	/* none */
+#define	PMAP_PAGE_INIT(pp)	LIST_INIT(&(pp)->pp_pvlist)
 
 #endif /* !_X86_PMAP_PV_H_ */

Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.352 src/sys/arch/x86/x86/pmap.c:1.353
--- src/sys/arch/x86/x86/pmap.c:1.352	Thu Jan  2 21:39:42 2020
+++ src/sys/arch/x86/x86/pmap.c	Sat Jan  4 22:49:20 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.c,v 1.352 2020/01/02 21:39:42 ad Exp $	*/
+/*	$NetBSD: pmap.c,v 1.353 2020/01/04 22:49:20 ad Exp $	*/
 
 /*
  * Copyright (c) 2008, 2010, 2016, 2017, 2019 The NetBSD Foundation, Inc.
@@ -130,7 +130,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.352 2020/01/02 21:39:42 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.353 2020/01/04 22:49:20 ad Exp $");
 
 #include "opt_user_ldt.h"
 #include "opt_lockdebug.h"
@@ -213,8 +213,8 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.3
  *  - struct pmap_page: describes one pv-tracked page, without
  *    necessarily a corresponding vm_page
  *  - struct pv_entry: describes one <PMAP,VA> mapping of a PA
- *  - struct pv_head: there is one pv_head per pv-tracked page of
- *    physical memory.   the pv_head points to a list of pv_entry
+ *  - pmap_page::pp_pvlist: there is one list per pv-tracked page of
+ *    physical memory.   the pp_pvlist points to a list of pv_entry
  *    structures which describe all the <PMAP,VA> pairs that this
  *    page is mapped in.    this is critical for page based operations
  *    such as pmap_page_protect() [change protection on _all_ mappings
@@ -224,16 +224,19 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.3
 /*
  * Locking
  *
- * We have the following locks that we must contend with:
+ * We have the following locks that we must contend with, listed in the
+ * order that they must be acquired:
  *
- * - pmap lock (per pmap, part of uvm_object)
- *   This lock protects the fields in the pmap structure including the
- *   non-kernel PDEs in the PDP, and the PTEs.
+ * - pg->uobject->vmobjlock, pg->uanon->an_lock
+ *   These per-object locks are taken by the VM system before calling into
+ *   the pmap module.  Holding them prevents concurrent operations on the
+ *   given page or set of pages.  Asserted with uvm_page_owner_locked_p().
  *
- * - pvh_lock (per pv_head)
- *   This lock protects the pv_entry list which is chained off the pv_head
- *   structure for a specific pv-tracked PA. It is locked when traversing
- *   the list (e.g. adding/removing mappings, syncing R/M bits, etc).
+ * - pmap->pm_lock (per pmap)
+ *   This lock protects the fields in the pmap structure including the
+ *   non-kernel PDEs in the PDP, the PTEs, and the PVE radix tree.  For
+ *   modifying kernel PTEs it is not required as kernel PDEs are never
+ *   freed, and the kernel is expected to be self consistent.
  *
  * - pmaps_lock
  *   This lock protects the list of active pmaps (headed by "pmaps"). We
@@ -254,7 +257,7 @@ pd_entry_t *normal_pdes[3];
 long nkptp[] = NKPTP_INITIALIZER;
 
 struct pmap_head pmaps;
-kmutex_t pmaps_lock;
+kmutex_t pmaps_lock __cacheline_aligned;
 
 struct pcpu_area *pcpuarea __read_mostly;
 
@@ -275,7 +278,7 @@ static bool cpu_pat_enabled __read_mostl
  * Global data structures
  */
 
-static struct pmap kernel_pmap_store;	/* the kernel's pmap (proc0) */
+static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
 
 struct bootspace bootspace __read_mostly;
@@ -301,61 +304,6 @@ paddr_t pmap_pa_end;   /* PA of last phy
 
 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
 
-#define	PV_HASH_SIZE		32768
-#define	PV_HASH_LOCK_CNT	32
-
-struct pv_hash_lock {
-	kmutex_t lock;
-} __aligned(CACHE_LINE_SIZE) pv_hash_locks[PV_HASH_LOCK_CNT]
-    __aligned(CACHE_LINE_SIZE);
-
-struct pv_hash_head {
-	SLIST_HEAD(, pv_entry) hh_list;
-} pv_hash_heads[PV_HASH_SIZE];
-
-static u_int
-pvhash_hash(struct vm_page *ptp, vaddr_t va)
-{
-
-	return (uintptr_t)ptp / sizeof(*ptp) + (va >> PAGE_SHIFT);
-}
-
-static struct pv_hash_head *
-pvhash_head(u_int hash)
-{
-
-	return &pv_hash_heads[hash % PV_HASH_SIZE];
-}
-
-static kmutex_t *
-pvhash_lock(u_int hash)
-{
-
-	return &pv_hash_locks[hash % PV_HASH_LOCK_CNT].lock;
-}
-
-static struct pv_entry *
-pvhash_remove(struct pv_hash_head *hh, struct vm_page *ptp, vaddr_t va)
-{
-	struct pv_entry *pve;
-	struct pv_entry *prev;
-
-	prev = NULL;
-	SLIST_FOREACH(pve, &hh->hh_list, pve_hash) {
-		if (pve->pve_pte.pte_ptp == ptp &&
-		    pve->pve_pte.pte_va == va) {
-			if (prev != NULL) {
-				SLIST_REMOVE_AFTER(prev, pve_hash);
-			} else {
-				SLIST_REMOVE_HEAD(&hh->hh_list, pve_hash);
-			}
-			break;
-		}
-		prev = pve;
-	}
-	return pve;
-}
-
 /*
  * Other data structures
  */
@@ -384,7 +332,9 @@ paddr_t local_apic_pa __read_mostly;
 /*
  * pool that pmap structures are allocated from
  */
-static struct pool_cache pmap_cache;
+struct pool_cache pmap_cache;
+static int  pmap_ctor(void *, void *, int);
+static void pmap_dtor(void *, void *);
 
 /*
  * pv_entry cache
@@ -411,10 +361,10 @@ static char *early_zerop;
 
 int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
 
-/* PDP pool_cache(9) and its callbacks */
-struct pool_cache pmap_pdp_cache;
-static int  pmap_pdp_ctor(void *, void *, int);
-static void pmap_pdp_dtor(void *, void *);
+/* PDP pool and its callbacks */
+static struct pool pmap_pdp_pool;
+static void pmap_pdp_init(pd_entry_t *);
+static void pmap_pdp_fini(pd_entry_t *);
 
 #ifdef PAE
 /* need to allocate items of 4 pages */
@@ -439,6 +389,12 @@ extern paddr_t ldt_paddr;
 extern vaddr_t pentium_idt_vaddr;
 #endif
 
+/* Array of freshly allocated PTPs, for pmap_get_ptp(). */
+struct pmap_ptparray {
+	struct vm_page *pg[PTP_LEVELS + 1];
+	bool alloced[PTP_LEVELS + 1];
+};
+
 /*
  * Local prototypes
  */
@@ -457,8 +413,11 @@ static void pmap_init_lapic(void);
 static void pmap_remap_largepages(void);
 #endif
 
-static int pmap_get_ptp(struct pmap *, vaddr_t,
-    pd_entry_t * const *, int, struct vm_page **);
+static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
+    struct vm_page **);
+static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
+static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
+    pd_entry_t * const *);
 static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, paddr_t, int);
 static void pmap_freepages(struct pmap *, struct vm_page *);
 static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
@@ -468,6 +427,10 @@ static bool pmap_remove_pte(struct pmap 
 static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
     vaddr_t, struct pv_entry **);
 
+static int pmap_pvmap_insert(struct pmap *, vaddr_t, struct pv_entry *);
+static struct pv_entry *pmap_pvmap_lookup(struct pmap *, vaddr_t);
+static void pmap_pvmap_remove(struct pmap *, vaddr_t, struct pv_entry *);
+
 static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
 
 static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
@@ -477,21 +440,16 @@ static void pmap_reactivate(struct pmap 
  * p m a p   h e l p e r   f u n c t i o n s
  */
 
-static void
+static inline void
 pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
 {
 
-	if (pmap == pmap_kernel()) {
-		atomic_add_long(&pmap->pm_stats.resident_count, resid_diff);
-		atomic_add_long(&pmap->pm_stats.wired_count, wired_diff);
-	} else {
-		KASSERT(mutex_owned(&pmap->pm_lock));
-		pmap->pm_stats.resident_count += resid_diff;
-		pmap->pm_stats.wired_count += wired_diff;
-	}
+	KASSERT(cold || mutex_owned(&pmap->pm_lock));
+	pmap->pm_stats.resident_count += resid_diff;
+	pmap->pm_stats.wired_count += wired_diff;
 }
 
-static void
+static inline void
 pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
 {
 	int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
@@ -549,7 +507,7 @@ pv_pte_first(struct pmap_page *pp)
 	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
 		return &pp->pp_pte;
 	}
-	return pve_to_pvpte(LIST_FIRST(&pp->pp_head.pvh_list));
+	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
 }
 
 static struct pv_pte *
@@ -559,9 +517,8 @@ pv_pte_next(struct pmap_page *pp, struct
 	KASSERT(pvpte != NULL);
 	if (pvpte == &pp->pp_pte) {
 		KASSERT((pp->pp_flags & PP_EMBEDDED) != 0);
-		return NULL;
+		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
 	}
-	KASSERT((pp->pp_flags & PP_EMBEDDED) == 0);
 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
 }
 
@@ -601,8 +558,9 @@ pmap_reference(struct pmap *pmap)
  *	- vm_map_pmap(&curproc->p_vmspace->vm_map)
  *		current process' pmap.
  *
- * => we lock enough pmaps to keep things locked in
+ * => caller must lock pmap first (if not the kernel pmap)
  * => must be undone with pmap_unmap_ptes before returning
+ * => disables kernel preemption
  */
 void
 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
@@ -612,6 +570,8 @@ pmap_map_ptes(struct pmap *pmap, struct 
 	struct cpu_info *ci;
 	lwp_t *l;
 
+	kpreempt_disable();
+
 	/* The kernel's pmap is always accessible. */
 	if (pmap == pmap_kernel()) {
 		*pmap2 = NULL;
@@ -619,39 +579,35 @@ pmap_map_ptes(struct pmap *pmap, struct 
 		*pdeppp = normal_pdes;
 		return;
 	}
-	KASSERT(kpreempt_disabled());
+
+	KASSERT(mutex_owned(&pmap->pm_lock));
 
 	l = curlwp;
- retry:
-	mutex_enter(&pmap->pm_lock);
-	ci = curcpu();
+	ci = l->l_cpu;
 	curpmap = ci->ci_pmap;
-	if (vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
-		/* Our own pmap so just load it: easy. */
-		if (__predict_false(ci->ci_want_pmapload)) {
-			mutex_exit(&pmap->pm_lock);
-			pmap_load();
-			goto retry;
-		}
-		KASSERT(pmap == curpmap);
-	} else if (pmap == curpmap) {
+	if (pmap == curpmap) {
 		/*
 		 * Already on the CPU: make it valid.  This is very
 		 * often the case during exit(), when we have switched
 		 * to the kernel pmap in order to destroy a user pmap.
 		 */
 		pmap_reactivate(pmap);
+		*pmap2 = NULL;
 	} else {
 		/*
 		 * Toss current pmap from CPU and install new pmap, but keep
-		 * a reference to the old one - dropping the reference can
-		 * block, so we'll defer to pmap_unmap_ptes().
+		 * a reference to the old one.  Dropping the reference can
+		 * can block as it needs to take locks, so defer that to
+		 * pmap_unmap_ptes().
 		 */
 		pmap_reference(pmap);
 		pmap_load1(l, pmap, curpmap);
+		*pmap2 = curpmap;
 	}
+	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
+#ifdef DIAGNOSTIC
 	pmap->pm_ncsw = lwp_pctr();
-	*pmap2 = curpmap;
+#endif
 	*ptepp = PTE_BASE;
 
 #if defined(XENPV) && defined(__x86_64__)
@@ -665,68 +621,52 @@ pmap_map_ptes(struct pmap *pmap, struct 
 
 /*
  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
+ *
+ * => we cannot tolerate context switches while mapped in: assert this.
+ * => reenables kernel preemption.
+ * => does not unlock pmap.
  */
 void
-pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2,
-    struct vm_page *ptp_tofree)
+pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
 {
 	struct cpu_info *ci;
 	struct pmap *mypmap;
+	struct lwp *l;
 
 	KASSERT(kpreempt_disabled());
 
 	/* The kernel's pmap is always accessible. */
 	if (pmap == pmap_kernel()) {
-		KASSERT(ptp_tofree == NULL);
+		kpreempt_enable();
 		return;
 	}
 
-	ci = curcpu();
+	l = curlwp;
+	ci = l->l_cpu;
+
+	KASSERT(mutex_owned(&pmap->pm_lock));
+	KASSERT(pmap->pm_ncsw == lwp_pctr());
 
 #if defined(XENPV) && defined(__x86_64__)
 	KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
 	ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
 #endif
 
-	/*
-	 * We cannot tolerate context switches while mapped in.
-	 * If it is our own pmap all we have to do is unlock.
-	 */
-	KASSERT(pmap->pm_ncsw == lwp_pctr());
-	mypmap = vm_map_pmap(&curproc->p_vmspace->vm_map);
-	if (pmap == mypmap) {
-		/* Now safe to free PTPs, with the pmap still locked. */
-		if (ptp_tofree != NULL) {
-			pmap_freepages(pmap, ptp_tofree);
-		}
-		mutex_exit(&pmap->pm_lock);
-		return;
-	}
-
-	/*
-	 * Mark whatever's on the CPU now as lazy and unlock.
-	 * If the pmap was already installed, we are done.
-	 */
-	if (ci->ci_pmap != mypmap) {
+	/* If not our own pmap, mark whatever's on the CPU now as lazy. */
+	KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
+	mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
+	if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
+		ci->ci_want_pmapload = 0;
+	} else {
 		ci->ci_want_pmapload = (mypmap != pmap_kernel());
-		if (ci->ci_tlbstate == TLBSTATE_VALID) {
-			ci->ci_tlbstate = TLBSTATE_LAZY;
-		} else {
-			/*
-			 * This can happen when undoing after pmap_get_ptp
-			 * blocked.
-			 */
-		}
+		ci->ci_tlbstate = TLBSTATE_LAZY;
 	}
 
-	/* Now safe to free PTPs, with the pmap still locked. */
-	if (ptp_tofree != NULL) {
-		pmap_freepages(pmap, ptp_tofree);
-	}
-	mutex_exit(&pmap->pm_lock);
+	/* Now safe to re-enable preemption. */
+	kpreempt_enable();
 
-	/* Toss the pmap we evicted earlier (can block). */
-	if (pmap != pmap2) {
+	/* Toss reference to other pmap taken earlier. */
+	if (pmap2 != NULL) {
 		pmap_destroy(pmap2);
 	}
 }
@@ -1239,11 +1179,7 @@ pmap_bootstrap(vaddr_t kva_start)
 	virtual_avail = reserve_dumppages(virtual_avail);
 
 	/*
-	 * Init the static-global locks and global lists.
-	 *
-	 * => pventry::pvh_lock (initialized elsewhere) must also be
-	 *    a spin lock, again at IPL_VM to prevent deadlock, and
-	 *    again is never taken from interrupt context.
+	 * Init the global lock and global list.
 	 */
 	mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
 	LIST_INIT(&pmaps);
@@ -1709,21 +1645,14 @@ pmap_remap_largepages(void)
 void
 pmap_init(void)
 {
-	int i, flags;
-
-	for (i = 0; i < PV_HASH_SIZE; i++) {
-		SLIST_INIT(&pv_hash_heads[i].hh_list);
-	}
-	for (i = 0; i < PV_HASH_LOCK_CNT; i++) {
-		mutex_init(&pv_hash_locks[i].lock, MUTEX_NODEBUG, IPL_VM);
-	}
+	int flags;
 
 	/*
 	 * initialize caches.
 	 */
 
-	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), 0, 0, 0,
-	    "pmappl", NULL, IPL_NONE, NULL, NULL, NULL);
+	pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
+	    0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
 
 #ifdef XENPV
 	/*
@@ -1736,12 +1665,11 @@ pmap_init(void)
 #endif
 
 #ifdef PAE
-	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
-	    "pdppl", &pmap_pdp_allocator, IPL_NONE,
-	    pmap_pdp_ctor, pmap_pdp_dtor, NULL);
+	pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
+	    "pdppl", &pmap_pdp_allocator, IPL_NONE);
 #else
-	pool_cache_bootstrap(&pmap_pdp_cache, PAGE_SIZE, 0, 0, flags,
-	    "pdppl", NULL, IPL_NONE, pmap_pdp_ctor, pmap_pdp_dtor, NULL);
+	pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
+	    "pdppl", NULL, IPL_NONE);
 #endif
 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
 	    PR_LARGECACHE, "pvpl", &pool_allocator_kmem, IPL_NONE, NULL,
@@ -1757,6 +1685,8 @@ pmap_init(void)
 	evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
 	    NULL, "x86", "ldt sync");
 
+	radix_tree_init_tree(&pmap_kernel()->pm_pvtree);
+
 	/*
 	 * done: pmap module is up (and ready for business)
 	 */
@@ -1838,99 +1768,131 @@ pmap_vpage_cpu_init(struct cpu_info *ci)
  * p v _ e n t r y   f u n c t i o n s
  */
 
+
+/*
+ * pmap_pp_needs_pve: return true if we need to allocate a pv entry and
+ * corresponding radix tree entry for the page.
+ */
 static bool
-pmap_pp_needs_pve(struct pmap_page *pp)
+pmap_pp_needs_pve(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
 {
 
 	/*
 	 * Adding a pv entry for this page only needs to allocate a pv_entry
-	 * structure if the page already has at least one pv entry,
-	 * since the first pv entry is stored in the pmap_page.
+	 * structure if the page already has at least one pv entry, since
+	 * the first pv entry is stored in the pmap_page.  However, because
+	 * of subsequent removal(s), PP_EMBEDDED can be false and there can
+	 * still be pv entries on the list.
 	 */
 
-	return pp && ((pp->pp_flags & PP_EMBEDDED) != 0 ||
-	    !LIST_EMPTY(&pp->pp_head.pvh_list));
+	if (pp == NULL || (pp->pp_flags & PP_EMBEDDED) == 0) {
+		return false;
+	}
+	return pp->pp_pte.pte_ptp != ptp || pp->pp_pte.pte_va != va;
 }
 
+/*
+ * pmap_free_pvs: free a linked list of pv entries.  the pv entries have
+ * been removed from their respective pages, but are still entered into the
+ * map and we must undo that.
+ *
+ * => must be called with pmap locked.
+ */
 static void
-pmap_free_pvs(struct pv_entry *pve)
+pmap_free_pvs(struct pmap *pmap, struct pv_entry *pve)
 {
 	struct pv_entry *next;
 
+	KASSERT(mutex_owned(&pmap->pm_lock));
+
 	for ( /* null */ ; pve != NULL ; pve = next) {
+		pmap_pvmap_remove(pmap, pve->pve_pte.pte_va, pve);
 		next = pve->pve_next;
 		pool_cache_put(&pmap_pv_cache, pve);
 	}
 }
 
 /*
- * main pv_entry manipulation functions:
- *   pmap_enter_pv: enter a mapping onto a pv_head list
- *   pmap_remove_pv: remove a mapping from a pv_head list
+ * pmap_pvmap_lookup: look up a non-PP_EMBEDDED pv entry for the given pmap
+ *
+ * => pmap must be locked
+ */
+
+static struct pv_entry *
+pmap_pvmap_lookup(struct pmap *pmap, vaddr_t va)
+{
+
+	KASSERT(mutex_owned(&pmap->pm_lock));
+
+	return radix_tree_lookup_node(&pmap->pm_pvtree, va >> PAGE_SHIFT);
+}
+
+/*
+ * pmap_pvmap_insert: insert a non-PP_EMBEDDED pv entry for the given pmap
  *
- * NOTE: Both pmap_enter_pv and pmap_remove_pv expect the caller to lock
- *       the pvh before calling
+ * => pmap must be locked
+ * => an error can be returned
  */
 
+static int
+pmap_pvmap_insert(struct pmap *pmap, vaddr_t va, struct pv_entry *pve)
+{
+
+	KASSERT(mutex_owned(&pmap->pm_lock));
+	KASSERT(pmap_pvmap_lookup(pmap, va) == NULL);
+
+	return radix_tree_insert_node(&pmap->pm_pvtree, va >> PAGE_SHIFT, pve);
+}
+
 /*
- * insert_pv: a helper of pmap_enter_pv
+ * pmap_pvmap_remove: look up a non-PP_EMBEDDED pv entry for the given pmap
+ *
+ * => pmap must be locked
  */
+
 static void
-insert_pv(struct pmap_page *pp, struct pv_entry *pve)
+pmap_pvmap_remove(struct pmap *pmap, vaddr_t va, struct pv_entry *pve)
 {
-	struct pv_hash_head *hh;
-	kmutex_t *lock;
-	u_int hash;
+	struct pv_entry *pve2 __diagused;
+
+	KASSERT(mutex_owned(&pmap->pm_lock));
 
-	hash = pvhash_hash(pve->pve_pte.pte_ptp, pve->pve_pte.pte_va);
-	lock = pvhash_lock(hash);
-	hh = pvhash_head(hash);
-	mutex_spin_enter(lock);
-	SLIST_INSERT_HEAD(&hh->hh_list, pve, pve_hash);
-	mutex_spin_exit(lock);
+	pve2 = radix_tree_remove_node(&pmap->pm_pvtree, va >> PAGE_SHIFT);
 
-	LIST_INSERT_HEAD(&pp->pp_head.pvh_list, pve, pve_list);
+	KASSERT(pve2 == pve);
 }
 
 /*
- * pmap_enter_pv: enter a mapping onto a pv_head lst
+ * pmap_enter_pv: enter a mapping onto a pmap_page lst
  *
  * => caller should adjust ptp's wire_count before calling
- * => caller has preallocated pve and *sparepve for us
+ * => caller has preallocated pve for us
+ * => if not embedded, tree node must be in place beforehand
  */
 static struct pv_entry *
-pmap_enter_pv(struct pmap_page *pp, struct pv_entry *pve,
-    struct pv_entry **sparepve, struct vm_page *ptp, vaddr_t va)
+pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct pv_entry *pve,
+    struct vm_page *ptp, vaddr_t va)
 {
 
+	KASSERT(mutex_owned(&pmap->pm_lock));
+	KASSERT(ptp_to_pmap(ptp) == pmap);
 	KASSERT(ptp == NULL || ptp->wire_count >= 2);
 	KASSERT(ptp == NULL || ptp->uobject != NULL);
 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
 
 	if ((pp->pp_flags & PP_EMBEDDED) == 0) {
-		if (LIST_EMPTY(&pp->pp_head.pvh_list)) {
-			pp->pp_flags |= PP_EMBEDDED;
-			pp->pp_pte.pte_ptp = ptp;
-			pp->pp_pte.pte_va = va;
-
-			return pve;
-		}
-	} else {
-		struct pv_entry *pve2;
-
-		pve2 = *sparepve;
-		*sparepve = NULL;
-
-		pve2->pve_pte = pp->pp_pte;
-		pp->pp_flags &= ~PP_EMBEDDED;
-		LIST_INIT(&pp->pp_head.pvh_list);
-		insert_pv(pp, pve2);
+		pp->pp_flags |= PP_EMBEDDED;
+		pp->pp_pte.pte_ptp = ptp;
+		pp->pp_pte.pte_va = va;
+		return pve;
 	}
 
+	KASSERT(pve != NULL);
 	pve->pve_pte.pte_ptp = ptp;
 	pve->pve_pte.pte_va = va;
-	insert_pv(pp, pve);
-
+	KASSERT(pmap_pvmap_lookup(pmap, va) != NULL);
+	KASSERT(pmap_pvmap_lookup(pmap, va) == pve);
+	LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
 	return NULL;
 }
 
@@ -1938,38 +1900,39 @@ pmap_enter_pv(struct pmap_page *pp, stru
  * pmap_remove_pv: try to remove a mapping from a pv_list
  *
  * => caller should adjust ptp's wire_count and free PTP if needed
+ * => we don't remove radix tree entry; defer till later (it could block)
  * => we return the removed pve
+ * => caller can optionally supply pve, if looked up already
  */
 static struct pv_entry *
-pmap_remove_pv(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
+pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
+    vaddr_t va, struct pv_entry *pve)
 {
-	struct pv_hash_head *hh;
-	struct pv_entry *pve;
-	kmutex_t *lock;
-	u_int hash;
 
+	KASSERT(mutex_owned(&pmap->pm_lock));
+	KASSERT(ptp_to_pmap(ptp) == pmap);
 	KASSERT(ptp == NULL || ptp->uobject != NULL);
 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
 
-	if ((pp->pp_flags & PP_EMBEDDED) != 0) {
-		KASSERT(pp->pp_pte.pte_ptp == ptp);
-		KASSERT(pp->pp_pte.pte_va == va);
-
+	if ((pp->pp_flags & PP_EMBEDDED) != 0 &&
+	    pp->pp_pte.pte_ptp == ptp &&
+	    pp->pp_pte.pte_va == va) {
+	    	KASSERT(pve == NULL);
 		pp->pp_flags &= ~PP_EMBEDDED;
-		LIST_INIT(&pp->pp_head.pvh_list);
-
+		pp->pp_pte.pte_ptp = NULL;
+		pp->pp_pte.pte_va = 0;
 		return NULL;
 	}
 
-	hash = pvhash_hash(ptp, va);
-	lock = pvhash_lock(hash);
-	hh = pvhash_head(hash);
-	mutex_spin_enter(lock);
-	pve = pvhash_remove(hh, ptp, va);
-	mutex_spin_exit(lock);
-
+	if (pve == NULL) {
+		pve = pmap_pvmap_lookup(pmap, va);
+		KASSERT(pve != NULL);
+	} else {
+		KASSERT(pve == pmap_pvmap_lookup(pmap, va));
+	}
+	KASSERT(pve->pve_pte.pte_ptp == ptp);
+	KASSERT(pve->pve_pte.pte_va == va);
 	LIST_REMOVE(pve, pve_list);
-
 	return pve;
 }
 
@@ -2079,56 +2042,69 @@ pmap_free_ptp(struct pmap *pmap, struct 
  *
  * => pmap should NOT be pmap_kernel()
  * => pmap should be locked
- * => preemption should be disabled
+ * => we are not touching any PTEs yet, so they need not be mapped in
  */
 static int
-pmap_get_ptp(struct pmap *pmap, vaddr_t va, pd_entry_t * const *pdes, int flags,
-	     struct vm_page **resultp)
+pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
+    int flags, struct vm_page **resultp)
 {
 	struct vm_page *ptp;
-	struct {
-		struct vm_page *pg;
-		bool new;
-	} pt[PTP_LEVELS + 1];
-	int i, aflags, error;
-	unsigned long index;
-	pd_entry_t *pva;
-	paddr_t pa;
+	int i, aflags;
 	struct uvm_object *obj;
 	voff_t off;
-	uint64_t ncsw;
 
 	KASSERT(pmap != pmap_kernel());
 	KASSERT(mutex_owned(&pmap->pm_lock));
-	KASSERT(kpreempt_disabled());
 
 	/*
 	 * Loop through all page table levels allocating a page
 	 * for any level where we don't already have one.
 	 */
-	memset(pt, 0, sizeof(pt));
+	memset(pt, 0, sizeof(*pt));
 	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
 		UVM_PGA_ZERO;
 	for (i = PTP_LEVELS; i > 1; i--) {
 		obj = &pmap->pm_obj[i - 2];
 		off = ptp_va2o(va, i - 1);
 
-		pt[i].pg = uvm_pagelookup(obj, off);
-		if (pt[i].pg == NULL) {
-			ncsw = lwp_pctr();
-			pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags);
-			pt[i].new = true;
-			if (__predict_false(ncsw != lwp_pctr())) {
-				/* uvm_pagealloc can block. */
-				error = EAGAIN;
-				goto fail;
-			}
+		pt->pg[i] = uvm_pagelookup(obj, off);
+		if (pt->pg[i] == NULL) {
+			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
+			pt->alloced[i] = true;
 		}
-		if (pt[i].pg == NULL) {
-			error = ENOMEM;
-			goto fail;
+		if (pt->pg[i] == NULL) {
+			pmap_unget_ptp(pmap, pt);
+			return ENOMEM;
 		}
 	}
+	ptp = pt->pg[2];
+	KASSERT(ptp != NULL);
+	*resultp = ptp;
+	pmap->pm_ptphint[0] = ptp;
+	return 0;
+}
+
+/*
+ * pmap_install_ptp: instal any freshly allocated PTPs
+ *
+ * => pmap should NOT be pmap_kernel()
+ * => pmap should be locked
+ * => PTEs must be mapped
+ * => preemption must be disabled
+ */
+static void
+pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
+    pd_entry_t * const *pdes)
+{
+	struct vm_page *ptp;
+	unsigned long index;
+	pd_entry_t *pva;
+	paddr_t pa;
+	int i;
+
+	KASSERT(pmap != pmap_kernel());
+	KASSERT(mutex_owned(&pmap->pm_lock));
+	KASSERT(kpreempt_disabled());
 
 	/*
 	 * Now that we have all the pages looked up or allocated,
@@ -2139,11 +2115,11 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t 
 		pva = pdes[i - 2];
 
 		if (pmap_valid_entry(pva[index])) {
-			KASSERT(!pt[i].new);
+			KASSERT(!pt->alloced[i]);
 			continue;
 		}
 
-		ptp = pt[i].pg;
+		ptp = pt->pg[i];
 		ptp->flags &= ~PG_BUSY; /* never busy */
 		ptp->wire_count = 1;
 		pmap->pm_ptphint[i - 2] = ptp;
@@ -2173,31 +2149,36 @@ pmap_get_ptp(struct pmap *pmap, vaddr_t 
 		 * wire count of the parent page.
 		 */
 		if (i < PTP_LEVELS) {
-			pt[i + 1].pg->wire_count++;
+			pt->pg[i + 1]->wire_count++;
 		}
 	}
-	ptp = pt[2].pg;
-	KASSERT(ptp != NULL);
-	pmap->pm_ptphint[0] = ptp;
-	*resultp = ptp;
-	return 0;
+}
+
+/*
+ * pmap_unget_ptp: free unusued PTPs
+ *
+ * => pmap should NOT be pmap_kernel()
+ * => pmap should be locked
+ */
+static void
+pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
+{
+	int i;
+
+	KASSERT(pmap != pmap_kernel());
+	KASSERT(mutex_owned(&pmap->pm_lock));
 
-	/*
-	 * Allocation of a PTP failed, free any others that we just allocated.
-	 */
-fail:
 	for (i = PTP_LEVELS; i > 1; i--) {
-		if (pt[i].pg == NULL) {
+		if (pt->pg[i] == NULL) {
 			break;
 		}
-		if (!pt[i].new) {
+		if (!pt->alloced[i]) {
 			continue;
 		}
-		uvm_pagefree(pt[i].pg);
+		uvm_pagefree(pt->pg[i]);
+		pt->pg[i] = NULL;
+		pmap->pm_ptphint[0] = NULL;
 	}
-	/* XXX silence assertion in pmap_unmap_ptes */
-	pmap->pm_ncsw = lwp_pctr();
-	return error;
 }
 
 /*
@@ -2205,12 +2186,11 @@ fail:
  */
 
 /*
- * pmap_pdp_ctor: constructor for the PDP cache.
+ * pmap_pdp_init: constructor a new PDP.
  */
-static int
-pmap_pdp_ctor(void *arg, void *v, int flags)
+static void
+pmap_pdp_init(pd_entry_t *pdir)
 {
-	pd_entry_t *pdir = v;
 	paddr_t pdirpa = 0;
 	vaddr_t object;
 	int i;
@@ -2225,7 +2205,8 @@ pmap_pdp_ctor(void *arg, void *v, int fl
 	memset(pdir, 0, PDP_SIZE * PAGE_SIZE);
 
 	/*
-	 * NOTE: The `pmaps_lock' is held when the PDP is allocated.
+	 * NOTE: This is all done unlocked, but we will check afterwards
+	 * if we have raced with pmap_growkernel().
 	 */
 
 #if defined(XENPV) && defined(__x86_64__)
@@ -2248,7 +2229,7 @@ pmap_pdp_ctor(void *arg, void *v, int fl
 	pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
 	     (pd_entry_t)-1 & PTE_FRAME;
 #else /* XENPV && __x86_64__*/
-	object = (vaddr_t)v;
+	object = (vaddr_t)pdir;
 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
 		/* Fetch the physical address of the page directory */
 		(void)pmap_extract(pmap_kernel(), object, &pdirpa);
@@ -2288,7 +2269,7 @@ pmap_pdp_ctor(void *arg, void *v, int fl
 
 #ifdef XENPV
 	s = splvm();
-	object = (vaddr_t)v;
+	object = (vaddr_t)pdir;
 	pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
 	    VM_PROT_READ);
 	pmap_update(pmap_kernel());
@@ -2316,19 +2297,17 @@ pmap_pdp_ctor(void *arg, void *v, int fl
 #endif
 	splx(s);
 #endif /* XENPV */
-
-	return 0;
 }
 
 /*
- * pmap_pdp_dtor: destructor for the PDP cache.
+ * pmap_pdp_fini: destructor for the PDPs.
  */
 static void
-pmap_pdp_dtor(void *arg, void *v)
+pmap_pdp_fini(pd_entry_t *pdir)
 {
 #ifdef XENPV
 	paddr_t pdirpa = 0;	/* XXX: GCC */
-	vaddr_t object = (vaddr_t)v;
+	vaddr_t object = (vaddr_t)pdir;
 	int i;
 	int s = splvm();
 	pt_entry_t *pte;
@@ -2339,7 +2318,7 @@ pmap_pdp_dtor(void *arg, void *v)
 		/* unpin page table */
 		xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
 	}
-	object = (vaddr_t)v;
+	object = (vaddr_t)pdir;
 	for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
 		/* Set page RW again */
 		pte = kvtopte(object);
@@ -2369,6 +2348,72 @@ pmap_pdp_free(struct pool *pp, void *v)
 #endif /* PAE */
 
 /*
+ * pmap_ctor: constructor for the pmap cache.
+ */
+static int
+pmap_ctor(void *arg, void *obj, int flags)
+{
+	struct pmap *pmap = obj;
+	pt_entry_t p;
+	int i;
+
+	KASSERT((flags & PR_WAITOK) != 0);
+
+	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
+	radix_tree_init_tree(&pmap->pm_pvtree);
+	kcpuset_create(&pmap->pm_cpus, true);
+	kcpuset_create(&pmap->pm_kernel_cpus, true);
+#ifdef XENPV
+	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
+#endif
+
+	/* allocate and init PDP */
+	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
+
+	for (;;) {
+		pmap_pdp_init(pmap->pm_pdir);
+		mutex_enter(&pmaps_lock);
+		p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
+		if (__predict_true(p != 0)) {
+			break;
+		}
+		mutex_exit(&pmaps_lock);
+	}
+
+	for (i = 0; i < PDP_SIZE; i++)
+		pmap->pm_pdirpa[i] =
+		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
+
+	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
+	mutex_exit(&pmaps_lock);
+
+	return 0;
+}
+
+/*
+ * pmap_ctor: destructor for the pmap cache.
+ */
+static void
+pmap_dtor(void *arg, void *obj)
+{
+	struct pmap *pmap = obj;
+
+	mutex_enter(&pmaps_lock);
+	LIST_REMOVE(pmap, pm_list);
+	mutex_exit(&pmaps_lock);
+
+	pmap_pdp_fini(pmap->pm_pdir);
+	pool_put(&pmap_pdp_pool, pmap->pm_pdir);
+	radix_tree_fini_tree(&pmap->pm_pvtree);
+	mutex_destroy(&pmap->pm_lock);
+	kcpuset_destroy(pmap->pm_cpus);
+	kcpuset_destroy(pmap->pm_kernel_cpus);
+#ifdef XENPV
+	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
+#endif
+}
+
+/*
  * pmap_create: create a pmap object.
  */
 struct pmap *
@@ -2380,7 +2425,6 @@ pmap_create(void)
 	pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
 
 	/* init uvm_object */
-	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
 	for (i = 0; i < PTP_LEVELS - 1; i++) {
 		uvm_obj_init(&pmap->pm_obj[i], NULL, false, 1);
 		uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_lock);
@@ -2406,36 +2450,11 @@ pmap_create(void)
 	pmap->pm_tlb_flush = NULL;
 	pmap->pm_data = NULL;
 
-	kcpuset_create(&pmap->pm_cpus, true);
-	kcpuset_create(&pmap->pm_kernel_cpus, true);
-#ifdef XENPV
-	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
-#endif
 	/* init the LDT */
 	pmap->pm_ldt = NULL;
 	pmap->pm_ldt_len = 0;
 	pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
 
-	/* allocate PDP */
- try_again:
-	pmap->pm_pdir = pool_cache_get(&pmap_pdp_cache, PR_WAITOK);
-
-	mutex_enter(&pmaps_lock);
-
-	if (pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] == 0) {
-		mutex_exit(&pmaps_lock);
-		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
-		goto try_again;
-	}
-
-	for (i = 0; i < PDP_SIZE; i++)
-		pmap->pm_pdirpa[i] =
-		    pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
-
-	LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
-
-	mutex_exit(&pmaps_lock);
-
 	return (pmap);
 }
 
@@ -2451,7 +2470,7 @@ pmap_free_ptps(struct vm_page *empty_ptp
 	while ((ptp = empty_ptps) != NULL) {
 		pp = VM_PAGE_TO_PP(ptp);
 		empty_ptps = pp->pp_link;
-		LIST_INIT(&pp->pp_head.pvh_list);
+		LIST_INIT(&pp->pp_pvlist);
 		uvm_pagefree(ptp);
 	}
 }
@@ -2503,6 +2522,9 @@ pmap_check_inuse(struct pmap *pmap)
 /*
  * pmap_destroy: drop reference count on pmap.   free pmap if
  * reference count goes to zero.
+ *
+ * => we can be called from pmap_unmap_ptes() with a different, unrelated
+ *    pmap's lock held.  be careful!
  */
 void
 pmap_destroy(struct pmap *pmap)
@@ -2536,13 +2558,8 @@ pmap_destroy(struct pmap *pmap)
 
 	/*
 	 * Reference count is zero, free pmap resources and then free pmap.
-	 * First, remove it from global list of pmaps.
 	 */
 
-	mutex_enter(&pmaps_lock);
-	LIST_REMOVE(pmap, pm_list);
-	mutex_exit(&pmaps_lock);
-
 	/*
 	 * Process deferred PTP frees.  No TLB shootdown required, as the
 	 * PTP pages are no longer visible to any CPU.
@@ -2550,12 +2567,6 @@ pmap_destroy(struct pmap *pmap)
 
 	pmap_free_ptps(pmap->pm_gc_ptp);
 
-	if (__predict_false(pmap->pm_enter != NULL)) {
-		pool_cache_destruct_object(&pmap_pdp_cache, pmap->pm_pdir);
-	} else {
-		pool_cache_put(&pmap_pdp_cache, pmap->pm_pdir);
-	}
-
 #ifdef USER_LDT
 	if (pmap->pm_ldt != NULL) {
 		/*
@@ -2576,15 +2587,20 @@ pmap_destroy(struct pmap *pmap)
 	for (i = 0; i < PTP_LEVELS - 1; i++) {
 		uvm_obj_destroy(&pmap->pm_obj[i], false);
 	}
-	mutex_destroy(&pmap->pm_lock);
-	kcpuset_destroy(pmap->pm_cpus);
-	kcpuset_destroy(pmap->pm_kernel_cpus);
+	kcpuset_zero(pmap->pm_cpus);
+	kcpuset_zero(pmap->pm_kernel_cpus);
 #ifdef XENPV
-	kcpuset_destroy(pmap->pm_xen_ptp_cpus);
+	kcpuset_zero(pmap->pm_xen_ptp_cpus);
 #endif
 
+	KASSERT(radix_tree_empty_tree_p(&pmap->pm_pvtree));
 	pmap_check_ptps(pmap);
-	pool_cache_put(&pmap_cache, pmap);
+	if (__predict_false(pmap->pm_enter != NULL)) {
+		/* XXX make this a different cache */
+		pool_cache_destruct_object(&pmap_cache, pmap);
+	} else {
+		pool_cache_put(&pmap_cache, pmap);
+	}
 }
 
 /*
@@ -3114,7 +3130,6 @@ pmap_extract(struct pmap *pmap, vaddr_t 
 	pa = 0;
 	l = curlwp;
 
-	kpreempt_disable();
 	ci = l->l_cpu;
 	if (pmap == pmap_kernel() ||
 	    __predict_true(!ci->ci_want_pmapload && ci->ci_pmap == pmap)) {
@@ -3129,9 +3144,11 @@ pmap_extract(struct pmap *pmap, vaddr_t 
 		hard = false;
 		ptes = PTE_BASE;
 		pdes = normal_pdes;
+		kpreempt_disable();
 	} else {
 		/* we lose, do it the hard way. */
 		hard = true;
+		mutex_enter(&pmap->pm_lock);
 		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 	}
 	if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
@@ -3148,9 +3165,11 @@ pmap_extract(struct pmap *pmap, vaddr_t 
 		}
 	}
 	if (__predict_false(hard)) {
-		pmap_unmap_ptes(pmap, pmap2, NULL);
+		pmap_unmap_ptes(pmap, pmap2);
+		mutex_exit(&pmap->pm_lock);
+	} else {
+		kpreempt_enable();
 	}
-	kpreempt_enable();
 	if (pap != NULL) {
 		*pap = pa;
 	}
@@ -3423,7 +3442,7 @@ pmap_remove_ptes(struct pmap *pmap, stru
 {
 	pt_entry_t *pte = (pt_entry_t *)ptpva;
 
-	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
+	KASSERT(mutex_owned(&pmap->pm_lock));
 	KASSERT(kpreempt_disabled());
 
 	/*
@@ -3485,7 +3504,7 @@ pmap_remove_pte(struct pmap *pmap, struc
 	struct pmap_page *pp;
 	pt_entry_t opte;
 
-	KASSERT(pmap == pmap_kernel() || mutex_owned(&pmap->pm_lock));
+	KASSERT(mutex_owned(&pmap->pm_lock));
 	KASSERT(kpreempt_disabled());
 
 	if (!pmap_valid_entry(*pte)) {
@@ -3517,7 +3536,7 @@ pmap_remove_pte(struct pmap *pmap, struc
 	}
 
 	/*
-	 * If we are not on a pv_head list - we are done.
+	 * If we are not on a pv list - we are done.
 	 */
 	if ((opte & PTE_PVLIST) == 0) {
 #ifndef DOM0OPS
@@ -3541,7 +3560,7 @@ pmap_remove_pte(struct pmap *pmap, struc
 
 	/* Sync R/M bits. */
 	pp->pp_attrs |= pmap_pte_to_pp_attrs(opte);
-	pve = pmap_remove_pv(pp, ptp, va);
+	pve = pmap_remove_pv(pmap, pp, ptp, va, NULL);
 
 	if (pve) {
 		pve->pve_next = *pv_tofree;
@@ -3575,8 +3594,8 @@ pmap_remove(struct pmap *pmap, vaddr_t s
 		return;
 	}
 
-	kpreempt_disable();
-	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
+	mutex_enter(&pmap->pm_lock);
+	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 
 	/*
 	 * removing one page?  take shortcut function.
@@ -3651,17 +3670,30 @@ pmap_remove(struct pmap *pmap, vaddr_t s
 		}
 		pmap_free_ptp(pmap, ptp, va, ptes, pdes, &ptp_tofree);
 		if (ptp_tofree != NULL) {
-			pmap_unmap_ptes(pmap, pmap2, ptp_tofree);
+			pmap_unmap_ptes(pmap, pmap2);
+			/* Now safe to free, with the pmap still locked. */
+			pmap_freepages(pmap, ptp_tofree);
 			ptp_tofree = NULL;
+			if (pv_tofree != NULL) {
+				pmap_free_pvs(pmap, pv_tofree);
+				pv_tofree = NULL;
+			}
 			pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 		}
 	}
-	pmap_unmap_ptes(pmap, pmap2, ptp_tofree);	/* unlock pmap */
-	kpreempt_enable();
-
-	/* Now we free unused PVs */
-	if (pv_tofree)
-		pmap_free_pvs(pv_tofree);
+	pmap_unmap_ptes(pmap, pmap2);
+	/*
+	 * Now safe to free, as we no longer have the PTEs mapped and can
+	 * block again.  Radix tree nodes are removed here, so we need to
+	 * continue holding the pmap locked until complete.
+	 */
+	if (ptp_tofree != NULL) {
+		pmap_freepages(pmap, ptp_tofree);
+	}
+	if (pv_tofree != NULL) {
+		pmap_free_pvs(pmap, pv_tofree);
+	}
+	mutex_exit(&pmap->pm_lock);
 }
 
 /*
@@ -3776,20 +3808,25 @@ pmap_pp_remove_ent(struct pmap *pmap, st
 	pt_entry_t *ptes;
 	pd_entry_t * const *pdes;
 
+	KASSERT(mutex_owned(&pmap->pm_lock));
+
 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 	pmap_stats_update_bypte(pmap, 0, opte);
 	ptp->wire_count--;
 	if (ptp->wire_count <= 1) {
 		pmap_free_ptp(pmap, ptp, va, ptes, pdes, &ptp_tofree);
 	}
-	pmap_unmap_ptes(pmap, pmap2, ptp_tofree);
+	pmap_unmap_ptes(pmap, pmap2);
+	/* Now safe to free PTPs, with the pmap still locked. */
+	if (ptp_tofree != NULL) {
+		pmap_freepages(pmap, ptp_tofree);
+	}
 }
 
 static void
 pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
 {
 	struct pv_pte *pvpte;
-	struct pv_entry *killlist = NULL;
 	struct vm_page *ptp;
 	uint8_t oattrs;
 	int count;
@@ -3813,11 +3850,12 @@ startover:
 		if (ptp != NULL) {
 			pmap_reference(pmap);
 		}
-
+		mutex_enter(&pmap->pm_lock);
 		error = pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte);
 		if (error == EAGAIN) {
 			int hold_count;
 			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
+			mutex_exit(&pmap->pm_lock);
 			if (ptp != NULL) {
 				pmap_destroy(pmap);
 			}
@@ -3828,7 +3866,7 @@ startover:
 
 		pp->pp_attrs |= oattrs;
 		va = pvpte->pte_va;
-		pve = pmap_remove_pv(pp, ptp, va);
+		pve = pmap_remove_pv(pmap, pp, ptp, va, NULL);
 
 		/* Update the PTP reference count. Free if last reference. */
 		if (ptp != NULL) {
@@ -3839,22 +3877,25 @@ startover:
 			} else {
 				pmap_pp_remove_ent(pmap, ptp, opte, va);
 			}
-			pmap_destroy(pmap);
 		} else {
 			KASSERT(pmap == pmap_kernel());
 			pmap_stats_update_bypte(pmap, 0, opte);
 		}
-
 		if (pve != NULL) {
-			pve->pve_next = killlist;	/* mark it for death */
-			killlist = pve;
+			/*
+			 * Must free pve, and remove from pmap's radix tree
+			 * with the pmap's lock still held.
+			 */
+			pve->pve_next = NULL;
+			pmap_free_pvs(pmap, pve);
+		}
+		mutex_exit(&pmap->pm_lock);
+		if (ptp != NULL) {
+			pmap_destroy(pmap);
 		}
 	}
 	pmap_tlb_shootnow();
 	kpreempt_enable();
-
-	/* Now free unused pvs. */
-	pmap_free_pvs(killlist);
 }
 
 /*
@@ -4073,8 +4114,13 @@ pmap_write_protect(struct pmap *pmap, va
 	sva &= ~PAGE_MASK;
 	eva &= ~PAGE_MASK;
 
-	/* Acquire pmap. */
-	kpreempt_disable();
+	/*
+	 * Acquire pmap.  No need to lock the kernel pmap as we won't
+	 * be touching the pvmap nor the stats.
+	 */
+	if (pmap != pmap_kernel()) {
+		mutex_enter(&pmap->pm_lock);
+	}
 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 
 	for (va = sva ; va < eva; va = blockend) {
@@ -4115,8 +4161,10 @@ next:;
 	}
 
 	/* Release pmap. */
-	pmap_unmap_ptes(pmap, pmap2, NULL);
-	kpreempt_enable();
+	pmap_unmap_ptes(pmap, pmap2);
+	if (pmap != pmap_kernel()) {
+		mutex_exit(&pmap->pm_lock);
+	}
 }
 
 /*
@@ -4137,8 +4185,11 @@ pmap_unwire(struct pmap *pmap, vaddr_t v
 		return;
 	}
 
-	/* Acquire pmap. */
-	kpreempt_disable();
+	/*
+	 * Acquire pmap.  Need to lock the kernel pmap only to protect the
+	 * statistics.
+	 */
+	mutex_enter(&pmap->pm_lock);
 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 
 	if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
@@ -4161,8 +4212,8 @@ pmap_unwire(struct pmap *pmap, vaddr_t v
 	}
 
 	/* Release pmap. */
-	pmap_unmap_ptes(pmap, pmap2, NULL);
-	kpreempt_enable();
+	pmap_unmap_ptes(pmap, pmap2);
+	mutex_exit(&pmap->pm_lock);
 }
 
 /*
@@ -4193,7 +4244,6 @@ pmap_enter_default(pmap_t pmap, vaddr_t 
  * pmap_enter: enter a mapping into a pmap
  *
  * => must be done "now" ... no lazy-evaluation
- * => we set pmap => pv_head locking
  */
 int
 pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
@@ -4205,12 +4255,11 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 	struct vm_page *ptp;
 	struct vm_page *new_pg, *old_pg;
 	struct pmap_page *new_pp, *old_pp;
-	struct pv_entry *old_pve = NULL;
-	struct pv_entry *new_pve;
-	struct pv_entry *new_sparepve;
+	struct pv_entry *pve;
 	int error;
 	bool wired = (flags & PMAP_WIRED) != 0;
 	struct pmap *pmap2;
+	struct pmap_ptparray pt;
 
 	KASSERT(pmap_initialized);
 	KASSERT(curlwp->l_md.md_gc_pmap != pmap);
@@ -4259,62 +4308,68 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 		new_pp = NULL;
 	}
 
-	/*
-	 * Try to get pves now if we might need them.
-	 * Keep going even if we fail, since we will not actually need them
-	 * if we are just changing the permissions on an existing mapping,
-	 * but we won't know if that's the case until later.
-	 */
-
-	bool needpves = pmap_pp_needs_pve(new_pp);
-	if (needpves) {
-		new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
-		new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
-	} else {
-		new_pve = NULL;
-		new_sparepve = NULL;
-	}
-
-	kpreempt_disable();
- retry:
-	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
+	/* Make sure we have PTPs allocated. */
+	mutex_enter(&pmap->pm_lock);
 	ptp = NULL;
 	if (pmap != pmap_kernel()) {
-		error = pmap_get_ptp(pmap, va, pdes, flags, &ptp);
-		if (error == EAGAIN) {
-			pmap_unmap_ptes(pmap, pmap2, NULL);
-			goto retry;
-		}
-		if (error == ENOMEM) {
-			pmap_unmap_ptes(pmap, pmap2, NULL);
+		error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
+		if (error != 0) {
 			if (flags & PMAP_CANFAIL) {
-				goto out;
+				mutex_exit(&pmap->pm_lock);
+				return error;
 			}
-			panic("%s: get ptp failed", __func__);
+			panic("%s: get ptp failed, error=%d", __func__,
+			    error);
 		}
 	}
 
 	/*
-	 * Check if there is an existing mapping.  If we are now sure that
-	 * we need pves and we failed to allocate them earlier, handle that.
-	 * Caching the value of oldpa here is safe because only the mod/ref
-	 * bits can change while the pmap is locked.
+	 * Now check to see if we need a pv entry for this VA.  If we do,
+	 * allocate and install in the radix tree.  In any case look up the
+	 * pv entry in case the old mapping used it.
 	 */
+	pve = pmap_pvmap_lookup(pmap, va);
+	if (pve == NULL && pmap_pp_needs_pve(new_pp, ptp, va)) {
+		pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
+		if (pve == NULL) {
+			if (flags & PMAP_CANFAIL) {
+				if (ptp != NULL) {
+					pmap_unget_ptp(pmap, &pt);
+				}
+				mutex_exit(&pmap->pm_lock);
+				return error;
+			}
+			panic("%s: alloc pve failed", __func__);
+		}	
+		error = pmap_pvmap_insert(pmap, va, pve);
+		if (error != 0) {
+			if (flags & PMAP_CANFAIL) {
+				if (ptp != NULL) {
+					pmap_unget_ptp(pmap, &pt);
+				}
+				pool_cache_put(&pmap_pv_cache, pve);
+				mutex_exit(&pmap->pm_lock);
+				return error;
+			}
+			panic("%s: radixtree insert failed, error=%d",
+			    __func__, error);
+		}
+	}
+
+	/* Map PTEs into address space. */
+	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
+
+	/* Install any newly allocated PTPs. */
+	if (ptp != NULL) {
+		pmap_install_ptp(pmap, &pt, va, pdes);
+	}
+
+	/* Check if there is an existing mapping. */
 	ptep = &ptes[pl1_i(va)];
 	opte = *ptep;
 	bool have_oldpa = pmap_valid_entry(opte);
 	paddr_t oldpa = pmap_pte2pa(opte);
 
-	if (needpves && (!have_oldpa || oldpa != pa) &&
-	    (new_pve == NULL || new_sparepve == NULL)) {
-		pmap_unmap_ptes(pmap, pmap2, NULL);
-		if (flags & PMAP_CANFAIL) {
-			error = ENOMEM;
-			goto out;
-		}
-		panic("%s: pve allocation failed", __func__);
-	}
-
 	/*
 	 * Update the pte.
 	 */
@@ -4344,7 +4399,6 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 					pmap_free_ptp(pmap, ptp, va,
 					    ptes, pdes, &ptp_tofree);
 				}
-				pmap_unmap_ptes(pmap, pmap2, ptp_tofree);
 				goto out;
 			}
 			break;
@@ -4366,6 +4420,11 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 	 */
 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
+		if ((opte & PTE_PVLIST) != 0 && pve != NULL) {
+			KASSERT(pve->pve_pte.pte_ptp == ptp);
+			KASSERT(pve->pve_pte.pte_va == va);
+		}
+		pve = NULL;
 		goto same_pa;
 	}
 
@@ -4383,7 +4442,7 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 			    __func__, va, oldpa, atop(pa));
 		}
 
-		old_pve = pmap_remove_pv(old_pp, ptp, va);
+		(void)pmap_remove_pv(pmap, old_pp, ptp, va, pve);
 		old_pp->pp_attrs |= pmap_pte_to_pp_attrs(opte);
 	}
 
@@ -4391,12 +4450,10 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 	 * If new page is pv-tracked, insert pv_entry into its list.
 	 */
 	if (new_pp) {
-		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
+		pve = pmap_enter_pv(pmap, new_pp, pve, ptp, va);
 	}
 
 same_pa:
-	pmap_unmap_ptes(pmap, pmap2, NULL);
-
 	/*
 	 * shootdown tlb if necessary.
 	 */
@@ -4407,18 +4464,15 @@ same_pa:
 	}
 
 	error = 0;
+#if defined(XENPV)
 out:
-	kpreempt_enable();
-	if (old_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, old_pve);
-	}
-	if (new_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, new_pve);
-	}
-	if (new_sparepve != NULL) {
-		pool_cache_put(&pmap_pv_cache, new_sparepve);
+#endif
+	pmap_unmap_ptes(pmap, pmap2);
+	if (pve != NULL) {
+		pmap_pvmap_remove(pmap, va, pve);
+		pool_cache_put(&pmap_pv_cache, pve);
 	}
-
+	mutex_exit(&pmap->pm_lock);
 	return error;
 }
 
@@ -4660,8 +4714,8 @@ pmap_growkernel(vaddr_t maxkvaddr)
 	splx(s);
 
 	if (invalidate && pmap_initialized) {
-		/* Invalidate the PDP cache. */
-		pool_cache_invalidate(&pmap_pdp_cache);
+		/* Invalidate the pmap cache. */
+		pool_cache_invalidate(&pmap_cache);
 	}
 
 	return maxkvaddr;
@@ -4692,12 +4746,8 @@ pmap_dump(struct pmap *pmap, vaddr_t sva
 	if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
 		eva = VM_MAXUSER_ADDRESS;
 
-	/*
-	 * we lock in the pmap => pv_head direction
-	 */
-
-	kpreempt_disable();
-	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);	/* locks pmap */
+	mutex_enter(&pmap->pm_lock);
+	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 
 	/*
 	 * dumping a range of pages: we dump in PTP sized blocks (4MB)
@@ -4724,8 +4774,8 @@ pmap_dump(struct pmap *pmap, vaddr_t sva
 			    sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
 		}
 	}
-	pmap_unmap_ptes(pmap, pmap2, NULL);
-	kpreempt_enable();
+	pmap_unmap_ptes(pmap, pmap2);
+	mutex_exit(&pmap->pm_lock);
 }
 #endif
 
@@ -5044,47 +5094,19 @@ pmap_ept_free_ptp(struct pmap *pmap, str
 }
 
 /* Allocate L4->L3->L2. Return L2. */
-static struct vm_page *
-pmap_ept_get_ptp(struct pmap *pmap, vaddr_t va, int flags)
+static void
+pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
 {
 	struct vm_page *ptp;
-	struct {
-		struct vm_page *pg;
-		bool new;
-	} pt[PTP_LEVELS + 1];
-	int i, aflags;
 	unsigned long index;
 	pd_entry_t *pteva;
 	paddr_t ptepa;
-	struct uvm_object *obj;
-	voff_t off;
+	int i;
 
 	KASSERT(pmap != pmap_kernel());
 	KASSERT(mutex_owned(&pmap->pm_lock));
 	KASSERT(kpreempt_disabled());
 
-	memset(pt, 0, sizeof(pt));
-	aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
-	    UVM_PGA_ZERO;
-
-	/*
-	 * Loop through all page table levels allocating a page
-	 * for any level where we don't already have one.
-	 */
-	for (i = PTP_LEVELS; i > 1; i--) {
-		obj = &pmap->pm_obj[i - 2];
-		off = ptp_va2o(va, i - 1);
-
-		pt[i].pg = uvm_pagelookup(obj, off);
-		if (pt[i].pg == NULL) {
-			pt[i].pg = uvm_pagealloc(obj, off, NULL, aflags);
-			pt[i].new = true;
-		}
-
-		if (pt[i].pg == NULL)
-			goto fail;
-	}
-
 	/*
 	 * Now that we have all the pages looked up or allocated,
 	 * loop through again installing any new ones into the tree.
@@ -5095,12 +5117,12 @@ pmap_ept_get_ptp(struct pmap *pmap, vadd
 		pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
 
 		if (pmap_ept_valid_entry(pteva[index])) {
-			KASSERT(!pt[i].new);
+			KASSERT(!pt->alloced[i]);
 			ptepa = pmap_pte2pa(pteva[index]);
 			continue;
 		}
 
-		ptp = pt[i].pg;
+		ptp = pt->pg[i];
 		ptp->flags &= ~PG_BUSY; /* never busy */
 		ptp->wire_count = 1;
 		pmap->pm_ptphint[i - 2] = ptp;
@@ -5115,29 +5137,9 @@ pmap_ept_get_ptp(struct pmap *pmap, vadd
 		 * wire count of the parent page.
 		 */
 		if (i < PTP_LEVELS) {
-			pt[i + 1].pg->wire_count++;
+			pt->pg[i + 1]->wire_count++;
 		}
 	}
-	ptp = pt[2].pg;
-	KASSERT(ptp != NULL);
-	pmap->pm_ptphint[0] = ptp;
-	return ptp;
-
-	/*
-	 * Allocation of a PTP failed, free any others that we just allocated.
-	 */
-fail:
-	for (i = PTP_LEVELS; i > 1; i--) {
-		if (pt[i].pg == NULL) {
-			break;
-		}
-		if (!pt[i].new) {
-			continue;
-		}
-		obj = &pmap->pm_obj[i - 2];
-		uvm_pagefree(pt[i].pg);
-	}
-	return NULL;
 }
 
 static int
@@ -5149,11 +5151,10 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 	struct vm_page *ptp;
 	struct vm_page *new_pg, *old_pg;
 	struct pmap_page *new_pp, *old_pp;
-	struct pv_entry *old_pve = NULL;
-	struct pv_entry *new_pve;
-	struct pv_entry *new_sparepve;
+	struct pv_entry *pve;
 	bool wired = (flags & PMAP_WIRED) != 0;
 	bool accessed;
+	struct pmap_ptparray pt;
 	int error;
 
 	KASSERT(pmap_initialized);
@@ -5184,33 +5185,60 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 		new_pp = NULL;
 	}
 
+	/* Make sure we have PTPs allocated. */
+	mutex_enter(&pmap->pm_lock);
+	ptp = NULL;
+	if (pmap != pmap_kernel()) {
+		error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
+		if (error != 0) {
+			if (flags & PMAP_CANFAIL) {
+				mutex_exit(&pmap->pm_lock);
+				return error;
+			}
+			panic("%s: get ptp failed, error=%d", __func__,
+			    error);
+		}
+	}
+
 	/*
-	 * Try to get pves now if we might need them.
-	 * Keep going even if we fail, since we will not actually need them
-	 * if we are just changing the permissions on an existing mapping,
-	 * but we won't know if that's the case until later.
+	 * Now check to see if we need a pv entry for this VA.  If we do,
+	 * allocate and install in the radix tree.  In any case look up the
+	 * pv entry in case the old mapping used it.
 	 */
-
-	bool needpves = pmap_pp_needs_pve(new_pp);
-	if (needpves) {
-		new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
-		new_sparepve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
-	} else {
-		new_pve = NULL;
-		new_sparepve = NULL;
+	pve = pmap_pvmap_lookup(pmap, va);
+	if (pve == NULL && pmap_pp_needs_pve(new_pp, ptp, va)) {
+		pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
+		if (pve == NULL) {
+			if (flags & PMAP_CANFAIL) {
+				if (ptp != NULL) {
+					pmap_unget_ptp(pmap, &pt);
+				}
+				mutex_exit(&pmap->pm_lock);
+				return error;
+			}
+			panic("%s: alloc pve failed", __func__);
+		}	
+		error = pmap_pvmap_insert(pmap, va, pve);
+		if (error != 0) {
+			if (flags & PMAP_CANFAIL) {
+				if (ptp != NULL) {
+					pmap_unget_ptp(pmap, &pt);
+				}
+				pool_cache_put(&pmap_pv_cache, pve);
+				mutex_exit(&pmap->pm_lock);
+				return error;
+			}
+			panic("%s: radixtree insert failed, error=%d",
+			    __func__, error);
+		}
 	}
 
+	/* Map PTEs into address space. */
 	kpreempt_disable();
-	mutex_enter(&pmap->pm_lock);
 
-	ptp = pmap_ept_get_ptp(pmap, va, flags);
-	if (ptp == NULL) {
-		mutex_exit(&pmap->pm_lock);
-		if (flags & PMAP_CANFAIL) {
-			error = ENOMEM;
-			goto out;
-		}
-		panic("%s: get ptp failed", __func__);
+	/* Install any newly allocated PTPs. */
+	if (ptp != NULL) {
+		pmap_ept_install_ptp(pmap, &pt, va);
 	}
 
 	/*
@@ -5225,16 +5253,6 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 	bool have_oldpa = pmap_ept_valid_entry(opte);
 	paddr_t oldpa = pmap_pte2pa(opte);
 
-	if (needpves && (!have_oldpa || oldpa != pa) &&
-	    (new_pve == NULL || new_sparepve == NULL)) {
-		mutex_exit(&pmap->pm_lock);
-		if (flags & PMAP_CANFAIL) {
-			error = ENOMEM;
-			goto out;
-		}
-		panic("%s: pve allocation failed", __func__);
-	}
-
 	/*
 	 * Update the pte.
 	 */
@@ -5263,11 +5281,16 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 	 */
 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
+		if ((opte & PTE_PVLIST) != 0 && pve != NULL) {
+			KASSERT(pve->pve_pte.pte_ptp == ptp);
+			KASSERT(pve->pve_pte.pte_va == va);
+		}
+		pve = NULL;
 		goto same_pa;
 	}
 
 	/*
-	 * If old page is pv-tracked, remove pv_entry from its list.
+	 * If old page is pv-tracked, replace pv_entry from its list.
 	 */
 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
@@ -5280,20 +5303,18 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 			    __func__, va, oldpa, atop(pa));
 		}
 
-		old_pve = pmap_remove_pv(old_pp, ptp, va);
-		old_pp->pp_attrs |= pmap_ept_to_pp_attrs(opte);
+		(void)pmap_remove_pv(pmap, old_pp, ptp, va, pve);
+		old_pp->pp_attrs |= pmap_pte_to_pp_attrs(opte);
 	}
 
 	/*
 	 * If new page is pv-tracked, insert pv_entry into its list.
 	 */
 	if (new_pp) {
-		new_pve = pmap_enter_pv(new_pp, new_pve, &new_sparepve, ptp, va);
+		pve = pmap_enter_pv(pmap, new_pp, pve, ptp, va);
 	}
 
 same_pa:
-	mutex_exit(&pmap->pm_lock);
-
 	if (pmap_ept_has_ad) {
 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
 	} else {
@@ -5304,17 +5325,12 @@ same_pa:
 	}
 
 	error = 0;
-out:
 	kpreempt_enable();
-	if (old_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, old_pve);
-	}
-	if (new_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, new_pve);
-	}
-	if (new_sparepve != NULL) {
-		pool_cache_put(&pmap_pv_cache, new_sparepve);
+	if (pve != NULL) {
+		pmap_pvmap_remove(pmap, va, pve);
+		pool_cache_put(&pmap_pv_cache, pve);
 	}
+	mutex_exit(&pmap->pm_lock);
 
 	return error;
 }
@@ -5364,8 +5380,8 @@ pmap_ept_extract(struct pmap *pmap, vadd
 	rv = false;
 	pa = 0;
 
-	kpreempt_disable();
 	mutex_enter(&pmap->pm_lock);
+	kpreempt_disable();
 
 	if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
 		ptppa = pmap_pte2pa(pde);
@@ -5377,8 +5393,8 @@ pmap_ept_extract(struct pmap *pmap, vadd
 		}
 	}
 
-	mutex_exit(&pmap->pm_lock);
 	kpreempt_enable();
+	mutex_exit(&pmap->pm_lock);
 
 	if (pap != NULL) {
 		*pap = pa;
@@ -5433,7 +5449,7 @@ pmap_ept_remove_pte(struct pmap *pmap, s
 	}
 
 	/*
-	 * If we are not on a pv_head list - we are done.
+	 * If we are not on a pv list - we are done.
 	 */
 	if ((opte & EPT_PVLIST) == 0) {
 		KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
@@ -5455,7 +5471,7 @@ pmap_ept_remove_pte(struct pmap *pmap, s
 
 	/* Sync R/M bits. */
 	pp->pp_attrs |= pmap_ept_to_pp_attrs(opte);
-	pve = pmap_remove_pv(pp, ptp, va);
+	pve = pmap_remove_pv(pmap, pp, ptp, va, NULL);
 
 	if (pve) {
 		pve->pve_next = *pv_tofree;
@@ -5499,8 +5515,8 @@ pmap_ept_remove(struct pmap *pmap, vaddr
 	vaddr_t blkendva, va = sva;
 	struct vm_page *ptp;
 
-	kpreempt_disable();
 	mutex_enter(&pmap->pm_lock);
+	kpreempt_disable();
 
 	for (/* null */ ; va < eva ; va = blkendva) {
 		int lvl;
@@ -5535,12 +5551,15 @@ pmap_ept_remove(struct pmap *pmap, vaddr
 		}
 	}
 
-	mutex_exit(&pmap->pm_lock);
 	kpreempt_enable();
-
-	/* Now we free unused PVs */
-	if (pv_tofree)
-		pmap_free_pvs(pv_tofree);
+	/*
+	 * Radix tree nodes are removed here, so we need to continue holding
+	 * the pmap locked until complete.
+	 */
+	if (pv_tofree != NULL) {
+		pmap_free_pvs(pmap, pv_tofree);
+	}
+	mutex_exit(&pmap->pm_lock);
 }
 
 static int
@@ -5633,13 +5652,14 @@ static void
 pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
     vaddr_t va)
 {
-	mutex_enter(&pmap->pm_lock);
+
+	KASSERT(mutex_owned(&pmap->pm_lock));
+
 	pmap_ept_stats_update_bypte(pmap, 0, opte);
 	ptp->wire_count--;
 	if (ptp->wire_count <= 1) {
 		pmap_ept_free_ptp(pmap, ptp, va);
 	}
-	mutex_exit(&pmap->pm_lock);
 }
 
 static void
@@ -5661,9 +5681,8 @@ pmap_ept_write_protect(struct pmap *pmap
 	eva &= PTE_FRAME;
 
 	/* Acquire pmap. */
-	kpreempt_disable();
-
 	mutex_enter(&pmap->pm_lock);
+	kpreempt_disable();
 
 	for (va = sva; va < eva; va += PAGE_SIZE) {
 		if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
@@ -5695,8 +5714,8 @@ pmap_ept_write_protect(struct pmap *pmap
 next:;
 	}
 
-	mutex_exit(&pmap->pm_lock);
 	kpreempt_enable();
+	mutex_exit(&pmap->pm_lock);
 }
 
 static void
@@ -5707,8 +5726,8 @@ pmap_ept_unwire(struct pmap *pmap, vaddr
 	paddr_t ptppa;
 
 	/* Acquire pmap. */
-	kpreempt_disable();
 	mutex_enter(&pmap->pm_lock);
+	kpreempt_disable();
 
 	if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
 		panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
@@ -5731,8 +5750,8 @@ pmap_ept_unwire(struct pmap *pmap, vaddr
 	}
 
 	/* Release pmap. */
-	mutex_exit(&pmap->pm_lock);
 	kpreempt_enable();
+	mutex_exit(&pmap->pm_lock);
 }
 
 /* -------------------------------------------------------------------------- */

Index: src/sys/arch/xen/x86/xen_pmap.c
diff -u src/sys/arch/xen/x86/xen_pmap.c:1.34 src/sys/arch/xen/x86/xen_pmap.c:1.35
--- src/sys/arch/xen/x86/xen_pmap.c:1.34	Sun Dec 15 19:24:11 2019
+++ src/sys/arch/xen/x86/xen_pmap.c	Sat Jan  4 22:49:20 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: xen_pmap.c,v 1.34 2019/12/15 19:24:11 ad Exp $	*/
+/*	$NetBSD: xen_pmap.c,v 1.35 2020/01/04 22:49:20 ad Exp $	*/
 
 /*
  * Copyright (c) 2007 Manuel Bouyer.
@@ -101,7 +101,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: xen_pmap.c,v 1.34 2019/12/15 19:24:11 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: xen_pmap.c,v 1.35 2020/01/04 22:49:20 ad Exp $");
 
 #include "opt_user_ldt.h"
 #include "opt_lockdebug.h"
@@ -214,18 +214,18 @@ pmap_extract_ma(struct pmap *pmap, vaddr
 	struct pmap *pmap2;
 	int lvl;
 
-	kpreempt_disable();
+	mutex_enter(&pmap->pm_lock);
 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
 	if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
-		pmap_unmap_ptes(pmap, pmap2, NULL);
-		kpreempt_enable();
+		pmap_unmap_ptes(pmap, pmap2);
+		mutex_exit(&pmap->pm_lock);
 		return false;
 	}
 
 	KASSERT(lvl == 1);
 	pte = ptes[pl1_i(va)];
-	pmap_unmap_ptes(pmap, pmap2, NULL);
-	kpreempt_enable();
+	pmap_unmap_ptes(pmap, pmap2);
+	mutex_exit(&pmap->pm_lock);
 
 	if (__predict_true((pte & PTE_P) != 0)) {
 		if (pap != NULL)
@@ -305,7 +305,7 @@ pmap_unmap_recursive_entries(void)
 	 * XXX jym@ : find a way to drain per-CPU caches to. pool_cache_inv
 	 * does not do that.
 	 */
-	pool_cache_invalidate(&pmap_pdp_cache);
+	pool_cache_invalidate(&pmap_cache);
 
 	mutex_enter(&pmaps_lock);
 	LIST_FOREACH(pm, &pmaps, pm_list) {

Reply via email to