Module Name:    src
Committed By:   ad
Date:           Tue Mar 17 22:29:19 UTC 2020

Modified Files:
        src/sys/arch/x86/include: pmap.h pmap_pv.h
        src/sys/arch/x86/x86: pmap.c

Log Message:
Hallelujah, the bug has been found.  Resurrect prior changes, to be fixed
with following commit.


To generate a diff of this commit:
cvs rdiff -u -r1.114 -r1.115 src/sys/arch/x86/include/pmap.h
cvs rdiff -u -r1.16 -r1.17 src/sys/arch/x86/include/pmap_pv.h
cvs rdiff -u -r1.373 -r1.374 src/sys/arch/x86/x86/pmap.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/x86/include/pmap.h
diff -u src/sys/arch/x86/include/pmap.h:1.114 src/sys/arch/x86/include/pmap.h:1.115
--- src/sys/arch/x86/include/pmap.h:1.114	Tue Mar 17 21:02:56 2020
+++ src/sys/arch/x86/include/pmap.h	Tue Mar 17 22:29:19 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.h,v 1.114 2020/03/17 21:02:56 ad Exp $	*/
+/*	$NetBSD: pmap.h,v 1.115 2020/03/17 22:29:19 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -248,6 +248,8 @@ extern struct pool_cache pmap_cache;
  * (the other object locks are only used when uvm_pagealloc is called)
  */
 
+struct pv_page;
+
 struct pmap {
 	struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */
 	LIST_ENTRY(pmap) pm_list;	/* list of all pmaps */
@@ -256,11 +258,11 @@ struct pmap {
 	struct vm_page *pm_ptphint[PTP_LEVELS-1];
 					/* pointer to a PTP in our pmap */
 	struct pmap_statistics pm_stats;  /* pmap stats */
+	struct pv_entry *pm_pve;	/* spare pv_entry */
 
 #if !defined(__x86_64__)
 	vaddr_t pm_hiexec;		/* highest executable mapping */
 #endif /* !defined(__x86_64__) */
-	struct lwp *pm_remove_all;	/* who's emptying the pmap */
 
 	union descriptor *pm_ldt;	/* user-set LDT */
 	size_t pm_ldt_len;		/* size of LDT in bytes */

Index: src/sys/arch/x86/include/pmap_pv.h
diff -u src/sys/arch/x86/include/pmap_pv.h:1.16 src/sys/arch/x86/include/pmap_pv.h:1.17
--- src/sys/arch/x86/include/pmap_pv.h:1.16	Tue Mar 17 21:02:56 2020
+++ src/sys/arch/x86/include/pmap_pv.h	Tue Mar 17 22:29:19 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap_pv.h,v 1.16 2020/03/17 21:02:56 ad Exp $	*/
+/*	$NetBSD: pmap_pv.h,v 1.17 2020/03/17 22:29:19 ad Exp $	*/
 
 /*-
  * Copyright (c)2008 YAMAMOTO Takashi,
@@ -34,6 +34,7 @@
 #include <sys/rbtree.h>
 
 struct vm_page;
+struct pmap_page;
 
 /*
  * structures to track P->V mapping
@@ -51,14 +52,14 @@ struct pv_pte {
 };
 
 /*
- * pv_entry: plug pv_pte into lists.
+ * pv_entry: plug pv_pte into lists.  32 bytes on i386, 64 on amd64.
  */
 
 struct pv_entry {
 	struct pv_pte pve_pte;		/* should be the first member */
 	LIST_ENTRY(pv_entry) pve_list;	/* on pmap_page::pp_pvlist */
 	rb_node_t pve_rb;		/* red-black tree node */
-	uintptr_t pve_padding;		/* unused */
+	struct pmap_page *pve_pp;	/* backpointer to mapped page */
 };
 #define	pve_next	pve_list.le_next
 
@@ -71,16 +72,13 @@ struct pmap_page {
 		/* PTPs */
 		rb_tree_t rb;
 
-		/* PTPs */
+		/* PTPs, when being freed */
 		LIST_ENTRY(vm_page) link;
 
-		/* Non-PTPs */
+		/* Non-PTPs (i.e. normal pages) */
 		struct {
-			/* PP_EMBEDDED */
 			struct pv_pte pte;
-
 			LIST_HEAD(, pv_entry) pvlist;
-			uint8_t flags;
 			uint8_t attrs;
 		} s;
 	} pp_u;
@@ -89,7 +87,6 @@ struct pmap_page {
 #define	pp_link		pp_u.link
 #define	pp_pte		pp_u.s.pte
 #define pp_pvlist	pp_u.s.pvlist
-#define	pp_pflags	pp_u.s.flags
 #define	pp_attrs	pp_u.s.attrs
 };
 
@@ -97,10 +94,6 @@ struct pmap_page {
 #define PP_ATTRS_A	0x02	/* Accessed */
 #define PP_ATTRS_W	0x04	/* Writable */
 
-/* pp_flags */
-#define	PP_EMBEDDED	1
-#define	PP_FREEING	2
-
 #define	PMAP_PAGE_INIT(pp) \
 do { \
 	LIST_INIT(&(pp)->pp_pvlist); \

Index: src/sys/arch/x86/x86/pmap.c
diff -u src/sys/arch/x86/x86/pmap.c:1.373 src/sys/arch/x86/x86/pmap.c:1.374
--- src/sys/arch/x86/x86/pmap.c:1.373	Tue Mar 17 21:02:56 2020
+++ src/sys/arch/x86/x86/pmap.c	Tue Mar 17 22:29:19 2020
@@ -1,4 +1,4 @@
-/*	$NetBSD: pmap.c,v 1.373 2020/03/17 21:02:56 ad Exp $	*/
+/*	$NetBSD: pmap.c,v 1.374 2020/03/17 22:29:19 ad Exp $	*/
 
 /*
  * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
@@ -130,7 +130,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.373 2020/03/17 21:02:56 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.374 2020/03/17 22:29:19 ad Exp $");
 
 #include "opt_user_ldt.h"
 #include "opt_lockdebug.h"
@@ -139,6 +139,8 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.3
 #include "opt_svs.h"
 #include "opt_kaslr.h"
 
+#define	__MUTEX_PRIVATE	/* for assertions */
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
@@ -224,23 +226,39 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.3
 /*
  * Locking
  *
- * We have the following locks that we must contend with, listed in the
- * order that they must be acquired:
+ * We have the following locks that we must deal with, listed in the order
+ * that they are acquired:
+ *
+ * pg->uobject->vmobjlock, pg->uanon->an_lock
  *
- * - pg->uobject->vmobjlock, pg->uanon->an_lock
- *   These per-object locks are taken by the VM system before calling into
- *   the pmap module.  Holding them prevents concurrent operations on the
- *   given page or set of pages.
- *
- * - pmap->pm_lock (per pmap)
- *   This lock protects the fields in the pmap structure including the
- *   non-kernel PDEs in the PDP, the PTEs, and the PVE radix tree.  For
- *   modifying kernel PTEs it is not required as kernel PDEs are never
- *   freed, and the kernel is expected to be self consistent.
- *
- * - pmaps_lock
- *   This lock protects the list of active pmaps (headed by "pmaps"). We
- *   lock it when adding or removing pmaps from this list.
+ * 	For managed pages, these per-object locks are taken by the VM system
+ *	before calling into the pmap module - either a read or write hold. 
+ *	The lock hold prevent pages from changing identity while the pmap is
+ *	operating on them.  For example, the same lock is held across a call
+ *	to pmap_remove() and the following call to pmap_update(), so that a
+ *	page does not gain a new identity while its TLB visibility is stale.
+ *
+ * pmap->pm_lock
+ *
+ *	This lock protects the fields in the pmap structure including the
+ *	non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
+ *	structures.  For modifying unmanaged kernel PTEs it is not needed as
+ *	kernel PDEs are never freed, and the kernel is expected to be self
+ *	consistent (and the lock can't be taken for unmanaged kernel PTEs,
+ *	because they can be modified from interrupt context).
+ *
+ * pmaps_lock
+ *
+ *	This lock protects the list of active pmaps (headed by "pmaps"). 
+ *	It's acqired when adding or removing pmaps or adjusting kernel PDEs.
+ *
+ * pp_lock
+ *
+ *	This per-page lock protects PV entry lists and the embedded PV entry
+ *	in each vm_page, allowing for concurrent operation on pages by
+ *	different pmaps.  This is a spin mutex at IPL_VM, because at the
+ *	points it is taken context switching is usually not tolerable, and
+ *	spin mutexes must block out interrupts that could take kernel_lock.
  */
 
 /* uvm_object is abused here to index pmap_pages; make assertions happy. */
@@ -317,6 +335,8 @@ paddr_t pmap_pa_end;   /* PA of last phy
 #endif
 
 #define	VM_PAGE_TO_PP(pg)	(&(pg)->mdpage.mp_pp)
+#define	PMAP_CHECK_PP(pp) \
+    KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
 
 /*
  * Other data structures
@@ -523,6 +543,17 @@ pvpte_to_pve(struct pv_pte *pvpte)
 }
 
 /*
+ * Return true if the pmap page has an embedded PV entry.
+ */
+static inline bool
+pv_pte_embedded(struct pmap_page *pp)
+{
+
+	KASSERT(mutex_owned(&pp->pp_lock));
+	return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
+}
+
+/*
  * pv_pte_first, pv_pte_next: PV list iterator.
  */
 static struct pv_pte *
@@ -530,7 +561,7 @@ pv_pte_first(struct pmap_page *pp)
 {
 
 	KASSERT(mutex_owned(&pp->pp_lock));
-	if ((pp->pp_pflags & PP_EMBEDDED) != 0) {
+	if (pv_pte_embedded(pp)) {
 		return &pp->pp_pte;
 	}
 	return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
@@ -543,7 +574,6 @@ pv_pte_next(struct pmap_page *pp, struct
 	KASSERT(mutex_owned(&pp->pp_lock));
 	KASSERT(pvpte != NULL);
 	if (pvpte == &pp->pp_pte) {
-		KASSERT((pp->pp_pflags & PP_EMBEDDED) != 0);
 		return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
 	}
 	return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
@@ -605,6 +635,61 @@ pmap_compare_key(void *context, const vo
 }
 
 /*
+ * pmap_ptp_init: initialize new page table page
+ */
+static inline void
+pmap_ptp_init(struct vm_page *ptp)
+{
+
+	ptp->uanon = (struct vm_anon *)(vaddr_t)~0L;
+	rb_tree_init(&VM_PAGE_TO_PP(ptp)->pp_rb, &pmap_rbtree_ops);
+	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
+}
+
+/*
+ * pmap_ptp_fini: finalize a page table page
+ */
+static inline void
+pmap_ptp_fini(struct vm_page *ptp)
+{
+
+	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
+	PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
+	ptp->uanon = NULL;
+}
+
+/*
+ * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
+ */
+static inline void
+pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
+{
+	vaddr_t *min = (vaddr_t *)&ptp->uanon;
+
+	if (va < *min) {
+		*min = va;
+	}
+}
+
+/*
+ * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
+ */
+static inline void
+pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
+{
+	vaddr_t sclip;
+
+	if (ptp == NULL) {
+		return;
+	}
+
+	sclip = (vaddr_t)ptp->uanon;
+	sclip = (*startva < sclip ? sclip : *startva);
+	*pte += (sclip - *startva) / PAGE_SIZE;
+	*startva = sclip;
+}
+
+/*
  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
  *
  * there are several pmaps involved.  some or all of them might be same.
@@ -656,7 +741,9 @@ pmap_map_ptes(struct pmap *pmap, struct 
 		 * often the case during exit(), when we have switched
 		 * to the kernel pmap in order to destroy a user pmap.
 		 */
-		pmap_reactivate(pmap);
+		if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) {
+			pmap_reactivate(pmap);
+		}
 		*pmap2 = NULL;
 	} else {
 		/*
@@ -1771,7 +1858,7 @@ pmap_init(void)
 	 * The kernel doesn't keep track of PTPs, so there's nowhere handy
 	 * to hang a tree of pv_entry records.  Dynamically allocated
 	 * pv_entry lists are not heavily used in the kernel's pmap (the
-	 * usual case is PP_EMBEDDED), so cop out and use a single RB tree
+	 * usual case is embedded), so cop out and use a single RB tree
 	 * to cover them.
 	 */
 	rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
@@ -1857,28 +1944,6 @@ pmap_vpage_cpu_init(struct cpu_info *ci)
  * p v _ e n t r y   f u n c t i o n s
  */
 
-
-/*
- * pmap_pp_needs_pve: return true if we need to allocate a pv entry.
- */
-static bool
-pmap_pp_needs_pve(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va)
-{
-
-	/*
-	 * Adding a pv entry for this page only needs to allocate a pv_entry
-	 * structure if the page already has at least one pv entry, since
-	 * the first pv entry is stored in the pmap_page.  However, because
-	 * of subsequent removal(s), PP_EMBEDDED can be false and there can
-	 * still be pv entries on the list.
-	 */
-
-	if (pp == NULL || (pp->pp_pflags & PP_EMBEDDED) == 0) {
-		return false;
-	}
-	return pp->pp_pte.pte_ptp != ptp || pp->pp_pte.pte_va != va;
-}
-
 /*
  * pmap_free_pvs: free a linked list of pv entries.  the pv entries have
  * been removed from their respective pages, but are still entered into the
@@ -1900,49 +1965,57 @@ pmap_free_pvs(struct pmap *pmap, struct 
 }
 
 /*
- * pmap_lookup_pv: look up a non-PP_EMBEDDED pv entry for the given pmap
- *
- * => pmap must be locked
+ * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
  */
-
-static struct pv_entry *
-pmap_lookup_pv(struct pmap *pmap, struct vm_page *ptp,
-    struct pmap_page *pp, vaddr_t va)
+static void
+pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
+    vaddr_t va, bool tracked)
 {
-	struct rb_node *node;
-	struct pv_entry *pve;
+#ifdef DIAGNOSTIC /* XXX too slow make this DEBUG before April 2020 */
+	struct pv_pte *pvpte;
 
-	KASSERT(mutex_owned(&pmap->pm_lock));
+	PMAP_CHECK_PP(pp);
 
-	/*
-	 * Do an unlocked check on the page: if tracked with PP_EMBEDDED we
-	 * can avoid touching the tree.
-	 */
-	if ((pp->pp_pflags & PP_EMBEDDED) != 0 &&
-	    pp->pp_pte.pte_ptp == ptp &&
-	    pp->pp_pte.pte_va == va) {
-		return NULL;
+	mutex_spin_enter(&pp->pp_lock);
+	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
+		if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
+			break;
+		}
 	}
+	mutex_spin_exit(&pp->pp_lock);
 
-	if (ptp != NULL) {
-		node = VM_PAGE_TO_PP(ptp)->pp_rb.rbt_root;
-	} else {
-		KASSERT(pmap == pmap_kernel());
-		node = pmap_kernel_rb.rbt_root;
+	if (pvpte && !tracked) {
+		panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp);
+	} else if (!pvpte && tracked) {
+		panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
 	}
+#endif
+}
+
+/*
+ * pmap_treelookup_pv: search the PV tree for a dynamic entry
+ *
+ * => pmap must be locked
+ */
+static struct pv_entry *
+pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
+    const rb_tree_t *tree, const vaddr_t va)
+{
+	struct pv_entry *pve;
+	rb_node_t *node;
 
 	/*
-	 * Search the RB tree for the key.  This is an inlined lookup
-	 * tailored for exactly what's needed here that is quite a bit
-	 * faster than using rb_tree_find_node().
+	 * Inlined lookup tailored for exactly what's needed here that is
+	 * quite a bit faster than using rb_tree_find_node().
 	 */
-	for (;;) {
+	for (node = tree->rbt_root;;) {
 		if (__predict_false(RB_SENTINEL_P(node))) {
 			return NULL;
 		}
 		pve = (struct pv_entry *)
 		    ((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
 		if (pve->pve_pte.pte_va == va) {
+			KASSERT(pve->pve_pte.pte_ptp == ptp);
 			return pve;
 		}
 		node = node->rb_nodes[pve->pve_pte.pte_va < va];
@@ -1950,91 +2023,194 @@ pmap_lookup_pv(struct pmap *pmap, struct
 }
 
 /*
- * pmap_enter_pv: enter a mapping onto a pmap_page lst
+ * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
  *
- * => caller should adjust ptp's wire_count before calling
- * => caller has preallocated pve for us
- * => if not embedded, tree node must be in place beforehand
+ * => a PV entry must be known present (doesn't check for existence)
+ * => pmap must be locked
  */
 static struct pv_entry *
-pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct pv_entry *pve,
-    struct vm_page *ptp, vaddr_t va)
+pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
+    const struct pmap_page * const old_pp, const vaddr_t va)
 {
+	struct pv_entry *pve;
+	const rb_tree_t *tree;
+
+	KASSERT(mutex_owned(&pmap->pm_lock));
+	KASSERT(ptp != NULL || pmap == pmap_kernel());
+
+	/*
+	 * [This mostly deals with the case of process-private pages, i.e.
+	 * anonymous memory allocations or COW.]
+	 *
+	 * If the page is tracked with an embedded entry then the tree
+	 * lookup can be avoided.  It's safe to check for this specific
+	 * set of values without pp_lock because both will only ever be
+	 * set together for this pmap.
+	 *
+	 */
+	if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp &&
+	    atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
+		return NULL;
+	}
+
+	/*
+	 * [This mostly deals with shared mappings, for example shared libs
+	 * and executables.]
+	 *
+	 * Optimise for pmap_remove_all() which works by ascending scan:
+	 * look at the lowest numbered node in the tree first.  The tree is
+	 * known non-empty because of the check above.  For short lived
+	 * processes where pmap_remove() isn't used much this gets close to
+	 * a 100% hit rate.
+	 */
+	tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
+	KASSERT(!RB_SENTINEL_P(tree->rbt_root));
+	pve = (struct pv_entry *)
+	    ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
+	    offsetof(struct pv_entry, pve_rb));
+	if (__predict_true(pve->pve_pte.pte_va == va)) {
+		KASSERT(pve->pve_pte.pte_ptp == ptp);
+		return pve;
+	}
+
+	/* Search the RB tree for the key (uncommon). */
+	return pmap_treelookup_pv(pmap, ptp, tree, va);
+}
+
+/*
+ * pmap_enter_pv: enter a mapping onto a pmap_page lst
+ *
+ * => pmap must be locked
+ * => does NOT insert dynamic entries to tree (pmap_enter() does later)
+ */
+static int
+pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
+    vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
+    bool *samepage, bool *new_embedded, rb_tree_t *tree)
+{
+	struct pv_entry *pve;
+	int error;
 
 	KASSERT(mutex_owned(&pmap->pm_lock));
 	KASSERT(ptp_to_pmap(ptp) == pmap);
-	KASSERT(ptp == NULL || ptp->wire_count >= 2);
 	KASSERT(ptp == NULL || ptp->uobject != NULL);
 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
+	PMAP_CHECK_PP(pp);
 
+	/*
+	 * If entering the same page and it's already tracked with an
+	 * embedded entry, we can avoid the expense below.  It's safe
+	 * to check for this very specific set of values without a lock
+	 * because both will only ever be set together for this pmap.
+	 */
+	if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp &&
+	    atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
+		*samepage = true;
+		pmap_check_pv(pmap, ptp, pp, va, true);
+		return 0;
+	}
+
+	/*
+	 * Check for an existing dynamic mapping at this address.  If it's
+	 * for the same page, then it will be reused and nothing needs to be
+	 * changed.
+	 */
+	*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
+	if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
+		*samepage = true;
+		pmap_check_pv(pmap, ptp, pp, va, true);
+		return 0;
+	}
+
+	/*
+	 * Need to put a new mapping in place.  Grab a spare pv_entry in
+	 * case it's needed; won't know for sure until the lock is taken.
+	 */
+	if (pmap->pm_pve == NULL) {
+		pmap->pm_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
+	}
+
+	error = 0;
+	pmap_check_pv(pmap, ptp, pp, va, false);
 	mutex_spin_enter(&pp->pp_lock);
-	if ((pp->pp_pflags & PP_EMBEDDED) == 0) {
-		pp->pp_pflags |= PP_EMBEDDED;
+	if (!pv_pte_embedded(pp)) {
+		/*
+		 * Embedded PV tracking available - easy.
+		 */
 		pp->pp_pte.pte_ptp = ptp;
 		pp->pp_pte.pte_va = va;
-		mutex_spin_exit(&pp->pp_lock);
-		return pve;
+		*new_embedded = true;
+	} else if (__predict_false(pmap->pm_pve == NULL)) {
+		/*
+		 * No memory.
+		 */
+		error = ENOMEM;
+	} else {
+		/*
+		 * Install new pv_entry on the page.
+		 */
+		pve = pmap->pm_pve;
+		pmap->pm_pve = NULL;
+		*new_pve = pve;
+		pve->pve_pte.pte_ptp = ptp;
+		pve->pve_pte.pte_va = va;
+		pve->pve_pp = pp;
+		LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
 	}
-
-	KASSERT(pve != NULL);
-	pve->pve_pte.pte_ptp = ptp;
-	pve->pve_pte.pte_va = va;
-	KASSERT(pmap_lookup_pv(pmap, ptp, pp, va) == NULL);
-	LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
 	mutex_spin_exit(&pp->pp_lock);
+	pmap_check_pv(pmap, ptp, pp, va, true);
 
-	if (ptp != NULL) {
-		rb_tree_insert_node(&VM_PAGE_TO_PP(ptp)->pp_rb, pve);
-	} else {
-		KASSERT(pmap == pmap_kernel());
-		rb_tree_insert_node(&pmap_kernel_rb, pve);
-	}
-	return NULL;
+	return error;
 }
 
 /*
  * pmap_remove_pv: try to remove a mapping from a pv_list
  *
+ * => pmap must be locked
+ * => removes dynamic entries from tree
  * => caller should adjust ptp's wire_count and free PTP if needed
- * => we don't remove radix tree entry; defer till later (it could block)
- * => we return the removed pve
- * => caller can optionally supply pve, if looked up already
  */
 static void
 pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
     vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
 {
+	rb_tree_t *tree = (ptp != NULL ?
+	    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
 
 	KASSERT(mutex_owned(&pmap->pm_lock));
 	KASSERT(ptp_to_pmap(ptp) == pmap);
 	KASSERT(ptp == NULL || ptp->uobject != NULL);
 	KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset);
+	KASSERT(ptp != NULL || pmap == pmap_kernel());
+
+	pmap_check_pv(pmap, ptp, pp, va, true);
 
 	mutex_spin_enter(&pp->pp_lock);
 	pp->pp_attrs |= oattrs;
-	if ((pp->pp_pflags & PP_EMBEDDED) != 0 &&
-	    pp->pp_pte.pte_ptp == ptp &&
-	    pp->pp_pte.pte_va == va) {
-	    	KASSERT(pve == NULL);
-		pp->pp_pflags &= ~PP_EMBEDDED;
+	if (pve == NULL) {
+		KASSERT(pp->pp_pte.pte_ptp == ptp);
+		KASSERT(pp->pp_pte.pte_va == va);
 		pp->pp_pte.pte_ptp = NULL;
 		pp->pp_pte.pte_va = 0;
 		mutex_spin_exit(&pp->pp_lock);
 	} else {
-		KASSERT(pve != NULL);
-		KASSERT(pve == pmap_lookup_pv(pmap, ptp, pp, va));
+		KASSERT(pp->pp_pte.pte_ptp != ptp ||
+		    pp->pp_pte.pte_va != va);
 		KASSERT(pve->pve_pte.pte_ptp == ptp);
 		KASSERT(pve->pve_pte.pte_va == va);
+		KASSERT(pve->pve_pp == pp);
 		LIST_REMOVE(pve, pve_list);
 		mutex_spin_exit(&pp->pp_lock);
 
-		if (ptp != NULL) {
-			rb_tree_remove_node(&VM_PAGE_TO_PP(ptp)->pp_rb, pve);
-		} else {
-			KASSERT(pmap == pmap_kernel());
-			rb_tree_remove_node(&pmap_kernel_rb, pve);
-		}
+		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
+		rb_tree_remove_node(tree, pve);
+#ifdef DIAGNOSTIC
+		memset(pve, 0, sizeof(*pve));
+#endif
 	}
+
+	KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
+	pmap_check_pv(pmap, ptp, pp, va, false);
 }
 
 /*
@@ -2052,7 +2228,9 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t
 
 	if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) {
 		KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
-		return pmap->pm_ptphint[lidx];
+		pg = pmap->pm_ptphint[lidx];
+		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
+		return pg;
 	}
 	PMAP_DUMMY_LOCK(pmap);
 	pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
@@ -2061,6 +2239,9 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t
 		/* This page is queued to be freed - ignore. */
 		pg = NULL;
 	}
+	if (pg != NULL) {
+		PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
+	}
 	pmap->pm_ptphint[lidx] = pg;
 	return pg;
 }
@@ -2077,6 +2258,7 @@ pmap_freepage(struct pmap *pmap, struct 
 	if (pmap->pm_ptphint[lidx] == ptp)
 		pmap->pm_ptphint[lidx] = NULL;
 	ptp->wire_count = 0;
+	pmap_ptp_fini(ptp);
 
 	/*
 	 * Enqueue the PTP to be freed by pmap_update().  We can't remove
@@ -2085,7 +2267,6 @@ pmap_freepage(struct pmap *pmap, struct 
 	 * Instead mark the PTP as free and if we bump into it again, we'll
 	 * either ignore or reuse (depending on what's useful at the time).
 	 */
-	KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
 	LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
 }
 
@@ -2178,14 +2359,12 @@ pmap_get_ptp(struct pmap *pmap, struct p
 			pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
 			pt->alloced[i] = true;
 			if (pt->pg[i] != NULL) {
-				rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
-				    &pmap_rbtree_ops);
+				pmap_ptp_init(pt->pg[i]);
 			}
 		} else if (pt->pg[i]->wire_count == 0) {
 			/* This page was queued to be freed; dequeue it. */
 			LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
-			rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
-			    &pmap_rbtree_ops);
+			pmap_ptp_init(pt->pg[i]);
 		}
 		PMAP_DUMMY_UNLOCK(pmap);
 		if (pt->pg[i] == NULL) {
@@ -2292,8 +2471,10 @@ pmap_unget_ptp(struct pmap *pmap, struct
 			continue;
 		}
 		KASSERT(pt->pg[i]->wire_count == 0);
+		PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
 		/* pmap zeros all pages before freeing. */
 		pt->pg[i]->flags |= PG_ZERO; 
+		pmap_ptp_fini(pt->pg[i]);
 		PMAP_DUMMY_LOCK(pmap);
 		uvm_pagefree(pt->pg[i]);
 		PMAP_DUMMY_UNLOCK(pmap);
@@ -2488,7 +2669,7 @@ pmap_ctor(void *arg, void *obj, int flag
 	kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
 #endif
 	LIST_INIT(&pmap->pm_gc_ptp);
-	pmap->pm_remove_all = NULL;
+	pmap->pm_pve = NULL;
 
 	/* allocate and init PDP */
 	pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
@@ -2521,6 +2702,10 @@ pmap_dtor(void *arg, void *obj)
 {
 	struct pmap *pmap = obj;
 
+	if (pmap->pm_pve != NULL) {
+		pool_cache_put(&pmap_pv_cache, pmap->pm_pve);
+	}
+
 	mutex_enter(&pmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mutex_exit(&pmaps_lock);
@@ -2637,26 +2822,28 @@ pmap_destroy(struct pmap *pmap)
 {
 	int i;
 
-	/* Undo pmap_remove_all(). */
-	if (pmap->pm_remove_all == curlwp) {
-		pmap_update(pmap);
-	}
-
 	/*
-	 * drop reference count
+	 * drop reference count and verify not in use.
 	 */
 
 	if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
 		return;
 	}
-
 	pmap_check_inuse(pmap);
 
 	/*
+	 * XXX handle deferred PTP page free for EPT.  ordinarily this is
+	 * taken care of by pmap_remove_all().  once shared with EPT this
+	 * can go away.
+	 */
+	if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) {
+		pmap_update(pmap);
+	}
+
+	/*
 	 * Reference count is zero, free pmap resources and then free pmap.
 	 */
 
-	KASSERT(pmap->pm_remove_all == NULL);
 	pmap_check_ptps(pmap);
 	KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
 
@@ -2697,20 +2884,85 @@ pmap_destroy(struct pmap *pmap)
 }
 
 /*
- * pmap_remove_all: pmap is being torn down by the current thread.
- * avoid unnecessary invalidations.
+ * pmap_remove_all: remove all mappings from pmap in bulk.
+ *
+ * Ordinarily when removing mappings it's important to hold the UVM object's
+ * lock, so that pages do not gain a new identity while retaining stale TLB
+ * entries (the same lock hold covers both pmap_remove() and pmap_update()). 
+ * Here it's known that the address space is no longer visible to any user
+ * process, so we don't need to worry about that.
  */
 bool
 pmap_remove_all(struct pmap *pmap)
 {
+	struct vm_page *ptps[32];
+	vaddr_t va, blkendva;
+	struct pmap *pmap2;
+	pt_entry_t *ptes;
+	pd_entry_t pde __diagused;
+	pd_entry_t * const *pdes;
+	struct pv_entry *pv_tofree;
+	int lvl __diagused, i, n;
 
-	/*
-	 * No locking needed; at this point it should only ever be checked
-	 * by curlwp.
-	 */
-	KASSERT(pmap->pm_remove_all == NULL);
-	pmap->pm_remove_all = curlwp;
-	return false;
+	/* XXX Can't handle EPT just yet. */
+	if (pmap->pm_remove != NULL) {
+		return false;
+	}
+ 
+	for (;;) {
+		/* Fetch a block of PTPs from tree. */
+		mutex_enter(&pmap->pm_lock);
+		n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
+		    (void **)ptps, __arraycount(ptps), false);
+		if (n == 0) {
+			mutex_exit(&pmap->pm_lock);
+			break;
+		}
+
+		/* Remove all mappings in the set of PTPs. */
+		pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
+		pv_tofree = NULL;
+		for (i = 0; i < n; i++) {
+			if (ptps[i]->wire_count == 0) {
+				/* It's dead: pmap_update() will expunge. */
+				continue;
+			}
+
+			/* Determine range of block. */
+			va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
+			blkendva = x86_round_pdr(va + 1);
+
+			/* Make sure everything squares up... */
+			KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
+			KASSERT(lvl == 1);
+			KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
+
+			/* Zap! */
+			pmap_remove_ptes(pmap, ptps[i],
+			    (vaddr_t)&ptes[pl1_i(va)], va,
+			    blkendva, &pv_tofree);
+
+			/* PTP should now be unused - free it. */
+			KASSERT(ptps[i]->wire_count == 1);
+			pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
+		}
+		pmap_unmap_ptes(pmap, pmap2);
+		pmap_free_pvs(pmap, pv_tofree);
+		mutex_exit(&pmap->pm_lock);
+
+		/* Process deferred frees. */
+		pmap_update(pmap);
+
+		/* A breathing point. */
+		preempt_point();
+	}
+
+	/* Verify that the pmap is now completely empty. */
+	pmap_check_ptps(pmap);
+	KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
+	    "pmap %p not empty", pmap);
+
+	return true;
 }
 
 #if defined(PMAP_FORK)
@@ -2952,7 +3204,7 @@ pmap_reactivate(struct pmap *pmap)
 	ci->ci_tlbstate = TLBSTATE_VALID;
 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
 
-	if (kcpuset_isset(pmap->pm_cpus, cid)) {
+	if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
 		/* We have the reference, state is valid. */
 	} else {
 		/*
@@ -3542,6 +3794,12 @@ pmap_remove_ptes(struct pmap *pmap, stru
 	KASSERT(kpreempt_disabled());
 
 	/*
+	 * mappings are very often sparse, so clip the given range to the
+	 * range of PTEs that are known present in the PTP.
+	 */
+	pmap_ptp_range_clip(ptp, &startva, &pte);
+
+	/*
 	 * note that ptpva points to the PTE that maps startva.   this may
 	 * or may not be the first PTE in the PTP.
 	 *
@@ -3641,6 +3899,8 @@ pmap_remove_pte(struct pmap *pmap, struc
 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
 		    "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
 #endif
+		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
+		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
 		return true;
 	}
 
@@ -3753,8 +4013,7 @@ pmap_remove(struct pmap *pmap, vaddr_t s
 	pmap_unmap_ptes(pmap, pmap2);
 	/*
 	 * Now safe to free, as we no longer have the PTEs mapped and can
-	 * block again.  Radix tree nodes are removed here, so we need to
-	 * continue holding the pmap locked until complete.
+	 * block again.
 	 */
 	if (pv_tofree != NULL) {
 		pmap_free_pvs(pmap, pv_tofree);
@@ -3889,20 +4148,36 @@ pmap_pp_remove(struct pmap_page *pp, pad
 {
 	struct pv_pte *pvpte;
 	struct vm_page *ptp;
+	uintptr_t sum;
 	uint8_t oattrs;
 	bool locked;
-	int count;
 
-	count = SPINLOCK_BACKOFF_MIN;
+	/*
+	 * Do an unlocked check to see if the page has no mappings, eg when
+	 * pmap_remove_all() was called before amap_wipeout() for a process
+	 * private amap - common.  The page being removed must be on the way
+	 * out, so we don't have to worry about concurrent attempts to enter
+	 * it (otherwise the caller either doesn't care or has screwed up).
+	 */
+	sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va);
+	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp);
+	sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first);
+	if (sum == 0) {
+	    	return;
+	}
+
 	kpreempt_disable();
-startover:
-	mutex_spin_enter(&pp->pp_lock);
-	while ((pvpte = pv_pte_first(pp)) != NULL) {
+	for (;;) {
 		struct pmap *pmap;
 		struct pv_entry *pve;
 		pt_entry_t opte;
 		vaddr_t va;
-		int error;
+
+		mutex_spin_enter(&pp->pp_lock);
+		if ((pvpte = pv_pte_first(pp)) == NULL) {
+			mutex_spin_exit(&pp->pp_lock);
+			break;
+		}
 
 		/*
 		 * Add a reference to the pmap before clearing the pte.
@@ -3930,23 +4205,37 @@ startover:
 			if (ptp != NULL) {
 				pmap_destroy(pmap);
 			}
-			goto startover;
+			continue;
 		}
-			
-		error = pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte);
-		if (error == EAGAIN) {
-			int hold_count;
-			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
-			mutex_exit(&pmap->pm_lock);
-			if (ptp != NULL) {
-				pmap_destroy(pmap);
-			}
-			SPINLOCK_BACKOFF(count);
-			KERNEL_LOCK(hold_count, curlwp);
-			goto startover;
+		va = pvpte->pte_va;
+
+		KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
+		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
+		KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
+		    "va %lx pmap %p ptp %p is free", va, pmap, ptp);
+		KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
+		    "va %lx pmap %p ptp %p is empty", va, pmap, ptp);
+		    
+#ifdef DIAGNOSTIC /* XXX Too expensive make DEBUG before April 2020 */
+		pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
+		rb_tree_t *tree = (ptp != NULL ?
+		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
+		pve = pmap_treelookup_pv(pmap, ptp, tree, va);
+		if (pve == NULL) {
+			KASSERTMSG(&pp->pp_pte == pvpte,
+			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
+			    va, pmap, ptp, pvpte, pve);
+		} else {
+			KASSERTMSG(&pve->pve_pte == pvpte,
+			    "va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
+			    va, pmap, ptp, pvpte, pve);
+		}
+#endif
+
+		if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
+			panic("pmap_pp_remove: mapping not present");
 		}
 
-		va = pvpte->pte_va;
 		pve = pmap_lookup_pv(pmap, ptp, pp, va);
 		pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
 
@@ -3964,21 +4253,15 @@ startover:
 			pmap_stats_update_bypte(pmap, 0, opte);
 		}
 		if (pve != NULL) {
-			/*
-			 * Must free pve, and remove from PV tree with the
-			 * pmap's lock still held.
-			 */
 			pve->pve_next = NULL;
 			pmap_free_pvs(pmap, pve);
 		}
+		pmap_tlb_shootnow();
 		mutex_exit(&pmap->pm_lock);
 		if (ptp != NULL) {
 			pmap_destroy(pmap);
 		}
-		mutex_spin_enter(&pp->pp_lock);
 	}
-	mutex_spin_exit(&pp->pp_lock);
-	pmap_tlb_shootnow();
 	kpreempt_enable();
 }
 
@@ -4028,6 +4311,7 @@ pmap_test_attrs(struct vm_page *pg, unsi
 {
 	struct pmap_page *pp;
 	struct pv_pte *pvpte;
+	struct pmap *pmap;
 	uint8_t oattrs;
 	u_int result;
 	paddr_t pa;
@@ -4037,17 +4321,29 @@ pmap_test_attrs(struct vm_page *pg, unsi
 		return true;
 	}
 	pa = VM_PAGE_TO_PHYS(pg);
+ startover:
 	mutex_spin_enter(&pp->pp_lock);
 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
-		int error;
-
 		if ((pp->pp_attrs & testbits) != 0) {
 			break;
 		}
-		error = pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL);
-		if (error == 0) {
-			pp->pp_attrs |= oattrs;
+		if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
+			/*
+			 * raced with a V->P operation.  wait for the other
+			 * side to finish by acquring pmap's lock.  if no
+			 * wait, updates to pp_attrs by the other side may
+			 * go unseen.
+			 */
+			pmap = ptp_to_pmap(pvpte->pte_ptp);
+			pmap_reference(pmap);
+			mutex_spin_exit(&pp->pp_lock);
+			mutex_enter(&pmap->pm_lock);
+			/* nothing. */
+			mutex_exit(&pmap->pm_lock);
+			pmap_destroy(pmap);
+			goto startover;
 		}
+		pp->pp_attrs |= oattrs;
 	}
 	result = pp->pp_attrs & testbits;
 	mutex_spin_exit(&pp->pp_lock);
@@ -4064,23 +4360,27 @@ static bool
 pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
 {
 	struct pv_pte *pvpte;
+	struct pmap *pmap;
 	uint8_t oattrs;
 	u_int result;
-	int count;
 
-	count = SPINLOCK_BACKOFF_MIN;
-	mutex_spin_enter(&pp->pp_lock);
 startover:
+	mutex_spin_enter(&pp->pp_lock);
 	for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
-		int error;
-
-		error = pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL);
-		if (error == EAGAIN) {
-			int hold_count;
+		if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
+			/*
+			 * raced with a V->P operation.  wait for the other
+			 * side to finish by acquring pmap's lock.  it is
+			 * probably unmapping the page, and it will be gone
+			 * when the loop is restarted.
+			 */
+			pmap = ptp_to_pmap(pvpte->pte_ptp);
+			pmap_reference(pmap);
 			mutex_spin_exit(&pp->pp_lock);
-			KERNEL_UNLOCK_ALL(curlwp, &hold_count);
-			SPINLOCK_BACKOFF(count);
-			KERNEL_LOCK(hold_count, curlwp);
+			mutex_enter(&pmap->pm_lock);
+			/* nothing. */
+			mutex_exit(&pmap->pm_lock);
+			pmap_destroy(pmap);
 			goto startover;
 		}
 		pp->pp_attrs |= oattrs;
@@ -4175,8 +4475,6 @@ pmap_write_protect(struct pmap *pmap, va
 	vaddr_t blockend, va;
 	int lvl, i;
 
-	KASSERT(pmap->pm_remove_all == NULL);
-
 	if (__predict_false(pmap->pm_write_protect != NULL)) {
 		(*pmap->pm_write_protect)(pmap, sva, eva, prot);
 		return;
@@ -4195,7 +4493,8 @@ pmap_write_protect(struct pmap *pmap, va
 
 	/*
 	 * Acquire pmap.  No need to lock the kernel pmap as we won't
-	 * be touching the pvmap nor the stats.
+	 * be touching PV entries nor stats and kernel PDEs aren't
+	 * freed.
 	 */
 	if (pmap != pmap_kernel()) {
 		mutex_enter(&pmap->pm_lock);
@@ -4335,14 +4634,14 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 	struct vm_page *new_pg, *old_pg;
 	struct pmap_page *new_pp, *old_pp;
 	struct pv_entry *old_pve, *new_pve;
-	int error;
 	bool wired = (flags & PMAP_WIRED) != 0;
 	struct pmap *pmap2;
 	struct pmap_ptparray pt;
-	bool getptp;
+	int error;
+	bool getptp, samepage, new_embedded;
+	rb_tree_t *tree;
 
 	KASSERT(pmap_initialized);
-	KASSERT(pmap->pm_remove_all == NULL);
 	KASSERT(va < VM_MAX_KERNEL_ADDRESS);
 	KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
 	    PRIxVADDR " over PDP!", __func__, va);
@@ -4377,13 +4676,16 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 	else
 #endif
 		new_pg = PHYS_TO_VM_PAGE(pa);
+		
 	if (new_pg != NULL) {
 		/* This is a managed page */
 		npte |= PTE_PVLIST;
 		new_pp = VM_PAGE_TO_PP(new_pg);
+		PMAP_CHECK_PP(new_pp);
 	} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
 		/* This is an unmanaged pv-tracked page */
 		npte |= PTE_PVLIST;
+		PMAP_CHECK_PP(new_pp);
 	} else {
 		new_pp = NULL;
 	}
@@ -4408,18 +4710,36 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 				    error);
 			}
 		}
+		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
+	} else {
+		/* Embedded PV entries rely on this. */
+		KASSERT(va != 0);
+		tree = &pmap_kernel_rb;
 	}
 
 	/*
-	 * Now check to see if we need a pv entry for this VA.  If we do,
-	 * allocate and install in the PV tree.  In any case look up the
-	 * pv entry in case the old mapping used it.
+	 * Look up the old PV entry at this VA (if any), and insert a new PV
+	 * entry if required for the new mapping.  Temporarily track the old
+	 * and new mappings concurrently.  Only after the old mapping is
+	 * evicted from the pmap will we remove its PV entry.  Otherwise,
+	 * our picture of modified/accessed state for either page could get
+	 * out of sync (we need any P->V operation for either page to stall
+	 * on pmap->pm_lock until done here).
 	 */
-	old_pve = NULL;
 	new_pve = NULL;
-	if (pmap_pp_needs_pve(new_pp, ptp, va)) {
-		new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
-		if (new_pve == NULL) {
+	old_pve = NULL;
+	samepage = false;
+	new_embedded = false;
+
+    	if (new_pp != NULL) {
+    		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
+    		    &old_pve, &samepage, &new_embedded, tree);
+
+		/*
+		 * If a new pv_entry was needed and none was available, we
+		 * can go no further.
+		 */
+		if (error != 0) {
 			if (flags & PMAP_CANFAIL) {
 				if (getptp) {
 					pmap_unget_ptp(pmap, &pt);
@@ -4429,6 +4749,8 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 			}
 			panic("%s: alloc pve failed", __func__);
 		}
+	} else {
+		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
 	}
 
 	/* Map PTEs into address space. */
@@ -4469,11 +4791,27 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 			    vtomach((vaddr_t)ptep), npte, domid);
 			splx(s);
 			if (error) {
+				/* Undo pv_entry tracking - oof. */
+				if (new_pp != NULL) {
+					mutex_spin_enter(&new_pp->pp_lock);
+					if (new_pve != NULL) {
+						LIST_REMOVE(new_pve, pve_list);
+						KASSERT(pmap->pm_pve == NULL);
+						pmap->pm_pve = new_pve;
+					} else if (new_embedded) {
+						new_pp->pp_pte.pte_ptp = NULL;
+						new_pp->pp_pte.pte_va = 0;
+					}
+					mutex_spin_exit(&new_pp->pp_lock);
+				}
+				pmap_unmap_ptes(pmap, pmap2);
+				/* Free new PTP. */
 				if (ptp != NULL && ptp->wire_count <= 1) {
 					pmap_free_ptp(pmap, ptp, va, ptes,
 					    pdes);
 				}
-				goto out;
+				mutex_exit(&pmap->pm_lock);
+				return error;
 			}
 			break;
 		}
@@ -4481,11 +4819,20 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
 
 	/*
+	 * Done with the PTEs: they can now be unmapped.
+	 */
+	pmap_unmap_ptes(pmap, pmap2);
+
+	/*
 	 * Update statistics and PTP's reference count.
 	 */
 	pmap_stats_update_bypte(pmap, npte, opte);
-	if (ptp != NULL && !have_oldpa) {
-		ptp->wire_count++;
+	if (ptp != NULL) {
+		if (!have_oldpa) {
+			ptp->wire_count++;
+		}
+		/* Remember minimum VA in PTP. */
+		pmap_ptp_range_set(ptp, va);
 	}
 	KASSERT(ptp == NULL || ptp->wire_count > 1);
 
@@ -4494,7 +4841,13 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 	 */
 	if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
 		KASSERT(((opte ^ npte) & PTE_PVLIST) == 0);
+		if ((npte & PTE_PVLIST) != 0) {
+			KASSERT(samepage);
+			pmap_check_pv(pmap, ptp, new_pp, va, true);
+		}
 		goto same_pa;
+	} else if ((npte & PTE_PVLIST) != 0) {
+		KASSERT(!samepage);
 	}
 
 	/*
@@ -4510,16 +4863,28 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t
 			    __func__, va, oldpa, atop(pa));
 		}
 
-		old_pve = pmap_lookup_pv(pmap, ptp, old_pp, va);
 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
 		    pmap_pte_to_pp_attrs(opte));
+		if (old_pve != NULL) {
+			if (pmap->pm_pve == NULL) {
+				pmap->pm_pve = old_pve;
+			} else {
+				pool_cache_put(&pmap_pv_cache, old_pve);
+			}
+		}
+	} else {
+		KASSERT(old_pve == NULL);
+		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
 	}
 
 	/*
-	 * If new page is pv-tracked, insert pv_entry into its list.
+	 * If new page is dynamically PV tracked, insert to tree.
 	 */
-	if (new_pp) {
-		new_pve = pmap_enter_pv(pmap, new_pp, new_pve, ptp, va);
+	if (new_pve != NULL) {
+		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
+		old_pve = rb_tree_insert_node(tree, new_pve);
+		KASSERT(old_pve == new_pve);
+		pmap_check_pv(pmap, ptp, new_pp, va, true);
 	}
 
 same_pa:
@@ -4531,20 +4896,8 @@ same_pa:
 	    ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
 		pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
 	}
-
-	error = 0;
-#if defined(XENPV)
-out:
-#endif
-	pmap_unmap_ptes(pmap, pmap2);
-	if (old_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, old_pve);
-	}
-	if (new_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, new_pve);
-	}
 	mutex_exit(&pmap->pm_lock);
-	return error;
+	return 0;
 }
 
 paddr_t
@@ -4863,20 +5216,10 @@ pmap_update(struct pmap *pmap)
 	struct vm_page *ptp;
 
 	/*
-	 * If pmap_remove_all() was in effect, re-enable invalidations from
-	 * this point on; issue a shootdown for all the mappings just
-	 * removed.
-	 */
-	kpreempt_disable();
-	if (pmap->pm_remove_all == curlwp) {
-		pmap->pm_remove_all = NULL;
-		pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE);
-	}
-
-	/*
 	 * Initiate any pending TLB shootdowns.  Wait for them to
 	 * complete before returning control to the caller.
 	 */
+	kpreempt_disable();
 	pmap_tlb_shootnow();
 	kpreempt_enable();
 
@@ -4885,7 +5228,7 @@ pmap_update(struct pmap *pmap)
 	 * is an unlocked check, but is safe as we're only interested in
 	 * work done in this LWP - we won't get a false negative.
 	 */
-	if (!LIST_EMPTY(&pmap->pm_gc_ptp)) {
+	if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) {
 		mutex_enter(&pmap->pm_lock);
 		while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) {
 			KASSERT(ptp->wire_count == 0);
@@ -4893,7 +5236,9 @@ pmap_update(struct pmap *pmap)
 			pp = VM_PAGE_TO_PP(ptp);
 			LIST_INIT(&pp->pp_pvlist);
 			pp->pp_attrs = 0;
-			pp->pp_pflags = 0;
+			pp->pp_pte.pte_ptp = NULL;
+			pp->pp_pte.pte_va = 0;
+			PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
 
 			/*
 			 * XXX Hack to avoid extra locking, and lock
@@ -5248,10 +5593,10 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 	bool accessed;
 	struct pmap_ptparray pt;
 	int error;
-	bool getptp;
+	bool getptp, samepage, new_embedded;
+	rb_tree_t *tree;
 
 	KASSERT(pmap_initialized);
-	KASSERT(pmap->pm_remove_all == NULL);
 	KASSERT(va < VM_MAXUSER_ADDRESS);
 
 	npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
@@ -5298,18 +5643,36 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 				    error);
 			}
 		}
+		tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
+	} else {
+		/* Embedded PV entries rely on this. */
+		KASSERT(va != 0);
+		tree = &pmap_kernel_rb;
 	}
 
 	/*
-	 * Now check to see if we need a pv entry for this VA.  If we do,
-	 * allocate and install in the radix tree.  In any case look up the
-	 * pv entry in case the old mapping used it.
+	 * Look up the old PV entry at this VA (if any), and insert a new PV
+	 * entry if required for the new mapping.  Temporarily track the old
+	 * and new mappings concurrently.  Only after the old mapping is
+	 * evicted from the pmap will we remove its PV entry.  Otherwise,
+	 * our picture of modified/accessed state for either page could get
+	 * out of sync (we need any P->V operation for either page to stall
+	 * on pmap->pm_lock until done here).
 	 */
-	old_pve = NULL;
 	new_pve = NULL;
-	if (pmap_pp_needs_pve(new_pp, ptp, va)) {
-		new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT);
-		if (new_pve == NULL) {
+	old_pve = NULL;
+	samepage = false;
+	new_embedded = false;
+
+    	if (new_pp != NULL) {
+    		error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
+    		    &old_pve, &samepage, &new_embedded, tree);
+
+		/*
+		 * If a new pv_entry was needed and none was available, we
+		 * can go no further.
+		 */
+		if (error != 0) {
 			if (flags & PMAP_CANFAIL) {
 				if (getptp) {
 					pmap_unget_ptp(pmap, &pt);
@@ -5318,7 +5681,9 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 				return error;
 			}
 			panic("%s: alloc pve failed", __func__);
-		}	
+		}
+	} else {
+		old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
 	}
 
 	/* Map PTEs into address space. */
@@ -5329,12 +5694,7 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 		pmap_ept_install_ptp(pmap, &pt, va);
 	}
 
-	/*
-	 * Check if there is an existing mapping.  If we are now sure that
-	 * we need pves and we failed to allocate them earlier, handle that.
-	 * Caching the value of oldpa here is safe because only the mod/ref
-	 * bits can change while the pmap is locked.
-	 */
+	/* Check if there is an existing mapping. */
 	ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
 	ptep = &ptes[pl1_pi(va)];
 	opte = *ptep;
@@ -5356,11 +5716,20 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 	} while (pmap_pte_cas(ptep, opte, npte) != opte);
 
 	/*
+	 * Done with the PTEs: they can now be unmapped.
+	 */
+	kpreempt_enable();
+
+	/*
 	 * Update statistics and PTP's reference count.
 	 */
 	pmap_ept_stats_update_bypte(pmap, npte, opte);
-	if (ptp != NULL && !have_oldpa) {
-		ptp->wire_count++;
+	if (ptp != NULL) {
+		if (!have_oldpa) {
+			ptp->wire_count++;
+		}
+		/* Remember minimum VA in PTP. */
+		pmap_ptp_range_set(ptp, va);
 	}
 	KASSERT(ptp == NULL || ptp->wire_count > 1);
 
@@ -5369,11 +5738,17 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 	 */
 	if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
 		KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
+		if ((npte & EPT_PVLIST) != 0) {
+			KASSERT(samepage);
+			pmap_check_pv(pmap, ptp, new_pp, va, true);
+		}
 		goto same_pa;
+	} else if ((npte & EPT_PVLIST) != 0) {
+		KASSERT(!samepage);
 	}
 
 	/*
-	 * If old page is pv-tracked, replace pv_entry from its list.
+	 * If old page is pv-tracked, remove pv_entry from its list.
 	 */
 	if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
 		if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
@@ -5385,19 +5760,35 @@ pmap_ept_enter(struct pmap *pmap, vaddr_
 			    __func__, va, oldpa, atop(pa));
 		}
 
-		old_pve = pmap_lookup_pv(pmap, ptp, old_pp, va);
 		pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
 		    pmap_ept_to_pp_attrs(opte));
+		if (old_pve != NULL) {
+			if (pmap->pm_pve == NULL) {
+				pmap->pm_pve = old_pve;
+			} else {
+				pool_cache_put(&pmap_pv_cache, old_pve);
+			}
+		}
+	} else {
+		KASSERT(old_pve == NULL);
+		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
 	}
 
 	/*
-	 * If new page is pv-tracked, insert pv_entry into its list.
+	 * If new page is dynamically PV tracked, insert to tree.
 	 */
-	if (new_pp) {
-		new_pve = pmap_enter_pv(pmap, new_pp, new_pve, ptp, va);
+	if (new_pve != NULL) {
+		KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
+		old_pve = rb_tree_insert_node(tree, new_pve);
+		KASSERT(old_pve == new_pve);
+		pmap_check_pv(pmap, ptp, new_pp, va, true);
 	}
 
 same_pa:
+	/*
+	 * shootdown tlb if necessary.
+	 */
+
 	if (pmap_ept_has_ad) {
 		accessed = (~opte & (EPT_R | EPT_A)) == 0;
 	} else {
@@ -5406,18 +5797,8 @@ same_pa:
 	if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
 		pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
 	}
-
-	error = 0;
-	kpreempt_enable();
-	if (old_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, old_pve);
-	}
-	if (new_pve != NULL) {
-		pool_cache_put(&pmap_pv_cache, new_pve);
-	}
 	mutex_exit(&pmap->pm_lock);
-
-	return error;
+	return 0;
 }
 
 /* Pay close attention, this returns L2. */
@@ -5541,6 +5922,8 @@ pmap_ept_remove_pte(struct pmap *pmap, s
 		    "managed page without EPT_PVLIST for %#"PRIxVADDR, va);
 		KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
 		    "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
+		KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
+		    &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
 		return true;
 	}
 
@@ -5575,6 +5958,12 @@ pmap_ept_remove_ptes(struct pmap *pmap, 
 	KASSERT(kpreempt_disabled());
 
 	/*
+	 * mappings are very often sparse, so clip the given range to the
+	 * range of PTEs that are known present in the PTP.
+	 */
+	pmap_ptp_range_clip(ptp, &startva, &pte);
+
+	/*
 	 * note that ptpva points to the PTE that maps startva.   this may
 	 * or may not be the first PTE in the PTP.
 	 *
@@ -5636,10 +6025,6 @@ pmap_ept_remove(struct pmap *pmap, vaddr
 	}
 
 	kpreempt_enable();
-	/*
-	 * Radix tree nodes are removed here, so we need to continue holding
-	 * the pmap locked until complete.
-	 */
 	if (pv_tofree != NULL) {
 		pmap_free_pvs(pmap, pv_tofree);
 	}

Reply via email to