Module Name: src Committed By: ad Date: Tue Mar 17 22:29:19 UTC 2020
Modified Files: src/sys/arch/x86/include: pmap.h pmap_pv.h src/sys/arch/x86/x86: pmap.c Log Message: Hallelujah, the bug has been found. Resurrect prior changes, to be fixed with following commit. To generate a diff of this commit: cvs rdiff -u -r1.114 -r1.115 src/sys/arch/x86/include/pmap.h cvs rdiff -u -r1.16 -r1.17 src/sys/arch/x86/include/pmap_pv.h cvs rdiff -u -r1.373 -r1.374 src/sys/arch/x86/x86/pmap.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/x86/include/pmap.h diff -u src/sys/arch/x86/include/pmap.h:1.114 src/sys/arch/x86/include/pmap.h:1.115 --- src/sys/arch/x86/include/pmap.h:1.114 Tue Mar 17 21:02:56 2020 +++ src/sys/arch/x86/include/pmap.h Tue Mar 17 22:29:19 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.h,v 1.114 2020/03/17 21:02:56 ad Exp $ */ +/* $NetBSD: pmap.h,v 1.115 2020/03/17 22:29:19 ad Exp $ */ /* * Copyright (c) 1997 Charles D. Cranor and Washington University. @@ -248,6 +248,8 @@ extern struct pool_cache pmap_cache; * (the other object locks are only used when uvm_pagealloc is called) */ +struct pv_page; + struct pmap { struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */ LIST_ENTRY(pmap) pm_list; /* list of all pmaps */ @@ -256,11 +258,11 @@ struct pmap { struct vm_page *pm_ptphint[PTP_LEVELS-1]; /* pointer to a PTP in our pmap */ struct pmap_statistics pm_stats; /* pmap stats */ + struct pv_entry *pm_pve; /* spare pv_entry */ #if !defined(__x86_64__) vaddr_t pm_hiexec; /* highest executable mapping */ #endif /* !defined(__x86_64__) */ - struct lwp *pm_remove_all; /* who's emptying the pmap */ union descriptor *pm_ldt; /* user-set LDT */ size_t pm_ldt_len; /* size of LDT in bytes */ Index: src/sys/arch/x86/include/pmap_pv.h diff -u src/sys/arch/x86/include/pmap_pv.h:1.16 src/sys/arch/x86/include/pmap_pv.h:1.17 --- src/sys/arch/x86/include/pmap_pv.h:1.16 Tue Mar 17 21:02:56 2020 +++ src/sys/arch/x86/include/pmap_pv.h Tue Mar 17 22:29:19 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap_pv.h,v 1.16 2020/03/17 21:02:56 ad Exp $ */ +/* $NetBSD: pmap_pv.h,v 1.17 2020/03/17 22:29:19 ad Exp $ */ /*- * Copyright (c)2008 YAMAMOTO Takashi, @@ -34,6 +34,7 @@ #include <sys/rbtree.h> struct vm_page; +struct pmap_page; /* * structures to track P->V mapping @@ -51,14 +52,14 @@ struct pv_pte { }; /* - * pv_entry: plug pv_pte into lists. + * pv_entry: plug pv_pte into lists. 32 bytes on i386, 64 on amd64. */ struct pv_entry { struct pv_pte pve_pte; /* should be the first member */ LIST_ENTRY(pv_entry) pve_list; /* on pmap_page::pp_pvlist */ rb_node_t pve_rb; /* red-black tree node */ - uintptr_t pve_padding; /* unused */ + struct pmap_page *pve_pp; /* backpointer to mapped page */ }; #define pve_next pve_list.le_next @@ -71,16 +72,13 @@ struct pmap_page { /* PTPs */ rb_tree_t rb; - /* PTPs */ + /* PTPs, when being freed */ LIST_ENTRY(vm_page) link; - /* Non-PTPs */ + /* Non-PTPs (i.e. normal pages) */ struct { - /* PP_EMBEDDED */ struct pv_pte pte; - LIST_HEAD(, pv_entry) pvlist; - uint8_t flags; uint8_t attrs; } s; } pp_u; @@ -89,7 +87,6 @@ struct pmap_page { #define pp_link pp_u.link #define pp_pte pp_u.s.pte #define pp_pvlist pp_u.s.pvlist -#define pp_pflags pp_u.s.flags #define pp_attrs pp_u.s.attrs }; @@ -97,10 +94,6 @@ struct pmap_page { #define PP_ATTRS_A 0x02 /* Accessed */ #define PP_ATTRS_W 0x04 /* Writable */ -/* pp_flags */ -#define PP_EMBEDDED 1 -#define PP_FREEING 2 - #define PMAP_PAGE_INIT(pp) \ do { \ LIST_INIT(&(pp)->pp_pvlist); \ Index: src/sys/arch/x86/x86/pmap.c diff -u src/sys/arch/x86/x86/pmap.c:1.373 src/sys/arch/x86/x86/pmap.c:1.374 --- src/sys/arch/x86/x86/pmap.c:1.373 Tue Mar 17 21:02:56 2020 +++ src/sys/arch/x86/x86/pmap.c Tue Mar 17 22:29:19 2020 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.c,v 1.373 2020/03/17 21:02:56 ad Exp $ */ +/* $NetBSD: pmap.c,v 1.374 2020/03/17 22:29:19 ad Exp $ */ /* * Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc. @@ -130,7 +130,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.373 2020/03/17 21:02:56 ad Exp $"); +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.374 2020/03/17 22:29:19 ad Exp $"); #include "opt_user_ldt.h" #include "opt_lockdebug.h" @@ -139,6 +139,8 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.3 #include "opt_svs.h" #include "opt_kaslr.h" +#define __MUTEX_PRIVATE /* for assertions */ + #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> @@ -224,23 +226,39 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.3 /* * Locking * - * We have the following locks that we must contend with, listed in the - * order that they must be acquired: + * We have the following locks that we must deal with, listed in the order + * that they are acquired: + * + * pg->uobject->vmobjlock, pg->uanon->an_lock * - * - pg->uobject->vmobjlock, pg->uanon->an_lock - * These per-object locks are taken by the VM system before calling into - * the pmap module. Holding them prevents concurrent operations on the - * given page or set of pages. - * - * - pmap->pm_lock (per pmap) - * This lock protects the fields in the pmap structure including the - * non-kernel PDEs in the PDP, the PTEs, and the PVE radix tree. For - * modifying kernel PTEs it is not required as kernel PDEs are never - * freed, and the kernel is expected to be self consistent. - * - * - pmaps_lock - * This lock protects the list of active pmaps (headed by "pmaps"). We - * lock it when adding or removing pmaps from this list. + * For managed pages, these per-object locks are taken by the VM system + * before calling into the pmap module - either a read or write hold. + * The lock hold prevent pages from changing identity while the pmap is + * operating on them. For example, the same lock is held across a call + * to pmap_remove() and the following call to pmap_update(), so that a + * page does not gain a new identity while its TLB visibility is stale. + * + * pmap->pm_lock + * + * This lock protects the fields in the pmap structure including the + * non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data + * structures. For modifying unmanaged kernel PTEs it is not needed as + * kernel PDEs are never freed, and the kernel is expected to be self + * consistent (and the lock can't be taken for unmanaged kernel PTEs, + * because they can be modified from interrupt context). + * + * pmaps_lock + * + * This lock protects the list of active pmaps (headed by "pmaps"). + * It's acqired when adding or removing pmaps or adjusting kernel PDEs. + * + * pp_lock + * + * This per-page lock protects PV entry lists and the embedded PV entry + * in each vm_page, allowing for concurrent operation on pages by + * different pmaps. This is a spin mutex at IPL_VM, because at the + * points it is taken context switching is usually not tolerable, and + * spin mutexes must block out interrupts that could take kernel_lock. */ /* uvm_object is abused here to index pmap_pages; make assertions happy. */ @@ -317,6 +335,8 @@ paddr_t pmap_pa_end; /* PA of last phy #endif #define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp) +#define PMAP_CHECK_PP(pp) \ + KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp) /* * Other data structures @@ -523,6 +543,17 @@ pvpte_to_pve(struct pv_pte *pvpte) } /* + * Return true if the pmap page has an embedded PV entry. + */ +static inline bool +pv_pte_embedded(struct pmap_page *pp) +{ + + KASSERT(mutex_owned(&pp->pp_lock)); + return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va); +} + +/* * pv_pte_first, pv_pte_next: PV list iterator. */ static struct pv_pte * @@ -530,7 +561,7 @@ pv_pte_first(struct pmap_page *pp) { KASSERT(mutex_owned(&pp->pp_lock)); - if ((pp->pp_pflags & PP_EMBEDDED) != 0) { + if (pv_pte_embedded(pp)) { return &pp->pp_pte; } return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); @@ -543,7 +574,6 @@ pv_pte_next(struct pmap_page *pp, struct KASSERT(mutex_owned(&pp->pp_lock)); KASSERT(pvpte != NULL); if (pvpte == &pp->pp_pte) { - KASSERT((pp->pp_pflags & PP_EMBEDDED) != 0); return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist)); } return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list)); @@ -605,6 +635,61 @@ pmap_compare_key(void *context, const vo } /* + * pmap_ptp_init: initialize new page table page + */ +static inline void +pmap_ptp_init(struct vm_page *ptp) +{ + + ptp->uanon = (struct vm_anon *)(vaddr_t)~0L; + rb_tree_init(&VM_PAGE_TO_PP(ptp)->pp_rb, &pmap_rbtree_ops); + PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); +} + +/* + * pmap_ptp_fini: finalize a page table page + */ +static inline void +pmap_ptp_fini(struct vm_page *ptp) +{ + + KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); + PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); + ptp->uanon = NULL; +} + +/* + * pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE + */ +static inline void +pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va) +{ + vaddr_t *min = (vaddr_t *)&ptp->uanon; + + if (va < *min) { + *min = va; + } +} + +/* + * pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove + */ +static inline void +pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte) +{ + vaddr_t sclip; + + if (ptp == NULL) { + return; + } + + sclip = (vaddr_t)ptp->uanon; + sclip = (*startva < sclip ? sclip : *startva); + *pte += (sclip - *startva) / PAGE_SIZE; + *startva = sclip; +} + +/* * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in * * there are several pmaps involved. some or all of them might be same. @@ -656,7 +741,9 @@ pmap_map_ptes(struct pmap *pmap, struct * often the case during exit(), when we have switched * to the kernel pmap in order to destroy a user pmap. */ - pmap_reactivate(pmap); + if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { + pmap_reactivate(pmap); + } *pmap2 = NULL; } else { /* @@ -1771,7 +1858,7 @@ pmap_init(void) * The kernel doesn't keep track of PTPs, so there's nowhere handy * to hang a tree of pv_entry records. Dynamically allocated * pv_entry lists are not heavily used in the kernel's pmap (the - * usual case is PP_EMBEDDED), so cop out and use a single RB tree + * usual case is embedded), so cop out and use a single RB tree * to cover them. */ rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops); @@ -1857,28 +1944,6 @@ pmap_vpage_cpu_init(struct cpu_info *ci) * p v _ e n t r y f u n c t i o n s */ - -/* - * pmap_pp_needs_pve: return true if we need to allocate a pv entry. - */ -static bool -pmap_pp_needs_pve(struct pmap_page *pp, struct vm_page *ptp, vaddr_t va) -{ - - /* - * Adding a pv entry for this page only needs to allocate a pv_entry - * structure if the page already has at least one pv entry, since - * the first pv entry is stored in the pmap_page. However, because - * of subsequent removal(s), PP_EMBEDDED can be false and there can - * still be pv entries on the list. - */ - - if (pp == NULL || (pp->pp_pflags & PP_EMBEDDED) == 0) { - return false; - } - return pp->pp_pte.pte_ptp != ptp || pp->pp_pte.pte_va != va; -} - /* * pmap_free_pvs: free a linked list of pv entries. the pv entries have * been removed from their respective pages, but are still entered into the @@ -1900,49 +1965,57 @@ pmap_free_pvs(struct pmap *pmap, struct } /* - * pmap_lookup_pv: look up a non-PP_EMBEDDED pv entry for the given pmap - * - * => pmap must be locked + * pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page */ - -static struct pv_entry * -pmap_lookup_pv(struct pmap *pmap, struct vm_page *ptp, - struct pmap_page *pp, vaddr_t va) +static void +pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp, + vaddr_t va, bool tracked) { - struct rb_node *node; - struct pv_entry *pve; +#ifdef DIAGNOSTIC /* XXX too slow make this DEBUG before April 2020 */ + struct pv_pte *pvpte; - KASSERT(mutex_owned(&pmap->pm_lock)); + PMAP_CHECK_PP(pp); - /* - * Do an unlocked check on the page: if tracked with PP_EMBEDDED we - * can avoid touching the tree. - */ - if ((pp->pp_pflags & PP_EMBEDDED) != 0 && - pp->pp_pte.pte_ptp == ptp && - pp->pp_pte.pte_va == va) { - return NULL; + mutex_spin_enter(&pp->pp_lock); + for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { + if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) { + break; + } } + mutex_spin_exit(&pp->pp_lock); - if (ptp != NULL) { - node = VM_PAGE_TO_PP(ptp)->pp_rb.rbt_root; - } else { - KASSERT(pmap == pmap_kernel()); - node = pmap_kernel_rb.rbt_root; + if (pvpte && !tracked) { + panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); + } else if (!pvpte && tracked) { + panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp); } +#endif +} + +/* + * pmap_treelookup_pv: search the PV tree for a dynamic entry + * + * => pmap must be locked + */ +static struct pv_entry * +pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp, + const rb_tree_t *tree, const vaddr_t va) +{ + struct pv_entry *pve; + rb_node_t *node; /* - * Search the RB tree for the key. This is an inlined lookup - * tailored for exactly what's needed here that is quite a bit - * faster than using rb_tree_find_node(). + * Inlined lookup tailored for exactly what's needed here that is + * quite a bit faster than using rb_tree_find_node(). */ - for (;;) { + for (node = tree->rbt_root;;) { if (__predict_false(RB_SENTINEL_P(node))) { return NULL; } pve = (struct pv_entry *) ((uintptr_t)node - offsetof(struct pv_entry, pve_rb)); if (pve->pve_pte.pte_va == va) { + KASSERT(pve->pve_pte.pte_ptp == ptp); return pve; } node = node->rb_nodes[pve->pve_pte.pte_va < va]; @@ -1950,91 +2023,194 @@ pmap_lookup_pv(struct pmap *pmap, struct } /* - * pmap_enter_pv: enter a mapping onto a pmap_page lst + * pmap_lookup_pv: look up a non-embedded pv entry for the given pmap * - * => caller should adjust ptp's wire_count before calling - * => caller has preallocated pve for us - * => if not embedded, tree node must be in place beforehand + * => a PV entry must be known present (doesn't check for existence) + * => pmap must be locked */ static struct pv_entry * -pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct pv_entry *pve, - struct vm_page *ptp, vaddr_t va) +pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp, + const struct pmap_page * const old_pp, const vaddr_t va) { + struct pv_entry *pve; + const rb_tree_t *tree; + + KASSERT(mutex_owned(&pmap->pm_lock)); + KASSERT(ptp != NULL || pmap == pmap_kernel()); + + /* + * [This mostly deals with the case of process-private pages, i.e. + * anonymous memory allocations or COW.] + * + * If the page is tracked with an embedded entry then the tree + * lookup can be avoided. It's safe to check for this specific + * set of values without pp_lock because both will only ever be + * set together for this pmap. + * + */ + if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && + atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) { + return NULL; + } + + /* + * [This mostly deals with shared mappings, for example shared libs + * and executables.] + * + * Optimise for pmap_remove_all() which works by ascending scan: + * look at the lowest numbered node in the tree first. The tree is + * known non-empty because of the check above. For short lived + * processes where pmap_remove() isn't used much this gets close to + * a 100% hit rate. + */ + tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); + KASSERT(!RB_SENTINEL_P(tree->rbt_root)); + pve = (struct pv_entry *) + ((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] - + offsetof(struct pv_entry, pve_rb)); + if (__predict_true(pve->pve_pte.pte_va == va)) { + KASSERT(pve->pve_pte.pte_ptp == ptp); + return pve; + } + + /* Search the RB tree for the key (uncommon). */ + return pmap_treelookup_pv(pmap, ptp, tree, va); +} + +/* + * pmap_enter_pv: enter a mapping onto a pmap_page lst + * + * => pmap must be locked + * => does NOT insert dynamic entries to tree (pmap_enter() does later) + */ +static int +pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, + vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve, + bool *samepage, bool *new_embedded, rb_tree_t *tree) +{ + struct pv_entry *pve; + int error; KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); - KASSERT(ptp == NULL || ptp->wire_count >= 2); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); + PMAP_CHECK_PP(pp); + /* + * If entering the same page and it's already tracked with an + * embedded entry, we can avoid the expense below. It's safe + * to check for this very specific set of values without a lock + * because both will only ever be set together for this pmap. + */ + if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && + atomic_load_relaxed(&pp->pp_pte.pte_va) == va) { + *samepage = true; + pmap_check_pv(pmap, ptp, pp, va, true); + return 0; + } + + /* + * Check for an existing dynamic mapping at this address. If it's + * for the same page, then it will be reused and nothing needs to be + * changed. + */ + *old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); + if (*old_pve != NULL && (*old_pve)->pve_pp == pp) { + *samepage = true; + pmap_check_pv(pmap, ptp, pp, va, true); + return 0; + } + + /* + * Need to put a new mapping in place. Grab a spare pv_entry in + * case it's needed; won't know for sure until the lock is taken. + */ + if (pmap->pm_pve == NULL) { + pmap->pm_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); + } + + error = 0; + pmap_check_pv(pmap, ptp, pp, va, false); mutex_spin_enter(&pp->pp_lock); - if ((pp->pp_pflags & PP_EMBEDDED) == 0) { - pp->pp_pflags |= PP_EMBEDDED; + if (!pv_pte_embedded(pp)) { + /* + * Embedded PV tracking available - easy. + */ pp->pp_pte.pte_ptp = ptp; pp->pp_pte.pte_va = va; - mutex_spin_exit(&pp->pp_lock); - return pve; + *new_embedded = true; + } else if (__predict_false(pmap->pm_pve == NULL)) { + /* + * No memory. + */ + error = ENOMEM; + } else { + /* + * Install new pv_entry on the page. + */ + pve = pmap->pm_pve; + pmap->pm_pve = NULL; + *new_pve = pve; + pve->pve_pte.pte_ptp = ptp; + pve->pve_pte.pte_va = va; + pve->pve_pp = pp; + LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); } - - KASSERT(pve != NULL); - pve->pve_pte.pte_ptp = ptp; - pve->pve_pte.pte_va = va; - KASSERT(pmap_lookup_pv(pmap, ptp, pp, va) == NULL); - LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list); mutex_spin_exit(&pp->pp_lock); + pmap_check_pv(pmap, ptp, pp, va, true); - if (ptp != NULL) { - rb_tree_insert_node(&VM_PAGE_TO_PP(ptp)->pp_rb, pve); - } else { - KASSERT(pmap == pmap_kernel()); - rb_tree_insert_node(&pmap_kernel_rb, pve); - } - return NULL; + return error; } /* * pmap_remove_pv: try to remove a mapping from a pv_list * + * => pmap must be locked + * => removes dynamic entries from tree * => caller should adjust ptp's wire_count and free PTP if needed - * => we don't remove radix tree entry; defer till later (it could block) - * => we return the removed pve - * => caller can optionally supply pve, if looked up already */ static void pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp, vaddr_t va, struct pv_entry *pve, uint8_t oattrs) { + rb_tree_t *tree = (ptp != NULL ? + &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); + KASSERT(ptp != NULL || pmap == pmap_kernel()); + + pmap_check_pv(pmap, ptp, pp, va, true); mutex_spin_enter(&pp->pp_lock); pp->pp_attrs |= oattrs; - if ((pp->pp_pflags & PP_EMBEDDED) != 0 && - pp->pp_pte.pte_ptp == ptp && - pp->pp_pte.pte_va == va) { - KASSERT(pve == NULL); - pp->pp_pflags &= ~PP_EMBEDDED; + if (pve == NULL) { + KASSERT(pp->pp_pte.pte_ptp == ptp); + KASSERT(pp->pp_pte.pte_va == va); pp->pp_pte.pte_ptp = NULL; pp->pp_pte.pte_va = 0; mutex_spin_exit(&pp->pp_lock); } else { - KASSERT(pve != NULL); - KASSERT(pve == pmap_lookup_pv(pmap, ptp, pp, va)); + KASSERT(pp->pp_pte.pte_ptp != ptp || + pp->pp_pte.pte_va != va); KASSERT(pve->pve_pte.pte_ptp == ptp); KASSERT(pve->pve_pte.pte_va == va); + KASSERT(pve->pve_pp == pp); LIST_REMOVE(pve, pve_list); mutex_spin_exit(&pp->pp_lock); - if (ptp != NULL) { - rb_tree_remove_node(&VM_PAGE_TO_PP(ptp)->pp_rb, pve); - } else { - KASSERT(pmap == pmap_kernel()); - rb_tree_remove_node(&pmap_kernel_rb, pve); - } + KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve); + rb_tree_remove_node(tree, pve); +#ifdef DIAGNOSTIC + memset(pve, 0, sizeof(*pve)); +#endif } + + KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); + pmap_check_pv(pmap, ptp, pp, va, false); } /* @@ -2052,7 +2228,9 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0); - return pmap->pm_ptphint[lidx]; + pg = pmap->pm_ptphint[lidx]; + PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); + return pg; } PMAP_DUMMY_LOCK(pmap); pg = uvm_pagelookup(&pmap->pm_obj[lidx], off); @@ -2061,6 +2239,9 @@ pmap_find_ptp(struct pmap *pmap, vaddr_t /* This page is queued to be freed - ignore. */ pg = NULL; } + if (pg != NULL) { + PMAP_CHECK_PP(VM_PAGE_TO_PP(pg)); + } pmap->pm_ptphint[lidx] = pg; return pg; } @@ -2077,6 +2258,7 @@ pmap_freepage(struct pmap *pmap, struct if (pmap->pm_ptphint[lidx] == ptp) pmap->pm_ptphint[lidx] = NULL; ptp->wire_count = 0; + pmap_ptp_fini(ptp); /* * Enqueue the PTP to be freed by pmap_update(). We can't remove @@ -2085,7 +2267,6 @@ pmap_freepage(struct pmap *pmap, struct * Instead mark the PTP as free and if we bump into it again, we'll * either ignore or reuse (depending on what's useful at the time). */ - KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL); LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link); } @@ -2178,14 +2359,12 @@ pmap_get_ptp(struct pmap *pmap, struct p pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags); pt->alloced[i] = true; if (pt->pg[i] != NULL) { - rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, - &pmap_rbtree_ops); + pmap_ptp_init(pt->pg[i]); } } else if (pt->pg[i]->wire_count == 0) { /* This page was queued to be freed; dequeue it. */ LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link); - rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb, - &pmap_rbtree_ops); + pmap_ptp_init(pt->pg[i]); } PMAP_DUMMY_UNLOCK(pmap); if (pt->pg[i] == NULL) { @@ -2292,8 +2471,10 @@ pmap_unget_ptp(struct pmap *pmap, struct continue; } KASSERT(pt->pg[i]->wire_count == 0); + PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i])); /* pmap zeros all pages before freeing. */ pt->pg[i]->flags |= PG_ZERO; + pmap_ptp_fini(pt->pg[i]); PMAP_DUMMY_LOCK(pmap); uvm_pagefree(pt->pg[i]); PMAP_DUMMY_UNLOCK(pmap); @@ -2488,7 +2669,7 @@ pmap_ctor(void *arg, void *obj, int flag kcpuset_create(&pmap->pm_xen_ptp_cpus, true); #endif LIST_INIT(&pmap->pm_gc_ptp); - pmap->pm_remove_all = NULL; + pmap->pm_pve = NULL; /* allocate and init PDP */ pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK); @@ -2521,6 +2702,10 @@ pmap_dtor(void *arg, void *obj) { struct pmap *pmap = obj; + if (pmap->pm_pve != NULL) { + pool_cache_put(&pmap_pv_cache, pmap->pm_pve); + } + mutex_enter(&pmaps_lock); LIST_REMOVE(pmap, pm_list); mutex_exit(&pmaps_lock); @@ -2637,26 +2822,28 @@ pmap_destroy(struct pmap *pmap) { int i; - /* Undo pmap_remove_all(). */ - if (pmap->pm_remove_all == curlwp) { - pmap_update(pmap); - } - /* - * drop reference count + * drop reference count and verify not in use. */ if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) { return; } - pmap_check_inuse(pmap); /* + * XXX handle deferred PTP page free for EPT. ordinarily this is + * taken care of by pmap_remove_all(). once shared with EPT this + * can go away. + */ + if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) { + pmap_update(pmap); + } + + /* * Reference count is zero, free pmap resources and then free pmap. */ - KASSERT(pmap->pm_remove_all == NULL); pmap_check_ptps(pmap); KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp)); @@ -2697,20 +2884,85 @@ pmap_destroy(struct pmap *pmap) } /* - * pmap_remove_all: pmap is being torn down by the current thread. - * avoid unnecessary invalidations. + * pmap_remove_all: remove all mappings from pmap in bulk. + * + * Ordinarily when removing mappings it's important to hold the UVM object's + * lock, so that pages do not gain a new identity while retaining stale TLB + * entries (the same lock hold covers both pmap_remove() and pmap_update()). + * Here it's known that the address space is no longer visible to any user + * process, so we don't need to worry about that. */ bool pmap_remove_all(struct pmap *pmap) { + struct vm_page *ptps[32]; + vaddr_t va, blkendva; + struct pmap *pmap2; + pt_entry_t *ptes; + pd_entry_t pde __diagused; + pd_entry_t * const *pdes; + struct pv_entry *pv_tofree; + int lvl __diagused, i, n; - /* - * No locking needed; at this point it should only ever be checked - * by curlwp. - */ - KASSERT(pmap->pm_remove_all == NULL); - pmap->pm_remove_all = curlwp; - return false; + /* XXX Can't handle EPT just yet. */ + if (pmap->pm_remove != NULL) { + return false; + } + + for (;;) { + /* Fetch a block of PTPs from tree. */ + mutex_enter(&pmap->pm_lock); + n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0, + (void **)ptps, __arraycount(ptps), false); + if (n == 0) { + mutex_exit(&pmap->pm_lock); + break; + } + + /* Remove all mappings in the set of PTPs. */ + pmap_map_ptes(pmap, &pmap2, &ptes, &pdes); + pv_tofree = NULL; + for (i = 0; i < n; i++) { + if (ptps[i]->wire_count == 0) { + /* It's dead: pmap_update() will expunge. */ + continue; + } + + /* Determine range of block. */ + va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t); + blkendva = x86_round_pdr(va + 1); + + /* Make sure everything squares up... */ + KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl)); + KASSERT(lvl == 1); + KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]); + + /* Zap! */ + pmap_remove_ptes(pmap, ptps[i], + (vaddr_t)&ptes[pl1_i(va)], va, + blkendva, &pv_tofree); + + /* PTP should now be unused - free it. */ + KASSERT(ptps[i]->wire_count == 1); + pmap_free_ptp(pmap, ptps[i], va, ptes, pdes); + } + pmap_unmap_ptes(pmap, pmap2); + pmap_free_pvs(pmap, pv_tofree); + mutex_exit(&pmap->pm_lock); + + /* Process deferred frees. */ + pmap_update(pmap); + + /* A breathing point. */ + preempt_point(); + } + + /* Verify that the pmap is now completely empty. */ + pmap_check_ptps(pmap); + KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE, + "pmap %p not empty", pmap); + + return true; } #if defined(PMAP_FORK) @@ -2952,7 +3204,7 @@ pmap_reactivate(struct pmap *pmap) ci->ci_tlbstate = TLBSTATE_VALID; KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); - if (kcpuset_isset(pmap->pm_cpus, cid)) { + if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) { /* We have the reference, state is valid. */ } else { /* @@ -3542,6 +3794,12 @@ pmap_remove_ptes(struct pmap *pmap, stru KASSERT(kpreempt_disabled()); /* + * mappings are very often sparse, so clip the given range to the + * range of PTEs that are known present in the PTP. + */ + pmap_ptp_range_clip(ptp, &startva, &pte); + + /* * note that ptpva points to the PTE that maps startva. this may * or may not be the first PTE in the PTP. * @@ -3641,6 +3899,8 @@ pmap_remove_pte(struct pmap *pmap, struc KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va); #endif + KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? + &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); return true; } @@ -3753,8 +4013,7 @@ pmap_remove(struct pmap *pmap, vaddr_t s pmap_unmap_ptes(pmap, pmap2); /* * Now safe to free, as we no longer have the PTEs mapped and can - * block again. Radix tree nodes are removed here, so we need to - * continue holding the pmap locked until complete. + * block again. */ if (pv_tofree != NULL) { pmap_free_pvs(pmap, pv_tofree); @@ -3889,20 +4148,36 @@ pmap_pp_remove(struct pmap_page *pp, pad { struct pv_pte *pvpte; struct vm_page *ptp; + uintptr_t sum; uint8_t oattrs; bool locked; - int count; - count = SPINLOCK_BACKOFF_MIN; + /* + * Do an unlocked check to see if the page has no mappings, eg when + * pmap_remove_all() was called before amap_wipeout() for a process + * private amap - common. The page being removed must be on the way + * out, so we don't have to worry about concurrent attempts to enter + * it (otherwise the caller either doesn't care or has screwed up). + */ + sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); + sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); + sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); + if (sum == 0) { + return; + } + kpreempt_disable(); -startover: - mutex_spin_enter(&pp->pp_lock); - while ((pvpte = pv_pte_first(pp)) != NULL) { + for (;;) { struct pmap *pmap; struct pv_entry *pve; pt_entry_t opte; vaddr_t va; - int error; + + mutex_spin_enter(&pp->pp_lock); + if ((pvpte = pv_pte_first(pp)) == NULL) { + mutex_spin_exit(&pp->pp_lock); + break; + } /* * Add a reference to the pmap before clearing the pte. @@ -3930,23 +4205,37 @@ startover: if (ptp != NULL) { pmap_destroy(pmap); } - goto startover; + continue; } - - error = pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte); - if (error == EAGAIN) { - int hold_count; - KERNEL_UNLOCK_ALL(curlwp, &hold_count); - mutex_exit(&pmap->pm_lock); - if (ptp != NULL) { - pmap_destroy(pmap); - } - SPINLOCK_BACKOFF(count); - KERNEL_LOCK(hold_count, curlwp); - goto startover; + va = pvpte->pte_va; + + KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE, + "va %lx pmap %p ptp %p is empty", va, pmap, ptp); + KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0, + "va %lx pmap %p ptp %p is free", va, pmap, ptp); + KASSERTMSG(ptp == NULL || ptp->wire_count > 1, + "va %lx pmap %p ptp %p is empty", va, pmap, ptp); + +#ifdef DIAGNOSTIC /* XXX Too expensive make DEBUG before April 2020 */ + pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true); + rb_tree_t *tree = (ptp != NULL ? + &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb); + pve = pmap_treelookup_pv(pmap, ptp, tree, va); + if (pve == NULL) { + KASSERTMSG(&pp->pp_pte == pvpte, + "va %lx pmap %p ptp %p pvpte %p pve %p oops 1", + va, pmap, ptp, pvpte, pve); + } else { + KASSERTMSG(&pve->pve_pte == pvpte, + "va %lx pmap %p ptp %p pvpte %p pve %p oops 2", + va, pmap, ptp, pvpte, pve); + } +#endif + + if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) { + panic("pmap_pp_remove: mapping not present"); } - va = pvpte->pte_va; pve = pmap_lookup_pv(pmap, ptp, pp, va); pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs); @@ -3964,21 +4253,15 @@ startover: pmap_stats_update_bypte(pmap, 0, opte); } if (pve != NULL) { - /* - * Must free pve, and remove from PV tree with the - * pmap's lock still held. - */ pve->pve_next = NULL; pmap_free_pvs(pmap, pve); } + pmap_tlb_shootnow(); mutex_exit(&pmap->pm_lock); if (ptp != NULL) { pmap_destroy(pmap); } - mutex_spin_enter(&pp->pp_lock); } - mutex_spin_exit(&pp->pp_lock); - pmap_tlb_shootnow(); kpreempt_enable(); } @@ -4028,6 +4311,7 @@ pmap_test_attrs(struct vm_page *pg, unsi { struct pmap_page *pp; struct pv_pte *pvpte; + struct pmap *pmap; uint8_t oattrs; u_int result; paddr_t pa; @@ -4037,17 +4321,29 @@ pmap_test_attrs(struct vm_page *pg, unsi return true; } pa = VM_PAGE_TO_PHYS(pg); + startover: mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { - int error; - if ((pp->pp_attrs & testbits) != 0) { break; } - error = pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL); - if (error == 0) { - pp->pp_attrs |= oattrs; + if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) { + /* + * raced with a V->P operation. wait for the other + * side to finish by acquring pmap's lock. if no + * wait, updates to pp_attrs by the other side may + * go unseen. + */ + pmap = ptp_to_pmap(pvpte->pte_ptp); + pmap_reference(pmap); + mutex_spin_exit(&pp->pp_lock); + mutex_enter(&pmap->pm_lock); + /* nothing. */ + mutex_exit(&pmap->pm_lock); + pmap_destroy(pmap); + goto startover; } + pp->pp_attrs |= oattrs; } result = pp->pp_attrs & testbits; mutex_spin_exit(&pp->pp_lock); @@ -4064,23 +4360,27 @@ static bool pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits) { struct pv_pte *pvpte; + struct pmap *pmap; uint8_t oattrs; u_int result; - int count; - count = SPINLOCK_BACKOFF_MIN; - mutex_spin_enter(&pp->pp_lock); startover: + mutex_spin_enter(&pp->pp_lock); for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { - int error; - - error = pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL); - if (error == EAGAIN) { - int hold_count; + if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) { + /* + * raced with a V->P operation. wait for the other + * side to finish by acquring pmap's lock. it is + * probably unmapping the page, and it will be gone + * when the loop is restarted. + */ + pmap = ptp_to_pmap(pvpte->pte_ptp); + pmap_reference(pmap); mutex_spin_exit(&pp->pp_lock); - KERNEL_UNLOCK_ALL(curlwp, &hold_count); - SPINLOCK_BACKOFF(count); - KERNEL_LOCK(hold_count, curlwp); + mutex_enter(&pmap->pm_lock); + /* nothing. */ + mutex_exit(&pmap->pm_lock); + pmap_destroy(pmap); goto startover; } pp->pp_attrs |= oattrs; @@ -4175,8 +4475,6 @@ pmap_write_protect(struct pmap *pmap, va vaddr_t blockend, va; int lvl, i; - KASSERT(pmap->pm_remove_all == NULL); - if (__predict_false(pmap->pm_write_protect != NULL)) { (*pmap->pm_write_protect)(pmap, sva, eva, prot); return; @@ -4195,7 +4493,8 @@ pmap_write_protect(struct pmap *pmap, va /* * Acquire pmap. No need to lock the kernel pmap as we won't - * be touching the pvmap nor the stats. + * be touching PV entries nor stats and kernel PDEs aren't + * freed. */ if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock); @@ -4335,14 +4634,14 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t struct vm_page *new_pg, *old_pg; struct pmap_page *new_pp, *old_pp; struct pv_entry *old_pve, *new_pve; - int error; bool wired = (flags & PMAP_WIRED) != 0; struct pmap *pmap2; struct pmap_ptparray pt; - bool getptp; + int error; + bool getptp, samepage, new_embedded; + rb_tree_t *tree; KASSERT(pmap_initialized); - KASSERT(pmap->pm_remove_all == NULL); KASSERT(va < VM_MAX_KERNEL_ADDRESS); KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#" PRIxVADDR " over PDP!", __func__, va); @@ -4377,13 +4676,16 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t else #endif new_pg = PHYS_TO_VM_PAGE(pa); + if (new_pg != NULL) { /* This is a managed page */ npte |= PTE_PVLIST; new_pp = VM_PAGE_TO_PP(new_pg); + PMAP_CHECK_PP(new_pp); } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) { /* This is an unmanaged pv-tracked page */ npte |= PTE_PVLIST; + PMAP_CHECK_PP(new_pp); } else { new_pp = NULL; } @@ -4408,18 +4710,36 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t error); } } + tree = &VM_PAGE_TO_PP(ptp)->pp_rb; + } else { + /* Embedded PV entries rely on this. */ + KASSERT(va != 0); + tree = &pmap_kernel_rb; } /* - * Now check to see if we need a pv entry for this VA. If we do, - * allocate and install in the PV tree. In any case look up the - * pv entry in case the old mapping used it. + * Look up the old PV entry at this VA (if any), and insert a new PV + * entry if required for the new mapping. Temporarily track the old + * and new mappings concurrently. Only after the old mapping is + * evicted from the pmap will we remove its PV entry. Otherwise, + * our picture of modified/accessed state for either page could get + * out of sync (we need any P->V operation for either page to stall + * on pmap->pm_lock until done here). */ - old_pve = NULL; new_pve = NULL; - if (pmap_pp_needs_pve(new_pp, ptp, va)) { - new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); - if (new_pve == NULL) { + old_pve = NULL; + samepage = false; + new_embedded = false; + + if (new_pp != NULL) { + error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, + &old_pve, &samepage, &new_embedded, tree); + + /* + * If a new pv_entry was needed and none was available, we + * can go no further. + */ + if (error != 0) { if (flags & PMAP_CANFAIL) { if (getptp) { pmap_unget_ptp(pmap, &pt); @@ -4429,6 +4749,8 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t } panic("%s: alloc pve failed", __func__); } + } else { + old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); } /* Map PTEs into address space. */ @@ -4469,11 +4791,27 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t vtomach((vaddr_t)ptep), npte, domid); splx(s); if (error) { + /* Undo pv_entry tracking - oof. */ + if (new_pp != NULL) { + mutex_spin_enter(&new_pp->pp_lock); + if (new_pve != NULL) { + LIST_REMOVE(new_pve, pve_list); + KASSERT(pmap->pm_pve == NULL); + pmap->pm_pve = new_pve; + } else if (new_embedded) { + new_pp->pp_pte.pte_ptp = NULL; + new_pp->pp_pte.pte_va = 0; + } + mutex_spin_exit(&new_pp->pp_lock); + } + pmap_unmap_ptes(pmap, pmap2); + /* Free new PTP. */ if (ptp != NULL && ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes); } - goto out; + mutex_exit(&pmap->pm_lock); + return error; } break; } @@ -4481,11 +4819,20 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t } while (pmap_pte_cas(ptep, opte, npte) != opte); /* + * Done with the PTEs: they can now be unmapped. + */ + pmap_unmap_ptes(pmap, pmap2); + + /* * Update statistics and PTP's reference count. */ pmap_stats_update_bypte(pmap, npte, opte); - if (ptp != NULL && !have_oldpa) { - ptp->wire_count++; + if (ptp != NULL) { + if (!have_oldpa) { + ptp->wire_count++; + } + /* Remember minimum VA in PTP. */ + pmap_ptp_range_set(ptp, va); } KASSERT(ptp == NULL || ptp->wire_count > 1); @@ -4494,7 +4841,13 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t */ if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) { KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); + if ((npte & PTE_PVLIST) != 0) { + KASSERT(samepage); + pmap_check_pv(pmap, ptp, new_pp, va, true); + } goto same_pa; + } else if ((npte & PTE_PVLIST) != 0) { + KASSERT(!samepage); } /* @@ -4510,16 +4863,28 @@ pmap_enter_ma(struct pmap *pmap, vaddr_t __func__, va, oldpa, atop(pa)); } - old_pve = pmap_lookup_pv(pmap, ptp, old_pp, va); pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_pte_to_pp_attrs(opte)); + if (old_pve != NULL) { + if (pmap->pm_pve == NULL) { + pmap->pm_pve = old_pve; + } else { + pool_cache_put(&pmap_pv_cache, old_pve); + } + } + } else { + KASSERT(old_pve == NULL); + KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } /* - * If new page is pv-tracked, insert pv_entry into its list. + * If new page is dynamically PV tracked, insert to tree. */ - if (new_pp) { - new_pve = pmap_enter_pv(pmap, new_pp, new_pve, ptp, va); + if (new_pve != NULL) { + KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); + old_pve = rb_tree_insert_node(tree, new_pve); + KASSERT(old_pve == new_pve); + pmap_check_pv(pmap, ptp, new_pp, va, true); } same_pa: @@ -4531,20 +4896,8 @@ same_pa: ((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER); } - - error = 0; -#if defined(XENPV) -out: -#endif - pmap_unmap_ptes(pmap, pmap2); - if (old_pve != NULL) { - pool_cache_put(&pmap_pv_cache, old_pve); - } - if (new_pve != NULL) { - pool_cache_put(&pmap_pv_cache, new_pve); - } mutex_exit(&pmap->pm_lock); - return error; + return 0; } paddr_t @@ -4863,20 +5216,10 @@ pmap_update(struct pmap *pmap) struct vm_page *ptp; /* - * If pmap_remove_all() was in effect, re-enable invalidations from - * this point on; issue a shootdown for all the mappings just - * removed. - */ - kpreempt_disable(); - if (pmap->pm_remove_all == curlwp) { - pmap->pm_remove_all = NULL; - pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_UPDATE); - } - - /* * Initiate any pending TLB shootdowns. Wait for them to * complete before returning control to the caller. */ + kpreempt_disable(); pmap_tlb_shootnow(); kpreempt_enable(); @@ -4885,7 +5228,7 @@ pmap_update(struct pmap *pmap) * is an unlocked check, but is safe as we're only interested in * work done in this LWP - we won't get a false negative. */ - if (!LIST_EMPTY(&pmap->pm_gc_ptp)) { + if (__predict_false(!LIST_EMPTY(&pmap->pm_gc_ptp))) { mutex_enter(&pmap->pm_lock); while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { KASSERT(ptp->wire_count == 0); @@ -4893,7 +5236,9 @@ pmap_update(struct pmap *pmap) pp = VM_PAGE_TO_PP(ptp); LIST_INIT(&pp->pp_pvlist); pp->pp_attrs = 0; - pp->pp_pflags = 0; + pp->pp_pte.pte_ptp = NULL; + pp->pp_pte.pte_va = 0; + PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp)); /* * XXX Hack to avoid extra locking, and lock @@ -5248,10 +5593,10 @@ pmap_ept_enter(struct pmap *pmap, vaddr_ bool accessed; struct pmap_ptparray pt; int error; - bool getptp; + bool getptp, samepage, new_embedded; + rb_tree_t *tree; KASSERT(pmap_initialized); - KASSERT(pmap->pm_remove_all == NULL); KASSERT(va < VM_MAXUSER_ADDRESS); npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags); @@ -5298,18 +5643,36 @@ pmap_ept_enter(struct pmap *pmap, vaddr_ error); } } + tree = &VM_PAGE_TO_PP(ptp)->pp_rb; + } else { + /* Embedded PV entries rely on this. */ + KASSERT(va != 0); + tree = &pmap_kernel_rb; } /* - * Now check to see if we need a pv entry for this VA. If we do, - * allocate and install in the radix tree. In any case look up the - * pv entry in case the old mapping used it. + * Look up the old PV entry at this VA (if any), and insert a new PV + * entry if required for the new mapping. Temporarily track the old + * and new mappings concurrently. Only after the old mapping is + * evicted from the pmap will we remove its PV entry. Otherwise, + * our picture of modified/accessed state for either page could get + * out of sync (we need any P->V operation for either page to stall + * on pmap->pm_lock until done here). */ - old_pve = NULL; new_pve = NULL; - if (pmap_pp_needs_pve(new_pp, ptp, va)) { - new_pve = pool_cache_get(&pmap_pv_cache, PR_NOWAIT); - if (new_pve == NULL) { + old_pve = NULL; + samepage = false; + new_embedded = false; + + if (new_pp != NULL) { + error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve, + &old_pve, &samepage, &new_embedded, tree); + + /* + * If a new pv_entry was needed and none was available, we + * can go no further. + */ + if (error != 0) { if (flags & PMAP_CANFAIL) { if (getptp) { pmap_unget_ptp(pmap, &pt); @@ -5318,7 +5681,9 @@ pmap_ept_enter(struct pmap *pmap, vaddr_ return error; } panic("%s: alloc pve failed", __func__); - } + } + } else { + old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); } /* Map PTEs into address space. */ @@ -5329,12 +5694,7 @@ pmap_ept_enter(struct pmap *pmap, vaddr_ pmap_ept_install_ptp(pmap, &pt, va); } - /* - * Check if there is an existing mapping. If we are now sure that - * we need pves and we failed to allocate them earlier, handle that. - * Caching the value of oldpa here is safe because only the mod/ref - * bits can change while the pmap is locked. - */ + /* Check if there is an existing mapping. */ ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp)); ptep = &ptes[pl1_pi(va)]; opte = *ptep; @@ -5356,11 +5716,20 @@ pmap_ept_enter(struct pmap *pmap, vaddr_ } while (pmap_pte_cas(ptep, opte, npte) != opte); /* + * Done with the PTEs: they can now be unmapped. + */ + kpreempt_enable(); + + /* * Update statistics and PTP's reference count. */ pmap_ept_stats_update_bypte(pmap, npte, opte); - if (ptp != NULL && !have_oldpa) { - ptp->wire_count++; + if (ptp != NULL) { + if (!have_oldpa) { + ptp->wire_count++; + } + /* Remember minimum VA in PTP. */ + pmap_ptp_range_set(ptp, va); } KASSERT(ptp == NULL || ptp->wire_count > 1); @@ -5369,11 +5738,17 @@ pmap_ept_enter(struct pmap *pmap, vaddr_ */ if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) { KASSERT(((opte ^ npte) & EPT_PVLIST) == 0); + if ((npte & EPT_PVLIST) != 0) { + KASSERT(samepage); + pmap_check_pv(pmap, ptp, new_pp, va, true); + } goto same_pa; + } else if ((npte & EPT_PVLIST) != 0) { + KASSERT(!samepage); } /* - * If old page is pv-tracked, replace pv_entry from its list. + * If old page is pv-tracked, remove pv_entry from its list. */ if ((~opte & (EPT_R | EPT_PVLIST)) == 0) { if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) { @@ -5385,19 +5760,35 @@ pmap_ept_enter(struct pmap *pmap, vaddr_ __func__, va, oldpa, atop(pa)); } - old_pve = pmap_lookup_pv(pmap, ptp, old_pp, va); pmap_remove_pv(pmap, old_pp, ptp, va, old_pve, pmap_ept_to_pp_attrs(opte)); + if (old_pve != NULL) { + if (pmap->pm_pve == NULL) { + pmap->pm_pve = old_pve; + } else { + pool_cache_put(&pmap_pv_cache, old_pve); + } + } + } else { + KASSERT(old_pve == NULL); + KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); } /* - * If new page is pv-tracked, insert pv_entry into its list. + * If new page is dynamically PV tracked, insert to tree. */ - if (new_pp) { - new_pve = pmap_enter_pv(pmap, new_pp, new_pve, ptp, va); + if (new_pve != NULL) { + KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL); + old_pve = rb_tree_insert_node(tree, new_pve); + KASSERT(old_pve == new_pve); + pmap_check_pv(pmap, ptp, new_pp, va, true); } same_pa: + /* + * shootdown tlb if necessary. + */ + if (pmap_ept_has_ad) { accessed = (~opte & (EPT_R | EPT_A)) == 0; } else { @@ -5406,18 +5797,8 @@ same_pa: if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) { pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER); } - - error = 0; - kpreempt_enable(); - if (old_pve != NULL) { - pool_cache_put(&pmap_pv_cache, old_pve); - } - if (new_pve != NULL) { - pool_cache_put(&pmap_pv_cache, new_pve); - } mutex_exit(&pmap->pm_lock); - - return error; + return 0; } /* Pay close attention, this returns L2. */ @@ -5541,6 +5922,8 @@ pmap_ept_remove_pte(struct pmap *pmap, s "managed page without EPT_PVLIST for %#"PRIxVADDR, va); KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL), "pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va); + KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ? + &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL); return true; } @@ -5575,6 +5958,12 @@ pmap_ept_remove_ptes(struct pmap *pmap, KASSERT(kpreempt_disabled()); /* + * mappings are very often sparse, so clip the given range to the + * range of PTEs that are known present in the PTP. + */ + pmap_ptp_range_clip(ptp, &startva, &pte); + + /* * note that ptpva points to the PTE that maps startva. this may * or may not be the first PTE in the PTP. * @@ -5636,10 +6025,6 @@ pmap_ept_remove(struct pmap *pmap, vaddr } kpreempt_enable(); - /* - * Radix tree nodes are removed here, so we need to continue holding - * the pmap locked until complete. - */ if (pv_tofree != NULL) { pmap_free_pvs(pmap, pv_tofree); }