Module Name:    src
Committed By:   ad
Date:           Tue Dec 31 22:42:51 UTC 2019

Modified Files:
        src/sys/kern: kern_idle.c
        src/sys/miscfs/genfs: genfs_io.c
        src/sys/ufs/lfs: lfs_pages.c lfs_vfsops.c ulfs_inode.c
        src/sys/ufs/ufs: ufs_inode.c
        src/sys/uvm: uvm.h uvm_anon.c uvm_aobj.c uvm_bio.c uvm_extern.h
            uvm_fault.c uvm_glue.c uvm_loan.c uvm_map.c uvm_object.c uvm_page.c
            uvm_page.h uvm_pager.c uvm_pdaemon.c uvm_pdpolicy.h
            uvm_pdpolicy_clock.c uvm_pdpolicy_clockpro.c

Log Message:
- Add and use wrapper functions that take and acquire page interlocks, and pairs
  of page interlocks.  Require that the page interlock be held over calls to
  uvm_pageactivate(), uvm_pagewire() and similar.

- Solve the concurrency problem with page replacement state.  Rather than
  updating the global state synchronously, set an intended state on
  individual pages (active, inactive, enqueued, dequeued) while holding the
  page interlock.  After the interlock is released put the pages on a 128
  entry per-CPU queue for their state changes to be made real in batch.
  This results in in a ~400 fold decrease in contention on my test system.
  Proposed on tech-kern but modified to use the page interlock rather than
  atomics to synchronise as it's much easier to maintain that way, and
  cheaper.


To generate a diff of this commit:
cvs rdiff -u -r1.28 -r1.29 src/sys/kern/kern_idle.c
cvs rdiff -u -r1.82 -r1.83 src/sys/miscfs/genfs/genfs_io.c
cvs rdiff -u -r1.18 -r1.19 src/sys/ufs/lfs/lfs_pages.c
cvs rdiff -u -r1.366 -r1.367 src/sys/ufs/lfs/lfs_vfsops.c
cvs rdiff -u -r1.22 -r1.23 src/sys/ufs/lfs/ulfs_inode.c
cvs rdiff -u -r1.106 -r1.107 src/sys/ufs/ufs/ufs_inode.c
cvs rdiff -u -r1.72 -r1.73 src/sys/uvm/uvm.h
cvs rdiff -u -r1.69 -r1.70 src/sys/uvm/uvm_anon.c
cvs rdiff -u -r1.132 -r1.133 src/sys/uvm/uvm_aobj.c
cvs rdiff -u -r1.101 -r1.102 src/sys/uvm/uvm_bio.c
cvs rdiff -u -r1.217 -r1.218 src/sys/uvm/uvm_extern.h
cvs rdiff -u -r1.213 -r1.214 src/sys/uvm/uvm_fault.c
cvs rdiff -u -r1.174 -r1.175 src/sys/uvm/uvm_glue.c
cvs rdiff -u -r1.92 -r1.93 src/sys/uvm/uvm_loan.c src/sys/uvm/uvm_page.h
cvs rdiff -u -r1.368 -r1.369 src/sys/uvm/uvm_map.c
cvs rdiff -u -r1.18 -r1.19 src/sys/uvm/uvm_object.c
cvs rdiff -u -r1.219 -r1.220 src/sys/uvm/uvm_page.c
cvs rdiff -u -r1.118 -r1.119 src/sys/uvm/uvm_pager.c
cvs rdiff -u -r1.121 -r1.122 src/sys/uvm/uvm_pdaemon.c
cvs rdiff -u -r1.5 -r1.6 src/sys/uvm/uvm_pdpolicy.h
cvs rdiff -u -r1.27 -r1.28 src/sys/uvm/uvm_pdpolicy_clock.c
cvs rdiff -u -r1.21 -r1.22 src/sys/uvm/uvm_pdpolicy_clockpro.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/kern/kern_idle.c
diff -u src/sys/kern/kern_idle.c:1.28 src/sys/kern/kern_idle.c:1.29
--- src/sys/kern/kern_idle.c:1.28	Fri Dec  6 21:36:10 2019
+++ src/sys/kern/kern_idle.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_idle.c,v 1.28 2019/12/06 21:36:10 ad Exp $	*/
+/*	$NetBSD: kern_idle.c,v 1.29 2019/12/31 22:42:51 ad Exp $	*/
 
 /*-
  * Copyright (c)2002, 2006, 2007 YAMAMOTO Takashi,
@@ -28,7 +28,7 @@
 
 #include <sys/cdefs.h>
 
-__KERNEL_RCSID(0, "$NetBSD: kern_idle.c,v 1.28 2019/12/06 21:36:10 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_idle.c,v 1.29 2019/12/31 22:42:51 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/cpu.h>
@@ -39,7 +39,7 @@ __KERNEL_RCSID(0, "$NetBSD: kern_idle.c,
 #include <sys/proc.h>
 #include <sys/atomic.h>
 
-#include <uvm/uvm.h>	/* uvm_pageidlezero */
+#include <uvm/uvm.h>	/* uvm_idle */
 #include <uvm/uvm_extern.h>
 
 void
@@ -81,7 +81,7 @@ idle_loop(void *dummy)
 		sched_idle();
 		if (!sched_curcpu_runnable_p()) {
 			if ((spc->spc_flags & SPCF_OFFLINE) == 0) {
-				uvm_pageidlezero();
+				uvm_idle();
 			}
 			if (!sched_curcpu_runnable_p()) {
 				cpu_idle();

Index: src/sys/miscfs/genfs/genfs_io.c
diff -u src/sys/miscfs/genfs/genfs_io.c:1.82 src/sys/miscfs/genfs/genfs_io.c:1.83
--- src/sys/miscfs/genfs/genfs_io.c:1.82	Tue Dec 31 12:40:27 2019
+++ src/sys/miscfs/genfs/genfs_io.c	Tue Dec 31 22:42:50 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: genfs_io.c,v 1.82 2019/12/31 12:40:27 ad Exp $	*/
+/*	$NetBSD: genfs_io.c,v 1.83 2019/12/31 22:42:50 ad Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1989, 1993
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.82 2019/12/31 12:40:27 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.83 2019/12/31 22:42:50 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -491,7 +491,9 @@ out:
 				uvm_pagefree(pg);
 				continue;
 			}
+			uvm_pagelock(pg);
 			uvm_pageenqueue(pg);
+			uvm_pageunlock(pg);
 			pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE);
 			UVM_PAGE_OWN(pg, NULL);
 		}
@@ -1164,14 +1166,18 @@ retry:
 			if (tpg->offset < startoff || tpg->offset >= endoff)
 				continue;
 			if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) {
+				uvm_pagelock(tpg);
 				uvm_pagedeactivate(tpg);
+				uvm_pageunlock(tpg);
 			} else if (flags & PGO_FREE) {
 				pmap_page_protect(tpg, VM_PROT_NONE);
 				if (tpg->flags & PG_BUSY) {
 					tpg->flags |= freeflag;
 					if (pagedaemon) {
 						uvm_pageout_start(1);
+						uvm_pagelock(tpg);
 						uvm_pagedequeue(tpg);
+						uvm_pageunlock(tpg);
 					}
 				} else {
 
@@ -1603,7 +1609,9 @@ genfs_compat_getpages(void *v)
 			pg->flags |= PG_RELEASED;
 		} else {
 			pmap_clear_modify(pg);
+			uvm_pagelock(pg);
 			uvm_pageactivate(pg);
+			uvm_pageunlock(pg);
 		}
 	}
 	if (error) {

Index: src/sys/ufs/lfs/lfs_pages.c
diff -u src/sys/ufs/lfs/lfs_pages.c:1.18 src/sys/ufs/lfs/lfs_pages.c:1.19
--- src/sys/ufs/lfs/lfs_pages.c:1.18	Fri Dec 20 20:54:48 2019
+++ src/sys/ufs/lfs/lfs_pages.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_pages.c,v 1.18 2019/12/20 20:54:48 ad Exp $	*/
+/*	$NetBSD: lfs_pages.c,v 1.19 2019/12/31 22:42:51 ad Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2019 The NetBSD Foundation, Inc.
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_pages.c,v 1.18 2019/12/20 20:54:48 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_pages.c,v 1.19 2019/12/31 22:42:51 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_compat_netbsd.h"
@@ -338,7 +338,9 @@ check_dirty(struct lfs *fs, struct vnode
 					 * Wire the page so that
 					 * pdaemon doesn't see it again.
 					 */
+					uvm_pagelock(pg);
 					uvm_pagewire(pg);
+					uvm_pageunlock(pg);
 
 					/* Suspended write flag */
 					pg->flags |= PG_DELWRI;
@@ -495,7 +497,9 @@ retry:
 						    "lfsput2", 0);
 				mutex_enter(vp->v_interlock);
 			}
+			uvm_pagelock(pg);
 			uvm_pageactivate(pg);
+			uvm_pageunlock(pg);
 		}
 		ap->a_offlo = blkeof;
 		if (ap->a_offhi > 0 && ap->a_offhi <= ap->a_offlo) {

Index: src/sys/ufs/lfs/lfs_vfsops.c
diff -u src/sys/ufs/lfs/lfs_vfsops.c:1.366 src/sys/ufs/lfs/lfs_vfsops.c:1.367
--- src/sys/ufs/lfs/lfs_vfsops.c:1.366	Fri Dec 13 20:10:22 2019
+++ src/sys/ufs/lfs/lfs_vfsops.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: lfs_vfsops.c,v 1.366 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: lfs_vfsops.c,v 1.367 2019/12/31 22:42:51 ad Exp $	*/
 
 /*-
  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.366 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.367 2019/12/31 22:42:51 ad Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_lfs.h"
@@ -2054,7 +2054,9 @@ lfs_gop_write(struct vnode *vp, struct v
 			pgs[i]->flags |= PG_PAGEOUT;
 			uvm_pageout_start(1);
 			mutex_enter(vp->v_interlock);
+			uvm_pagelock(pgs[i]);
 			uvm_pageunwire(pgs[i]);
+			uvm_pageunlock(pgs[i]);
 			mutex_exit(vp->v_interlock);
 		}
 	}
@@ -2241,10 +2243,12 @@ lfs_gop_write(struct vnode *vp, struct v
 
 		if (pg->flags & PG_PAGEOUT)
 			uvm_pageout_done(1);
+		uvm_pagelock(pg);
 		if (pg->flags & PG_DELWRI) {
 			uvm_pageunwire(pg);
 		}
 		uvm_pageactivate(pg);
+		uvm_pageunlock(pg);
 		pg->flags &= ~(PG_CLEAN|PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
 		DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
 			vp, pg->offset));

Index: src/sys/ufs/lfs/ulfs_inode.c
diff -u src/sys/ufs/lfs/ulfs_inode.c:1.22 src/sys/ufs/lfs/ulfs_inode.c:1.23
--- src/sys/ufs/lfs/ulfs_inode.c:1.22	Fri Dec 13 20:10:22 2019
+++ src/sys/ufs/lfs/ulfs_inode.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: ulfs_inode.c,v 1.22 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: ulfs_inode.c,v 1.23 2019/12/31 22:42:51 ad Exp $	*/
 /*  from NetBSD: ufs_inode.c,v 1.95 2015/06/13 14:56:45 hannken Exp  */
 
 /*
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ulfs_inode.c,v 1.22 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ulfs_inode.c,v 1.23 2019/12/31 22:42:51 ad Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_lfs.h"
@@ -243,7 +243,9 @@ ulfs_balloc_range(struct vnode *vp, off_
 			}
 			pgs[i]->flags &= ~PG_CLEAN;
 		}
+		uvm_pagelock(pgs[i]);
 		uvm_pageactivate(pgs[i]);
+		uvm_pageunlock(pgs[i]);
 	}
 	uvm_page_unbusy(pgs, npages);
 	mutex_exit(uobj->vmobjlock);

Index: src/sys/ufs/ufs/ufs_inode.c
diff -u src/sys/ufs/ufs/ufs_inode.c:1.106 src/sys/ufs/ufs/ufs_inode.c:1.107
--- src/sys/ufs/ufs/ufs_inode.c:1.106	Fri Dec 13 20:10:22 2019
+++ src/sys/ufs/ufs/ufs_inode.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: ufs_inode.c,v 1.106 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: ufs_inode.c,v 1.107 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1991, 1993
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.106 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.107 2019/12/31 22:42:51 ad Exp $");
 
 #if defined(_KERNEL_OPT)
 #include "opt_ffs.h"
@@ -279,7 +279,9 @@ ufs_balloc_range(struct vnode *vp, off_t
 			}
 			pgs[i]->flags &= ~PG_CLEAN;
 		}
+		uvm_pagelock(pgs[i]);
 		uvm_pageactivate(pgs[i]);
+		uvm_pageunlock(pgs[i]);
 	}
 	uvm_page_unbusy(pgs, npages);
 	mutex_exit(uobj->vmobjlock);

Index: src/sys/uvm/uvm.h
diff -u src/sys/uvm/uvm.h:1.72 src/sys/uvm/uvm.h:1.73
--- src/sys/uvm/uvm.h:1.72	Fri Dec 27 13:19:24 2019
+++ src/sys/uvm/uvm.h	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm.h,v 1.72 2019/12/27 13:19:24 ad Exp $	*/
+/*	$NetBSD: uvm.h,v 1.73 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -87,6 +87,12 @@ struct uvm_cpu {
 
 	/* entropy */
 	krndsource_t 	rs;			/* entropy source */
+
+	/* uvmpdpol: queue of intended page status changes. */
+	struct vm_page	**pdq;			/* queue entries */
+	u_int		pdqhead;		/* current queue head */
+	u_int		pdqtail;		/* maximum number entries */
+	int		pdqtime;		/* last time queue cleared */
 };
 
 /*

Index: src/sys/uvm/uvm_anon.c
diff -u src/sys/uvm/uvm_anon.c:1.69 src/sys/uvm/uvm_anon.c:1.70
--- src/sys/uvm/uvm_anon.c:1.69	Fri Dec 13 20:10:22 2019
+++ src/sys/uvm/uvm_anon.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_anon.c,v 1.69 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: uvm_anon.c,v 1.70 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_anon.c,v 1.69 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_anon.c,v 1.70 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_uvmhist.h"
 
@@ -352,7 +352,9 @@ uvm_anon_pagein(struct vm_amap *amap, st
 	 * Deactivate the page (to put it on a page queue).
 	 */
 
+	uvm_pagelock(pg);
 	uvm_pagedeactivate(pg);
+	uvm_pageunlock(pg);
 	if (pg->flags & PG_WANTED) {
 		pg->flags &= ~PG_WANTED;
 		wakeup(pg);

Index: src/sys/uvm/uvm_aobj.c
diff -u src/sys/uvm/uvm_aobj.c:1.132 src/sys/uvm/uvm_aobj.c:1.133
--- src/sys/uvm/uvm_aobj.c:1.132	Sun Dec 15 21:11:35 2019
+++ src/sys/uvm/uvm_aobj.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_aobj.c,v 1.132 2019/12/15 21:11:35 ad Exp $	*/
+/*	$NetBSD: uvm_aobj.c,v 1.133 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and
@@ -38,7 +38,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_aobj.c,v 1.132 2019/12/15 21:11:35 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_aobj.c,v 1.133 2019/12/31 22:42:51 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_uvmhist.h"
@@ -738,7 +738,9 @@ uao_put(struct uvm_object *uobj, voff_t 
 		case PGO_CLEANIT|PGO_DEACTIVATE:
 		case PGO_DEACTIVATE:
  deactivate_it:
+ 			uvm_pagelock(pg);
 			uvm_pagedeactivate(pg);
+ 			uvm_pageunlock(pg);
 			break;
 
 		case PGO_FREE:
@@ -1299,7 +1301,9 @@ uao_pagein_page(struct uvm_aobj *aobj, i
 	/*
 	 * make sure it's on a page queue.
 	 */
+	uvm_pagelock(pg);
 	uvm_pageenqueue(pg);
+	uvm_pageunlock(pg);
 
 	if (pg->flags & PG_WANTED) {
 		wakeup(pg);

Index: src/sys/uvm/uvm_bio.c
diff -u src/sys/uvm/uvm_bio.c:1.101 src/sys/uvm/uvm_bio.c:1.102
--- src/sys/uvm/uvm_bio.c:1.101	Fri Dec 13 20:10:22 2019
+++ src/sys/uvm/uvm_bio.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_bio.c,v 1.101 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: uvm_bio.c,v 1.102 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1998 Chuck Silvers.
@@ -34,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_bio.c,v 1.101 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_bio.c,v 1.102 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_uvmhist.h"
 #include "opt_ubc.h"
@@ -285,7 +285,9 @@ ubc_fault_page(const struct uvm_faultinf
 	error = pmap_enter(ufi->orig_map->pmap, va, VM_PAGE_TO_PHYS(pg),
 	    prot & mask, PMAP_CANFAIL | (access_type & mask));
 
+	uvm_pagelock(pg);
 	uvm_pageactivate(pg);
+	uvm_pageunlock(pg);
 	pg->flags &= ~(PG_BUSY|PG_WANTED);
 	UVM_PAGE_OWN(pg, NULL);
 
@@ -665,7 +667,9 @@ ubc_release(void *va, int flags)
 			pgs[i] = PHYS_TO_VM_PAGE(pa);
 			pgs[i]->flags &= ~(PG_FAKE|PG_CLEAN);
 			KASSERT(pgs[i]->loan_count == 0);
+			uvm_pagelock(pgs[i]);
 			uvm_pageactivate(pgs[i]);
+			uvm_pageunlock(pgs[i]);
 		}
 		pmap_kremove(umapva, ubc_winsize);
 		pmap_update(pmap_kernel());
@@ -888,7 +892,9 @@ ubc_direct_release(struct uvm_object *uo
 	for (int i = 0; i < npages; i++) {
 		struct vm_page *pg = pgs[i];
 
+		uvm_pagelock(pg);
 		uvm_pageactivate(pg);
+		uvm_pageunlock(pg);
 
 		/* Page was changed, no longer fake and neither clean */
 		if (flags & UBC_WRITE)

Index: src/sys/uvm/uvm_extern.h
diff -u src/sys/uvm/uvm_extern.h:1.217 src/sys/uvm/uvm_extern.h:1.218
--- src/sys/uvm/uvm_extern.h:1.217	Tue Dec 31 13:07:14 2019
+++ src/sys/uvm/uvm_extern.h	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_extern.h,v 1.217 2019/12/31 13:07:14 ad Exp $	*/
+/*	$NetBSD: uvm_extern.h,v 1.218 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -639,6 +639,7 @@ int			uvm_coredump_walkmap(struct proc *
 int			uvm_coredump_count_segs(struct proc *);
 void			uvm_proc_exit(struct proc *);
 void			uvm_lwp_exit(struct lwp *);
+void			uvm_idle(void);
 void			uvm_init_limits(struct proc *);
 bool			uvm_kernacc(void *, size_t, vm_prot_t);
 __dead void		uvm_scheduler(void);

Index: src/sys/uvm/uvm_fault.c
diff -u src/sys/uvm/uvm_fault.c:1.213 src/sys/uvm/uvm_fault.c:1.214
--- src/sys/uvm/uvm_fault.c:1.213	Mon Dec 16 22:47:55 2019
+++ src/sys/uvm/uvm_fault.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_fault.c,v 1.213 2019/12/16 22:47:55 ad Exp $	*/
+/*	$NetBSD: uvm_fault.c,v 1.214 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_fault.c,v 1.213 2019/12/16 22:47:55 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_fault.c,v 1.214 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_uvmhist.h"
 
@@ -197,7 +197,9 @@ uvmfault_anonflush(struct vm_anon **anon
 		KASSERT(mutex_owned(anons[lcv]->an_lock));
 		pg = anons[lcv]->an_page;
 		if (pg && (pg->flags & PG_BUSY) == 0) {
+			uvm_pagelock(pg);
 			uvm_pagedeactivate(pg);
+			uvm_pageunlock(pg);
 		}
 	}
 }
@@ -482,7 +484,9 @@ released:
 			 * We have successfully read the page, activate it.
 			 */
 
+			uvm_pagelock(pg);
 			uvm_pageactivate(pg);
+			uvm_pageunlock(pg);
 			pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE);
 			UVM_PAGE_OWN(pg, NULL);
 #else
@@ -1252,7 +1256,9 @@ uvm_fault_upper_neighbor(
 
 	/* locked: amap, anon */
 
+	uvm_pagelock(pg);
 	uvm_pageenqueue(pg);
+	uvm_pageunlock(pg);
 	UVMHIST_LOG(maphist,
 	    "  MAPPING: n anon: pm=%#jx, va=%#jx, pg=%#jx",
 	    (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
@@ -1469,7 +1475,9 @@ uvm_fault_upper_promote(
 
 	pg = anon->an_page;
 	/* uvm_fault_upper_done will activate the page */
+	uvm_pagelock(pg);
 	uvm_pageenqueue(pg);
+	uvm_pageunlock(pg);
 	pg->flags &= ~(PG_BUSY|PG_FAKE);
 	UVM_PAGE_OWN(pg, NULL);
 
@@ -1601,6 +1609,7 @@ uvm_fault_upper_done(
 	 * ... update the page queues.
 	 */
 
+	uvm_pagelock(pg);
 	if (wire_paging) {
 		uvm_pagewire(pg);
 
@@ -1615,6 +1624,7 @@ uvm_fault_upper_done(
 	} else {
 		uvm_pageactivate(pg);
 	}
+	uvm_pageunlock(pg);
 
 	if (wire_paging) {
 		uvm_anon_dropswap(anon);
@@ -1833,7 +1843,9 @@ uvm_fault_lower_neighbor(
 	 * for this.  we can just directly enter the pages.
 	 */
 
+	uvm_pagelock(pg);
 	uvm_pageenqueue(pg);
+	uvm_pageunlock(pg);
 	UVMHIST_LOG(maphist,
 	    "  MAPPING: n obj: pm=%#jx, va=%#jx, pg=%#jx",
 	    (uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
@@ -1950,7 +1962,9 @@ uvm_fault_lower_io(
 	mutex_enter(uobj->vmobjlock);
 	KASSERT((pg->flags & PG_BUSY) != 0);
 
+	uvm_pagelock(pg);
 	uvm_pageactivate(pg);
+	uvm_pageunlock(pg);
 
 	/* locked(locked): maps(read), amap(if !null), uobj, pg */
 	/* locked(!locked): uobj, pg */
@@ -2249,7 +2263,9 @@ uvm_fault_lower_enter(
 		 * we just promoted the page.
 		 */
 
+		uvm_pagelock(pg);
 		uvm_pageenqueue(pg);
+		uvm_pageunlock(pg);
 
 		if (pg->flags & PG_WANTED)
 			wakeup(pg);
@@ -2308,6 +2324,7 @@ uvm_fault_lower_done(
 
 	UVMHIST_FUNC("uvm_fault_lower_done"); UVMHIST_CALLED(maphist);
 
+	uvm_pagelock(pg);
 	if (flt->wire_paging) {
 		uvm_pagewire(pg);
 		if (pg->flags & PG_AOBJ) {
@@ -2326,6 +2343,7 @@ uvm_fault_lower_done(
 	} else {
 		uvm_pageactivate(pg);
 	}
+	uvm_pageunlock(pg);
 
 	if (dropswap) {
 		uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
@@ -2455,8 +2473,11 @@ uvm_fault_unwire_locked(struct vm_map *m
 			pmap_unwire(pmap, va);
 
 		pg = PHYS_TO_VM_PAGE(pa);
-		if (pg)
+		if (pg) {
+			uvm_pagelock(pg);
 			uvm_pageunwire(pg);
+			uvm_pageunlock(pg);
+		}
 	}
 
 	if (oentry != NULL) {

Index: src/sys/uvm/uvm_glue.c
diff -u src/sys/uvm/uvm_glue.c:1.174 src/sys/uvm/uvm_glue.c:1.175
--- src/sys/uvm/uvm_glue.c:1.174	Tue Dec 31 13:07:14 2019
+++ src/sys/uvm/uvm_glue.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_glue.c,v 1.174 2019/12/31 13:07:14 ad Exp $	*/
+/*	$NetBSD: uvm_glue.c,v 1.175 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -62,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.174 2019/12/31 13:07:14 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.175 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_kgdb.h"
 #include "opt_kstack.h"
@@ -86,6 +86,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v
 #include <sys/asan.h>
 
 #include <uvm/uvm.h>
+#include <uvm/uvm_pdpolicy.h>
 #include <uvm/uvm_pgflcache.h>
 
 /*
@@ -516,3 +517,22 @@ uvm_scheduler(void)
 		(void)kpause("uvm", false, hz, NULL);
 	}
 }
+
+/*
+ * uvm_idle: called from the idle loop.
+ */
+
+void
+uvm_idle(void)
+{
+	struct cpu_info *ci = curcpu();
+	struct uvm_cpu *ucpu = ci->ci_data.cpu_uvm;
+
+	KASSERT(kpreempt_disabled());
+
+	if (!ci->ci_want_resched)
+		uvmpdpol_idle(ucpu);
+	if (!ci->ci_want_resched)
+		uvm_pageidlezero();
+
+}

Index: src/sys/uvm/uvm_loan.c
diff -u src/sys/uvm/uvm_loan.c:1.92 src/sys/uvm/uvm_loan.c:1.93
--- src/sys/uvm/uvm_loan.c:1.92	Wed Dec 18 20:38:14 2019
+++ src/sys/uvm/uvm_loan.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_loan.c,v 1.92 2019/12/18 20:38:14 ad Exp $	*/
+/*	$NetBSD: uvm_loan.c,v 1.93 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_loan.c,v 1.92 2019/12/18 20:38:14 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_loan.c,v 1.93 2019/12/31 22:42:51 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -421,11 +421,11 @@ uvm_loananon(struct uvm_faultinfo *ufi, 
 	if (pg->loan_count == 0) {
 		pmap_page_protect(pg, VM_PROT_READ);
 	}
-	mutex_enter(&pg->interlock);
+	uvm_pagelock(pg);
 	pg->loan_count++;
 	KASSERT(pg->loan_count > 0);	/* detect wrap-around */
-	mutex_exit(&pg->interlock);
 	uvm_pageactivate(pg);
+	uvm_pageunlock(pg);
 	**output = pg;
 	(*output)++;
 
@@ -471,11 +471,11 @@ uvm_loanpage(struct vm_page **pgpp, int 
 		if (pg->loan_count == 0) {
 			pmap_page_protect(pg, VM_PROT_READ);
 		}
-		mutex_enter(&pg->interlock);
+		uvm_pagelock(pg);
 		pg->loan_count++;
 		KASSERT(pg->loan_count > 0);	/* detect wrap-around */
-		mutex_exit(&pg->interlock);
 		uvm_pageactivate(pg);
+		uvm_pageunlock(pg);
 	}
 
 	uvm_page_unbusy(pgpp, npages);
@@ -713,7 +713,9 @@ uvm_loanuobj(struct uvm_faultinfo *ufi, 
 				mutex_exit(uobj->vmobjlock);
 				return (0);
 			}
+			uvm_pagelock(pg);
 			uvm_pageactivate(pg);
+			uvm_pageunlock(pg);
 			pg->flags &= ~(PG_BUSY|PG_WANTED);
 			UVM_PAGE_OWN(pg, NULL);
 			mutex_exit(uobj->vmobjlock);
@@ -778,14 +780,14 @@ uvm_loanuobj(struct uvm_faultinfo *ufi, 
 	if (pg->loan_count == 0) {
 		pmap_page_protect(pg, VM_PROT_READ);
 	}
-	mutex_enter(&pg->interlock);
+	uvm_pagelock(pg);
 	pg->loan_count++;
 	KASSERT(pg->loan_count > 0);	/* detect wrap-around */
 	pg->uanon = anon;
 	anon->an_page = pg;
 	anon->an_lock = /* TODO: share amap lock */
-	mutex_exit(&pg->interlock);
 	uvm_pageactivate(pg);
+	uvm_pageunlock(pg);
 	if (pg->flags & PG_WANTED) {
 		wakeup(pg);
 	}
@@ -863,7 +865,9 @@ again:
 		/* got a zero'd page. */
 		pg->flags &= ~(PG_WANTED|PG_BUSY|PG_FAKE);
 		pg->flags |= PG_RDONLY;
+		uvm_pagelock(pg);
 		uvm_pageactivate(pg);
+		uvm_pageunlock(pg);
 		UVM_PAGE_OWN(pg, NULL);
 	}
 
@@ -909,11 +913,11 @@ again:
 	}
 	anon->an_page = pg;
 	pg->uanon = anon;
-	mutex_enter(&pg->interlock);
+	uvm_pagelock(pg);
 	pg->loan_count++;
 	KASSERT(pg->loan_count > 0);	/* detect wrap-around */
-	mutex_exit(&pg->interlock);
 	uvm_pageactivate(pg);
+	uvm_pageunlock(pg);
 	mutex_exit(&anon->an_lock);
 	mutex_exit(uvm_loanzero_object.vmobjlock);
 	**output = anon;
@@ -1063,11 +1067,13 @@ ulz_put(struct uvm_object *uobj, voff_t 
 	pg = uvm_pagelookup(uobj, 0);
 	KASSERT(pg != NULL);
 
+	uvm_pagelock(pg);
 	if (pg->uanon) {
 		uvm_pageactivate(pg);
 	} else {
 		uvm_pagedequeue(pg);
 	}
+	uvm_pageunlock(pg);
 
 	mutex_exit(uobj->vmobjlock);
 	return 0;
@@ -1147,6 +1153,8 @@ uvm_loanbreak(struct vm_page *uobjpage)
 	 * an anon (i.e. we are breaking an O->K
 	 * loan), then remove it from any pageq's.
 	 */
+
+	uvm_pagelock2(uobjpage, pg);
 	if (uobjpage->uanon == NULL)
 		uvm_pagedequeue(uobjpage);
 
@@ -1162,6 +1170,7 @@ uvm_loanbreak(struct vm_page *uobjpage)
 	 */
 
 	uvm_pageactivate(pg);
+	uvm_pageunlock2(uobjpage, pg);
 
 	/*
 	 * done!  loan is broken and "pg" is
@@ -1186,6 +1195,13 @@ uvm_loanbreak_anon(struct vm_anon *anon,
 	}
 
 	oldpg = anon->an_page;
+	/* copy old -> new */
+	uvm_pagecopy(oldpg, newpg);
+
+	/* force reload */
+	pmap_page_protect(oldpg, VM_PROT_NONE);
+
+	uvm_pagelock2(oldpg, newpg);
 	if (uobj == NULL) {
 		/*
 		 * we were the lender (A->K); need to remove the page from
@@ -1193,19 +1209,6 @@ uvm_loanbreak_anon(struct vm_anon *anon,
 		 */
 		uvm_pagedequeue(oldpg);
 	}
-
-	/* copy old -> new */
-	uvm_pagecopy(oldpg, newpg);
-
-	/* force reload */
-	pmap_page_protect(oldpg, VM_PROT_NONE);
-	if (newpg < oldpg) {
-		mutex_enter(&newpg->interlock);
-		mutex_enter(&oldpg->interlock);
-	} else {
-		mutex_enter(&oldpg->interlock);
-		mutex_enter(&newpg->interlock);
-	}
 	oldpg->uanon = NULL;
 	/* in case we owned */
 	oldpg->flags &= ~PG_ANON;
@@ -1220,9 +1223,8 @@ uvm_loanbreak_anon(struct vm_anon *anon,
 	newpg->uanon = anon;
 	newpg->flags |= PG_ANON;
 
-	mutex_exit(&newpg->interlock);
-	mutex_exit(&oldpg->interlock);
 	uvm_pageactivate(newpg);
+	uvm_pageunlock2(oldpg, newpg);
 
 	newpg->flags &= ~(PG_BUSY|PG_FAKE);
 	UVM_PAGE_OWN(newpg, NULL);
Index: src/sys/uvm/uvm_page.h
diff -u src/sys/uvm/uvm_page.h:1.92 src/sys/uvm/uvm_page.h:1.93
--- src/sys/uvm/uvm_page.h:1.92	Tue Dec 31 17:56:16 2019
+++ src/sys/uvm/uvm_page.h	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.h,v 1.92 2019/12/31 17:56:16 ad Exp $	*/
+/*	$NetBSD: uvm_page.h,v 1.93 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -266,6 +266,24 @@ struct vm_page {
 	"\11AOBJ\12AOBJ\13READAHEAD\14FREE\15MARKER\16PAGER1\17ZERO"
 
 /*
+ * uvmpdpol state flags.
+ *
+ * => may only be changed with pg->interlock held.
+ * => changing them is the responsibility of uvmpdpol ..
+ * => .. but uvm_page needs to know about them in order to purge updates.
+ * => PQ_PRIVATE is private to the individual uvmpdpol implementation.
+ */
+
+#define	PQ_INTENT_A		0x00000000	/* intend activation */
+#define	PQ_INTENT_I		0x00000001	/* intend deactivation */
+#define	PQ_INTENT_E		0x00000002	/* intend enqueue */
+#define	PQ_INTENT_D		0x00000003	/* intend dequeue */
+#define	PQ_INTENT_MASK		0x00000003	/* mask of intended state */
+#define	PQ_INTENT_SET		0x00000004	/* not realized yet */
+#define	PQ_INTENT_QUEUED	0x00000008	/* queued for processing */
+#define	PQ_PRIVATE		0xfffffff0
+
+/*
  * physical memory layout structure
  *
  * MD vmparam.h must #define:
@@ -312,6 +330,10 @@ void uvm_pagedeactivate(struct vm_page *
 void uvm_pagedequeue(struct vm_page *);
 void uvm_pageenqueue(struct vm_page *);
 void uvm_pagefree(struct vm_page *);
+void uvm_pagelock(struct vm_page *);
+void uvm_pagelock2(struct vm_page *, struct vm_page *);
+void uvm_pageunlock(struct vm_page *);
+void uvm_pageunlock2(struct vm_page *, struct vm_page *);
 void uvm_page_unbusy(struct vm_page **, int);
 struct vm_page *uvm_pagelookup(struct uvm_object *, voff_t);
 void uvm_pageunwire(struct vm_page *);

Index: src/sys/uvm/uvm_map.c
diff -u src/sys/uvm/uvm_map.c:1.368 src/sys/uvm/uvm_map.c:1.369
--- src/sys/uvm/uvm_map.c:1.368	Fri Dec 27 10:17:57 2019
+++ src/sys/uvm/uvm_map.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_map.c,v 1.368 2019/12/27 10:17:57 msaitoh Exp $	*/
+/*	$NetBSD: uvm_map.c,v 1.369 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -66,7 +66,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.368 2019/12/27 10:17:57 msaitoh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.369 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_ddb.h"
 #include "opt_pax.h"
@@ -3949,7 +3949,9 @@ uvm_map_clean(struct vm_map *map, vaddr_
 					continue;
 				}
 				KASSERT(pg->uanon == anon);
+				uvm_pagelock(pg);
 				uvm_pagedeactivate(pg);
+				uvm_pageunlock(pg);
 				continue;
 
 			case PGO_FREE:

Index: src/sys/uvm/uvm_object.c
diff -u src/sys/uvm/uvm_object.c:1.18 src/sys/uvm/uvm_object.c:1.19
--- src/sys/uvm/uvm_object.c:1.18	Sun Dec 15 21:11:35 2019
+++ src/sys/uvm/uvm_object.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_object.c,v 1.18 2019/12/15 21:11:35 ad Exp $	*/
+/*	$NetBSD: uvm_object.c,v 1.19 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc.
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.18 2019/12/15 21:11:35 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.19 2019/12/31 22:42:51 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_ddb.h"
@@ -181,7 +181,9 @@ uvm_obj_wirepages(struct uvm_object *uob
 
 		/* Wire the pages */
 		for (i = 0; i < npages; i++) {
+			uvm_pagelock(pgs[i]);
 			uvm_pagewire(pgs[i]);
+			uvm_pageunlock(pgs[i]);
 			if (list != NULL)
 				TAILQ_INSERT_TAIL(list, pgs[i], pageq.queue);
 		}
@@ -223,7 +225,9 @@ uvm_obj_unwirepages(struct uvm_object *u
 		KASSERT(pg != NULL);
 		KASSERT(!(pg->flags & PG_RELEASED));
 
+		uvm_pagelock(pg);
 		uvm_pageunwire(pg);
+		uvm_pageunlock(pg);
 	}
 	mutex_exit(uobj->vmobjlock);
 }

Index: src/sys/uvm/uvm_page.c
diff -u src/sys/uvm/uvm_page.c:1.219 src/sys/uvm/uvm_page.c:1.220
--- src/sys/uvm/uvm_page.c:1.219	Tue Dec 31 13:07:14 2019
+++ src/sys/uvm/uvm_page.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.c,v 1.219 2019/12/31 13:07:14 ad Exp $	*/
+/*	$NetBSD: uvm_page.c,v 1.220 2019/12/31 22:42:51 ad Exp $	*/
 
 /*-
  * Copyright (c) 2019 The NetBSD Foundation, Inc.
@@ -95,7 +95,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.219 2019/12/31 13:07:14 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.220 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_ddb.h"
 #include "opt_uvm.h"
@@ -984,6 +984,8 @@ uvm_cpu_attach(struct cpu_info *ci)
 		ucpu = ci->ci_data.cpu_uvm;
 	}
 
+	uvmpdpol_init_cpu(ucpu);
+
 	/*
 	 * Attach RNG source for this CPU's VM events
 	 */
@@ -1345,6 +1347,7 @@ uvm_pagealloc_strat(struct uvm_object *o
  * uvm_pagereplace: replace a page with another
  *
  * => object must be locked
+ * => page interlocks must be held
  */
 
 void
@@ -1358,25 +1361,17 @@ uvm_pagereplace(struct vm_page *oldpg, s
 	KASSERT((newpg->flags & PG_TABLED) == 0);
 	KASSERT(newpg->uobject == NULL);
 	KASSERT(mutex_owned(uobj->vmobjlock));
+	KASSERT(mutex_owned(&oldpg->interlock));
+	KASSERT(mutex_owned(&newpg->interlock));
 
 	newpg->offset = oldpg->offset;
 	pg = radix_tree_replace_node(&uobj->uo_pages,
 	    newpg->offset >> PAGE_SHIFT, newpg);
 	KASSERT(pg == oldpg);
 
-	/* take page interlocks during rename */
-	if (oldpg < newpg) {
-		mutex_enter(&oldpg->interlock);
-		mutex_enter(&newpg->interlock);
-	} else {
-		mutex_enter(&newpg->interlock);
-		mutex_enter(&oldpg->interlock);
-	}
 	newpg->uobject = uobj;
 	uvm_pageinsert_object(uobj, newpg);
 	uvm_pageremove_object(uobj, oldpg);
-	mutex_exit(&oldpg->interlock);
-	mutex_exit(&newpg->interlock);
 }
 
 /*
@@ -1502,7 +1497,7 @@ uvm_pagefree(struct vm_page *pg)
 		 * unbusy the page, and we're done.
 		 */
 
-		mutex_enter(&pg->interlock);
+		uvm_pagelock(pg);
 		locked = true;
 		if (pg->uobject != NULL) {
 			uvm_pageremove_object(pg->uobject, pg);
@@ -1526,15 +1521,15 @@ uvm_pagefree(struct vm_page *pg)
 #endif
 		if (pg->loan_count) {
 			KASSERT(pg->uobject == NULL);
-			mutex_exit(&pg->interlock);
 			if (pg->uanon == NULL) {
 				uvm_pagedequeue(pg);
 			}
+			uvm_pageunlock(pg);
 			return;
 		}
 	} else if (pg->uobject != NULL || pg->uanon != NULL ||
 	           pg->wire_count != 0) {
-		mutex_enter(&pg->interlock);
+		uvm_pagelock(pg);
 		locked = true;
 	} else {
 		locked = false;
@@ -1560,15 +1555,16 @@ uvm_pagefree(struct vm_page *pg)
 		atomic_dec_uint(&uvmexp.wired);
 	}
 	if (locked) {
-		mutex_exit(&pg->interlock);
+		/*
+		 * now remove the page from the queues.
+		 */
+		uvm_pagedequeue(pg);
+		uvm_pageunlock(pg);
+	} else {
+		KASSERT(!uvmpdpol_pageisqueued_p(pg));
 	}
 
 	/*
-	 * now remove the page from the queues.
-	 */
-	uvm_pagedequeue(pg);
-
-	/*
 	 * and put on free queue
 	 */
 
@@ -1744,6 +1740,7 @@ uvm_pagelookup(struct uvm_object *obj, v
  * uvm_pagewire: wire the page, thus removing it from the daemon's grasp
  *
  * => caller must lock objects
+ * => caller must hold pg->interlock
  */
 
 void
@@ -1751,6 +1748,7 @@ uvm_pagewire(struct vm_page *pg)
 {
 
 	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
 #if defined(READAHEAD_STATS)
 	if ((pg->flags & PG_READAHEAD) != 0) {
 		uvm_ra_hit.ev_count++;
@@ -1761,9 +1759,7 @@ uvm_pagewire(struct vm_page *pg)
 		uvm_pagedequeue(pg);
 		atomic_inc_uint(&uvmexp.wired);
 	}
-	mutex_enter(&pg->interlock);
 	pg->wire_count++;
-	mutex_exit(&pg->interlock);
 	KASSERT(pg->wire_count > 0);	/* detect wraparound */
 }
 
@@ -1772,6 +1768,7 @@ uvm_pagewire(struct vm_page *pg)
  *
  * => activate if wire count goes to zero.
  * => caller must lock objects
+ * => caller must hold pg->interlock
  */
 
 void
@@ -1781,9 +1778,8 @@ uvm_pageunwire(struct vm_page *pg)
 	KASSERT(uvm_page_owner_locked_p(pg));
 	KASSERT(pg->wire_count != 0);
 	KASSERT(!uvmpdpol_pageisqueued_p(pg));
-	mutex_enter(&pg->interlock);
+	KASSERT(mutex_owned(&pg->interlock));
 	pg->wire_count--;
-	mutex_exit(&pg->interlock);
 	if (pg->wire_count == 0) {
 		uvm_pageactivate(pg);
 		KASSERT(uvmexp.wired != 0);
@@ -1798,6 +1794,7 @@ uvm_pageunwire(struct vm_page *pg)
  * => caller must check to make sure page is not wired
  * => object that page belongs to must be locked (so we can adjust pg->flags)
  * => caller must clear the reference on the page before calling
+ * => caller must hold pg->interlock
  */
 
 void
@@ -1805,6 +1802,7 @@ uvm_pagedeactivate(struct vm_page *pg)
 {
 
 	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
 	if (pg->wire_count == 0) {
 		KASSERT(uvmpdpol_pageisqueued_p(pg));
 		uvmpdpol_pagedeactivate(pg);
@@ -1815,6 +1813,7 @@ uvm_pagedeactivate(struct vm_page *pg)
  * uvm_pageactivate: activate page
  *
  * => caller must lock objects
+ * => caller must hold pg->interlock
  */
 
 void
@@ -1822,6 +1821,7 @@ uvm_pageactivate(struct vm_page *pg)
 {
 
 	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
 #if defined(READAHEAD_STATS)
 	if ((pg->flags & PG_READAHEAD) != 0) {
 		uvm_ra_hit.ev_count++;
@@ -1837,12 +1837,14 @@ uvm_pageactivate(struct vm_page *pg)
  * uvm_pagedequeue: remove a page from any paging queue
  * 
  * => caller must lock objects
+ * => caller must hold pg->interlock
  */
 void
 uvm_pagedequeue(struct vm_page *pg)
 {
 
 	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
 	if (uvmpdpol_pageisqueued_p(pg)) {
 		uvmpdpol_pagedequeue(pg);
 	}
@@ -1853,18 +1855,103 @@ uvm_pagedequeue(struct vm_page *pg)
  * used where a page is not really demanded (yet).  eg. read-ahead
  *
  * => caller must lock objects
+ * => caller must hold pg->interlock
  */
 void
 uvm_pageenqueue(struct vm_page *pg)
 {
 
 	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
 	if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) {
 		uvmpdpol_pageenqueue(pg);
 	}
 }
 
 /*
+ * uvm_pagelock: acquire page interlock
+ */
+void
+uvm_pagelock(struct vm_page *pg)
+{
+
+	mutex_enter(&pg->interlock);
+}
+
+/*
+ * uvm_pagelock2: acquire two page interlocks
+ */
+void
+uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
+{
+
+	if (pg1 < pg2) {
+		mutex_enter(&pg1->interlock);
+		mutex_enter(&pg2->interlock);
+	} else {
+		mutex_enter(&pg2->interlock);
+		mutex_enter(&pg1->interlock);
+	}
+}
+
+/*
+ * uvm_pageunlock: release page interlock, and if a page replacement intent
+ * is set on the page, pass it to uvmpdpol to make real.
+ * 
+ * => caller must hold pg->interlock
+ */
+void
+uvm_pageunlock(struct vm_page *pg)
+{
+
+	if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
+	    (pg->pqflags & PQ_INTENT_QUEUED) != 0) {
+	    	mutex_exit(&pg->interlock);
+	    	return;
+	}
+	pg->pqflags |= PQ_INTENT_QUEUED;
+	mutex_exit(&pg->interlock);
+	uvmpdpol_pagerealize(pg);
+}
+
+/*
+ * uvm_pageunlock2: release two page interlocks, and for both pages if a
+ * page replacement intent is set on the page, pass it to uvmpdpol to make
+ * real.
+ * 
+ * => caller must hold pg->interlock
+ */
+void
+uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
+{
+
+	if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
+	    (pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
+	    	mutex_exit(&pg1->interlock);
+	    	pg1 = NULL;
+	} else {
+		pg1->pqflags |= PQ_INTENT_QUEUED;
+		mutex_exit(&pg1->interlock);
+	}
+
+	if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
+	    (pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
+	    	mutex_exit(&pg2->interlock);
+	    	pg2 = NULL;
+	} else {
+		pg2->pqflags |= PQ_INTENT_QUEUED;
+		mutex_exit(&pg2->interlock);
+	}
+
+	if (pg1 != NULL) {
+		uvmpdpol_pagerealize(pg1);
+	}
+	if (pg2 != NULL) {
+		uvmpdpol_pagerealize(pg2);
+	}
+}
+
+/*
  * uvm_pagezero: zero fill a page
  *
  * => if page is part of an object then the object should be locked

Index: src/sys/uvm/uvm_pager.c
diff -u src/sys/uvm/uvm_pager.c:1.118 src/sys/uvm/uvm_pager.c:1.119
--- src/sys/uvm/uvm_pager.c:1.118	Fri Dec 27 00:46:38 2019
+++ src/sys/uvm/uvm_pager.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pager.c,v 1.118 2019/12/27 00:46:38 ad Exp $	*/
+/*	$NetBSD: uvm_pager.c,v 1.119 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_pager.c,v 1.118 2019/12/27 00:46:38 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_pager.c,v 1.119 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_uvmhist.h"
 #include "opt_readahead.h"
@@ -387,7 +387,9 @@ uvm_aio_aiodone_pages(struct vm_page **p
 					pageout_done++;
 				}
 				pg->flags &= ~PG_CLEAN;
+				uvm_pagelock(pg);
 				uvm_pageactivate(pg);
+				uvm_pageunlock(pg);
 				slot = 0;
 			} else
 				slot = SWSLOT_BAD;
@@ -423,7 +425,9 @@ uvm_aio_aiodone_pages(struct vm_page **p
 			uvm_ra_total.ev_count++;
 #endif /* defined(READAHEAD_STATS) */
 			KASSERT((pg->flags & PG_CLEAN) != 0);
+			uvm_pagelock(pg);
 			uvm_pageenqueue(pg);
+			uvm_pageunlock(pg);
 			pmap_clear_modify(pg);
 		}
 

Index: src/sys/uvm/uvm_pdaemon.c
diff -u src/sys/uvm/uvm_pdaemon.c:1.121 src/sys/uvm/uvm_pdaemon.c:1.122
--- src/sys/uvm/uvm_pdaemon.c:1.121	Tue Dec 31 13:07:14 2019
+++ src/sys/uvm/uvm_pdaemon.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pdaemon.c,v 1.121 2019/12/31 13:07:14 ad Exp $	*/
+/*	$NetBSD: uvm_pdaemon.c,v 1.122 2019/12/31 22:42:51 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -66,7 +66,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.121 2019/12/31 13:07:14 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.122 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_uvmhist.h"
 #include "opt_readahead.h"
@@ -818,7 +818,9 @@ uvmpd_scan_queue(void)
 
 		if (swapcluster_allocslots(&swc)) {
 			dirtyreacts++;
+			uvm_pagelock(p);
 			uvm_pageactivate(p);
+			uvm_pageunlock(p);
 			mutex_exit(slock);
 			continue;
 		}
@@ -836,7 +838,9 @@ uvmpd_scan_queue(void)
 		p->flags |= PG_PAGEOUT;
 		uvmexp.pgswapout++;
 
+		uvm_pagelock(p);
 		uvm_pagedequeue(p);
+		uvm_pageunlock(p);
 
 		/*
 		 * add the new page to the cluster.
@@ -846,7 +850,9 @@ uvmpd_scan_queue(void)
 			p->flags &= ~(PG_BUSY|PG_PAGEOUT);
 			UVM_PAGE_OWN(p, NULL);
 			dirtyreacts++;
+			uvm_pagelock(p);
 			uvm_pageactivate(p);
+			uvm_pageunlock(p);
 			mutex_exit(slock);
 			continue;
 		}
@@ -862,7 +868,9 @@ uvmpd_scan_queue(void)
 		atomic_inc_uint(&uvmexp.pdpending);
 
 #else /* defined(VMSWAP) */
+		uvm_pagelock(p);
 		uvm_pageactivate(p);
+		uvm_pageunlock(p);
 		mutex_exit(slock);
 #endif /* defined(VMSWAP) */
 	}

Index: src/sys/uvm/uvm_pdpolicy.h
diff -u src/sys/uvm/uvm_pdpolicy.h:1.5 src/sys/uvm/uvm_pdpolicy.h:1.6
--- src/sys/uvm/uvm_pdpolicy.h:1.5	Mon Dec 30 18:08:38 2019
+++ src/sys/uvm/uvm_pdpolicy.h	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pdpolicy.h,v 1.5 2019/12/30 18:08:38 ad Exp $	*/
+/*	$NetBSD: uvm_pdpolicy.h,v 1.6 2019/12/31 22:42:51 ad Exp $	*/
 
 /*-
  * Copyright (c)2005, 2006 YAMAMOTO Takashi,
@@ -37,7 +37,9 @@ struct vm_anon;
  * don't use them directly from outside of /sys/uvm.
  */
 
+void uvmpdpol_idle(struct uvm_cpu *);
 void uvmpdpol_init(void);
+void uvmpdpol_init_cpu(struct uvm_cpu *);
 void uvmpdpol_reinit(void);
 void uvmpdpol_estimatepageable(int *, int *);
 bool uvmpdpol_needsscan_p(void);
@@ -47,6 +49,7 @@ void uvmpdpol_pagedeactivate(struct vm_p
 void uvmpdpol_pagedequeue(struct vm_page *);
 void uvmpdpol_pageenqueue(struct vm_page *);
 bool uvmpdpol_pageisqueued_p(struct vm_page *);
+void uvmpdpol_pagerealize(struct vm_page *);
 void uvmpdpol_anfree(struct vm_anon *);
 
 void uvmpdpol_tune(void);
@@ -57,4 +60,17 @@ void uvmpdpol_balancequeue(int);
 
 void uvmpdpol_sysctlsetup(void);
 
+/*
+ * uvmpdpol_set_intent: set an intended state for the page, taking care not
+ * to overwrite any of the other flags.
+ */
+
+static inline void
+uvmpdpol_set_intent(struct vm_page *pg, uint32_t i)
+{
+
+	KASSERT(mutex_owned(&pg->interlock));
+	pg->pqflags = PQ_INTENT_SET | (pg->pqflags & ~PQ_INTENT_MASK) | i;
+}
+
 #endif /* !_UVM_PDPOLICY_H_ */

Index: src/sys/uvm/uvm_pdpolicy_clock.c
diff -u src/sys/uvm/uvm_pdpolicy_clock.c:1.27 src/sys/uvm/uvm_pdpolicy_clock.c:1.28
--- src/sys/uvm/uvm_pdpolicy_clock.c:1.27	Tue Dec 31 13:07:14 2019
+++ src/sys/uvm/uvm_pdpolicy_clock.c	Tue Dec 31 22:42:51 2019
@@ -1,6 +1,35 @@
-/*	$NetBSD: uvm_pdpolicy_clock.c,v 1.27 2019/12/31 13:07:14 ad Exp $	*/
+/*	$NetBSD: uvm_pdpolicy_clock.c,v 1.28 2019/12/31 22:42:51 ad Exp $	*/
 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
 
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
  * Copyright (c) 1991, 1993, The Regents of the University of California.
@@ -69,12 +98,13 @@
 #else /* defined(PDSIM) */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.27 2019/12/31 13:07:14 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.28 2019/12/31 22:42:51 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/kmem.h>
 
 #include <uvm/uvm.h>
 #include <uvm/uvm_pdpolicy.h>
@@ -83,9 +113,19 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy
 
 #endif /* defined(PDSIM) */
 
-#define	PQ_TIME		0xfffffffc	/* time of last activation */
-#define PQ_INACTIVE	0x00000001	/* page is in inactive list */
-#define PQ_ACTIVE	0x00000002	/* page is in active list */
+/*
+ * per-CPU queue of pending page status changes.  128 entries makes for a
+ * 1kB queue on _LP64 and has been found to be a reasonable compromise that
+ * keeps lock contention events and wait times low, while not using too much
+ * memory nor allowing global state to fall too far behind.
+ */
+#if !defined(CLOCK_PDQ_SIZE)
+#define	CLOCK_PDQ_SIZE	128
+#endif /* !defined(CLOCK_PDQ_SIZE) */
+
+#define	PQ_TIME		0xffffffc0	/* time of last activation */
+#define PQ_INACTIVE	0x00000010	/* page is in inactive list */
+#define PQ_ACTIVE	0x00000020	/* page is in active list */
 
 #if !defined(CLOCK_INACTIVEPCT)
 #define	CLOCK_INACTIVEPCT	33
@@ -117,6 +157,8 @@ struct uvmpdpol_scanstate {
 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
+static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
+static struct uvm_cpu *uvmpdpol_flush(void);
 
 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
 static struct uvmpdpol_scanstate pdpol_scanstate;
@@ -216,15 +258,12 @@ uvmpdpol_selectvictim(kmutex_t **plock)
 		/*
 		 * acquire interlock to stablize page identity.
 		 * if we have caught the page in a state of flux
-		 * and it should be dequeued, do it now and then
-		 * move on to the next.
+		 * deal with it and retry.
 		 */
 		mutex_enter(&pg->interlock);
-	        if ((pg->uobject == NULL && pg->uanon == NULL) ||
-	            pg->wire_count > 0) {
-	            	mutex_exit(&pg->interlock);
-	            	uvmpdpol_pagedequeue_locked(pg);
-	            	continue;
+		if (uvmpdpol_pagerealize_locked(pg)) {
+			mutex_exit(&pg->interlock);
+			continue;
 		}
 
 		/*
@@ -245,21 +284,21 @@ uvmpdpol_selectvictim(kmutex_t **plock)
 		anon = pg->uanon;
 		uobj = pg->uobject;
 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
-			mutex_exit(&pg->interlock);
 			uvmpdpol_pageactivate_locked(pg);
+			mutex_exit(&pg->interlock);
 			PDPOL_EVCNT_INCR(reactexec);
 			continue;
 		}
 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
-			mutex_exit(&pg->interlock);
 			uvmpdpol_pageactivate_locked(pg);
+			mutex_exit(&pg->interlock);
 			PDPOL_EVCNT_INCR(reactfile);
 			continue;
 		}
 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
-			mutex_exit(&pg->interlock);
 			uvmpdpol_pageactivate_locked(pg);
+			mutex_exit(&pg->interlock);
 			PDPOL_EVCNT_INCR(reactanon);
 			continue;
 		}
@@ -292,7 +331,9 @@ uvmpdpol_selectvictim(kmutex_t **plock)
 		 * next page.
 		 */
 		if (pmap_is_referenced(pg)) {
+			mutex_enter(&pg->interlock);
 			uvmpdpol_pageactivate_locked(pg);
+			mutex_exit(&pg->interlock);
 			uvmexp.pdreact++;
 			mutex_exit(lock);
 			continue;
@@ -339,15 +380,12 @@ uvmpdpol_balancequeue(int swap_shortage)
 		/*
 		 * acquire interlock to stablize page identity.
 		 * if we have caught the page in a state of flux
-		 * and it should be dequeued, do it now and then
-		 * move on to the next.
+		 * deal with it and retry.
 		 */
 		mutex_enter(&p->interlock);
-	        if ((p->uobject == NULL && p->uanon == NULL) ||
-	            p->wire_count > 0) {
-	            	mutex_exit(&p->interlock);
-	            	uvmpdpol_pagedequeue_locked(p);
-	            	continue;
+		if (uvmpdpol_pagerealize_locked(p)) {
+			mutex_exit(&p->interlock);
+			continue;
 		}
 
 		/*
@@ -384,7 +422,10 @@ uvmpdpol_balancequeue(int swap_shortage)
 		 * if there's a shortage of inactive pages, deactivate.
 		 */
 		if (inactive_shortage > 0) {
+			pmap_clear_reference(p);
+			mutex_enter(&p->interlock);
 			uvmpdpol_pagedeactivate_locked(p);
+			mutex_exit(&p->interlock);
 			uvmexp.pddeact++;
 			inactive_shortage--;
 		}
@@ -397,93 +438,118 @@ uvmpdpol_balancequeue(int swap_shortage)
 static void
 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
 {
+	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
 
-	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&s->lock));
+	KASSERT(mutex_owned(&pg->interlock));
+	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
+	    (PQ_INTENT_D | PQ_INTENT_SET));
 
 	if (pg->pqflags & PQ_ACTIVE) {
 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
-		pg->pqflags &= ~(PQ_ACTIVE | PQ_TIME);
 		KASSERT(pdpol_state.s_active > 0);
 		pdpol_state.s_active--;
 	}
 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
 		KASSERT(pg->wire_count == 0);
-		pmap_clear_reference(pg);
 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
-		pg->pqflags |= PQ_INACTIVE;
 		pdpol_state.s_inactive++;
 	}
+	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_INACTIVE;
 }
 
 void
 uvmpdpol_pagedeactivate(struct vm_page *pg)
 {
-	struct uvmpdpol_globalstate *s = &pdpol_state;
 
-	mutex_enter(&s->lock);
-	uvmpdpol_pagedeactivate_locked(pg);
-	mutex_exit(&s->lock);
+	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
+
+	/*
+	 * we have to clear the reference bit now, as when it comes time to
+	 * realize the intent we won't have the object locked any more.
+	 */
+	pmap_clear_reference(pg);
+	uvmpdpol_set_intent(pg, PQ_INTENT_I);
 }
 
 static void
 uvmpdpol_pageactivate_locked(struct vm_page *pg)
 {
+	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
+
+	KASSERT(mutex_owned(&s->lock));
+	KASSERT(mutex_owned(&pg->interlock));
+	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
+	    (PQ_INTENT_D | PQ_INTENT_SET));
 
 	uvmpdpol_pagedequeue_locked(pg);
 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
-	pg->pqflags = PQ_ACTIVE | (hardclock_ticks & PQ_TIME);
 	pdpol_state.s_active++;
+	pg->pqflags = (pg->pqflags & PQ_INTENT_QUEUED) | PQ_ACTIVE |
+	    (hardclock_ticks & PQ_TIME);
 }
 
 void
 uvmpdpol_pageactivate(struct vm_page *pg)
 {
-	struct uvmpdpol_globalstate *s = &pdpol_state;
+	uint32_t pqflags;
 
-	/* Safety: PQ_ACTIVE clear also tells us if it is not enqueued. */
-	if ((pg->pqflags & PQ_ACTIVE) == 0 ||
-	    ((hardclock_ticks & PQ_TIME) - (pg->pqflags & PQ_TIME)) >= hz) {
-		mutex_enter(&s->lock);
-		uvmpdpol_pageactivate_locked(pg);
-		mutex_exit(&s->lock);
+	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
+
+	/*
+	 * if there is any intent set on the page, or the page is not
+	 * active, or the page was activated in the "distant" past, then
+	 * it needs to be activated anew.
+	 */
+	pqflags = pg->pqflags;
+	if ((pqflags & PQ_INTENT_SET) != 0 ||
+	    (pqflags & PQ_ACTIVE) == 0 ||
+	    ((hardclock_ticks & PQ_TIME) - (pqflags & PQ_TIME)) > hz) {
+		uvmpdpol_set_intent(pg, PQ_INTENT_A);
 	}
 }
 
 static void
 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
 {
+	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
+
+	KASSERT(mutex_owned(&s->lock));
+	KASSERT(mutex_owned(&pg->interlock));
 
 	if (pg->pqflags & PQ_ACTIVE) {
 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
-		pg->pqflags &= ~(PQ_ACTIVE | PQ_TIME);
+		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
 		KASSERT(pdpol_state.s_active > 0);
 		pdpol_state.s_active--;
 	} else if (pg->pqflags & PQ_INACTIVE) {
 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
-		pg->pqflags &= ~PQ_INACTIVE;
 		KASSERT(pdpol_state.s_inactive > 0);
 		pdpol_state.s_inactive--;
 	}
+	pg->pqflags &= PQ_INTENT_QUEUED;
 }
 
 void
 uvmpdpol_pagedequeue(struct vm_page *pg)
 {
-	struct uvmpdpol_globalstate *s = &pdpol_state;
 
-	mutex_enter(&s->lock);
-	uvmpdpol_pagedequeue_locked(pg);
-	mutex_exit(&s->lock);
+	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
+
+	uvmpdpol_set_intent(pg, PQ_INTENT_D);
 }
 
 void
 uvmpdpol_pageenqueue(struct vm_page *pg)
 {
-	struct uvmpdpol_globalstate *s = &pdpol_state;
 
-	mutex_enter(&s->lock);
-	uvmpdpol_pageactivate_locked(pg);
-	mutex_exit(&s->lock);
+	KASSERT(uvm_page_owner_locked_p(pg));
+	KASSERT(mutex_owned(&pg->interlock));
+
+	uvmpdpol_set_intent(pg, PQ_INTENT_E);
 }
 
 void
@@ -494,9 +560,19 @@ uvmpdpol_anfree(struct vm_anon *an)
 bool
 uvmpdpol_pageisqueued_p(struct vm_page *pg)
 {
+	uint32_t pqflags;
 
-	/* Safe to test unlocked due to page life-cycle. */
-	return (pg->pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
+	/*
+	 * if there's an intent set, we have to consider it.  otherwise,
+	 * return the actual state.  we may be called unlocked for the
+	 * purpose of assertions, which is safe due to the page lifecycle.
+	 */
+	pqflags = atomic_load_relaxed(&pg->pqflags);
+	if ((pqflags & PQ_INTENT_SET) != 0) {
+		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
+	} else {
+		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
+	}
 }
 
 void
@@ -555,6 +631,16 @@ uvmpdpol_init(void)
 }
 
 void
+uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
+{
+
+	ucpu->pdq =
+	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
+	ucpu->pdqhead = CLOCK_PDQ_SIZE;
+	ucpu->pdqtail = CLOCK_PDQ_SIZE;
+}
+
+void
 uvmpdpol_reinit(void)
 {
 }
@@ -563,7 +649,9 @@ bool
 uvmpdpol_needsscan_p(void)
 {
 
-	/* This must be an unlocked check: can be called from interrupt. */
+	/*
+	 * this must be an unlocked check: can be called from interrupt.
+	 */
 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
 }
 
@@ -577,6 +665,157 @@ uvmpdpol_tune(void)
 	mutex_exit(&s->lock);
 }
 
+/*
+ * uvmpdpol_pagerealize_locked: take the intended state set on an indivdual
+ * page and make it real.  return true if any work was done.
+ */
+static bool
+uvmpdpol_pagerealize_locked(struct vm_page *pg)
+{
+	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
+
+	KASSERT(mutex_owned(&s->lock));
+	KASSERT(mutex_owned(&pg->interlock));
+
+	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
+	case PQ_INTENT_A | PQ_INTENT_SET:
+	case PQ_INTENT_E | PQ_INTENT_SET:
+		uvmpdpol_pageactivate_locked(pg);
+		return true;
+	case PQ_INTENT_I | PQ_INTENT_SET:
+		uvmpdpol_pagedeactivate_locked(pg);
+		return true;
+	case PQ_INTENT_D | PQ_INTENT_SET:
+		uvmpdpol_pagedequeue_locked(pg);
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * uvmpdpol_flush: return the current uvm_cpu with all of its pending
+ * updates flushed to the global queues.  this routine may block, and
+ * so can switch cpu.  the idea is to empty to queue on whatever cpu
+ * we finally end up on.
+ */
+static struct uvm_cpu *
+uvmpdpol_flush(void)
+{
+	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
+	struct uvm_cpu *ucpu;
+	struct vm_page *pg;
+
+	KASSERT(kpreempt_disabled());
+
+	mutex_enter(&s->lock);
+	for (;;) {
+		/*
+		 * prefer scanning forwards (even though mutex_enter() is
+		 * serializing) so as to not defeat any prefetch logic in
+		 * the CPU.  that means elsewhere enqueuing backwards, like
+		 * a stack, but not so important there as pages are being
+		 * added singularly.
+		 *
+		 * prefetch the next "struct vm_page" while working on the
+		 * current one.  this has a measurable and very positive
+		 * effect in reducing the amount of time spent here under
+		 * the global lock.
+		 */
+		ucpu = curcpu()->ci_data.cpu_uvm;
+		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
+		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
+			break;
+		}
+		pg = ucpu->pdq[ucpu->pdqhead++];
+		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
+			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
+		}
+		mutex_enter(&pg->interlock);
+		pg->pqflags &= ~PQ_INTENT_QUEUED;
+		(void)uvmpdpol_pagerealize_locked(pg);
+		mutex_exit(&pg->interlock);
+	}
+	mutex_exit(&s->lock);
+	return ucpu;
+}
+
+/*
+ * uvmpdpol_pagerealize: realize any intent set on the page.  in this
+ * implementation, that means putting the page on a per-CPU queue to be
+ * dealt with later.
+ */
+void
+uvmpdpol_pagerealize(struct vm_page *pg)
+{
+	struct uvm_cpu *ucpu;
+
+	/*
+	 * drain the per per-CPU queue if full, then enter the page.
+	 */
+	kpreempt_disable();
+	ucpu = curcpu()->ci_data.cpu_uvm;
+	if (__predict_false(ucpu->pdqhead == 0)) {
+		ucpu = uvmpdpol_flush();
+	}
+	ucpu->pdq[--(ucpu->pdqhead)] = pg;
+	kpreempt_enable();
+}
+
+/*
+ * uvmpdpol_idle: called from the system idle loop.  periodically purge any
+ * pending updates back to the global queues.
+ */
+void
+uvmpdpol_idle(struct uvm_cpu *ucpu)
+{
+	struct uvmpdpol_globalstate *s = &pdpol_state;
+	struct vm_page *pg;
+
+	KASSERT(kpreempt_disabled());
+
+	/*
+	 * if no pages in the queue, we have nothing to do.
+	 */
+	if (ucpu->pdqhead == ucpu->pdqtail) {
+		ucpu->pdqtime = hardclock_ticks;
+		return;
+	}
+
+	/*
+	 * don't do this more than ~8 times a second as it would needlessly
+	 * exert pressure.
+	 */
+	if (hardclock_ticks - ucpu->pdqtime < (hz >> 3)) {
+		return;
+	}
+
+	/*
+	 * the idle LWP can't block, so we have to try for the lock.  if we
+	 * get it, purge the per-CPU pending update queue.  continually
+	 * check for a pending resched: in that case exit immediately.
+	 */
+	if (mutex_tryenter(&s->lock)) {
+		while (ucpu->pdqhead != ucpu->pdqtail) {
+			pg = ucpu->pdq[ucpu->pdqhead];
+			if (!mutex_tryenter(&pg->interlock)) {
+				break;
+			}
+			ucpu->pdqhead++;
+			pg->pqflags &= ~PQ_INTENT_QUEUED;
+			(void)uvmpdpol_pagerealize_locked(pg);
+			mutex_exit(&pg->interlock);
+			if (curcpu()->ci_want_resched) {
+				break;
+			}
+		}
+		if (ucpu->pdqhead == ucpu->pdqtail) {
+			ucpu->pdqtime = hardclock_ticks;
+		}
+		mutex_exit(&s->lock);
+	}
+}
+
 #if !defined(PDSIM)
 
 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */

Index: src/sys/uvm/uvm_pdpolicy_clockpro.c
diff -u src/sys/uvm/uvm_pdpolicy_clockpro.c:1.21 src/sys/uvm/uvm_pdpolicy_clockpro.c:1.22
--- src/sys/uvm/uvm_pdpolicy_clockpro.c:1.21	Tue Dec 31 12:40:27 2019
+++ src/sys/uvm/uvm_pdpolicy_clockpro.c	Tue Dec 31 22:42:51 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_pdpolicy_clockpro.c,v 1.21 2019/12/31 12:40:27 ad Exp $	*/
+/*	$NetBSD: uvm_pdpolicy_clockpro.c,v 1.22 2019/12/31 22:42:51 ad Exp $	*/
 
 /*-
  * Copyright (c)2005, 2006 YAMAMOTO Takashi,
@@ -43,7 +43,7 @@
 #else /* defined(PDSIM) */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clockpro.c,v 1.21 2019/12/31 12:40:27 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clockpro.c,v 1.22 2019/12/31 22:42:51 ad Exp $");
 
 #include "opt_ddb.h"
 
@@ -121,13 +121,13 @@ PDPOL_EVCNT_DEFINE(speculativemiss)
 PDPOL_EVCNT_DEFINE(locksuccess)
 PDPOL_EVCNT_DEFINE(lockfail)
 
-#define	PQ_REFERENCED	0x000000001
-#define	PQ_HOT		0x000000002
-#define	PQ_TEST		0x000000004
-#define	PQ_INITIALREF	0x000000008
-#define	PQ_QMASK	0x000000070
-#define	PQ_QFACTOR	0x000000010
-#define	PQ_SPECULATIVE	0x000000080
+#define	PQ_REFERENCED	0x000000010
+#define	PQ_HOT		0x000000020
+#define	PQ_TEST		0x000000040
+#define	PQ_INITIALREF	0x000000080
+#define	PQ_QMASK	0x000000700
+#define	PQ_QFACTOR	0x000000100
+#define	PQ_SPECULATIVE	0x000000800
 
 #define	CLOCKPRO_NOQUEUE	0
 #define	CLOCKPRO_NEWQ		1	/* small queue to clear initial ref. */
@@ -141,6 +141,8 @@ PDPOL_EVCNT_DEFINE(lockfail)
 #define	CLOCKPRO_LISTQ		4
 #define	CLOCKPRO_NQUEUE		4
 
+static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
+
 static inline void
 clockpro_setq(struct vm_page *pg, int qidx)
 {
@@ -1129,12 +1131,10 @@ done:;
 	return pg;
 }
 
-void
-uvmpdpol_pageactivate(struct vm_page *pg)
+static void
+uvmpdpol_pageactivate_locked(struct vm_page *pg)
 {
-	struct clockpro_state * const s = &clockpro;
 
-	mutex_enter(&s->lock);
 	if (!uvmpdpol_pageisqueued_p(pg)) {
 		KASSERT((pg->pqflags & PQ_SPECULATIVE) == 0);
 		pg->pqflags |= PQ_INITIALREF;
@@ -1147,54 +1147,111 @@ uvmpdpol_pageactivate(struct vm_page *pg
 		clockpro_pageenqueue(pg);
 	}
 	pg->pqflags |= PQ_REFERENCED;
-	mutex_exit(&s->lock);
 }
 
 void
-uvmpdpol_pagedeactivate(struct vm_page *pg)
+uvmpdpol_pageactivate(struct vm_page *pg)
+{
+
+	uvmpdpol_set_intent(pg, PQ_INTENT_A);
+}
+
+static void
+uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
 {
-	struct clockpro_state * const s = &clockpro;
 
-	mutex_enter(&s->lock);
 	clockpro_clearreferencebit(pg, true);
-	mutex_exit(&s->lock);
 }
 
 void
-uvmpdpol_pagedequeue(struct vm_page *pg)
+uvmpdpol_pagedeactivate(struct vm_page *pg)
+{
+
+	uvmpdpol_set_intent(pg, PQ_INTENT_I);
+}
+
+static void
+uvmpdpol_pagedequeue_locked(struct vm_page *pg)
 {
-	struct clockpro_state * const s = &clockpro;
 
 	if (!uvmpdpol_pageisqueued_p(pg)) {
 		return;
 	}
-	mutex_enter(&s->lock);
 	clockpro_pagedequeue(pg);
 	pg->pqflags &= ~(PQ_INITIALREF|PQ_SPECULATIVE);
-	mutex_exit(&s->lock);
 }
 
 void
-uvmpdpol_pageenqueue(struct vm_page *pg)
+uvmpdpol_pagedequeue(struct vm_page *pg)
 {
 
-#if 1
-	struct clockpro_state * const s = &clockpro;
+	uvmpdpol_set_intent(pg, PQ_INTENT_D);
+}
+
+static void
+uvmpdpol_pageenqueue_locked(struct vm_page *pg)
+{
 
+#if 1
 	if (uvmpdpol_pageisqueued_p(pg)) {
 		return;
 	}
-	mutex_enter(&s->lock);
 	clockpro_clearreferencebit(pg, true);
 	pg->pqflags |= PQ_SPECULATIVE;
 	clockpro_pageenqueue(pg);
-	mutex_exit(&s->lock);
 #else
-	uvmpdpol_pageactivate(pg);
+	uvmpdpol_pageactivate_locked(pg);
 #endif
 }
 
 void
+uvmpdpol_pageenqueue(struct vm_page *pg)
+{
+
+	uvmpdpol_set_intent(pg, PQ_INTENT_D);
+}
+
+static bool
+uvmpdpol_pagerealize_locked(struct vm_page *pg)
+{
+	uint32_t pqflags;
+
+	KASSERT(mutex_owned(&clockpro.lock));
+	KASSERT(mutex_owned(&pg->interlock));
+
+	/* XXX this needs to be called from elsewhere, like uvmpdpol_clock. */
+
+	pqflags = pg->pqflags;
+	pq->pqflags &= ~(PQ_INTENT_SET | PQ_INTENT_QUEUED);
+	switch (pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
+	case PQ_INTENT_A | PQ_INTENT_SET:
+		uvmpdpol_pageactivate_locked(pg);
+		return true;
+	case PQ_INTENT_E | PQ_INTENT_SET:
+		uvmpdpol_pageenqueue_locked(pg);
+		return true;
+	case PQ_INTENT_I | PQ_INTENT_SET:
+		uvmpdpol_pagedeactivate_locked(pg);
+		return true;
+	case PQ_INTENT_D | PQ_INTENT_SET:
+		uvmpdpol_pagedequeue_locked(pg);
+		return true;
+	default:
+		return false;
+	}
+}
+
+void
+uvmpdpol_pagerealize(struct vm_page *pg)
+{
+	struct clockpro_state * const s = &clockpro;
+
+	mutex_enter(&s->lock);
+	uvmpdpol_pagerealize_locked(pg);
+	mutex_exit(&s->lock);
+}
+
+void
 uvmpdpol_anfree(struct vm_anon *an)
 {
 	struct clockpro_state * const s = &clockpro;
@@ -1398,6 +1455,12 @@ uvmpdpol_tune(void)
 	mutex_exit(&s->lock);
 }
 
+void
+uvmpdpol_idle(void)
+{
+
+}
+
 #if !defined(PDSIM)
 
 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */

Reply via email to