Module Name:    src
Committed By:   ad
Date:           Fri Dec 27 12:51:57 UTC 2019

Modified Files:
        src/sys/arch/amd64/amd64: autoconf.c
        src/sys/arch/i386/i386: autoconf.c
        src/sys/ddb: db_command.c
        src/sys/dev/acpi: acpi_srat.c acpi_srat.h
        src/sys/kern: init_main.c
        src/sys/uvm: files.uvm uvm.h uvm_ddb.h uvm_extern.h uvm_glue.c
            uvm_init.c uvm_page.c uvm_page.h uvm_pglist.c uvm_pglist.h
Added Files:
        src/sys/uvm: uvm_pgflcache.c uvm_pgflcache.h

Log Message:
Redo the page allocator to perform better, especially on multi-core and
multi-socket systems.  Proposed on tech-kern.  While here:

- add rudimentary NUMA support - needs more work.
- remove now unused "listq" from vm_page.


To generate a diff of this commit:
cvs rdiff -u -r1.28 -r1.29 src/sys/arch/amd64/amd64/autoconf.c
cvs rdiff -u -r1.105 -r1.106 src/sys/arch/i386/i386/autoconf.c
cvs rdiff -u -r1.165 -r1.166 src/sys/ddb/db_command.c
cvs rdiff -u -r1.7 -r1.8 src/sys/dev/acpi/acpi_srat.c
cvs rdiff -u -r1.4 -r1.5 src/sys/dev/acpi/acpi_srat.h
cvs rdiff -u -r1.512 -r1.513 src/sys/kern/init_main.c
cvs rdiff -u -r1.31 -r1.32 src/sys/uvm/files.uvm
cvs rdiff -u -r1.70 -r1.71 src/sys/uvm/uvm.h
cvs rdiff -u -r1.15 -r1.16 src/sys/uvm/uvm_ddb.h
cvs rdiff -u -r1.215 -r1.216 src/sys/uvm/uvm_extern.h
cvs rdiff -u -r1.172 -r1.173 src/sys/uvm/uvm_glue.c
cvs rdiff -u -r1.51 -r1.52 src/sys/uvm/uvm_init.c
cvs rdiff -u -r1.212 -r1.213 src/sys/uvm/uvm_page.c
cvs rdiff -u -r1.88 -r1.89 src/sys/uvm/uvm_page.h
cvs rdiff -u -r0 -r1.1 src/sys/uvm/uvm_pgflcache.c \
    src/sys/uvm/uvm_pgflcache.h
cvs rdiff -u -r1.77 -r1.78 src/sys/uvm/uvm_pglist.c
cvs rdiff -u -r1.8 -r1.9 src/sys/uvm/uvm_pglist.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/amd64/amd64/autoconf.c
diff -u src/sys/arch/amd64/amd64/autoconf.c:1.28 src/sys/arch/amd64/amd64/autoconf.c:1.29
--- src/sys/arch/amd64/amd64/autoconf.c:1.28	Sun Oct 22 00:59:28 2017
+++ src/sys/arch/amd64/amd64/autoconf.c	Fri Dec 27 12:51:56 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $	*/
+/*	$NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -46,7 +46,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $");
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $");
 
 #include "opt_multiprocessor.h"
 #include "opt_intrdebug.h"
@@ -60,9 +60,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v
 #include <machine/pte.h>
 #include <machine/cpufunc.h>
 
+#include "acpica.h"
 #include "ioapic.h"
 #include "lapic.h"
 
+#if NACPICA > 0
+#include <dev/acpi/acpi_srat.h>
+#endif
+
 #if NIOAPIC > 0
 #include <machine/i82093var.h>
 #endif
@@ -112,6 +117,11 @@ cpu_configure(void)
 	cpu_init_idle_lwps();
 #endif
 
+#if NACPICA > 0
+	/* Load NUMA memory regions into UVM. */
+	acpisrat_load_uvm();
+#endif
+
 	spl0();
 	lcr8(0);
 }

Index: src/sys/arch/i386/i386/autoconf.c
diff -u src/sys/arch/i386/i386/autoconf.c:1.105 src/sys/arch/i386/i386/autoconf.c:1.106
--- src/sys/arch/i386/i386/autoconf.c:1.105	Sun Oct 22 00:59:28 2017
+++ src/sys/arch/i386/i386/autoconf.c	Fri Dec 27 12:51:56 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $	*/
+/*	$NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $	*/
 
 /*-
  * Copyright (c) 1990 The Regents of the University of California.
@@ -46,7 +46,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $");
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $");
 
 #include "opt_intrdebug.h"
 #include "opt_multiprocessor.h"
@@ -65,9 +65,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v
 #include <machine/cpufunc.h>
 #include <x86/fpu.h>
 
+#include "acpica.h"
 #include "ioapic.h"
 #include "lapic.h"
 
+#if NACPICA > 0
+#include <dev/acpi/acpi_srat.h>
+#endif
+
 #if NIOAPIC > 0
 #include <machine/i82093var.h>
 #endif
@@ -132,6 +137,11 @@ cpu_configure(void)
 	cpu_init_idle_lwps();
 #endif
 
+#if NACPICA > 0
+	/* Load NUMA memory regions into UVM. */
+	acpisrat_load_uvm();
+#endif
+
 	spl0();
 #if NLAPIC > 0
 	lapic_write_tpri(0);

Index: src/sys/ddb/db_command.c
diff -u src/sys/ddb/db_command.c:1.165 src/sys/ddb/db_command.c:1.166
--- src/sys/ddb/db_command.c:1.165	Sun Dec 15 20:29:08 2019
+++ src/sys/ddb/db_command.c	Fri Dec 27 12:51:56 2019
@@ -1,7 +1,8 @@
-/*	$NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $	*/
+/*	$NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $	*/
 
 /*
- * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009 The NetBSD Foundation, Inc.
+ * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009, 2019
+ *     The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
@@ -60,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $");
+__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_aio.h"
@@ -193,6 +194,7 @@ static void     db_help_print_cmd(db_exp
 static void	db_lock_print_cmd(db_expr_t, bool, db_expr_t, const char *);
 static void	db_show_all_locks(db_expr_t, bool, db_expr_t, const char *);
 static void	db_show_lockstats(db_expr_t, bool, db_expr_t, const char *);
+static void	db_show_all_freelists(db_expr_t, bool, db_expr_t, const char *);
 static void	db_mount_print_cmd(db_expr_t, bool, db_expr_t, const char *);
 static void	db_show_all_mount(db_expr_t, bool, db_expr_t, const char *);
 static void	db_mbuf_print_cmd(db_expr_t, bool, db_expr_t, const char *);
@@ -234,6 +236,8 @@ static const struct db_command db_show_c
 	    0 ,"Show all held locks", "[/t]", NULL) },
 	{ DDB_ADD_CMD("mount",	db_show_all_mount,	0,
 	    "Print all mount structures.", "[/f]", NULL) },
+	{ DDB_ADD_CMD("freelists",	db_show_all_freelists,
+	    0 ,"Show all freelists", NULL, NULL) },
 #ifdef AIO
 	/*added from all sub cmds*/
 	{ DDB_ADD_CMD("aio_jobs",	db_show_aio_jobs,	0,
@@ -1285,6 +1289,16 @@ db_show_all_locks(db_expr_t addr, bool h
 }
 
 static void
+db_show_all_freelists(db_expr_t addr, bool have_addr,
+    db_expr_t count, const char *modif)
+{
+
+#ifdef _KERNEL	/* XXX CRASH(8) */
+	uvm_page_print_freelists(db_printf);
+#endif
+}
+
+static void
 db_show_lockstats(db_expr_t addr, bool have_addr,
     db_expr_t count, const char *modif)
 {

Index: src/sys/dev/acpi/acpi_srat.c
diff -u src/sys/dev/acpi/acpi_srat.c:1.7 src/sys/dev/acpi/acpi_srat.c:1.8
--- src/sys/dev/acpi/acpi_srat.c:1.7	Sun Dec 22 22:18:04 2019
+++ src/sys/dev/acpi/acpi_srat.c	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $ */
+/* $NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $ */
 
 /*
  * Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/kmem.h>
@@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,
 #include <dev/acpi/acpivar.h>
 #include <dev/acpi/acpi_srat.h>
 
+#include <uvm/uvm_extern.h>
+
 static ACPI_TABLE_SRAT *srat;
 
 static uint32_t nnodes; /* Number of NUMA nodes */
@@ -472,6 +474,28 @@ acpisrat_dump(void)
 	}
 }
 
+void
+acpisrat_load_uvm(void)
+{
+	uint32_t i, j, nn, nm;
+	struct acpisrat_mem m;
+
+	nn = acpisrat_nodes();
+	aprint_debug("SRAT: %u NUMA nodes\n", nn);
+	for (i = 0; i < nn; i++) {
+		nm = acpisrat_node_memoryranges(i);
+		for (j = 0; j < nm; j++) {
+			acpisrat_mem(i, j, &m);
+			aprint_debug("SRAT: node %u memory range %u (0x%"
+			    PRIx64" - 0x%"PRIx64" flags %u)\n",
+			    m.nodeid, j, m.baseaddress,
+			    m.baseaddress + m.length, m.flags);
+			uvm_page_numa_load(trunc_page(m.baseaddress),
+			    trunc_page(m.length), m.nodeid);
+		}
+	}
+}
+
 /*
  * Get number of NUMA nodes.
  */

Index: src/sys/dev/acpi/acpi_srat.h
diff -u src/sys/dev/acpi/acpi_srat.h:1.4 src/sys/dev/acpi/acpi_srat.h:1.5
--- src/sys/dev/acpi/acpi_srat.h:1.4	Thu Dec 28 08:49:28 2017
+++ src/sys/dev/acpi/acpi_srat.h	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: acpi_srat.h,v 1.4 2017/12/28 08:49:28 maxv Exp $ */
+/* $NetBSD: acpi_srat.h,v 1.5 2019/12/27 12:51:57 ad Exp $ */
 
 /*
  * Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -68,6 +68,7 @@ int acpisrat_init(void);
 int acpisrat_refresh(void);
 int acpisrat_exit(void);
 void acpisrat_dump(void);
+void acpisrat_load_uvm(void);
 uint32_t acpisrat_nodes(void);
 uint32_t acpisrat_node_cpus(acpisrat_nodeid_t);
 uint32_t acpisrat_node_memoryranges(acpisrat_nodeid_t);

Index: src/sys/kern/init_main.c
diff -u src/sys/kern/init_main.c:1.512 src/sys/kern/init_main.c:1.513
--- src/sys/kern/init_main.c:1.512	Sun Dec 22 15:00:42 2019
+++ src/sys/kern/init_main.c	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $	*/
+/*	$NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $	*/
 
 /*-
  * Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc.
@@ -97,7 +97,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
@@ -814,6 +814,10 @@ configure2(void)
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		uvm_cpu_attach(ci);
 	}
+
+	/* Decide how to partition free memory. */
+	uvm_page_rebucket();
+
 	mp_online = true;
 #if defined(MULTIPROCESSOR)
 	cpu_boot_secondary_processors();

Index: src/sys/uvm/files.uvm
diff -u src/sys/uvm/files.uvm:1.31 src/sys/uvm/files.uvm:1.32
--- src/sys/uvm/files.uvm:1.31	Sun Dec 15 21:11:35 2019
+++ src/sys/uvm/files.uvm	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-#	$NetBSD: files.uvm,v 1.31 2019/12/15 21:11:35 ad Exp $
+#	$NetBSD: files.uvm,v 1.32 2019/12/27 12:51:57 ad Exp $
 
 #
 # UVM options
@@ -42,6 +42,7 @@ file	uvm/uvm_pager.c			uvm
 file	uvm/uvm_pdaemon.c		uvm
 file	uvm/uvm_pdpolicy_clock.c	!pdpolicy_clockpro
 file	uvm/uvm_pdpolicy_clockpro.c	pdpolicy_clockpro
+file	uvm/uvm_pgflcache.c		uvm
 file	uvm/uvm_pglist.c		uvm
 file	uvm/uvm_physseg.c		uvm
 file	uvm/uvm_readahead.c		uvm

Index: src/sys/uvm/uvm.h
diff -u src/sys/uvm/uvm.h:1.70 src/sys/uvm/uvm.h:1.71
--- src/sys/uvm/uvm.h:1.70	Fri Dec 13 20:10:22 2019
+++ src/sys/uvm/uvm.h	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm.h,v 1.70 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: uvm.h,v 1.71 2019/12/27 12:51:57 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -71,21 +71,19 @@
 #include <machine/vmparam.h>
 
 struct workqueue;
+struct pgflcache;
 
 /*
  * per-cpu data
  */
 
 struct uvm_cpu {
-	struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
-	int page_free_nextcolor;	/* next color to allocate from */
-	int page_idlezero_next;		/* which color to zero next */
-	bool page_idle_zero;		/* TRUE if we should try to zero
-					   pages in the idle loop */
-	int pages[PGFL_NQUEUES];	/* total of pages in page_free */
-	u_int emap_gen;			/* emap generation number */
-
-	krndsource_t rs;		/* entropy source */
+	struct pgflcache *pgflcache[VM_NFREELIST];/* cpu-local cached pages */
+	void		*pgflcachemem;		/* pointer to allocated mem */
+	size_t		pgflcachememsz;		/* size of allocated memory */
+	u_int		pgflcolor;		/* next color to allocate */
+	u_int		pgflbucket;		/* where to send our pages */
+	krndsource_t 	rs;			/* entropy source */
 };
 
 /*
@@ -98,7 +96,9 @@ struct uvm {
 
 		/* vm_page queues */
 	struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
-	bool page_init_done;		/* TRUE if uvm_page_init() finished */
+	u_int	bucketcount;
+	bool	page_init_done;		/* true if uvm_page_init() finished */
+	bool	numa_alloc;		/* use NUMA page allocation strategy */
 
 		/* page daemon trigger */
 	int pagedaemon;			/* daemon sleeps on this */
@@ -123,7 +123,6 @@ extern struct uvm_object *uvm_kernel_obj
  * locks (made globals for lockstat).
  */
 
-extern kmutex_t uvm_fpageqlock;		/* lock for free page q */
 extern kmutex_t uvm_kentry_lock;
 
 #endif /* _KERNEL */

Index: src/sys/uvm/uvm_ddb.h
diff -u src/sys/uvm/uvm_ddb.h:1.15 src/sys/uvm/uvm_ddb.h:1.16
--- src/sys/uvm/uvm_ddb.h:1.15	Tue May 17 04:18:07 2011
+++ src/sys/uvm/uvm_ddb.h	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_ddb.h,v 1.15 2011/05/17 04:18:07 mrg Exp $	*/
+/*	$NetBSD: uvm_ddb.h,v 1.16 2019/12/27 12:51:57 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -40,6 +40,7 @@ void	uvm_object_printit(struct uvm_objec
 void	uvm_page_printit(struct vm_page *, bool,
 	    void (*)(const char *, ...));
 void	uvm_page_printall(void (*)(const char *, ...));
+void	uvm_page_print_freelists(void (*)(const char *, ...));
 void	uvmexp_print(void (*)(const char *, ...));
 #endif /* DDB || DEBUGPRINT */
 

Index: src/sys/uvm/uvm_extern.h
diff -u src/sys/uvm/uvm_extern.h:1.215 src/sys/uvm/uvm_extern.h:1.216
--- src/sys/uvm/uvm_extern.h:1.215	Sat Dec 21 12:58:26 2019
+++ src/sys/uvm/uvm_extern.h	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_extern.h,v 1.215 2019/12/21 12:58:26 ad Exp $	*/
+/*	$NetBSD: uvm_extern.h,v 1.216 2019/12/27 12:51:57 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -210,6 +210,7 @@ b\32UNMAP\0\
 #define	UVM_PGA_STRAT_NORMAL	0	/* priority (low id to high) walk */
 #define	UVM_PGA_STRAT_ONLY	1	/* only specified free list */
 #define	UVM_PGA_STRAT_FALLBACK	2	/* ONLY falls back on NORMAL */
+#define	UVM_PGA_STRAT_NUMA	3	/* strongly prefer ideal bucket */
 
 /*
  * flags for uvm_pagealloc_strat()
@@ -736,6 +737,7 @@ void			uvm_obj_unwirepages(struct uvm_ob
 
 /* uvm_page.c */
 int			uvm_free(void);
+void			uvm_page_numa_load(paddr_t, paddr_t, u_int);
 struct vm_page		*uvm_pagealloc_strat(struct uvm_object *,
 			    voff_t, struct vm_anon *, int, int, int);
 #define	uvm_pagealloc(obj, off, anon, flags) \

Index: src/sys/uvm/uvm_glue.c
diff -u src/sys/uvm/uvm_glue.c:1.172 src/sys/uvm/uvm_glue.c:1.173
--- src/sys/uvm/uvm_glue.c:1.172	Sat Dec 21 13:00:25 2019
+++ src/sys/uvm/uvm_glue.c	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $	*/
+/*	$NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -62,7 +62,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $");
 
 #include "opt_kgdb.h"
 #include "opt_kstack.h"
@@ -86,6 +86,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v
 #include <sys/asan.h>
 
 #include <uvm/uvm.h>
+#include <uvm/uvm_pgflcache.h>
 
 /*
  * uvm_kernacc: test if kernel can access a memory region.
@@ -500,9 +501,17 @@ uvm_scheduler(void)
 	lwp_changepri(l, PRI_VM);
 	lwp_unlock(l);
 
+	/* Start the freelist cache. */
+	uvm_pgflcache_start();
+
 	for (;;) {
 		/* Update legacy stats for post-mortem debugging. */
 		uvm_update_uvmexp();
+
+		/* See if the pagedaemon needs to generate some free pages. */
+		uvm_kick_pdaemon();
+
+		/* Calculate process statistics. */
 		sched_pstats();
 		(void)kpause("uvm", false, hz, NULL);
 	}

Index: src/sys/uvm/uvm_init.c
diff -u src/sys/uvm/uvm_init.c:1.51 src/sys/uvm/uvm_init.c:1.52
--- src/sys/uvm/uvm_init.c:1.51	Fri Dec 13 20:10:22 2019
+++ src/sys/uvm/uvm_init.c	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $	*/
+/*	$NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -64,7 +64,6 @@ const int * const uvmexp_pagemask = &uvm
 const int * const uvmexp_pageshift = &uvmexp.pageshift;
 #endif
 
-kmutex_t uvm_fpageqlock __cacheline_aligned;
 kmutex_t uvm_kentry_lock __cacheline_aligned;
 
 /*

Index: src/sys/uvm/uvm_page.c
diff -u src/sys/uvm/uvm_page.c:1.212 src/sys/uvm/uvm_page.c:1.213
--- src/sys/uvm/uvm_page.c:1.212	Sun Dec 22 16:37:36 2019
+++ src/sys/uvm/uvm_page.c	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,33 @@
-/*	$NetBSD: uvm_page.c,v 1.212 2019/12/22 16:37:36 ad Exp $	*/
+/*	$NetBSD: uvm_page.c,v 1.213 2019/12/27 12:51:57 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -66,7 +95,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.212 2019/12/22 16:37:36 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.213 2019/12/27 12:51:57 ad Exp $");
 
 #include "opt_ddb.h"
 #include "opt_uvm.h"
@@ -87,6 +116,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v
 #include <uvm/uvm.h>
 #include <uvm/uvm_ddb.h>
 #include <uvm/uvm_pdpolicy.h>
+#include <uvm/uvm_pgflcache.h>
 
 /*
  * Some supported CPUs in a given architecture don't support all
@@ -130,6 +160,25 @@ static vaddr_t      virtual_space_end;
  */
 
 static size_t recolored_pages_memsize /* = 0 */;
+static char *recolored_pages_mem;
+
+/*
+ * freelist locks - one per bucket.
+ */
+
+union uvm_freelist_lock	uvm_freelist_locks[PGFL_MAX_BUCKETS]
+    __cacheline_aligned;
+
+/*
+ * basic NUMA information.
+ */
+
+static struct uvm_page_numa_region {
+	struct uvm_page_numa_region	*next;
+	paddr_t				start;
+	paddr_t				size;
+	u_int				numa_id;
+} *uvm_page_numa_region;
 
 #ifdef DEBUG
 vaddr_t uvm_zerocheckkva;
@@ -243,15 +292,15 @@ uvm_pageremove_tree(struct uvm_object *u
 }
 
 static void
-uvm_page_init_buckets(struct pgfreelist *pgfl)
+uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
 {
-	int color, i;
+	int i;
 
-	for (color = 0; color < uvmexp.ncolors; color++) {
-		for (i = 0; i < PGFL_NQUEUES; i++) {
-			LIST_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]);
-		}
+	pgb->pgb_nfree = 0;
+	for (i = 0; i < uvmexp.ncolors; i++) {
+		LIST_INIT(&pgb->pgb_colors[i]);
 	}
+	pgfl->pgfl_buckets[num] = pgb;
 }
 
 /*
@@ -263,18 +312,18 @@ uvm_page_init_buckets(struct pgfreelist 
 void
 uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
 {
-	static struct uvm_cpu boot_cpu;
-	psize_t freepages, pagecount, bucketcount, n;
-	struct pgflbucket *bucketarray, *cpuarray;
+	static struct uvm_cpu boot_cpu __cacheline_aligned;
+	psize_t freepages, pagecount, bucketsize, n;
+	struct pgflbucket *pgb;
 	struct vm_page *pagearray;
+	char *bucketarray;
 	uvm_physseg_t bank;
-	int lcv;
+	int fl, b;
 
 	KASSERT(ncpu <= 1);
-	CTASSERT(sizeof(pagearray->offset) >= sizeof(struct uvm_cpu *));
 
 	/*
-	 * init the page queues and free page queue lock, except the
+	 * init the page queues and free page queue locks, except the
 	 * free list; we allocate that later (with the initial vm_page
 	 * structures).
 	 */
@@ -282,7 +331,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
 	uvm.cpus[0] = &boot_cpu;
 	curcpu()->ci_data.cpu_uvm = &boot_cpu;
 	uvmpdpol_init();
-	mutex_init(&uvm_fpageqlock, MUTEX_DRIVER, IPL_VM);
+	for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
+		mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
+	}
 
 	/*
 	 * allocate vm_page structures.
@@ -323,6 +374,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
 	uvmexp.colormask = uvmexp.ncolors - 1;
 	KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
 
+	/* We always start with only 1 bucket. */
+	uvm.bucketcount = 1;
+
 	/*
 	 * we now know we have (PAGE_SIZE * freepages) bytes of memory we can
 	 * use.   for each page of memory we use we need a vm_page structure.
@@ -332,28 +386,28 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
 	 * truncation errors (since we can only allocate in terms of whole
 	 * pages).
 	 */
-
-	bucketcount = uvmexp.ncolors * VM_NFREELIST;
 	pagecount = ((freepages + 1) << PAGE_SHIFT) /
 	    (PAGE_SIZE + sizeof(struct vm_page));
-
-	bucketarray = (void *)uvm_pageboot_alloc((bucketcount *
-	    sizeof(struct pgflbucket) * 2) + (pagecount *
-	    sizeof(struct vm_page)));
-	cpuarray = bucketarray + bucketcount;
-	pagearray = (struct vm_page *)(bucketarray + bucketcount * 2);
-
-	for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
-		uvm.page_free[lcv].pgfl_buckets =
-		    (bucketarray + (lcv * uvmexp.ncolors));
-		uvm_page_init_buckets(&uvm.page_free[lcv]);
-		uvm.cpus[0]->page_free[lcv].pgfl_buckets =
-		    (cpuarray + (lcv * uvmexp.ncolors));
-		uvm_page_init_buckets(&uvm.cpus[0]->page_free[lcv]);
+	bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
+	bucketsize = roundup2(bucketsize, coherency_unit);
+	bucketarray = (void *)uvm_pageboot_alloc(
+	    bucketsize * VM_NFREELIST +
+	    pagecount * sizeof(struct vm_page));
+	pagearray = (struct vm_page *)
+	    (bucketarray + bucketsize * VM_NFREELIST);
+
+	for (fl = 0; fl < VM_NFREELIST; fl++) {
+		pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
+		uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
 	}
 	memset(pagearray, 0, pagecount * sizeof(struct vm_page));
 
 	/*
+	 * init the freelist cache in the disabled state.
+	 */
+	uvm_pgflcache_init();
+
+	/*
 	 * init the vm_page structures and put them in the correct place.
 	 */
 	/* First init the extent */
@@ -396,12 +450,6 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
 	uvmexp.reserve_kernel = vm_page_reserve_kernel;
 
 	/*
-	 * determine if we should zero pages in the idle loop.
-	 */
-
-	uvm.cpus[0]->page_idle_zero = vm_page_zero_enable;
-
-	/*
 	 * done!
 	 */
 
@@ -409,6 +457,34 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
 }
 
 /*
+ * uvm_pgfl_lock: lock all freelist buckets
+ */
+
+void
+uvm_pgfl_lock(void)
+{
+	int i;
+
+	for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
+		mutex_spin_enter(&uvm_freelist_locks[i].lock);
+	}
+}
+
+/*
+ * uvm_pgfl_unlock: unlock all freelist buckets
+ */
+
+void
+uvm_pgfl_unlock(void)
+{
+	int i;
+
+	for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
+		mutex_spin_exit(&uvm_freelist_locks[i].lock);
+	}
+}
+
+/*
  * uvm_setpagesize: set the page size
  *
  * => sets page_shift and page_mask from uvmexp.pagesize.
@@ -612,129 +688,301 @@ uvm_vm_page_to_phys(const struct vm_page
 }
 
 /*
- * uvm_page_recolor: Recolor the pages if the new bucket count is
- * larger than the old one.
+ * uvm_page_numa_load: load NUMA range description.
  */
-
 void
-uvm_page_recolor(int newncolors)
+uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
 {
-	struct pgflbucket *bucketarray, *cpuarray, *oldbucketarray;
-	struct pgfreelist gpgfl, pgfl;
-	struct vm_page *pg;
-	vsize_t bucketcount;
-	size_t bucketmemsize, oldbucketmemsize;
-	int color, i, ocolors;
-	int lcv;
-	struct uvm_cpu *ucpu;
+	struct uvm_page_numa_region *d;
+
+	KASSERT(numa_id < PGFL_MAX_BUCKETS);
+
+	d = kmem_alloc(sizeof(*d), KM_SLEEP);
+	d->start = start;
+	d->size = size;
+	d->numa_id = numa_id;
+	d->next = uvm_page_numa_region;
+	uvm_page_numa_region = d;
+}
+
+/*
+ * uvm_page_numa_lookup: lookup NUMA node for the given page.
+ */
+static u_int
+uvm_page_numa_lookup(struct vm_page *pg)
+{
+	struct uvm_page_numa_region *d;
+	static bool warned;
+	paddr_t pa;
+
+	KASSERT(uvm.numa_alloc);
+	KASSERT(uvm_page_numa_region != NULL);
+
+	pa = VM_PAGE_TO_PHYS(pg);
+	for (d = uvm_page_numa_region; d != NULL; d = d->next) {
+		if (pa >= d->start && pa < d->start + d->size) {
+			return d->numa_id;
+		}
+	}
+
+	if (!warned) {
+		printf("uvm_page_numa_lookup: failed, first pg=%p pa=%p\n",
+		    pg, (void *)VM_PAGE_TO_PHYS(pg));
+		warned = true;
+	}
+
+	return 0;
+}
+
+/*
+ * uvm_page_redim: adjust freelist dimensions if they have changed.
+ */
+
+static void
+uvm_page_redim(int newncolors, int newnbuckets)
+{
+	struct pgfreelist npgfl;
+	struct pgflbucket *opgb, *npgb;
+	struct pgflist *ohead, *nhead;
+	struct vm_page *pg; 
+	size_t bucketsize, bucketmemsize, oldbucketmemsize;
+	int fl, ob, oc, nb, nc, obuckets, ocolors;
+	char *bucketarray, *oldbucketmem, *bucketmem;
 
 	KASSERT(((newncolors - 1) & newncolors) == 0);
 
-	if (newncolors <= uvmexp.ncolors)
+	/* Anything to do? */
+	if (newncolors <= uvmexp.ncolors &&
+	    newnbuckets == uvm.bucketcount) {
 		return;
-
+	}
 	if (uvm.page_init_done == false) {
 		uvmexp.ncolors = newncolors;
 		return;
 	}
 
-	bucketcount = newncolors * VM_NFREELIST;
-	bucketmemsize = bucketcount * sizeof(struct pgflbucket) * 2;
-	bucketarray = kmem_alloc(bucketmemsize, KM_SLEEP);
-	cpuarray = bucketarray + bucketcount;
+	bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
+	bucketsize = roundup2(bucketsize, coherency_unit);
+	bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
+	    coherency_unit - 1;
+	bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
+	bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
+
+	ocolors = uvmexp.ncolors;
+	obuckets = uvm.bucketcount;
 
-	mutex_spin_enter(&uvm_fpageqlock);
+	/* Freelist cache musn't be enabled. */
+	uvm_pgflcache_pause();
 
 	/* Make sure we should still do this. */
-	if (newncolors <= uvmexp.ncolors) {
-		mutex_spin_exit(&uvm_fpageqlock);
-		kmem_free(bucketarray, bucketmemsize);
+	uvm_pgfl_lock();
+	if (newncolors <= uvmexp.ncolors &&
+	    newnbuckets == uvm.bucketcount) {
+		uvm_pgfl_unlock();
+		kmem_free(bucketmem, bucketmemsize);
 		return;
 	}
 
-	oldbucketarray = uvm.page_free[0].pgfl_buckets;
-	ocolors = uvmexp.ncolors;
-
 	uvmexp.ncolors = newncolors;
 	uvmexp.colormask = uvmexp.ncolors - 1;
+	uvm.bucketcount = newnbuckets;
 
-	ucpu = curcpu()->ci_data.cpu_uvm;
-	for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
-		gpgfl.pgfl_buckets = (bucketarray + (lcv * newncolors));
-		pgfl.pgfl_buckets = (cpuarray + (lcv * uvmexp.ncolors));
-		uvm_page_init_buckets(&gpgfl);
-		uvm_page_init_buckets(&pgfl);
-		for (color = 0; color < ocolors; color++) {
-			for (i = 0; i < PGFL_NQUEUES; i++) {
-				while ((pg = LIST_FIRST(&uvm.page_free[
-				    lcv].pgfl_buckets[color].pgfl_queues[i]))
-				    != NULL) {
-					LIST_REMOVE(pg, pageq.list); /* global */
-					LIST_REMOVE(pg, listq.list); /* cpu */
-					LIST_INSERT_HEAD(&gpgfl.pgfl_buckets[
-					    VM_PGCOLOR(pg)].pgfl_queues[
-					    i], pg, pageq.list);
-					LIST_INSERT_HEAD(&pgfl.pgfl_buckets[
-					    VM_PGCOLOR(pg)].pgfl_queues[
-					    i], pg, listq.list);
+	for (fl = 0; fl < VM_NFREELIST; fl++) {
+		/* Init new buckets in new freelist. */
+		memset(&npgfl, 0, sizeof(npgfl));
+		for (nb = 0; nb < newnbuckets; nb++) {
+			npgb = (struct pgflbucket *)bucketarray;
+			uvm_page_init_bucket(&npgfl, npgb, nb);
+			bucketarray += bucketsize;
+		}
+		/* Now transfer pages from the old freelist. */
+		for (nb = ob = 0; ob < obuckets; ob++) {
+			opgb = uvm.page_free[fl].pgfl_buckets[ob];
+			for (oc = 0; oc < ocolors; oc++) {
+				ohead = &opgb->pgb_colors[oc];
+				while ((pg = LIST_FIRST(ohead)) != NULL) {
+					LIST_REMOVE(pg, pageq.list);
+					/*
+					 * Here we decide on the NEW color &
+					 * bucket for the page.  For NUMA
+					 * we'll use the info that the
+					 * hardware gave us.  Otherwise we
+					 * just do a round-robin among the
+					 * buckets.
+					 */
+					KASSERT(
+					    uvm_page_get_bucket(pg) == ob);
+					KASSERT(fl ==
+					    uvm_page_get_freelist(pg));
+					if (uvm.numa_alloc) {
+						nb = uvm_page_numa_lookup(pg);
+					} else if (nb + 1 < newnbuckets) {
+						nb = nb + 1;
+					} else {
+						nb = 0;
+					}
+					uvm_page_set_bucket(pg, nb);
+					npgb = npgfl.pgfl_buckets[nb];
+					npgb->pgb_nfree++;
+					nc = VM_PGCOLOR(pg);
+					nhead = &npgb->pgb_colors[nc];
+					LIST_INSERT_HEAD(nhead, pg, pageq.list);
 				}
 			}
 		}
-		uvm.page_free[lcv].pgfl_buckets = gpgfl.pgfl_buckets;
-		ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
+		/* Install the new freelist. */
+		memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
 	}
 
+	/* Unlock and free the old memory. */
 	oldbucketmemsize = recolored_pages_memsize;
-
+	oldbucketmem = recolored_pages_mem;
 	recolored_pages_memsize = bucketmemsize;
-	mutex_spin_exit(&uvm_fpageqlock);
+	recolored_pages_mem = bucketmem;
+	uvm_pgfl_unlock();
 
 	if (oldbucketmemsize) {
-		kmem_free(oldbucketarray, oldbucketmemsize);
+		kmem_free(oldbucketmem, oldbucketmemsize);
 	}
 
+	uvm_pgflcache_resume();
+
 	/*
 	 * this calls uvm_km_alloc() which may want to hold
-	 * uvm_fpageqlock.
+	 * uvm_freelist_lock.
 	 */
 	uvm_pager_realloc_emerg();
 }
 
 /*
+ * uvm_page_recolor: Recolor the pages if the new color count is
+ * larger than the old one.
+ */
+
+void
+uvm_page_recolor(int newncolors)
+{
+
+	uvm_page_redim(newncolors, uvm.bucketcount);
+}
+
+/*
+ * uvm_page_rebucket: Determine a bucket structure and redim the free
+ * lists to match.
+ */
+
+void
+uvm_page_rebucket(void)
+{
+	u_int min_numa, max_numa, npackage, shift;
+	struct cpu_info *ci, *ci2, *ci3;
+	CPU_INFO_ITERATOR cii;
+
+	/*
+	 * If we have more than one NUMA node, and the maximum NUMA node ID
+	 * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
+	 * for free pages.  uvm_pagefree() will not reassign pages to a
+	 * different bucket on free.
+	 */
+	min_numa = (u_int)-1;
+	max_numa = 0;
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (ci->ci_numa_id < min_numa) {
+			min_numa = ci->ci_numa_id;
+		}
+		if (ci->ci_numa_id > max_numa) {
+			max_numa = ci->ci_numa_id;
+		}
+	}
+	if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
+#ifdef NUMA
+		/*
+		 * We can do this, and it seems to work well, but until
+		 * further experiments are done we'll stick with the cache
+		 * locality strategy.
+		 */
+		aprint_debug("UVM: using NUMA allocation scheme\n");
+		for (CPU_INFO_FOREACH(cii, ci)) {	
+			ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
+		}
+		uvm.numa_alloc = true;
+	 	uvm_page_redim(uvmexp.ncolors, max_numa + 1);
+	 	return;
+#endif
+	}
+
+	/*
+	 * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
+	 * and minimise lock contention.  Count the total number of CPU
+	 * packages, and then try to distribute the buckets among CPU
+	 * packages evenly.  uvm_pagefree() will reassign pages to the
+	 * freeing CPU's preferred bucket on free.
+	 */
+	npackage = 0;
+	ci = curcpu();
+	ci2 = ci;
+	do {
+		npackage++;
+		ci2 = ci2->ci_sibling[CPUREL_PEER];
+	} while (ci2 != ci);
+	
+	/*
+	 * Figure out how to arrange the packages & buckets, and the total
+	 * number of buckets we need.  XXX 2 may not be the best factor.
+	 */
+	for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
+		npackage >>= 1;
+	}
+ 	uvm_page_redim(uvmexp.ncolors, npackage);
+
+ 	/*
+ 	 * Now tell each CPU which bucket to use.  In the outer loop, scroll
+ 	 * through all CPU packages.
+ 	 */
+ 	npackage = 0;
+	ci = curcpu();
+	ci2 = ci;
+	do {
+		/*
+		 * In the inner loop, scroll through all CPUs in the package
+		 * and assign the same bucket ID.
+		 */
+		ci3 = ci2;
+		do {
+			ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
+			ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
+		} while (ci3 != ci2);
+		npackage++;
+		ci2 = ci2->ci_sibling[CPUREL_PEER];
+	} while (ci2 != ci);
+
+	aprint_debug("UVM: using package allocation scheme, "
+	    "%d package(s) per bucket\n", 1 << shift);
+}
+
+/*
  * uvm_cpu_attach: initialize per-CPU data structures.
  */
 
 void
 uvm_cpu_attach(struct cpu_info *ci)
 {
-	struct pgflbucket *bucketarray;
-	struct pgfreelist pgfl;
 	struct uvm_cpu *ucpu;
-	vsize_t bucketcount;
-	int lcv;
 
-	if (CPU_IS_PRIMARY(ci)) {
-		/* Already done in uvm_page_init(). */
-		goto attachrnd;
-	}
-
-	/* Add more reserve pages for this CPU. */
-	uvmexp.reserve_kernel += vm_page_reserve_kernel;
-
-	/* Configure this CPU's free lists. */
-	bucketcount = uvmexp.ncolors * VM_NFREELIST;
-	bucketarray = kmem_alloc(bucketcount * sizeof(struct pgflbucket),
-	    KM_SLEEP);
-	ucpu = kmem_zalloc(sizeof(*ucpu), KM_SLEEP);
-	uvm.cpus[cpu_index(ci)] = ucpu;
-	ci->ci_data.cpu_uvm = ucpu;
-	for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
-		pgfl.pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors));
-		uvm_page_init_buckets(&pgfl);
-		ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
+	/* Already done in uvm_page_init(). */
+	if (!CPU_IS_PRIMARY(ci)) {
+		/* Add more reserve pages for this CPU. */
+		uvmexp.reserve_kernel += vm_page_reserve_kernel;
+
+		/* Allocate per-CPU data structures. */
+		ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
+		    KM_SLEEP);
+		ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
+		    coherency_unit);
+		uvm.cpus[cpu_index(ci)] = ucpu;
+		ci->ci_data.cpu_uvm = ucpu;
 	}
 
-attachrnd:
 	/*
 	 * Attach RNG source for this CPU's VM events
 	 */
@@ -742,101 +990,140 @@ attachrnd:
 			  ci->ci_data.cpu_name, RND_TYPE_VM,
 			  RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
 			  RND_FLAG_ESTIMATE_VALUE);
-
 }
 
 /*
- * uvm_free: return total number of free pages in system.
+ * uvm_free: fetch the total amount of free memory in pages.  This can have a
+ * detrimental effect on performance due to false sharing; don't call unless
+ * needed.
  */
 
 int
 uvm_free(void)
 {
+	struct pgfreelist *pgfl;
+	int fl, b, fpages;
 
-	return uvmexp.free;
+	fpages = 0;
+	for (fl = 0; fl < VM_NFREELIST; fl++) {
+		pgfl = &uvm.page_free[fl];
+		for (b = 0; b < uvm.bucketcount; b++) {
+			fpages += pgfl->pgfl_buckets[b]->pgb_nfree;
+		}
+	}
+	return fpages;
 }
 
 /*
- * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat
+ * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
+ * specific freelist and specific bucket only.
+ *
+ * => must be at IPL_VM or higher to protect per-CPU data structures.
  */
 
 static struct vm_page *
-uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int flist, int try1, int try2,
-    int *trycolorp)
+uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
 {
-	struct pgflist *freeq;
+	int c, trycolor, colormask;
+	struct pgflbucket *pgb;
 	struct vm_page *pg;
-	int color, trycolor = *trycolorp;
-	struct pgfreelist *gpgfl, *pgfl;
+	kmutex_t *lock;
+
+	/*
+	 * Skip the bucket if empty, no lock needed.  There could be many
+	 * empty freelists/buckets.
+	 */
+	pgb = uvm.page_free[f].pgfl_buckets[b];
+	if (pgb->pgb_nfree == 0) {
+		return NULL;
+	}
 
-	KASSERT(mutex_owned(&uvm_fpageqlock));
+	/* Skip bucket if low on memory. */
+	lock = &uvm_freelist_locks[b].lock;
+	mutex_spin_enter(lock);
+	if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
+		if ((flags & UVM_PGA_USERESERVE) == 0 ||
+		    (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
+		     curlwp != uvm.pagedaemon_lwp)) {
+			mutex_spin_exit(lock);
+		     	return NULL;
+		}
+	}
 
-	color = trycolor;
-	pgfl = &ucpu->page_free[flist];
-	gpgfl = &uvm.page_free[flist];
+	/* Try all page colors as needed. */
+	c = trycolor = *trycolorp;
+	colormask = uvmexp.colormask;
 	do {
-		/* cpu, try1 */
-		if ((pg = LIST_FIRST((freeq =
-		    &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
+		pg = LIST_FIRST(&pgb->pgb_colors[c]);
+		if (__predict_true(pg != NULL)) {
+			/*
+			 * Got a free page!  PG_FREE must be cleared under
+			 * lock because of uvm_pglistalloc().
+			 */
+			LIST_REMOVE(pg, pageq.list);
 			KASSERT(pg->flags & PG_FREE);
-			KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
-			KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
-			KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
-			VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
-		    	CPU_COUNT(CPU_COUNT_CPUHIT, 1);
-			goto gotit;
-		}
-		/* global, try1 */
-		if ((pg = LIST_FIRST((freeq =
-		    &gpgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
-			KASSERT(pg->flags & PG_FREE);
-			KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
-			KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
-			KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
-			VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
-		    	CPU_COUNT(CPU_COUNT_CPUMISS, 1);
-			goto gotit;
-		}
-		/* cpu, try2 */
-		if ((pg = LIST_FIRST((freeq =
-		    &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
-			KASSERT(pg->flags & PG_FREE);
-			KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
-			KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
-			KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
-			VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
-		    	CPU_COUNT(CPU_COUNT_CPUHIT, 1);
-			goto gotit;
-		}
-		/* global, try2 */
-		if ((pg = LIST_FIRST((freeq =
-		    &gpgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
-			KASSERT(pg->flags & PG_FREE);
-			KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
-			KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
-			KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
-			VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
-		    	CPU_COUNT(CPU_COUNT_CPUMISS, 1);
-			goto gotit;
+			pg->flags &= PG_ZERO;
+			pgb->pgb_nfree--;
+	
+			/*
+			 * While we have the bucket locked and our data
+			 * structures fresh in L1 cache, we have an ideal
+			 * opportunity to grab some pages for the freelist
+			 * cache without causing extra contention.  Only do
+			 * so if we found pages in this CPU's preferred
+			 * bucket.
+			 */
+			if (__predict_true(b == ucpu->pgflbucket)) {
+				uvm_pgflcache_fill(ucpu, f, b, c);
+			}
+			mutex_spin_exit(lock);
+			KASSERT(uvm_page_get_bucket(pg) == b);
+			CPU_COUNT(c == trycolor ?
+			    CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
+			CPU_COUNT(CPU_COUNT_CPUMISS, 1);
+			*trycolorp = c;
+			return pg;
 		}
-		color = (color + 1) & uvmexp.colormask;
-	} while (color != trycolor);
+		c = (c + 1) & colormask;
+	} while (c != trycolor);
+	mutex_spin_exit(lock);
 
-	return (NULL);
+	return NULL;
+}
 
- gotit:
-	LIST_REMOVE(pg, pageq.list);	/* global list */
-	LIST_REMOVE(pg, listq.list);	/* per-cpu list */
-	uvmexp.free--;
-
-	if (color == trycolor)
-	    	CPU_COUNT(CPU_COUNT_COLORHIT, 1);
-	else {
-	    	CPU_COUNT(CPU_COUNT_COLORMISS, 1);
-		*trycolorp = color;
+/*
+ * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
+ * any color from any bucket, in a specific freelist.
+ *
+ * => must be at IPL_VM or higher to protect per-CPU data structures.
+ */
+
+static struct vm_page *
+uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
+{
+	int b, trybucket, bucketcount;
+	struct vm_page *pg;
+
+	/* Try for the exact thing in the per-CPU cache. */
+	if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
+		CPU_COUNT(CPU_COUNT_CPUHIT, 1);
+		CPU_COUNT(CPU_COUNT_COLORHIT, 1);
+		return pg;
 	}
 
-	return (pg);
+	/* Walk through all buckets, trying our preferred bucket first. */
+	trybucket = ucpu->pgflbucket;
+	b = trybucket;
+	bucketcount = uvm.bucketcount;
+	do {
+		pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
+		if (pg != NULL) {
+			return pg;
+		}
+		b = (b + 1 == bucketcount ? 0 : b + 1);
+	} while (b != trybucket);
+
+	return NULL;
 }
 
 /*
@@ -861,8 +1148,8 @@ struct vm_page *
 uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
     int flags, int strat, int free_list)
 {
-	int try1, try2, zeroit = 0, color;
-	int lcv, error;
+	int zeroit = 0, color;
+	int lcv, error, s;
 	struct uvm_cpu *ucpu;
 	struct vm_page *pg;
 	lwp_t *l;
@@ -879,21 +1166,15 @@ uvm_pagealloc_strat(struct uvm_object *o
 	 * algorithm.
 	 */
 
+	s = splvm();
 	ucpu = curcpu()->ci_data.cpu_uvm;
 	if (flags & UVM_FLAG_COLORMATCH) {
 		color = atop(off) & uvmexp.colormask;
 	} else {
-		color = ucpu->page_free_nextcolor;
+		color = ucpu->pgflcolor;
 	}
 
 	/*
-	 * check to see if we need to generate some free pages waking
-	 * the pagedaemon.
-	 */
-
-	uvm_kick_pdaemon();
-
-	/*
 	 * fail if any of these conditions is true:
 	 * [1]  there really are no free pages, or
 	 * [2]  only kernel "reserved" pages remain and
@@ -903,55 +1184,40 @@ uvm_pagealloc_strat(struct uvm_object *o
 	 * we make kernel reserve pages available if called by a
 	 * kernel thread or a realtime thread.
 	 */
-	mutex_spin_enter(&uvm_fpageqlock);
 	l = curlwp;
 	if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) {
 		flags |= UVM_PGA_USERESERVE;
 	}
-	if ((uvmexp.free <= uvmexp.reserve_kernel &&
-	    (flags & UVM_PGA_USERESERVE) == 0) ||
-	    (uvmexp.free <= uvmexp.reserve_pagedaemon &&
-	     curlwp != uvm.pagedaemon_lwp))
-		goto fail;
-
-#if PGFL_NQUEUES != 2
-#error uvm_pagealloc_strat needs to be updated
-#endif
 
-	/*
-	 * If we want a zero'd page, try the ZEROS queue first, otherwise
-	 * we try the UNKNOWN queue first.
-	 */
-	if (flags & UVM_PGA_ZERO) {
-		try1 = PGFL_ZEROS;
-		try2 = PGFL_UNKNOWN;
-	} else {
-		try1 = PGFL_UNKNOWN;
-		try2 = PGFL_ZEROS;
+	/* If the allocator's running in NUMA mode, go with NUMA strategy. */
+	if (uvm.numa_alloc && strat == UVM_PGA_STRAT_NORMAL) {
+		strat = UVM_PGA_STRAT_NUMA;
 	}
 
  again:
 	switch (strat) {
 	case UVM_PGA_STRAT_NORMAL:
-		/* Check freelists: descending priority (ascending id) order */
+		/* Check freelists: descending priority (ascending id) order. */
 		for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
-			pg = uvm_pagealloc_pgfl(ucpu, lcv,
-			    try1, try2, &color);
-			if (pg != NULL)
+			pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
+			if (pg != NULL) {
 				goto gotit;
+			}
 		}
 
-		/* No pages free! */
-		goto fail;
+		/* No pages free!  Have pagedaemon free some memory. */
+		splx(s);
+		uvm_kick_pdaemon();
+		return NULL;
 
 	case UVM_PGA_STRAT_ONLY:
 	case UVM_PGA_STRAT_FALLBACK:
 		/* Attempt to allocate from the specified free list. */
 		KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
-		pg = uvm_pagealloc_pgfl(ucpu, free_list,
-		    try1, try2, &color);
-		if (pg != NULL)
+		pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
+		if (pg != NULL) {
 			goto gotit;
+		}
 
 		/* Fall back, if possible. */
 		if (strat == UVM_PGA_STRAT_FALLBACK) {
@@ -959,8 +1225,33 @@ uvm_pagealloc_strat(struct uvm_object *o
 			goto again;
 		}
 
-		/* No pages free! */
-		goto fail;
+		/* No pages free!  Have pagedaemon free some memory. */
+		splx(s);
+		uvm_kick_pdaemon();
+		return NULL;
+
+	case UVM_PGA_STRAT_NUMA:
+		/*
+		 * NUMA strategy: allocating from the correct bucket is more
+		 * important than observing freelist priority.  Look only to
+		 * the current NUMA node; if that fails, we need to look to
+		 * other NUMA nodes, so retry with the normal strategy.
+		 */
+		for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
+			pg = uvm_pgflcache_alloc(ucpu, lcv, color);
+			if (pg != NULL) {
+				CPU_COUNT(CPU_COUNT_CPUHIT, 1);
+				CPU_COUNT(CPU_COUNT_COLORHIT, 1);
+				goto gotit;
+			}
+			pg = uvm_pagealloc_pgb(ucpu, lcv,
+			    ucpu->pgflbucket, &color, flags);
+			if (pg != NULL) {
+				goto gotit;
+			}
+		}
+		strat = UVM_PGA_STRAT_NORMAL;
+		goto again;
 
 	default:
 		panic("uvm_pagealloc_strat: bad strat %d", strat);
@@ -973,11 +1264,11 @@ uvm_pagealloc_strat(struct uvm_object *o
 	 * the next color accordingly.
 	 */
 
-	ucpu->page_free_nextcolor = (color + 1) & uvmexp.colormask;
+	ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
 
 	/*
-	 * update allocation statistics and remember if we have to
-	 * zero the page
+	 * while still at IPL_VM, update allocation statistics and remember
+	 * if we have to zero the page
 	 */
 
 	if (flags & UVM_PGA_ZERO) {
@@ -988,9 +1279,6 @@ uvm_pagealloc_strat(struct uvm_object *o
 		    	CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1);
 			zeroit = 1;
 		}
-		if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
-			ucpu->page_idle_zero = vm_page_zero_enable;
-		}
 	}
 	if (pg->flags & PG_ZERO) {
 	    	CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
@@ -998,12 +1286,9 @@ uvm_pagealloc_strat(struct uvm_object *o
 	if (anon) {
 		CPU_COUNT(CPU_COUNT_ANONPAGES, 1);
 	}
+	splx(s);
 	KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0);
 
-	/* mark the page as allocated and then drop uvm_fpageqlock. */
-	pg->flags &= ~PG_FREE;
-	mutex_spin_exit(&uvm_fpageqlock);
-
 	/*
 	 * assign the page to the object.  as the page was free, we know
 	 * that pg->uobject and pg->uanon are NULL.  we only need to take
@@ -1050,10 +1335,6 @@ uvm_pagealloc_strat(struct uvm_object *o
 	}
 
 	return(pg);
-
- fail:
-	mutex_spin_exit(&uvm_fpageqlock);
-	return (NULL);
 }
 
 /*
@@ -1133,7 +1414,6 @@ uvm_pagezerocheck(struct vm_page *pg)
 	int *p, *ep;
 
 	KASSERT(uvm_zerocheckkva != 0);
-	KASSERT(mutex_owned(&uvm_fpageqlock));
 
 	/*
 	 * XXX assuming pmap_kenter_pa and pmap_kremove never call
@@ -1170,10 +1450,12 @@ uvm_pagezerocheck(struct vm_page *pg)
 void
 uvm_pagefree(struct vm_page *pg)
 {
-	struct pgflist *pgfl;
+	struct pgfreelist *pgfl;
+	struct pgflbucket *pgb;
 	struct uvm_cpu *ucpu;
-	int index, color, queue;
-	bool iszero, locked;
+	kmutex_t *lock;
+	int bucket, s;
+	bool locked;
 
 #ifdef DEBUG
 	if (pg->uobject == (void *)0xdeadbeef &&
@@ -1184,7 +1466,6 @@ uvm_pagefree(struct vm_page *pg)
 
 	KASSERT((pg->flags & PG_PAGEOUT) == 0);
 	KASSERT(!(pg->flags & PG_FREE));
-	//KASSERT(mutex_owned(&uvm_pageqlock) || !uvmpdpol_pageisqueued_p(pg));
 	KASSERT(pg->uobject == NULL || mutex_owned(pg->uobject->vmobjlock));
 	KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
 		mutex_owned(pg->uanon->an_lock));
@@ -1285,44 +1566,46 @@ uvm_pagefree(struct vm_page *pg)
 	 * and put on free queue
 	 */
 
-	iszero = (pg->flags & PG_ZERO);
-	index = uvm_page_get_freelist(pg);
-	color = VM_PGCOLOR(pg);
-	queue = (iszero ? PGFL_ZEROS : PGFL_UNKNOWN);
-
 #ifdef DEBUG
 	pg->uobject = (void *)0xdeadbeef;
 	pg->uanon = (void *)0xdeadbeef;
-#endif
-
-	mutex_spin_enter(&uvm_fpageqlock);
-	pg->flags = PG_FREE;
-
-#ifdef DEBUG
-	if (iszero)
+	if (pg->flags & PG_ZERO)
 		uvm_pagezerocheck(pg);
 #endif /* DEBUG */
 
+	s = splvm();
+	ucpu = curcpu()->ci_data.cpu_uvm;
 
-	/* global list */
-	pgfl = &uvm.page_free[index].pgfl_buckets[color].pgfl_queues[queue];
-	LIST_INSERT_HEAD(pgfl, pg, pageq.list);
-	uvmexp.free++;
-	if (iszero) {
-	    	CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
+	/*
+	 * If we're using the NUMA strategy, we'll only cache this page if
+	 * it came from the local CPU's NUMA node.  Otherwise we're using
+	 * the L2/L3 cache locality strategy and we'll cache anything.
+	 */
+	if (uvm.numa_alloc) {
+		bucket = uvm_page_get_bucket(pg);
+	} else {
+		bucket = ucpu->pgflbucket;
+		uvm_page_set_bucket(pg, bucket);
 	}
 
-	/* per-cpu list */
-	ucpu = curcpu()->ci_data.cpu_uvm;
-	pg->offset = (uintptr_t)ucpu;
-	pgfl = &ucpu->page_free[index].pgfl_buckets[color].pgfl_queues[queue];
-	LIST_INSERT_HEAD(pgfl, pg, listq.list);
-	ucpu->pages[queue]++;
-	if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
-		ucpu->page_idle_zero = vm_page_zero_enable;
+	/* Try to send the page to the per-CPU cache. */
+	if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
+		splx(s);
+		return;
 	}
 
-	mutex_spin_exit(&uvm_fpageqlock);
+	/* Didn't work.  Never mind, send it to a global bucket. */
+	pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+	pgb = pgfl->pgfl_buckets[bucket];
+	lock = &uvm_freelist_locks[bucket].lock;
+
+	mutex_spin_enter(lock);
+	/* PG_FREE must be set under lock because of uvm_pglistalloc(). */
+	pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
+	LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
+	pgb->pgb_nfree++;
+	mutex_spin_exit(lock);
+	splx(s);
 }
 
 /*
@@ -1411,116 +1694,22 @@ uvm_page_own(struct vm_page *pg, const c
 		    "page (%p)\n", pg);
 		panic("uvm_page_own");
 	}
-	if (!uvmpdpol_pageisqueued_p(pg)) {
-		KASSERT((pg->uanon == NULL && pg->uobject == NULL) ||
-		    pg->wire_count > 0);
-	} else {
-		KASSERT(pg->wire_count == 0);
-	}
 	pg->owner_tag = NULL;
 }
 #endif
 
 /*
  * uvm_pageidlezero: zero free pages while the system is idle.
- *
- * => try to complete one color bucket at a time, to reduce our impact
- *	on the CPU cache.
- * => we loop until we either reach the target or there is a lwp ready
- *      to run, or MD code detects a reason to break early.
  */
 void
 uvm_pageidlezero(void)
 {
-	struct vm_page *pg;
-	struct pgfreelist *pgfl, *gpgfl;
-	struct uvm_cpu *ucpu;
-	int free_list, firstbucket, nextbucket;
-	bool lcont = false;
 
-	ucpu = curcpu()->ci_data.cpu_uvm;
-	if (!ucpu->page_idle_zero ||
-	    ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
-	    	ucpu->page_idle_zero = false;
-		return;
-	}
-	if (!mutex_tryenter(&uvm_fpageqlock)) {
-		/* Contention: let other CPUs to use the lock. */
-		return;
-	}
-	firstbucket = ucpu->page_free_nextcolor;
-	nextbucket = firstbucket;
-	do {
-		for (free_list = 0; free_list < VM_NFREELIST; free_list++) {
-			if (sched_curcpu_runnable_p()) {
-				goto quit;
-			}
-			pgfl = &ucpu->page_free[free_list];
-			gpgfl = &uvm.page_free[free_list];
-			while ((pg = LIST_FIRST(&pgfl->pgfl_buckets[
-			    nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) {
-				if (lcont || sched_curcpu_runnable_p()) {
-					goto quit;
-				}
-				LIST_REMOVE(pg, pageq.list); /* global list */
-				LIST_REMOVE(pg, listq.list); /* per-cpu list */
-				ucpu->pages[PGFL_UNKNOWN]--;
-				uvmexp.free--;
-				KASSERT(pg->flags == PG_FREE);
-				pg->flags = 0;
-				mutex_spin_exit(&uvm_fpageqlock);
-#ifdef PMAP_PAGEIDLEZERO
-				if (!PMAP_PAGEIDLEZERO(VM_PAGE_TO_PHYS(pg))) {
-
-					/*
-					 * The machine-dependent code detected
-					 * some reason for us to abort zeroing
-					 * pages, probably because there is a
-					 * process now ready to run.
-					 */
-
-					mutex_spin_enter(&uvm_fpageqlock);
-					pg->flags = PG_FREE;
-					LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
-					    nextbucket].pgfl_queues[
-					    PGFL_UNKNOWN], pg, pageq.list);
-					LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
-					    nextbucket].pgfl_queues[
-					    PGFL_UNKNOWN], pg, listq.list);
-					ucpu->pages[PGFL_UNKNOWN]++;
-					uvmexp.free++;
-				    	uvmexp.zeroaborts++;
-					goto quit;
-				}
-#else
-				pmap_zero_page(VM_PAGE_TO_PHYS(pg));
-#endif /* PMAP_PAGEIDLEZERO */
-				if (!mutex_tryenter(&uvm_fpageqlock)) {
-					lcont = true;
-					mutex_spin_enter(&uvm_fpageqlock);
-				} else {
-					lcont = false;
-				}
-				pg->flags = PG_FREE | PG_ZERO;
-				LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
-				    nextbucket].pgfl_queues[PGFL_ZEROS],
-				    pg, pageq.list);
-				LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
-				    nextbucket].pgfl_queues[PGFL_ZEROS],
-				    pg, listq.list);
-				ucpu->pages[PGFL_ZEROS]++;
-				uvmexp.free++;
-			    	CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
-			}
-		}
-		if (ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
-			break;
-		}
-		nextbucket = (nextbucket + 1) & uvmexp.colormask;
-	} while (nextbucket != firstbucket);
-	ucpu->page_idle_zero = false;
- quit:
-	mutex_spin_exit(&uvm_fpageqlock);
+	/*
+	 * Disabled for the moment.  Previous strategy too cache heavy.  In
+	 * the future we may experiment with zeroing the pages held in the
+	 * per-CPU cache (uvm_pgflcache).
+	 */
 }
 
 /*
@@ -1800,6 +1989,7 @@ uvm_page_printit(struct vm_page *pg, boo
 {
 	struct vm_page *tpg;
 	struct uvm_object *uobj;
+	struct pgflbucket *pgb;
 	struct pgflist *pgl;
 	char pgbuf[128];
 
@@ -1848,14 +2038,9 @@ uvm_page_printit(struct vm_page *pg, boo
 	/* cross-verify page queue */
 	if (pg->flags & PG_FREE) {
 		int fl = uvm_page_get_freelist(pg);
-		int color = VM_PGCOLOR(pg);
-		pgl = &uvm.page_free[fl].pgfl_buckets[color].pgfl_queues[
-		    ((pg)->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN];
-	} else {
-		pgl = NULL;
-	}
-
-	if (pgl) {
+		int b = uvm_page_get_bucket(pg);
+		pgb = uvm.page_free[fl].pgfl_buckets[b];
+		pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
 		(*pr)("  checking pageq list\n");
 		LIST_FOREACH(tpg, pgl, pageq.list) {
 			if (tpg == pg) {
@@ -1905,4 +2090,36 @@ uvm_page_printall(void (*pr)(const char 
 	}
 }
 
+/*
+ * uvm_page_print_freelists - print a summary freelists
+ */
+
+void
+uvm_page_print_freelists(void (*pr)(const char *, ...))
+{
+	struct pgfreelist *pgfl;
+	struct pgflbucket *pgb;
+	int fl, b, c;
+
+	(*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
+	    VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
+	    
+	for (fl = 0; fl < VM_NFREELIST; fl++) {
+		pgfl = &uvm.page_free[fl];
+		(*pr)("freelist(%d) @ %p\n", fl, pgfl);
+		for (b = 0; b < uvm.bucketcount; b++) {
+			pgb = uvm.page_free[fl].pgfl_buckets[b];
+			(*pr)("    bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
+			    b, pgb, pgb->pgb_nfree,
+			    &uvm_freelist_locks[b].lock);
+			for (c = 0; c < uvmexp.ncolors; c++) {
+				(*pr)("        color(%d) @ %p, ", c,
+				    &pgb->pgb_colors[c]);
+				(*pr)("first page = %p\n",
+				    LIST_FIRST(&pgb->pgb_colors[c]));
+			}
+		}
+	}
+}
+
 #endif /* DDB || DEBUGPRINT */

Index: src/sys/uvm/uvm_page.h
diff -u src/sys/uvm/uvm_page.h:1.88 src/sys/uvm/uvm_page.h:1.89
--- src/sys/uvm/uvm_page.h:1.88	Sat Dec 21 14:41:44 2019
+++ src/sys/uvm/uvm_page.h	Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: uvm_page.h,v 1.88 2019/12/21 14:41:44 ad Exp $	*/
+/*	$NetBSD: uvm_page.h,v 1.89 2019/12/27 12:51:57 ad Exp $	*/
 
 /*
  * Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -119,7 +119,6 @@
  *
  * o free
  *   => pageq.list is entry on global free page queue
- *   => listq.list is entry on per-CPU free page queue
  *   => uanon is unused (or (void *)0xdeadbeef for DEBUG)
  *   => uobject is unused (or (void *)0xdeadbeef for DEBUG)
  *   => PG_FREE is set in flags
@@ -129,13 +128,11 @@
  *   => uobject is owner
  * o owned by a vm_anon
  *   => pageq is unused (XXX correct?)
- *   => listq is unused (XXX correct?)
  *   => uanon is owner
  *   => uobject is NULL
  *   => PG_ANON is set in flags
  * o allocated by uvm_pglistalloc
  *   => pageq.queue is entry on resulting pglist, owned by caller
- *   => listq is unused (XXX correct?)
  *   => uanon is unused
  *   => uobject is unused
  *
@@ -153,11 +150,6 @@ struct vm_page {
 						 * or uvm_pglistalloc output */
 		LIST_ENTRY(vm_page) list;	/* f: global free page queue */
 	} pageq;
-
-	union {
-		LIST_ENTRY(vm_page) list;	/* f: CPU free page queue */
-	} listq;
-
 	struct vm_anon		*uanon;		/* o,i: anon */
 	struct uvm_object	*uobject;	/* o,i: object */
 	voff_t			offset;		/* o: offset into object */
@@ -302,6 +294,7 @@ void uvm_page_own(struct vm_page *, cons
 bool uvm_page_physget(paddr_t *);
 #endif
 void uvm_page_recolor(int);
+void uvm_page_rebucket(void);
 void uvm_pageidlezero(void);
 
 void uvm_pageactivate(struct vm_page *);
@@ -318,6 +311,8 @@ void uvm_pagewire(struct vm_page *);
 void uvm_pagezero(struct vm_page *);
 bool uvm_pageismanaged(paddr_t);
 bool uvm_page_locked_p(struct vm_page *);
+void uvm_pgfl_lock(void);
+void uvm_pgfl_unlock(void);
 
 int uvm_page_lookup_freelist(struct vm_page *);
 
@@ -348,8 +343,12 @@ int uvm_direct_process(struct vm_page **
 #define	VM_PGCOLOR(pg) \
 	(atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask)
 #define	PHYS_TO_VM_PAGE(pa)	uvm_phys_to_vm_page(pa)
+
+/*
+ * VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a
+ * per-CPU cache.  If you need to be certain, pause caching.
+ */
 #define VM_PAGE_IS_FREE(entry)  ((entry)->flags & PG_FREE)
-#define	VM_FREE_PAGE_TO_CPU(pg)	((struct uvm_cpu *)((uintptr_t)pg->offset))
 
 /*
  * Use the lower 10 bits of pg->phys_addr to cache some some locators for

Index: src/sys/uvm/uvm_pglist.c
diff -u src/sys/uvm/uvm_pglist.c:1.77 src/sys/uvm/uvm_pglist.c:1.78
--- src/sys/uvm/uvm_pglist.c:1.77	Sat Dec 21 14:50:34 2019
+++ src/sys/uvm/uvm_pglist.c	Fri Dec 27 12:51:57 2019
@@ -1,12 +1,12 @@
-/*	$NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $	*/
+/*	$NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $	*/
 
 /*-
- * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997, 2019 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
- * NASA Ames Research Center.
+ * NASA Ames Research Center, and by Andrew Doran.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -35,13 +35,14 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <uvm/uvm.h>
 #include <uvm/uvm_pdpolicy.h>
+#include <uvm/uvm_pgflcache.h>
 
 #ifdef VM_PAGE_ALLOC_MEMORY_STATS
 #define	STAT_INCR(v)	(v)++
@@ -79,34 +80,25 @@ u_long	uvm_pglistalloc_npages;
 static void
 uvm_pglist_add(struct vm_page *pg, struct pglist *rlist)
 {
-	int free_list __unused, color __unused, pgflidx;
+	struct pgfreelist *pgfl;
+	struct pgflbucket *pgb;
 
-	KASSERT(mutex_owned(&uvm_fpageqlock));
+	pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+	pgb = pgfl->pgfl_buckets[uvm_page_get_bucket(pg)];
 
-#if PGFL_NQUEUES != 2
-#error uvm_pglistalloc needs to be updated
-#endif
-
-	free_list = uvm_page_get_freelist(pg);
-	color = VM_PGCOLOR(pg);
-	pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN;
 #ifdef UVMDEBUG
 	struct vm_page *tp;
-	LIST_FOREACH(tp,
-	    &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx],
-	    pageq.list) {
+	LIST_FOREACH(tp, &pgb->pgb_colors[VM_PGCOLOR(pg)], pageq.list) {
 		if (tp == pg)
 			break;
 	}
 	if (tp == NULL)
 		panic("uvm_pglistalloc: page not on freelist");
 #endif
-	LIST_REMOVE(pg, pageq.list);	/* global */
-	LIST_REMOVE(pg, listq.list);	/* cpu */
-	uvmexp.free--;
+	LIST_REMOVE(pg, pageq.list);
+	pgb->pgb_nfree--;
 	if (pg->flags & PG_ZERO)
 		CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
-	VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--;
 	pg->flags = PG_CLEAN;
 	pg->uobject = NULL;
 	pg->uanon = NULL;
@@ -129,8 +121,6 @@ uvm_pglistalloc_c_ps(uvm_physseg_t psi, 
 	printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem);
 #endif
 
-	KASSERT(mutex_owned(&uvm_fpageqlock));
-
 	low = atop(low);
 	high = atop(high);
 	alignment = atop(alignment);
@@ -316,7 +306,7 @@ uvm_pglistalloc_contig(int num, paddr_t 
 	/*
 	 * Block all memory allocation and lock the free list.
 	 */
-	mutex_spin_enter(&uvm_fpageqlock);
+	uvm_pgfl_lock();
 
 	/* Are there even any free pages? */
 	if (uvm_free() <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
@@ -352,7 +342,7 @@ out:
 	 * the pagedaemon.
 	 */
 
-	mutex_spin_exit(&uvm_fpageqlock);
+	uvm_pgfl_unlock();
 	uvm_kick_pdaemon();
 	return (error);
 }
@@ -368,7 +358,6 @@ uvm_pglistalloc_s_ps(uvm_physseg_t psi, 
 	printf("pgalloc: simple %d pgs from psi %zd\n", num, psi);
 #endif
 
-	KASSERT(mutex_owned(&uvm_fpageqlock));
 	KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_start(psi));
 	KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_end(psi));
 	KASSERT(uvm_physseg_get_avail_start(psi) <= uvm_physseg_get_end(psi));
@@ -461,7 +450,7 @@ again:
 	/*
 	 * Block all memory allocation and lock the free list.
 	 */
-	mutex_spin_enter(&uvm_fpageqlock);
+	uvm_pgfl_lock();
 	count++;
 
 	/* Are there even any free pages? */
@@ -493,7 +482,7 @@ out:
 	 * the pagedaemon.
 	 */
 
-	mutex_spin_exit(&uvm_fpageqlock);
+	uvm_pgfl_unlock();
 	uvm_kick_pdaemon();
 
 	if (error) {
@@ -539,6 +528,12 @@ uvm_pglistalloc(psize_t size, paddr_t lo
 
 	TAILQ_INIT(rlist);
 
+	/*
+	 * Turn off the caching of free pages - we need everything to be on
+	 * the global freelists.
+	 */
+	uvm_pgflcache_pause();
+
 	if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) ||
 	    (boundary != 0))
 		res = uvm_pglistalloc_contig(num, low, high, alignment,
@@ -546,6 +541,8 @@ uvm_pglistalloc(psize_t size, paddr_t lo
 	else
 		res = uvm_pglistalloc_simple(num, low, high, rlist, waitok);
 
+	uvm_pgflcache_resume();
+
 	return (res);
 }
 
@@ -558,45 +555,34 @@ uvm_pglistalloc(psize_t size, paddr_t lo
 void
 uvm_pglistfree(struct pglist *list)
 {
-	struct uvm_cpu *ucpu;
+	struct pgfreelist *pgfl;
+	struct pgflbucket *pgb;
 	struct vm_page *pg;
-	int index, color, queue;
-	bool iszero;
+	int c, b;
 
 	/*
 	 * Lock the free list and free each page.
 	 */
 
-	mutex_spin_enter(&uvm_fpageqlock);
-	ucpu = curcpu()->ci_data.cpu_uvm;
+	uvm_pgfl_lock();
 	while ((pg = TAILQ_FIRST(list)) != NULL) {
-		KASSERT(!uvmpdpol_pageisqueued_p(pg));
 		TAILQ_REMOVE(list, pg, pageq.queue);
-		iszero = (pg->flags & PG_ZERO);
 		pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
 #ifdef DEBUG
 		pg->uobject = (void *)0xdeadbeef;
 		pg->uanon = (void *)0xdeadbeef;
-#endif /* DEBUG */
-#ifdef DEBUG
-		if (iszero)
+		if (pg->flags & PG_ZERO)
 			uvm_pagezerocheck(pg);
 #endif /* DEBUG */
-		index = uvm_page_get_freelist(pg);
-		color = VM_PGCOLOR(pg);
-		queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN;
-		pg->offset = (uintptr_t)ucpu;
-		LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color].
-		    pgfl_queues[queue], pg, pageq.list);
-		LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color].
-		    pgfl_queues[queue], pg, listq.list);
-		uvmexp.free++;
-		if (iszero)
+		c = VM_PGCOLOR(pg);
+		b = uvm_page_get_bucket(pg);
+		pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+		pgb = pgfl->pgfl_buckets[b];
+		if (pg->flags & PG_ZERO)
 			CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
-		ucpu->pages[queue]++;
+		pgb->pgb_nfree++;
+		LIST_INSERT_HEAD(&pgb->pgb_colors[c], pg, pageq.list);
 		STAT_DECR(uvm_pglistalloc_npages);
 	}
-	if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN])
-		ucpu->page_idle_zero = vm_page_zero_enable;
-	mutex_spin_exit(&uvm_fpageqlock);
+	uvm_pgfl_unlock();
 }

Index: src/sys/uvm/uvm_pglist.h
diff -u src/sys/uvm/uvm_pglist.h:1.8 src/sys/uvm/uvm_pglist.h:1.9
--- src/sys/uvm/uvm_pglist.h:1.8	Sat Nov  6 15:48:00 2010
+++ src/sys/uvm/uvm_pglist.h	Fri Dec 27 12:51:57 2019
@@ -1,11 +1,11 @@
-/*	$NetBSD: uvm_pglist.h,v 1.8 2010/11/06 15:48:00 uebayasi Exp $	*/
+/*	$NetBSD: uvm_pglist.h,v 1.9 2019/12/27 12:51:57 ad Exp $	*/
 
 /*-
- * Copyright (c) 2000, 2001, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2000, 2001, 2008, 2019 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
- * by Jason R. Thorpe.
+ * by Jason R. Thorpe, and by Andrew Doran.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -41,19 +41,51 @@ TAILQ_HEAD(pglist, vm_page);
 LIST_HEAD(pgflist, vm_page);
 
 /*
- * A page free list consists of free pages of unknown contents and free
- * pages of all zeros.
+ * The global uvm.page_free list (uvm_page.c, uvm_pglist.c).  Free pages are
+ * stored according to freelist, bucket, and cache colour.
+ *
+ * pglist = &uvm.page_free[freelist].pgfl_buckets[bucket].pgb_color[color];
+ *
+ * Freelists provide a priority ordering of pages for allocation, based upon
+ * how valuable they are for special uses (e.g. device driver DMA).
+ *
+ * Pages are then grouped in buckets according to some common factor, for
+ * example L2/L3 cache locality.  Each bucket has its own lock, and the
+ * locks are shared among freelists for the same numbered buckets.
+ *
+ * Inside each bucket, pages are further distributed by cache color.
+ *
+ * We want these data structures to occupy as few cache lines as possible,
+ * as they will be highly contended.
  */
-#define	PGFL_UNKNOWN	0
-#define	PGFL_ZEROS	1
-#define	PGFL_NQUEUES	2
-
 struct pgflbucket {
-	struct pgflist pgfl_queues[PGFL_NQUEUES];
+	uintptr_t	pgb_nfree;	/* total # free pages, all colors */
+	struct pgflist	pgb_colors[1];	/* variable size array */
 };
 
+/*
+ * At the root, the freelists.  MD code decides the number and structure of
+ * these.  They are always arranged in descending order of allocation
+ * priority.
+ *
+ * 8 buckets should be enough to cover most all current x86 systems (2019),
+ * given the way package/core/smt IDs are structured on x86.  For systems
+ * that report high package counts despite having a single physical CPU
+ * package (e.g. Ampere eMAG) a little bit of sharing isn't going to hurt
+ * in the least.
+ */
+#define	PGFL_MAX_BUCKETS	8
 struct pgfreelist {
-	struct pgflbucket *pgfl_buckets;
+	struct pgflbucket	*pgfl_buckets[PGFL_MAX_BUCKETS];
+};
+
+/*
+ * Lock for each bucket.
+ */
+union uvm_freelist_lock {
+        kmutex_t        lock;
+        uint8_t         padding[COHERENCY_UNIT];
 };
+extern union uvm_freelist_lock	uvm_freelist_locks[PGFL_MAX_BUCKETS];
 
 #endif /* _UVM_UVM_PGLIST_H_ */

Added files:

Index: src/sys/uvm/uvm_pgflcache.c
diff -u /dev/null src/sys/uvm/uvm_pgflcache.c:1.1
--- /dev/null	Fri Dec 27 12:51:57 2019
+++ src/sys/uvm/uvm_pgflcache.c	Fri Dec 27 12:51:57 2019
@@ -0,0 +1,471 @@
+/*	$NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * uvm_pgflcache.c: page freelist cache.
+ *
+ * This implements a tiny per-CPU cache of pages that sits between the main
+ * page allocator and the freelists.  By allocating and freeing pages in
+ * batch, it reduces freelist contention by an order of magnitude.
+ *
+ * The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
+ * uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
+ * world.  On system with one CPU per physical package (e.g. a uniprocessor)
+ * the cache is not enabled.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $");
+
+#include "opt_uvm.h"
+#include "opt_multiprocessor.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sched.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/atomic.h>
+#include <sys/cpu.h>
+#include <sys/xcall.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pglist.h>
+#include <uvm/uvm_pgflcache.h>
+
+/* There is no point doing any of this on a uniprocessor. */
+#ifdef MULTIPROCESSOR
+
+/*
+ * MAXPGS - maximum pages per color, per bucket.
+ * FILLPGS - number of pages to allocate at once, per color, per bucket.
+ *
+ * Why the chosen values:
+ *
+ * (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
+ * colors.  We make the assumption that most of the time allocation activity
+ * will be centered around one UVM freelist, so most of the time there will
+ * be no more than 224kB worth of cached pages per-CPU.  That's tiny, but
+ * enough to hugely reduce contention on the freelist locks, and give us a
+ * small pool of pages which if we're very lucky may have some L1/L2 cache
+ * locality, and do so without subtracting too much from the L2/L3 cache
+ * benefits of having per-package free lists in the page allocator.
+ *
+ * (2) With the chosen values on _LP64, the data structure for each color
+ * takes up a single cache line (64 bytes) giving this very low overhead
+ * even in the "miss" case.
+ *
+ * (3) We don't want to cause too much pressure by hiding away memory that
+ * could otherwise be put to good use.
+ */
+#define	MAXPGS		7
+#define	FILLPGS		6
+
+/* Variable size, according to # colors. */
+struct pgflcache {
+	struct pccolor {
+		intptr_t	count;
+		struct vm_page	*pages[MAXPGS];
+	} color[1];
+};
+
+static kmutex_t		uvm_pgflcache_lock;
+static kcondvar_t	uvm_pgflcache_cv;
+static int		uvm_pgflcache_sem;
+static bool		uvm_pgflcache_draining;
+
+/*
+ * uvm_pgflcache_fill: fill specified freelist/color from global list
+ *
+ * => must be called at IPL_VM
+ * => must be called with given bucket lock held
+ * => must only fill from the correct bucket for this CPU
+ */
+
+void
+uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
+{
+	struct pgflbucket *pgb;
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	struct pgflist *head;
+	struct vm_page *pg;
+	int count;
+
+	KASSERT(mutex_owned(&uvm_freelist_locks[b].lock));
+	KASSERT(ucpu->pgflbucket == b);
+
+	/* If caching is off, then bail out. */
+	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+		return;
+	}
+
+	/* Fill only to the limit. */
+	pcc = &pc->color[c];
+	pgb = uvm.page_free[fl].pgfl_buckets[b];
+	head = &pgb->pgb_colors[c];
+	if (pcc->count >= FILLPGS) {
+		return;
+	}
+
+	/* Pull pages from the bucket until it's empty, or we are full. */
+	count = pcc->count;
+	pg = LIST_FIRST(head);
+	while (__predict_true(pg != NULL && count < FILLPGS)) {
+		KASSERT(pg->flags & PG_FREE);
+		KASSERT(uvm_page_get_bucket(pg) == b);
+		pcc->pages[count++] = pg;
+		pg = LIST_NEXT(pg, pageq.list);
+	}
+
+	/* Violate LIST abstraction to remove all pages at once. */
+	head->lh_first = pg;
+	if (__predict_true(pg != NULL)) {
+		pg->pageq.list.le_prev = &head->lh_first;
+	}
+	pgb->pgb_nfree -= (count - pcc->count);
+	pcc->count = count;
+}
+
+/*
+ * uvm_pgflcache_spill: spill specified freelist/color to global list
+ *
+ * => must be called at IPL_VM
+ * => mark __noinline so we don't pull it into uvm_pgflcache_free()
+ */
+
+static void __noinline
+uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
+{
+	struct pgflbucket *pgb;
+	struct pgfreelist *pgfl;
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	struct pgflist *head;
+	kmutex_t *lock;
+	int b, adj;
+
+	pc = ucpu->pgflcache[fl];
+	pcc = &pc->color[c];
+	pgfl = &uvm.page_free[fl];
+	b = ucpu->pgflbucket;
+	pgb = pgfl->pgfl_buckets[b];
+	head = &pgb->pgb_colors[c];
+	lock = &uvm_freelist_locks[b].lock;
+
+	mutex_spin_enter(lock);
+	for (adj = pcc->count; pcc->count != 0;) {
+		pcc->count--;
+		KASSERT(pcc->pages[pcc->count] != NULL);
+		KASSERT(pcc->pages[pcc->count]->flags & PG_FREE);
+		LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
+	}
+	pgb->pgb_nfree += adj;
+	mutex_spin_exit(lock);
+}
+
+/*
+ * uvm_pgflcache_alloc: try to allocate a cached page.
+ *
+ * => must be called at IPL_VM
+ * => allocate only from the given freelist and given page color
+ */
+
+struct vm_page *
+uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
+{
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	struct vm_page *pg;
+
+	/* If caching is off, then bail out. */
+	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+		return NULL;
+	}
+
+	/* Very simple: if we have a page then return it. */
+	pcc = &pc->color[c];
+	if (__predict_false(pcc->count == 0)) {
+		return NULL;
+	}
+	pg = pcc->pages[--(pcc->count)];
+	KASSERT(pg != NULL);
+	KASSERT(pg->flags & PG_FREE);
+	KASSERT(uvm_page_get_freelist(pg) == fl);
+	KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
+	pg->flags &= PG_ZERO;
+	return pg;
+}
+
+/*
+ * uvm_pgflcache_free: cache a page, if possible.
+ *
+ * => must be called at IPL_VM
+ * => must only send pages for the correct bucket for this CPU
+ */
+
+bool
+uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
+{
+	struct pgflcache *pc;
+	struct pccolor *pcc;
+	int fl, c;
+
+	KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
+
+	/* If caching is off, then bail out. */
+ 	fl = uvm_page_get_freelist(pg);
+	if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+		return false;
+	}
+
+	/* If the array is full spill it first, then add page to array. */
+	c = VM_PGCOLOR(pg);
+	pcc = &pc->color[c];
+	KASSERT((pg->flags & PG_FREE) == 0);
+	if (__predict_false(pcc->count == MAXPGS)) {
+		uvm_pgflcache_spill(ucpu, fl, c);
+	}
+	pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
+	pcc->pages[pcc->count] = pg;
+	pcc->count++;
+	return true;
+}
+
+/*
+ * uvm_pgflcache_init: allocate and initialize per-CPU data structures for
+ * the free page cache.  Don't set anything in motion - that's taken care
+ * of by uvm_pgflcache_resume().
+ */
+
+static void
+uvm_pgflcache_init_cpu(struct cpu_info *ci)
+{
+	struct uvm_cpu *ucpu;
+	size_t sz;
+
+	ucpu = ci->ci_data.cpu_uvm;
+	KASSERT(ucpu->pgflcachemem == NULL);
+	KASSERT(ucpu->pgflcache[0] == NULL);
+
+	sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
+	ucpu->pgflcachememsz =
+	    (roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
+	ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
+}
+
+/*
+ * uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
+ * and shut down caching on the CPU.  Called on each CPU in the system via
+ * xcall.
+ */
+
+static void
+uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
+{
+	struct uvm_cpu *ucpu;
+	int fl, color, s;
+
+	ucpu = curcpu()->ci_data.cpu_uvm;
+	for (fl = 0; fl < VM_NFREELIST; fl++) {
+		s = splvm();
+		for (color = 0; color < uvmexp.ncolors; color++) {
+			uvm_pgflcache_spill(ucpu, fl, color);
+		}
+		ucpu->pgflcache[fl] = NULL;
+		splx(s);
+	}
+}
+
+/*
+ * uvm_pgflcache_pause: pause operation of the caches
+ */
+
+void
+uvm_pgflcache_pause(void)
+{
+	uint64_t where;
+
+	/* First one in starts draining.  Everyone else waits. */
+	mutex_enter(&uvm_pgflcache_lock);
+	if (uvm_pgflcache_sem++ == 0) {
+		uvm_pgflcache_draining = true;
+		mutex_exit(&uvm_pgflcache_lock);
+		where = xc_broadcast(0, uvm_pgflcache_fini_cpu, NULL, NULL);
+		xc_wait(where);
+		mutex_enter(&uvm_pgflcache_lock);
+		uvm_pgflcache_draining = false;
+		cv_broadcast(&uvm_pgflcache_cv);
+	} else {
+		while (uvm_pgflcache_draining) {
+			cv_wait(&uvm_pgflcache_cv, &uvm_pgflcache_lock);
+		}
+	}
+	mutex_exit(&uvm_pgflcache_lock);
+}
+
+/*
+ * uvm_pgflcache_resume: resume operation of the caches
+ */
+
+void
+uvm_pgflcache_resume(void)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+	struct uvm_cpu *ucpu;
+	uintptr_t addr;
+	size_t sz;
+	int fl;
+
+	/* Last guy out takes care of business. */
+	mutex_enter(&uvm_pgflcache_lock);
+	KASSERT(!uvm_pgflcache_draining);
+	KASSERT(uvm_pgflcache_sem > 0);
+	if (uvm_pgflcache_sem-- > 1) {
+		mutex_exit(&uvm_pgflcache_lock);
+		return;
+	}
+
+	/*
+	 * Make sure dependant data structure updates are remotely visible.
+	 * Essentially this functions as a global memory barrier.
+	 */
+	xc_barrier(XC_HIGHPRI);
+
+	/*
+	 * Then set all of the pointers in place on each CPU.  As soon as
+	 * each pointer is set, caching is operational in that dimension.
+	 */
+	sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		ucpu = ci->ci_data.cpu_uvm;
+		addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
+		for (fl = 0; fl < VM_NFREELIST; fl++) {
+			ucpu->pgflcache[fl] = (struct pgflcache *)addr;
+			addr += sz;
+		}
+	}
+	mutex_exit(&uvm_pgflcache_lock);
+}
+
+/*
+ * uvm_pgflcache_start: start operation of the cache.
+ *
+ * => called once only, when init(8) is about to be started
+ */
+
+void
+uvm_pgflcache_start(void)
+{
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+
+	KASSERT(uvm_pgflcache_sem > 0);
+
+	/*
+	 * There's not much point doing this if every CPU has its own
+	 * bucket (and that includes the uniprocessor case).
+	 */
+	if (ncpu == uvm.bucketcount) {
+		return;
+	}
+
+	/* Create each CPU's buckets. */
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		uvm_pgflcache_init_cpu(ci);
+	}
+
+	/* Kick it into action. */
+	uvm_pgflcache_resume();
+}
+
+/*
+ * uvm_pgflcache_init: set up data structures for the free page cache.
+ */
+
+void
+uvm_pgflcache_init(void)
+{
+
+	uvm_pgflcache_sem = 1;
+	mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
+	cv_init(&uvm_pgflcache_cv, "flcache");
+}
+
+#else	/* MULTIPROCESSOR */
+
+struct vm_page *
+uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
+{
+
+	return NULL;
+}
+
+bool
+uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
+{
+
+	return false;
+}
+
+void
+uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
+{
+
+}
+
+void
+uvm_pgflcache_pause(void)
+{
+
+}
+
+void
+uvm_pgflcache_resume(void)
+{
+
+}
+
+void
+uvm_pgflcache_start(void)
+{
+
+}
+
+void
+uvm_pgflcache_init(void)
+{
+
+}
+
+#endif	/* MULTIPROCESSOR */
Index: src/sys/uvm/uvm_pgflcache.h
diff -u /dev/null src/sys/uvm/uvm_pgflcache.h:1.1
--- /dev/null	Fri Dec 27 12:51:57 2019
+++ src/sys/uvm/uvm_pgflcache.h	Fri Dec 27 12:51:57 2019
@@ -0,0 +1,43 @@
+/*	$NetBSD: uvm_pgflcache.h,v 1.1 2019/12/27 12:51:57 ad Exp $	*/
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if !defined(_UVM_PGFLCACHE_H_)
+#define	_UVM_PGFLCACHE_H_
+
+struct vm_page	*uvm_pgflcache_alloc(struct uvm_cpu *, int, int);
+void		uvm_pgflcache_fill(struct uvm_cpu *, int, int, int);
+bool		uvm_pgflcache_free(struct uvm_cpu *, struct vm_page *);
+void		uvm_pgflcache_init(void);
+void		uvm_pgflcache_pause(void);
+void		uvm_pgflcache_resume(void);
+void		uvm_pgflcache_start(void);
+
+#endif /* !_UVM_PGFLCACHE_H_ */

Reply via email to