Module Name: src
Committed By: ad
Date: Fri Dec 27 12:51:57 UTC 2019
Modified Files:
src/sys/arch/amd64/amd64: autoconf.c
src/sys/arch/i386/i386: autoconf.c
src/sys/ddb: db_command.c
src/sys/dev/acpi: acpi_srat.c acpi_srat.h
src/sys/kern: init_main.c
src/sys/uvm: files.uvm uvm.h uvm_ddb.h uvm_extern.h uvm_glue.c
uvm_init.c uvm_page.c uvm_page.h uvm_pglist.c uvm_pglist.h
Added Files:
src/sys/uvm: uvm_pgflcache.c uvm_pgflcache.h
Log Message:
Redo the page allocator to perform better, especially on multi-core and
multi-socket systems. Proposed on tech-kern. While here:
- add rudimentary NUMA support - needs more work.
- remove now unused "listq" from vm_page.
To generate a diff of this commit:
cvs rdiff -u -r1.28 -r1.29 src/sys/arch/amd64/amd64/autoconf.c
cvs rdiff -u -r1.105 -r1.106 src/sys/arch/i386/i386/autoconf.c
cvs rdiff -u -r1.165 -r1.166 src/sys/ddb/db_command.c
cvs rdiff -u -r1.7 -r1.8 src/sys/dev/acpi/acpi_srat.c
cvs rdiff -u -r1.4 -r1.5 src/sys/dev/acpi/acpi_srat.h
cvs rdiff -u -r1.512 -r1.513 src/sys/kern/init_main.c
cvs rdiff -u -r1.31 -r1.32 src/sys/uvm/files.uvm
cvs rdiff -u -r1.70 -r1.71 src/sys/uvm/uvm.h
cvs rdiff -u -r1.15 -r1.16 src/sys/uvm/uvm_ddb.h
cvs rdiff -u -r1.215 -r1.216 src/sys/uvm/uvm_extern.h
cvs rdiff -u -r1.172 -r1.173 src/sys/uvm/uvm_glue.c
cvs rdiff -u -r1.51 -r1.52 src/sys/uvm/uvm_init.c
cvs rdiff -u -r1.212 -r1.213 src/sys/uvm/uvm_page.c
cvs rdiff -u -r1.88 -r1.89 src/sys/uvm/uvm_page.h
cvs rdiff -u -r0 -r1.1 src/sys/uvm/uvm_pgflcache.c \
src/sys/uvm/uvm_pgflcache.h
cvs rdiff -u -r1.77 -r1.78 src/sys/uvm/uvm_pglist.c
cvs rdiff -u -r1.8 -r1.9 src/sys/uvm/uvm_pglist.h
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/arch/amd64/amd64/autoconf.c
diff -u src/sys/arch/amd64/amd64/autoconf.c:1.28 src/sys/arch/amd64/amd64/autoconf.c:1.29
--- src/sys/arch/amd64/amd64/autoconf.c:1.28 Sun Oct 22 00:59:28 2017
+++ src/sys/arch/amd64/amd64/autoconf.c Fri Dec 27 12:51:56 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $ */
+/* $NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
@@ -46,7 +46,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.28 2017/10/22 00:59:28 maya Exp $");
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.29 2019/12/27 12:51:56 ad Exp $");
#include "opt_multiprocessor.h"
#include "opt_intrdebug.h"
@@ -60,9 +60,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v
#include <machine/pte.h>
#include <machine/cpufunc.h>
+#include "acpica.h"
#include "ioapic.h"
#include "lapic.h"
+#if NACPICA > 0
+#include <dev/acpi/acpi_srat.h>
+#endif
+
#if NIOAPIC > 0
#include <machine/i82093var.h>
#endif
@@ -112,6 +117,11 @@ cpu_configure(void)
cpu_init_idle_lwps();
#endif
+#if NACPICA > 0
+ /* Load NUMA memory regions into UVM. */
+ acpisrat_load_uvm();
+#endif
+
spl0();
lcr8(0);
}
Index: src/sys/arch/i386/i386/autoconf.c
diff -u src/sys/arch/i386/i386/autoconf.c:1.105 src/sys/arch/i386/i386/autoconf.c:1.106
--- src/sys/arch/i386/i386/autoconf.c:1.105 Sun Oct 22 00:59:28 2017
+++ src/sys/arch/i386/i386/autoconf.c Fri Dec 27 12:51:56 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $ */
+/* $NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
@@ -46,7 +46,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.105 2017/10/22 00:59:28 maya Exp $");
+__KERNEL_RCSID(0, "$NetBSD: autoconf.c,v 1.106 2019/12/27 12:51:56 ad Exp $");
#include "opt_intrdebug.h"
#include "opt_multiprocessor.h"
@@ -65,9 +65,14 @@ __KERNEL_RCSID(0, "$NetBSD: autoconf.c,v
#include <machine/cpufunc.h>
#include <x86/fpu.h>
+#include "acpica.h"
#include "ioapic.h"
#include "lapic.h"
+#if NACPICA > 0
+#include <dev/acpi/acpi_srat.h>
+#endif
+
#if NIOAPIC > 0
#include <machine/i82093var.h>
#endif
@@ -132,6 +137,11 @@ cpu_configure(void)
cpu_init_idle_lwps();
#endif
+#if NACPICA > 0
+ /* Load NUMA memory regions into UVM. */
+ acpisrat_load_uvm();
+#endif
+
spl0();
#if NLAPIC > 0
lapic_write_tpri(0);
Index: src/sys/ddb/db_command.c
diff -u src/sys/ddb/db_command.c:1.165 src/sys/ddb/db_command.c:1.166
--- src/sys/ddb/db_command.c:1.165 Sun Dec 15 20:29:08 2019
+++ src/sys/ddb/db_command.c Fri Dec 27 12:51:56 2019
@@ -1,7 +1,8 @@
-/* $NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $ */
+/* $NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $ */
/*
- * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009 The NetBSD Foundation, Inc.
+ * Copyright (c) 1996, 1997, 1998, 1999, 2002, 2009, 2019
+ * The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
@@ -60,7 +61,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.165 2019/12/15 20:29:08 joerg Exp $");
+__KERNEL_RCSID(0, "$NetBSD: db_command.c,v 1.166 2019/12/27 12:51:56 ad Exp $");
#ifdef _KERNEL_OPT
#include "opt_aio.h"
@@ -193,6 +194,7 @@ static void db_help_print_cmd(db_exp
static void db_lock_print_cmd(db_expr_t, bool, db_expr_t, const char *);
static void db_show_all_locks(db_expr_t, bool, db_expr_t, const char *);
static void db_show_lockstats(db_expr_t, bool, db_expr_t, const char *);
+static void db_show_all_freelists(db_expr_t, bool, db_expr_t, const char *);
static void db_mount_print_cmd(db_expr_t, bool, db_expr_t, const char *);
static void db_show_all_mount(db_expr_t, bool, db_expr_t, const char *);
static void db_mbuf_print_cmd(db_expr_t, bool, db_expr_t, const char *);
@@ -234,6 +236,8 @@ static const struct db_command db_show_c
0 ,"Show all held locks", "[/t]", NULL) },
{ DDB_ADD_CMD("mount", db_show_all_mount, 0,
"Print all mount structures.", "[/f]", NULL) },
+ { DDB_ADD_CMD("freelists", db_show_all_freelists,
+ 0 ,"Show all freelists", NULL, NULL) },
#ifdef AIO
/*added from all sub cmds*/
{ DDB_ADD_CMD("aio_jobs", db_show_aio_jobs, 0,
@@ -1285,6 +1289,16 @@ db_show_all_locks(db_expr_t addr, bool h
}
static void
+db_show_all_freelists(db_expr_t addr, bool have_addr,
+ db_expr_t count, const char *modif)
+{
+
+#ifdef _KERNEL /* XXX CRASH(8) */
+ uvm_page_print_freelists(db_printf);
+#endif
+}
+
+static void
db_show_lockstats(db_expr_t addr, bool have_addr,
db_expr_t count, const char *modif)
{
Index: src/sys/dev/acpi/acpi_srat.c
diff -u src/sys/dev/acpi/acpi_srat.c:1.7 src/sys/dev/acpi/acpi_srat.c:1.8
--- src/sys/dev/acpi/acpi_srat.c:1.7 Sun Dec 22 22:18:04 2019
+++ src/sys/dev/acpi/acpi_srat.c Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $ */
+/* $NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.7 2019/12/22 22:18:04 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,v 1.8 2019/12/27 12:51:57 ad Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
@@ -39,6 +39,8 @@ __KERNEL_RCSID(0, "$NetBSD: acpi_srat.c,
#include <dev/acpi/acpivar.h>
#include <dev/acpi/acpi_srat.h>
+#include <uvm/uvm_extern.h>
+
static ACPI_TABLE_SRAT *srat;
static uint32_t nnodes; /* Number of NUMA nodes */
@@ -472,6 +474,28 @@ acpisrat_dump(void)
}
}
+void
+acpisrat_load_uvm(void)
+{
+ uint32_t i, j, nn, nm;
+ struct acpisrat_mem m;
+
+ nn = acpisrat_nodes();
+ aprint_debug("SRAT: %u NUMA nodes\n", nn);
+ for (i = 0; i < nn; i++) {
+ nm = acpisrat_node_memoryranges(i);
+ for (j = 0; j < nm; j++) {
+ acpisrat_mem(i, j, &m);
+ aprint_debug("SRAT: node %u memory range %u (0x%"
+ PRIx64" - 0x%"PRIx64" flags %u)\n",
+ m.nodeid, j, m.baseaddress,
+ m.baseaddress + m.length, m.flags);
+ uvm_page_numa_load(trunc_page(m.baseaddress),
+ trunc_page(m.length), m.nodeid);
+ }
+ }
+}
+
/*
* Get number of NUMA nodes.
*/
Index: src/sys/dev/acpi/acpi_srat.h
diff -u src/sys/dev/acpi/acpi_srat.h:1.4 src/sys/dev/acpi/acpi_srat.h:1.5
--- src/sys/dev/acpi/acpi_srat.h:1.4 Thu Dec 28 08:49:28 2017
+++ src/sys/dev/acpi/acpi_srat.h Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: acpi_srat.h,v 1.4 2017/12/28 08:49:28 maxv Exp $ */
+/* $NetBSD: acpi_srat.h,v 1.5 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -68,6 +68,7 @@ int acpisrat_init(void);
int acpisrat_refresh(void);
int acpisrat_exit(void);
void acpisrat_dump(void);
+void acpisrat_load_uvm(void);
uint32_t acpisrat_nodes(void);
uint32_t acpisrat_node_cpus(acpisrat_nodeid_t);
uint32_t acpisrat_node_memoryranges(acpisrat_nodeid_t);
Index: src/sys/kern/init_main.c
diff -u src/sys/kern/init_main.c:1.512 src/sys/kern/init_main.c:1.513
--- src/sys/kern/init_main.c:1.512 Sun Dec 22 15:00:42 2019
+++ src/sys/kern/init_main.c Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $ */
+/* $NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc.
@@ -97,7 +97,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.512 2019/12/22 15:00:42 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: init_main.c,v 1.513 2019/12/27 12:51:57 ad Exp $");
#include "opt_ddb.h"
#include "opt_inet.h"
@@ -814,6 +814,10 @@ configure2(void)
for (CPU_INFO_FOREACH(cii, ci)) {
uvm_cpu_attach(ci);
}
+
+ /* Decide how to partition free memory. */
+ uvm_page_rebucket();
+
mp_online = true;
#if defined(MULTIPROCESSOR)
cpu_boot_secondary_processors();
Index: src/sys/uvm/files.uvm
diff -u src/sys/uvm/files.uvm:1.31 src/sys/uvm/files.uvm:1.32
--- src/sys/uvm/files.uvm:1.31 Sun Dec 15 21:11:35 2019
+++ src/sys/uvm/files.uvm Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-# $NetBSD: files.uvm,v 1.31 2019/12/15 21:11:35 ad Exp $
+# $NetBSD: files.uvm,v 1.32 2019/12/27 12:51:57 ad Exp $
#
# UVM options
@@ -42,6 +42,7 @@ file uvm/uvm_pager.c uvm
file uvm/uvm_pdaemon.c uvm
file uvm/uvm_pdpolicy_clock.c !pdpolicy_clockpro
file uvm/uvm_pdpolicy_clockpro.c pdpolicy_clockpro
+file uvm/uvm_pgflcache.c uvm
file uvm/uvm_pglist.c uvm
file uvm/uvm_physseg.c uvm
file uvm/uvm_readahead.c uvm
Index: src/sys/uvm/uvm.h
diff -u src/sys/uvm/uvm.h:1.70 src/sys/uvm/uvm.h:1.71
--- src/sys/uvm/uvm.h:1.70 Fri Dec 13 20:10:22 2019
+++ src/sys/uvm/uvm.h Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm.h,v 1.70 2019/12/13 20:10:22 ad Exp $ */
+/* $NetBSD: uvm.h,v 1.71 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -71,21 +71,19 @@
#include <machine/vmparam.h>
struct workqueue;
+struct pgflcache;
/*
* per-cpu data
*/
struct uvm_cpu {
- struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
- int page_free_nextcolor; /* next color to allocate from */
- int page_idlezero_next; /* which color to zero next */
- bool page_idle_zero; /* TRUE if we should try to zero
- pages in the idle loop */
- int pages[PGFL_NQUEUES]; /* total of pages in page_free */
- u_int emap_gen; /* emap generation number */
-
- krndsource_t rs; /* entropy source */
+ struct pgflcache *pgflcache[VM_NFREELIST];/* cpu-local cached pages */
+ void *pgflcachemem; /* pointer to allocated mem */
+ size_t pgflcachememsz; /* size of allocated memory */
+ u_int pgflcolor; /* next color to allocate */
+ u_int pgflbucket; /* where to send our pages */
+ krndsource_t rs; /* entropy source */
};
/*
@@ -98,7 +96,9 @@ struct uvm {
/* vm_page queues */
struct pgfreelist page_free[VM_NFREELIST]; /* unallocated pages */
- bool page_init_done; /* TRUE if uvm_page_init() finished */
+ u_int bucketcount;
+ bool page_init_done; /* true if uvm_page_init() finished */
+ bool numa_alloc; /* use NUMA page allocation strategy */
/* page daemon trigger */
int pagedaemon; /* daemon sleeps on this */
@@ -123,7 +123,6 @@ extern struct uvm_object *uvm_kernel_obj
* locks (made globals for lockstat).
*/
-extern kmutex_t uvm_fpageqlock; /* lock for free page q */
extern kmutex_t uvm_kentry_lock;
#endif /* _KERNEL */
Index: src/sys/uvm/uvm_ddb.h
diff -u src/sys/uvm/uvm_ddb.h:1.15 src/sys/uvm/uvm_ddb.h:1.16
--- src/sys/uvm/uvm_ddb.h:1.15 Tue May 17 04:18:07 2011
+++ src/sys/uvm/uvm_ddb.h Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_ddb.h,v 1.15 2011/05/17 04:18:07 mrg Exp $ */
+/* $NetBSD: uvm_ddb.h,v 1.16 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -40,6 +40,7 @@ void uvm_object_printit(struct uvm_objec
void uvm_page_printit(struct vm_page *, bool,
void (*)(const char *, ...));
void uvm_page_printall(void (*)(const char *, ...));
+void uvm_page_print_freelists(void (*)(const char *, ...));
void uvmexp_print(void (*)(const char *, ...));
#endif /* DDB || DEBUGPRINT */
Index: src/sys/uvm/uvm_extern.h
diff -u src/sys/uvm/uvm_extern.h:1.215 src/sys/uvm/uvm_extern.h:1.216
--- src/sys/uvm/uvm_extern.h:1.215 Sat Dec 21 12:58:26 2019
+++ src/sys/uvm/uvm_extern.h Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_extern.h,v 1.215 2019/12/21 12:58:26 ad Exp $ */
+/* $NetBSD: uvm_extern.h,v 1.216 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -210,6 +210,7 @@ b\32UNMAP\0\
#define UVM_PGA_STRAT_NORMAL 0 /* priority (low id to high) walk */
#define UVM_PGA_STRAT_ONLY 1 /* only specified free list */
#define UVM_PGA_STRAT_FALLBACK 2 /* ONLY falls back on NORMAL */
+#define UVM_PGA_STRAT_NUMA 3 /* strongly prefer ideal bucket */
/*
* flags for uvm_pagealloc_strat()
@@ -736,6 +737,7 @@ void uvm_obj_unwirepages(struct uvm_ob
/* uvm_page.c */
int uvm_free(void);
+void uvm_page_numa_load(paddr_t, paddr_t, u_int);
struct vm_page *uvm_pagealloc_strat(struct uvm_object *,
voff_t, struct vm_anon *, int, int, int);
#define uvm_pagealloc(obj, off, anon, flags) \
Index: src/sys/uvm/uvm_glue.c
diff -u src/sys/uvm/uvm_glue.c:1.172 src/sys/uvm/uvm_glue.c:1.173
--- src/sys/uvm/uvm_glue.c:1.172 Sat Dec 21 13:00:25 2019
+++ src/sys/uvm/uvm_glue.c Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $ */
+/* $NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -62,7 +62,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.172 2019/12/21 13:00:25 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.173 2019/12/27 12:51:57 ad Exp $");
#include "opt_kgdb.h"
#include "opt_kstack.h"
@@ -86,6 +86,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v
#include <sys/asan.h>
#include <uvm/uvm.h>
+#include <uvm/uvm_pgflcache.h>
/*
* uvm_kernacc: test if kernel can access a memory region.
@@ -500,9 +501,17 @@ uvm_scheduler(void)
lwp_changepri(l, PRI_VM);
lwp_unlock(l);
+ /* Start the freelist cache. */
+ uvm_pgflcache_start();
+
for (;;) {
/* Update legacy stats for post-mortem debugging. */
uvm_update_uvmexp();
+
+ /* See if the pagedaemon needs to generate some free pages. */
+ uvm_kick_pdaemon();
+
+ /* Calculate process statistics. */
sched_pstats();
(void)kpause("uvm", false, hz, NULL);
}
Index: src/sys/uvm/uvm_init.c
diff -u src/sys/uvm/uvm_init.c:1.51 src/sys/uvm/uvm_init.c:1.52
--- src/sys/uvm/uvm_init.c:1.51 Fri Dec 13 20:10:22 2019
+++ src/sys/uvm/uvm_init.c Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $ */
+/* $NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -32,7 +32,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.51 2019/12/13 20:10:22 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_init.c,v 1.52 2019/12/27 12:51:57 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -64,7 +64,6 @@ const int * const uvmexp_pagemask = &uvm
const int * const uvmexp_pageshift = &uvmexp.pageshift;
#endif
-kmutex_t uvm_fpageqlock __cacheline_aligned;
kmutex_t uvm_kentry_lock __cacheline_aligned;
/*
Index: src/sys/uvm/uvm_page.c
diff -u src/sys/uvm/uvm_page.c:1.212 src/sys/uvm/uvm_page.c:1.213
--- src/sys/uvm/uvm_page.c:1.212 Sun Dec 22 16:37:36 2019
+++ src/sys/uvm/uvm_page.c Fri Dec 27 12:51:57 2019
@@ -1,4 +1,33 @@
-/* $NetBSD: uvm_page.c,v 1.212 2019/12/22 16:37:36 ad Exp $ */
+/* $NetBSD: uvm_page.c,v 1.213 2019/12/27 12:51:57 ad Exp $ */
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -66,7 +95,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.212 2019/12/22 16:37:36 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.213 2019/12/27 12:51:57 ad Exp $");
#include "opt_ddb.h"
#include "opt_uvm.h"
@@ -87,6 +116,7 @@ __KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v
#include <uvm/uvm.h>
#include <uvm/uvm_ddb.h>
#include <uvm/uvm_pdpolicy.h>
+#include <uvm/uvm_pgflcache.h>
/*
* Some supported CPUs in a given architecture don't support all
@@ -130,6 +160,25 @@ static vaddr_t virtual_space_end;
*/
static size_t recolored_pages_memsize /* = 0 */;
+static char *recolored_pages_mem;
+
+/*
+ * freelist locks - one per bucket.
+ */
+
+union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
+ __cacheline_aligned;
+
+/*
+ * basic NUMA information.
+ */
+
+static struct uvm_page_numa_region {
+ struct uvm_page_numa_region *next;
+ paddr_t start;
+ paddr_t size;
+ u_int numa_id;
+} *uvm_page_numa_region;
#ifdef DEBUG
vaddr_t uvm_zerocheckkva;
@@ -243,15 +292,15 @@ uvm_pageremove_tree(struct uvm_object *u
}
static void
-uvm_page_init_buckets(struct pgfreelist *pgfl)
+uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
{
- int color, i;
+ int i;
- for (color = 0; color < uvmexp.ncolors; color++) {
- for (i = 0; i < PGFL_NQUEUES; i++) {
- LIST_INIT(&pgfl->pgfl_buckets[color].pgfl_queues[i]);
- }
+ pgb->pgb_nfree = 0;
+ for (i = 0; i < uvmexp.ncolors; i++) {
+ LIST_INIT(&pgb->pgb_colors[i]);
}
+ pgfl->pgfl_buckets[num] = pgb;
}
/*
@@ -263,18 +312,18 @@ uvm_page_init_buckets(struct pgfreelist
void
uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
{
- static struct uvm_cpu boot_cpu;
- psize_t freepages, pagecount, bucketcount, n;
- struct pgflbucket *bucketarray, *cpuarray;
+ static struct uvm_cpu boot_cpu __cacheline_aligned;
+ psize_t freepages, pagecount, bucketsize, n;
+ struct pgflbucket *pgb;
struct vm_page *pagearray;
+ char *bucketarray;
uvm_physseg_t bank;
- int lcv;
+ int fl, b;
KASSERT(ncpu <= 1);
- CTASSERT(sizeof(pagearray->offset) >= sizeof(struct uvm_cpu *));
/*
- * init the page queues and free page queue lock, except the
+ * init the page queues and free page queue locks, except the
* free list; we allocate that later (with the initial vm_page
* structures).
*/
@@ -282,7 +331,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
uvm.cpus[0] = &boot_cpu;
curcpu()->ci_data.cpu_uvm = &boot_cpu;
uvmpdpol_init();
- mutex_init(&uvm_fpageqlock, MUTEX_DRIVER, IPL_VM);
+ for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
+ mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
+ }
/*
* allocate vm_page structures.
@@ -323,6 +374,9 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
uvmexp.colormask = uvmexp.ncolors - 1;
KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
+ /* We always start with only 1 bucket. */
+ uvm.bucketcount = 1;
+
/*
* we now know we have (PAGE_SIZE * freepages) bytes of memory we can
* use. for each page of memory we use we need a vm_page structure.
@@ -332,28 +386,28 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
* truncation errors (since we can only allocate in terms of whole
* pages).
*/
-
- bucketcount = uvmexp.ncolors * VM_NFREELIST;
pagecount = ((freepages + 1) << PAGE_SHIFT) /
(PAGE_SIZE + sizeof(struct vm_page));
-
- bucketarray = (void *)uvm_pageboot_alloc((bucketcount *
- sizeof(struct pgflbucket) * 2) + (pagecount *
- sizeof(struct vm_page)));
- cpuarray = bucketarray + bucketcount;
- pagearray = (struct vm_page *)(bucketarray + bucketcount * 2);
-
- for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
- uvm.page_free[lcv].pgfl_buckets =
- (bucketarray + (lcv * uvmexp.ncolors));
- uvm_page_init_buckets(&uvm.page_free[lcv]);
- uvm.cpus[0]->page_free[lcv].pgfl_buckets =
- (cpuarray + (lcv * uvmexp.ncolors));
- uvm_page_init_buckets(&uvm.cpus[0]->page_free[lcv]);
+ bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
+ bucketsize = roundup2(bucketsize, coherency_unit);
+ bucketarray = (void *)uvm_pageboot_alloc(
+ bucketsize * VM_NFREELIST +
+ pagecount * sizeof(struct vm_page));
+ pagearray = (struct vm_page *)
+ (bucketarray + bucketsize * VM_NFREELIST);
+
+ for (fl = 0; fl < VM_NFREELIST; fl++) {
+ pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
+ uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
}
memset(pagearray, 0, pagecount * sizeof(struct vm_page));
/*
+ * init the freelist cache in the disabled state.
+ */
+ uvm_pgflcache_init();
+
+ /*
* init the vm_page structures and put them in the correct place.
*/
/* First init the extent */
@@ -396,12 +450,6 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
uvmexp.reserve_kernel = vm_page_reserve_kernel;
/*
- * determine if we should zero pages in the idle loop.
- */
-
- uvm.cpus[0]->page_idle_zero = vm_page_zero_enable;
-
- /*
* done!
*/
@@ -409,6 +457,34 @@ uvm_page_init(vaddr_t *kvm_startp, vaddr
}
/*
+ * uvm_pgfl_lock: lock all freelist buckets
+ */
+
+void
+uvm_pgfl_lock(void)
+{
+ int i;
+
+ for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
+ mutex_spin_enter(&uvm_freelist_locks[i].lock);
+ }
+}
+
+/*
+ * uvm_pgfl_unlock: unlock all freelist buckets
+ */
+
+void
+uvm_pgfl_unlock(void)
+{
+ int i;
+
+ for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
+ mutex_spin_exit(&uvm_freelist_locks[i].lock);
+ }
+}
+
+/*
* uvm_setpagesize: set the page size
*
* => sets page_shift and page_mask from uvmexp.pagesize.
@@ -612,129 +688,301 @@ uvm_vm_page_to_phys(const struct vm_page
}
/*
- * uvm_page_recolor: Recolor the pages if the new bucket count is
- * larger than the old one.
+ * uvm_page_numa_load: load NUMA range description.
*/
-
void
-uvm_page_recolor(int newncolors)
+uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
{
- struct pgflbucket *bucketarray, *cpuarray, *oldbucketarray;
- struct pgfreelist gpgfl, pgfl;
- struct vm_page *pg;
- vsize_t bucketcount;
- size_t bucketmemsize, oldbucketmemsize;
- int color, i, ocolors;
- int lcv;
- struct uvm_cpu *ucpu;
+ struct uvm_page_numa_region *d;
+
+ KASSERT(numa_id < PGFL_MAX_BUCKETS);
+
+ d = kmem_alloc(sizeof(*d), KM_SLEEP);
+ d->start = start;
+ d->size = size;
+ d->numa_id = numa_id;
+ d->next = uvm_page_numa_region;
+ uvm_page_numa_region = d;
+}
+
+/*
+ * uvm_page_numa_lookup: lookup NUMA node for the given page.
+ */
+static u_int
+uvm_page_numa_lookup(struct vm_page *pg)
+{
+ struct uvm_page_numa_region *d;
+ static bool warned;
+ paddr_t pa;
+
+ KASSERT(uvm.numa_alloc);
+ KASSERT(uvm_page_numa_region != NULL);
+
+ pa = VM_PAGE_TO_PHYS(pg);
+ for (d = uvm_page_numa_region; d != NULL; d = d->next) {
+ if (pa >= d->start && pa < d->start + d->size) {
+ return d->numa_id;
+ }
+ }
+
+ if (!warned) {
+ printf("uvm_page_numa_lookup: failed, first pg=%p pa=%p\n",
+ pg, (void *)VM_PAGE_TO_PHYS(pg));
+ warned = true;
+ }
+
+ return 0;
+}
+
+/*
+ * uvm_page_redim: adjust freelist dimensions if they have changed.
+ */
+
+static void
+uvm_page_redim(int newncolors, int newnbuckets)
+{
+ struct pgfreelist npgfl;
+ struct pgflbucket *opgb, *npgb;
+ struct pgflist *ohead, *nhead;
+ struct vm_page *pg;
+ size_t bucketsize, bucketmemsize, oldbucketmemsize;
+ int fl, ob, oc, nb, nc, obuckets, ocolors;
+ char *bucketarray, *oldbucketmem, *bucketmem;
KASSERT(((newncolors - 1) & newncolors) == 0);
- if (newncolors <= uvmexp.ncolors)
+ /* Anything to do? */
+ if (newncolors <= uvmexp.ncolors &&
+ newnbuckets == uvm.bucketcount) {
return;
-
+ }
if (uvm.page_init_done == false) {
uvmexp.ncolors = newncolors;
return;
}
- bucketcount = newncolors * VM_NFREELIST;
- bucketmemsize = bucketcount * sizeof(struct pgflbucket) * 2;
- bucketarray = kmem_alloc(bucketmemsize, KM_SLEEP);
- cpuarray = bucketarray + bucketcount;
+ bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
+ bucketsize = roundup2(bucketsize, coherency_unit);
+ bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
+ coherency_unit - 1;
+ bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
+ bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
+
+ ocolors = uvmexp.ncolors;
+ obuckets = uvm.bucketcount;
- mutex_spin_enter(&uvm_fpageqlock);
+ /* Freelist cache musn't be enabled. */
+ uvm_pgflcache_pause();
/* Make sure we should still do this. */
- if (newncolors <= uvmexp.ncolors) {
- mutex_spin_exit(&uvm_fpageqlock);
- kmem_free(bucketarray, bucketmemsize);
+ uvm_pgfl_lock();
+ if (newncolors <= uvmexp.ncolors &&
+ newnbuckets == uvm.bucketcount) {
+ uvm_pgfl_unlock();
+ kmem_free(bucketmem, bucketmemsize);
return;
}
- oldbucketarray = uvm.page_free[0].pgfl_buckets;
- ocolors = uvmexp.ncolors;
-
uvmexp.ncolors = newncolors;
uvmexp.colormask = uvmexp.ncolors - 1;
+ uvm.bucketcount = newnbuckets;
- ucpu = curcpu()->ci_data.cpu_uvm;
- for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
- gpgfl.pgfl_buckets = (bucketarray + (lcv * newncolors));
- pgfl.pgfl_buckets = (cpuarray + (lcv * uvmexp.ncolors));
- uvm_page_init_buckets(&gpgfl);
- uvm_page_init_buckets(&pgfl);
- for (color = 0; color < ocolors; color++) {
- for (i = 0; i < PGFL_NQUEUES; i++) {
- while ((pg = LIST_FIRST(&uvm.page_free[
- lcv].pgfl_buckets[color].pgfl_queues[i]))
- != NULL) {
- LIST_REMOVE(pg, pageq.list); /* global */
- LIST_REMOVE(pg, listq.list); /* cpu */
- LIST_INSERT_HEAD(&gpgfl.pgfl_buckets[
- VM_PGCOLOR(pg)].pgfl_queues[
- i], pg, pageq.list);
- LIST_INSERT_HEAD(&pgfl.pgfl_buckets[
- VM_PGCOLOR(pg)].pgfl_queues[
- i], pg, listq.list);
+ for (fl = 0; fl < VM_NFREELIST; fl++) {
+ /* Init new buckets in new freelist. */
+ memset(&npgfl, 0, sizeof(npgfl));
+ for (nb = 0; nb < newnbuckets; nb++) {
+ npgb = (struct pgflbucket *)bucketarray;
+ uvm_page_init_bucket(&npgfl, npgb, nb);
+ bucketarray += bucketsize;
+ }
+ /* Now transfer pages from the old freelist. */
+ for (nb = ob = 0; ob < obuckets; ob++) {
+ opgb = uvm.page_free[fl].pgfl_buckets[ob];
+ for (oc = 0; oc < ocolors; oc++) {
+ ohead = &opgb->pgb_colors[oc];
+ while ((pg = LIST_FIRST(ohead)) != NULL) {
+ LIST_REMOVE(pg, pageq.list);
+ /*
+ * Here we decide on the NEW color &
+ * bucket for the page. For NUMA
+ * we'll use the info that the
+ * hardware gave us. Otherwise we
+ * just do a round-robin among the
+ * buckets.
+ */
+ KASSERT(
+ uvm_page_get_bucket(pg) == ob);
+ KASSERT(fl ==
+ uvm_page_get_freelist(pg));
+ if (uvm.numa_alloc) {
+ nb = uvm_page_numa_lookup(pg);
+ } else if (nb + 1 < newnbuckets) {
+ nb = nb + 1;
+ } else {
+ nb = 0;
+ }
+ uvm_page_set_bucket(pg, nb);
+ npgb = npgfl.pgfl_buckets[nb];
+ npgb->pgb_nfree++;
+ nc = VM_PGCOLOR(pg);
+ nhead = &npgb->pgb_colors[nc];
+ LIST_INSERT_HEAD(nhead, pg, pageq.list);
}
}
}
- uvm.page_free[lcv].pgfl_buckets = gpgfl.pgfl_buckets;
- ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
+ /* Install the new freelist. */
+ memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
}
+ /* Unlock and free the old memory. */
oldbucketmemsize = recolored_pages_memsize;
-
+ oldbucketmem = recolored_pages_mem;
recolored_pages_memsize = bucketmemsize;
- mutex_spin_exit(&uvm_fpageqlock);
+ recolored_pages_mem = bucketmem;
+ uvm_pgfl_unlock();
if (oldbucketmemsize) {
- kmem_free(oldbucketarray, oldbucketmemsize);
+ kmem_free(oldbucketmem, oldbucketmemsize);
}
+ uvm_pgflcache_resume();
+
/*
* this calls uvm_km_alloc() which may want to hold
- * uvm_fpageqlock.
+ * uvm_freelist_lock.
*/
uvm_pager_realloc_emerg();
}
/*
+ * uvm_page_recolor: Recolor the pages if the new color count is
+ * larger than the old one.
+ */
+
+void
+uvm_page_recolor(int newncolors)
+{
+
+ uvm_page_redim(newncolors, uvm.bucketcount);
+}
+
+/*
+ * uvm_page_rebucket: Determine a bucket structure and redim the free
+ * lists to match.
+ */
+
+void
+uvm_page_rebucket(void)
+{
+ u_int min_numa, max_numa, npackage, shift;
+ struct cpu_info *ci, *ci2, *ci3;
+ CPU_INFO_ITERATOR cii;
+
+ /*
+ * If we have more than one NUMA node, and the maximum NUMA node ID
+ * is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
+ * for free pages. uvm_pagefree() will not reassign pages to a
+ * different bucket on free.
+ */
+ min_numa = (u_int)-1;
+ max_numa = 0;
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ if (ci->ci_numa_id < min_numa) {
+ min_numa = ci->ci_numa_id;
+ }
+ if (ci->ci_numa_id > max_numa) {
+ max_numa = ci->ci_numa_id;
+ }
+ }
+ if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
+#ifdef NUMA
+ /*
+ * We can do this, and it seems to work well, but until
+ * further experiments are done we'll stick with the cache
+ * locality strategy.
+ */
+ aprint_debug("UVM: using NUMA allocation scheme\n");
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
+ }
+ uvm.numa_alloc = true;
+ uvm_page_redim(uvmexp.ncolors, max_numa + 1);
+ return;
+#endif
+ }
+
+ /*
+ * Otherwise we'll go with a scheme to maximise L2/L3 cache locality
+ * and minimise lock contention. Count the total number of CPU
+ * packages, and then try to distribute the buckets among CPU
+ * packages evenly. uvm_pagefree() will reassign pages to the
+ * freeing CPU's preferred bucket on free.
+ */
+ npackage = 0;
+ ci = curcpu();
+ ci2 = ci;
+ do {
+ npackage++;
+ ci2 = ci2->ci_sibling[CPUREL_PEER];
+ } while (ci2 != ci);
+
+ /*
+ * Figure out how to arrange the packages & buckets, and the total
+ * number of buckets we need. XXX 2 may not be the best factor.
+ */
+ for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
+ npackage >>= 1;
+ }
+ uvm_page_redim(uvmexp.ncolors, npackage);
+
+ /*
+ * Now tell each CPU which bucket to use. In the outer loop, scroll
+ * through all CPU packages.
+ */
+ npackage = 0;
+ ci = curcpu();
+ ci2 = ci;
+ do {
+ /*
+ * In the inner loop, scroll through all CPUs in the package
+ * and assign the same bucket ID.
+ */
+ ci3 = ci2;
+ do {
+ ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
+ ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
+ } while (ci3 != ci2);
+ npackage++;
+ ci2 = ci2->ci_sibling[CPUREL_PEER];
+ } while (ci2 != ci);
+
+ aprint_debug("UVM: using package allocation scheme, "
+ "%d package(s) per bucket\n", 1 << shift);
+}
+
+/*
* uvm_cpu_attach: initialize per-CPU data structures.
*/
void
uvm_cpu_attach(struct cpu_info *ci)
{
- struct pgflbucket *bucketarray;
- struct pgfreelist pgfl;
struct uvm_cpu *ucpu;
- vsize_t bucketcount;
- int lcv;
- if (CPU_IS_PRIMARY(ci)) {
- /* Already done in uvm_page_init(). */
- goto attachrnd;
- }
-
- /* Add more reserve pages for this CPU. */
- uvmexp.reserve_kernel += vm_page_reserve_kernel;
-
- /* Configure this CPU's free lists. */
- bucketcount = uvmexp.ncolors * VM_NFREELIST;
- bucketarray = kmem_alloc(bucketcount * sizeof(struct pgflbucket),
- KM_SLEEP);
- ucpu = kmem_zalloc(sizeof(*ucpu), KM_SLEEP);
- uvm.cpus[cpu_index(ci)] = ucpu;
- ci->ci_data.cpu_uvm = ucpu;
- for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
- pgfl.pgfl_buckets = (bucketarray + (lcv * uvmexp.ncolors));
- uvm_page_init_buckets(&pgfl);
- ucpu->page_free[lcv].pgfl_buckets = pgfl.pgfl_buckets;
+ /* Already done in uvm_page_init(). */
+ if (!CPU_IS_PRIMARY(ci)) {
+ /* Add more reserve pages for this CPU. */
+ uvmexp.reserve_kernel += vm_page_reserve_kernel;
+
+ /* Allocate per-CPU data structures. */
+ ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
+ KM_SLEEP);
+ ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
+ coherency_unit);
+ uvm.cpus[cpu_index(ci)] = ucpu;
+ ci->ci_data.cpu_uvm = ucpu;
}
-attachrnd:
/*
* Attach RNG source for this CPU's VM events
*/
@@ -742,101 +990,140 @@ attachrnd:
ci->ci_data.cpu_name, RND_TYPE_VM,
RND_FLAG_COLLECT_TIME|RND_FLAG_COLLECT_VALUE|
RND_FLAG_ESTIMATE_VALUE);
-
}
/*
- * uvm_free: return total number of free pages in system.
+ * uvm_free: fetch the total amount of free memory in pages. This can have a
+ * detrimental effect on performance due to false sharing; don't call unless
+ * needed.
*/
int
uvm_free(void)
{
+ struct pgfreelist *pgfl;
+ int fl, b, fpages;
- return uvmexp.free;
+ fpages = 0;
+ for (fl = 0; fl < VM_NFREELIST; fl++) {
+ pgfl = &uvm.page_free[fl];
+ for (b = 0; b < uvm.bucketcount; b++) {
+ fpages += pgfl->pgfl_buckets[b]->pgb_nfree;
+ }
+ }
+ return fpages;
}
/*
- * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat
+ * uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
+ * specific freelist and specific bucket only.
+ *
+ * => must be at IPL_VM or higher to protect per-CPU data structures.
*/
static struct vm_page *
-uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int flist, int try1, int try2,
- int *trycolorp)
+uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
{
- struct pgflist *freeq;
+ int c, trycolor, colormask;
+ struct pgflbucket *pgb;
struct vm_page *pg;
- int color, trycolor = *trycolorp;
- struct pgfreelist *gpgfl, *pgfl;
+ kmutex_t *lock;
+
+ /*
+ * Skip the bucket if empty, no lock needed. There could be many
+ * empty freelists/buckets.
+ */
+ pgb = uvm.page_free[f].pgfl_buckets[b];
+ if (pgb->pgb_nfree == 0) {
+ return NULL;
+ }
- KASSERT(mutex_owned(&uvm_fpageqlock));
+ /* Skip bucket if low on memory. */
+ lock = &uvm_freelist_locks[b].lock;
+ mutex_spin_enter(lock);
+ if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) {
+ if ((flags & UVM_PGA_USERESERVE) == 0 ||
+ (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
+ curlwp != uvm.pagedaemon_lwp)) {
+ mutex_spin_exit(lock);
+ return NULL;
+ }
+ }
- color = trycolor;
- pgfl = &ucpu->page_free[flist];
- gpgfl = &uvm.page_free[flist];
+ /* Try all page colors as needed. */
+ c = trycolor = *trycolorp;
+ colormask = uvmexp.colormask;
do {
- /* cpu, try1 */
- if ((pg = LIST_FIRST((freeq =
- &pgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
+ pg = LIST_FIRST(&pgb->pgb_colors[c]);
+ if (__predict_true(pg != NULL)) {
+ /*
+ * Got a free page! PG_FREE must be cleared under
+ * lock because of uvm_pglistalloc().
+ */
+ LIST_REMOVE(pg, pageq.list);
KASSERT(pg->flags & PG_FREE);
- KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
- KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
- KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
- VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
- CPU_COUNT(CPU_COUNT_CPUHIT, 1);
- goto gotit;
- }
- /* global, try1 */
- if ((pg = LIST_FIRST((freeq =
- &gpgfl->pgfl_buckets[color].pgfl_queues[try1]))) != NULL) {
- KASSERT(pg->flags & PG_FREE);
- KASSERT(try1 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
- KASSERT(try1 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
- KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
- VM_FREE_PAGE_TO_CPU(pg)->pages[try1]--;
- CPU_COUNT(CPU_COUNT_CPUMISS, 1);
- goto gotit;
- }
- /* cpu, try2 */
- if ((pg = LIST_FIRST((freeq =
- &pgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
- KASSERT(pg->flags & PG_FREE);
- KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
- KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
- KASSERT(ucpu == VM_FREE_PAGE_TO_CPU(pg));
- VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
- CPU_COUNT(CPU_COUNT_CPUHIT, 1);
- goto gotit;
- }
- /* global, try2 */
- if ((pg = LIST_FIRST((freeq =
- &gpgfl->pgfl_buckets[color].pgfl_queues[try2]))) != NULL) {
- KASSERT(pg->flags & PG_FREE);
- KASSERT(try2 == PGFL_ZEROS || !(pg->flags & PG_ZERO));
- KASSERT(try2 == PGFL_UNKNOWN || (pg->flags & PG_ZERO));
- KASSERT(ucpu != VM_FREE_PAGE_TO_CPU(pg));
- VM_FREE_PAGE_TO_CPU(pg)->pages[try2]--;
- CPU_COUNT(CPU_COUNT_CPUMISS, 1);
- goto gotit;
+ pg->flags &= PG_ZERO;
+ pgb->pgb_nfree--;
+
+ /*
+ * While we have the bucket locked and our data
+ * structures fresh in L1 cache, we have an ideal
+ * opportunity to grab some pages for the freelist
+ * cache without causing extra contention. Only do
+ * so if we found pages in this CPU's preferred
+ * bucket.
+ */
+ if (__predict_true(b == ucpu->pgflbucket)) {
+ uvm_pgflcache_fill(ucpu, f, b, c);
+ }
+ mutex_spin_exit(lock);
+ KASSERT(uvm_page_get_bucket(pg) == b);
+ CPU_COUNT(c == trycolor ?
+ CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
+ CPU_COUNT(CPU_COUNT_CPUMISS, 1);
+ *trycolorp = c;
+ return pg;
}
- color = (color + 1) & uvmexp.colormask;
- } while (color != trycolor);
+ c = (c + 1) & colormask;
+ } while (c != trycolor);
+ mutex_spin_exit(lock);
- return (NULL);
+ return NULL;
+}
- gotit:
- LIST_REMOVE(pg, pageq.list); /* global list */
- LIST_REMOVE(pg, listq.list); /* per-cpu list */
- uvmexp.free--;
-
- if (color == trycolor)
- CPU_COUNT(CPU_COUNT_COLORHIT, 1);
- else {
- CPU_COUNT(CPU_COUNT_COLORMISS, 1);
- *trycolorp = color;
+/*
+ * uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
+ * any color from any bucket, in a specific freelist.
+ *
+ * => must be at IPL_VM or higher to protect per-CPU data structures.
+ */
+
+static struct vm_page *
+uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
+{
+ int b, trybucket, bucketcount;
+ struct vm_page *pg;
+
+ /* Try for the exact thing in the per-CPU cache. */
+ if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
+ CPU_COUNT(CPU_COUNT_CPUHIT, 1);
+ CPU_COUNT(CPU_COUNT_COLORHIT, 1);
+ return pg;
}
- return (pg);
+ /* Walk through all buckets, trying our preferred bucket first. */
+ trybucket = ucpu->pgflbucket;
+ b = trybucket;
+ bucketcount = uvm.bucketcount;
+ do {
+ pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
+ if (pg != NULL) {
+ return pg;
+ }
+ b = (b + 1 == bucketcount ? 0 : b + 1);
+ } while (b != trybucket);
+
+ return NULL;
}
/*
@@ -861,8 +1148,8 @@ struct vm_page *
uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
int flags, int strat, int free_list)
{
- int try1, try2, zeroit = 0, color;
- int lcv, error;
+ int zeroit = 0, color;
+ int lcv, error, s;
struct uvm_cpu *ucpu;
struct vm_page *pg;
lwp_t *l;
@@ -879,21 +1166,15 @@ uvm_pagealloc_strat(struct uvm_object *o
* algorithm.
*/
+ s = splvm();
ucpu = curcpu()->ci_data.cpu_uvm;
if (flags & UVM_FLAG_COLORMATCH) {
color = atop(off) & uvmexp.colormask;
} else {
- color = ucpu->page_free_nextcolor;
+ color = ucpu->pgflcolor;
}
/*
- * check to see if we need to generate some free pages waking
- * the pagedaemon.
- */
-
- uvm_kick_pdaemon();
-
- /*
* fail if any of these conditions is true:
* [1] there really are no free pages, or
* [2] only kernel "reserved" pages remain and
@@ -903,55 +1184,40 @@ uvm_pagealloc_strat(struct uvm_object *o
* we make kernel reserve pages available if called by a
* kernel thread or a realtime thread.
*/
- mutex_spin_enter(&uvm_fpageqlock);
l = curlwp;
if (__predict_true(l != NULL) && lwp_eprio(l) >= PRI_KTHREAD) {
flags |= UVM_PGA_USERESERVE;
}
- if ((uvmexp.free <= uvmexp.reserve_kernel &&
- (flags & UVM_PGA_USERESERVE) == 0) ||
- (uvmexp.free <= uvmexp.reserve_pagedaemon &&
- curlwp != uvm.pagedaemon_lwp))
- goto fail;
-
-#if PGFL_NQUEUES != 2
-#error uvm_pagealloc_strat needs to be updated
-#endif
- /*
- * If we want a zero'd page, try the ZEROS queue first, otherwise
- * we try the UNKNOWN queue first.
- */
- if (flags & UVM_PGA_ZERO) {
- try1 = PGFL_ZEROS;
- try2 = PGFL_UNKNOWN;
- } else {
- try1 = PGFL_UNKNOWN;
- try2 = PGFL_ZEROS;
+ /* If the allocator's running in NUMA mode, go with NUMA strategy. */
+ if (uvm.numa_alloc && strat == UVM_PGA_STRAT_NORMAL) {
+ strat = UVM_PGA_STRAT_NUMA;
}
again:
switch (strat) {
case UVM_PGA_STRAT_NORMAL:
- /* Check freelists: descending priority (ascending id) order */
+ /* Check freelists: descending priority (ascending id) order. */
for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
- pg = uvm_pagealloc_pgfl(ucpu, lcv,
- try1, try2, &color);
- if (pg != NULL)
+ pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
+ if (pg != NULL) {
goto gotit;
+ }
}
- /* No pages free! */
- goto fail;
+ /* No pages free! Have pagedaemon free some memory. */
+ splx(s);
+ uvm_kick_pdaemon();
+ return NULL;
case UVM_PGA_STRAT_ONLY:
case UVM_PGA_STRAT_FALLBACK:
/* Attempt to allocate from the specified free list. */
KASSERT(free_list >= 0 && free_list < VM_NFREELIST);
- pg = uvm_pagealloc_pgfl(ucpu, free_list,
- try1, try2, &color);
- if (pg != NULL)
+ pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
+ if (pg != NULL) {
goto gotit;
+ }
/* Fall back, if possible. */
if (strat == UVM_PGA_STRAT_FALLBACK) {
@@ -959,8 +1225,33 @@ uvm_pagealloc_strat(struct uvm_object *o
goto again;
}
- /* No pages free! */
- goto fail;
+ /* No pages free! Have pagedaemon free some memory. */
+ splx(s);
+ uvm_kick_pdaemon();
+ return NULL;
+
+ case UVM_PGA_STRAT_NUMA:
+ /*
+ * NUMA strategy: allocating from the correct bucket is more
+ * important than observing freelist priority. Look only to
+ * the current NUMA node; if that fails, we need to look to
+ * other NUMA nodes, so retry with the normal strategy.
+ */
+ for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
+ pg = uvm_pgflcache_alloc(ucpu, lcv, color);
+ if (pg != NULL) {
+ CPU_COUNT(CPU_COUNT_CPUHIT, 1);
+ CPU_COUNT(CPU_COUNT_COLORHIT, 1);
+ goto gotit;
+ }
+ pg = uvm_pagealloc_pgb(ucpu, lcv,
+ ucpu->pgflbucket, &color, flags);
+ if (pg != NULL) {
+ goto gotit;
+ }
+ }
+ strat = UVM_PGA_STRAT_NORMAL;
+ goto again;
default:
panic("uvm_pagealloc_strat: bad strat %d", strat);
@@ -973,11 +1264,11 @@ uvm_pagealloc_strat(struct uvm_object *o
* the next color accordingly.
*/
- ucpu->page_free_nextcolor = (color + 1) & uvmexp.colormask;
+ ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
/*
- * update allocation statistics and remember if we have to
- * zero the page
+ * while still at IPL_VM, update allocation statistics and remember
+ * if we have to zero the page
*/
if (flags & UVM_PGA_ZERO) {
@@ -988,9 +1279,6 @@ uvm_pagealloc_strat(struct uvm_object *o
CPU_COUNT(CPU_COUNT_PGA_ZEROMISS, 1);
zeroit = 1;
}
- if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
- ucpu->page_idle_zero = vm_page_zero_enable;
- }
}
if (pg->flags & PG_ZERO) {
CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
@@ -998,12 +1286,9 @@ uvm_pagealloc_strat(struct uvm_object *o
if (anon) {
CPU_COUNT(CPU_COUNT_ANONPAGES, 1);
}
+ splx(s);
KASSERT((pg->flags & ~(PG_ZERO|PG_FREE)) == 0);
- /* mark the page as allocated and then drop uvm_fpageqlock. */
- pg->flags &= ~PG_FREE;
- mutex_spin_exit(&uvm_fpageqlock);
-
/*
* assign the page to the object. as the page was free, we know
* that pg->uobject and pg->uanon are NULL. we only need to take
@@ -1050,10 +1335,6 @@ uvm_pagealloc_strat(struct uvm_object *o
}
return(pg);
-
- fail:
- mutex_spin_exit(&uvm_fpageqlock);
- return (NULL);
}
/*
@@ -1133,7 +1414,6 @@ uvm_pagezerocheck(struct vm_page *pg)
int *p, *ep;
KASSERT(uvm_zerocheckkva != 0);
- KASSERT(mutex_owned(&uvm_fpageqlock));
/*
* XXX assuming pmap_kenter_pa and pmap_kremove never call
@@ -1170,10 +1450,12 @@ uvm_pagezerocheck(struct vm_page *pg)
void
uvm_pagefree(struct vm_page *pg)
{
- struct pgflist *pgfl;
+ struct pgfreelist *pgfl;
+ struct pgflbucket *pgb;
struct uvm_cpu *ucpu;
- int index, color, queue;
- bool iszero, locked;
+ kmutex_t *lock;
+ int bucket, s;
+ bool locked;
#ifdef DEBUG
if (pg->uobject == (void *)0xdeadbeef &&
@@ -1184,7 +1466,6 @@ uvm_pagefree(struct vm_page *pg)
KASSERT((pg->flags & PG_PAGEOUT) == 0);
KASSERT(!(pg->flags & PG_FREE));
- //KASSERT(mutex_owned(&uvm_pageqlock) || !uvmpdpol_pageisqueued_p(pg));
KASSERT(pg->uobject == NULL || mutex_owned(pg->uobject->vmobjlock));
KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
mutex_owned(pg->uanon->an_lock));
@@ -1285,44 +1566,46 @@ uvm_pagefree(struct vm_page *pg)
* and put on free queue
*/
- iszero = (pg->flags & PG_ZERO);
- index = uvm_page_get_freelist(pg);
- color = VM_PGCOLOR(pg);
- queue = (iszero ? PGFL_ZEROS : PGFL_UNKNOWN);
-
#ifdef DEBUG
pg->uobject = (void *)0xdeadbeef;
pg->uanon = (void *)0xdeadbeef;
-#endif
-
- mutex_spin_enter(&uvm_fpageqlock);
- pg->flags = PG_FREE;
-
-#ifdef DEBUG
- if (iszero)
+ if (pg->flags & PG_ZERO)
uvm_pagezerocheck(pg);
#endif /* DEBUG */
+ s = splvm();
+ ucpu = curcpu()->ci_data.cpu_uvm;
- /* global list */
- pgfl = &uvm.page_free[index].pgfl_buckets[color].pgfl_queues[queue];
- LIST_INSERT_HEAD(pgfl, pg, pageq.list);
- uvmexp.free++;
- if (iszero) {
- CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
+ /*
+ * If we're using the NUMA strategy, we'll only cache this page if
+ * it came from the local CPU's NUMA node. Otherwise we're using
+ * the L2/L3 cache locality strategy and we'll cache anything.
+ */
+ if (uvm.numa_alloc) {
+ bucket = uvm_page_get_bucket(pg);
+ } else {
+ bucket = ucpu->pgflbucket;
+ uvm_page_set_bucket(pg, bucket);
}
- /* per-cpu list */
- ucpu = curcpu()->ci_data.cpu_uvm;
- pg->offset = (uintptr_t)ucpu;
- pgfl = &ucpu->page_free[index].pgfl_buckets[color].pgfl_queues[queue];
- LIST_INSERT_HEAD(pgfl, pg, listq.list);
- ucpu->pages[queue]++;
- if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN]) {
- ucpu->page_idle_zero = vm_page_zero_enable;
+ /* Try to send the page to the per-CPU cache. */
+ if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) {
+ splx(s);
+ return;
}
- mutex_spin_exit(&uvm_fpageqlock);
+ /* Didn't work. Never mind, send it to a global bucket. */
+ pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+ pgb = pgfl->pgfl_buckets[bucket];
+ lock = &uvm_freelist_locks[bucket].lock;
+
+ mutex_spin_enter(lock);
+ /* PG_FREE must be set under lock because of uvm_pglistalloc(). */
+ pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
+ LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
+ pgb->pgb_nfree++;
+ mutex_spin_exit(lock);
+ splx(s);
}
/*
@@ -1411,116 +1694,22 @@ uvm_page_own(struct vm_page *pg, const c
"page (%p)\n", pg);
panic("uvm_page_own");
}
- if (!uvmpdpol_pageisqueued_p(pg)) {
- KASSERT((pg->uanon == NULL && pg->uobject == NULL) ||
- pg->wire_count > 0);
- } else {
- KASSERT(pg->wire_count == 0);
- }
pg->owner_tag = NULL;
}
#endif
/*
* uvm_pageidlezero: zero free pages while the system is idle.
- *
- * => try to complete one color bucket at a time, to reduce our impact
- * on the CPU cache.
- * => we loop until we either reach the target or there is a lwp ready
- * to run, or MD code detects a reason to break early.
*/
void
uvm_pageidlezero(void)
{
- struct vm_page *pg;
- struct pgfreelist *pgfl, *gpgfl;
- struct uvm_cpu *ucpu;
- int free_list, firstbucket, nextbucket;
- bool lcont = false;
- ucpu = curcpu()->ci_data.cpu_uvm;
- if (!ucpu->page_idle_zero ||
- ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
- ucpu->page_idle_zero = false;
- return;
- }
- if (!mutex_tryenter(&uvm_fpageqlock)) {
- /* Contention: let other CPUs to use the lock. */
- return;
- }
- firstbucket = ucpu->page_free_nextcolor;
- nextbucket = firstbucket;
- do {
- for (free_list = 0; free_list < VM_NFREELIST; free_list++) {
- if (sched_curcpu_runnable_p()) {
- goto quit;
- }
- pgfl = &ucpu->page_free[free_list];
- gpgfl = &uvm.page_free[free_list];
- while ((pg = LIST_FIRST(&pgfl->pgfl_buckets[
- nextbucket].pgfl_queues[PGFL_UNKNOWN])) != NULL) {
- if (lcont || sched_curcpu_runnable_p()) {
- goto quit;
- }
- LIST_REMOVE(pg, pageq.list); /* global list */
- LIST_REMOVE(pg, listq.list); /* per-cpu list */
- ucpu->pages[PGFL_UNKNOWN]--;
- uvmexp.free--;
- KASSERT(pg->flags == PG_FREE);
- pg->flags = 0;
- mutex_spin_exit(&uvm_fpageqlock);
-#ifdef PMAP_PAGEIDLEZERO
- if (!PMAP_PAGEIDLEZERO(VM_PAGE_TO_PHYS(pg))) {
-
- /*
- * The machine-dependent code detected
- * some reason for us to abort zeroing
- * pages, probably because there is a
- * process now ready to run.
- */
-
- mutex_spin_enter(&uvm_fpageqlock);
- pg->flags = PG_FREE;
- LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
- nextbucket].pgfl_queues[
- PGFL_UNKNOWN], pg, pageq.list);
- LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
- nextbucket].pgfl_queues[
- PGFL_UNKNOWN], pg, listq.list);
- ucpu->pages[PGFL_UNKNOWN]++;
- uvmexp.free++;
- uvmexp.zeroaborts++;
- goto quit;
- }
-#else
- pmap_zero_page(VM_PAGE_TO_PHYS(pg));
-#endif /* PMAP_PAGEIDLEZERO */
- if (!mutex_tryenter(&uvm_fpageqlock)) {
- lcont = true;
- mutex_spin_enter(&uvm_fpageqlock);
- } else {
- lcont = false;
- }
- pg->flags = PG_FREE | PG_ZERO;
- LIST_INSERT_HEAD(&gpgfl->pgfl_buckets[
- nextbucket].pgfl_queues[PGFL_ZEROS],
- pg, pageq.list);
- LIST_INSERT_HEAD(&pgfl->pgfl_buckets[
- nextbucket].pgfl_queues[PGFL_ZEROS],
- pg, listq.list);
- ucpu->pages[PGFL_ZEROS]++;
- uvmexp.free++;
- CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
- }
- }
- if (ucpu->pages[PGFL_UNKNOWN] < uvmexp.ncolors) {
- break;
- }
- nextbucket = (nextbucket + 1) & uvmexp.colormask;
- } while (nextbucket != firstbucket);
- ucpu->page_idle_zero = false;
- quit:
- mutex_spin_exit(&uvm_fpageqlock);
+ /*
+ * Disabled for the moment. Previous strategy too cache heavy. In
+ * the future we may experiment with zeroing the pages held in the
+ * per-CPU cache (uvm_pgflcache).
+ */
}
/*
@@ -1800,6 +1989,7 @@ uvm_page_printit(struct vm_page *pg, boo
{
struct vm_page *tpg;
struct uvm_object *uobj;
+ struct pgflbucket *pgb;
struct pgflist *pgl;
char pgbuf[128];
@@ -1848,14 +2038,9 @@ uvm_page_printit(struct vm_page *pg, boo
/* cross-verify page queue */
if (pg->flags & PG_FREE) {
int fl = uvm_page_get_freelist(pg);
- int color = VM_PGCOLOR(pg);
- pgl = &uvm.page_free[fl].pgfl_buckets[color].pgfl_queues[
- ((pg)->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN];
- } else {
- pgl = NULL;
- }
-
- if (pgl) {
+ int b = uvm_page_get_bucket(pg);
+ pgb = uvm.page_free[fl].pgfl_buckets[b];
+ pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
(*pr)(" checking pageq list\n");
LIST_FOREACH(tpg, pgl, pageq.list) {
if (tpg == pg) {
@@ -1905,4 +2090,36 @@ uvm_page_printall(void (*pr)(const char
}
}
+/*
+ * uvm_page_print_freelists - print a summary freelists
+ */
+
+void
+uvm_page_print_freelists(void (*pr)(const char *, ...))
+{
+ struct pgfreelist *pgfl;
+ struct pgflbucket *pgb;
+ int fl, b, c;
+
+ (*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
+ VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
+
+ for (fl = 0; fl < VM_NFREELIST; fl++) {
+ pgfl = &uvm.page_free[fl];
+ (*pr)("freelist(%d) @ %p\n", fl, pgfl);
+ for (b = 0; b < uvm.bucketcount; b++) {
+ pgb = uvm.page_free[fl].pgfl_buckets[b];
+ (*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
+ b, pgb, pgb->pgb_nfree,
+ &uvm_freelist_locks[b].lock);
+ for (c = 0; c < uvmexp.ncolors; c++) {
+ (*pr)(" color(%d) @ %p, ", c,
+ &pgb->pgb_colors[c]);
+ (*pr)("first page = %p\n",
+ LIST_FIRST(&pgb->pgb_colors[c]));
+ }
+ }
+ }
+}
+
#endif /* DDB || DEBUGPRINT */
Index: src/sys/uvm/uvm_page.h
diff -u src/sys/uvm/uvm_page.h:1.88 src/sys/uvm/uvm_page.h:1.89
--- src/sys/uvm/uvm_page.h:1.88 Sat Dec 21 14:41:44 2019
+++ src/sys/uvm/uvm_page.h Fri Dec 27 12:51:57 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: uvm_page.h,v 1.88 2019/12/21 14:41:44 ad Exp $ */
+/* $NetBSD: uvm_page.h,v 1.89 2019/12/27 12:51:57 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
@@ -119,7 +119,6 @@
*
* o free
* => pageq.list is entry on global free page queue
- * => listq.list is entry on per-CPU free page queue
* => uanon is unused (or (void *)0xdeadbeef for DEBUG)
* => uobject is unused (or (void *)0xdeadbeef for DEBUG)
* => PG_FREE is set in flags
@@ -129,13 +128,11 @@
* => uobject is owner
* o owned by a vm_anon
* => pageq is unused (XXX correct?)
- * => listq is unused (XXX correct?)
* => uanon is owner
* => uobject is NULL
* => PG_ANON is set in flags
* o allocated by uvm_pglistalloc
* => pageq.queue is entry on resulting pglist, owned by caller
- * => listq is unused (XXX correct?)
* => uanon is unused
* => uobject is unused
*
@@ -153,11 +150,6 @@ struct vm_page {
* or uvm_pglistalloc output */
LIST_ENTRY(vm_page) list; /* f: global free page queue */
} pageq;
-
- union {
- LIST_ENTRY(vm_page) list; /* f: CPU free page queue */
- } listq;
-
struct vm_anon *uanon; /* o,i: anon */
struct uvm_object *uobject; /* o,i: object */
voff_t offset; /* o: offset into object */
@@ -302,6 +294,7 @@ void uvm_page_own(struct vm_page *, cons
bool uvm_page_physget(paddr_t *);
#endif
void uvm_page_recolor(int);
+void uvm_page_rebucket(void);
void uvm_pageidlezero(void);
void uvm_pageactivate(struct vm_page *);
@@ -318,6 +311,8 @@ void uvm_pagewire(struct vm_page *);
void uvm_pagezero(struct vm_page *);
bool uvm_pageismanaged(paddr_t);
bool uvm_page_locked_p(struct vm_page *);
+void uvm_pgfl_lock(void);
+void uvm_pgfl_unlock(void);
int uvm_page_lookup_freelist(struct vm_page *);
@@ -348,8 +343,12 @@ int uvm_direct_process(struct vm_page **
#define VM_PGCOLOR(pg) \
(atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask)
#define PHYS_TO_VM_PAGE(pa) uvm_phys_to_vm_page(pa)
+
+/*
+ * VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a
+ * per-CPU cache. If you need to be certain, pause caching.
+ */
#define VM_PAGE_IS_FREE(entry) ((entry)->flags & PG_FREE)
-#define VM_FREE_PAGE_TO_CPU(pg) ((struct uvm_cpu *)((uintptr_t)pg->offset))
/*
* Use the lower 10 bits of pg->phys_addr to cache some some locators for
Index: src/sys/uvm/uvm_pglist.c
diff -u src/sys/uvm/uvm_pglist.c:1.77 src/sys/uvm/uvm_pglist.c:1.78
--- src/sys/uvm/uvm_pglist.c:1.77 Sat Dec 21 14:50:34 2019
+++ src/sys/uvm/uvm_pglist.c Fri Dec 27 12:51:57 2019
@@ -1,12 +1,12 @@
-/* $NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $ */
+/* $NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $ */
/*-
- * Copyright (c) 1997 The NetBSD Foundation, Inc.
+ * Copyright (c) 1997, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
- * NASA Ames Research Center.
+ * NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -35,13 +35,14 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.77 2019/12/21 14:50:34 ad Exp $");
+__KERNEL_RCSID(0, "$NetBSD: uvm_pglist.c,v 1.78 2019/12/27 12:51:57 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
+#include <uvm/uvm_pgflcache.h>
#ifdef VM_PAGE_ALLOC_MEMORY_STATS
#define STAT_INCR(v) (v)++
@@ -79,34 +80,25 @@ u_long uvm_pglistalloc_npages;
static void
uvm_pglist_add(struct vm_page *pg, struct pglist *rlist)
{
- int free_list __unused, color __unused, pgflidx;
+ struct pgfreelist *pgfl;
+ struct pgflbucket *pgb;
- KASSERT(mutex_owned(&uvm_fpageqlock));
+ pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+ pgb = pgfl->pgfl_buckets[uvm_page_get_bucket(pg)];
-#if PGFL_NQUEUES != 2
-#error uvm_pglistalloc needs to be updated
-#endif
-
- free_list = uvm_page_get_freelist(pg);
- color = VM_PGCOLOR(pg);
- pgflidx = (pg->flags & PG_ZERO) ? PGFL_ZEROS : PGFL_UNKNOWN;
#ifdef UVMDEBUG
struct vm_page *tp;
- LIST_FOREACH(tp,
- &uvm.page_free[free_list].pgfl_buckets[color].pgfl_queues[pgflidx],
- pageq.list) {
+ LIST_FOREACH(tp, &pgb->pgb_colors[VM_PGCOLOR(pg)], pageq.list) {
if (tp == pg)
break;
}
if (tp == NULL)
panic("uvm_pglistalloc: page not on freelist");
#endif
- LIST_REMOVE(pg, pageq.list); /* global */
- LIST_REMOVE(pg, listq.list); /* cpu */
- uvmexp.free--;
+ LIST_REMOVE(pg, pageq.list);
+ pgb->pgb_nfree--;
if (pg->flags & PG_ZERO)
CPU_COUNT(CPU_COUNT_ZEROPAGES, -1);
- VM_FREE_PAGE_TO_CPU(pg)->pages[pgflidx]--;
pg->flags = PG_CLEAN;
pg->uobject = NULL;
pg->uanon = NULL;
@@ -129,8 +121,6 @@ uvm_pglistalloc_c_ps(uvm_physseg_t psi,
printf("pgalloc: contig %d pgs from psi %zd\n", num, ps - vm_physmem);
#endif
- KASSERT(mutex_owned(&uvm_fpageqlock));
-
low = atop(low);
high = atop(high);
alignment = atop(alignment);
@@ -316,7 +306,7 @@ uvm_pglistalloc_contig(int num, paddr_t
/*
* Block all memory allocation and lock the free list.
*/
- mutex_spin_enter(&uvm_fpageqlock);
+ uvm_pgfl_lock();
/* Are there even any free pages? */
if (uvm_free() <= (uvmexp.reserve_pagedaemon + uvmexp.reserve_kernel))
@@ -352,7 +342,7 @@ out:
* the pagedaemon.
*/
- mutex_spin_exit(&uvm_fpageqlock);
+ uvm_pgfl_unlock();
uvm_kick_pdaemon();
return (error);
}
@@ -368,7 +358,6 @@ uvm_pglistalloc_s_ps(uvm_physseg_t psi,
printf("pgalloc: simple %d pgs from psi %zd\n", num, psi);
#endif
- KASSERT(mutex_owned(&uvm_fpageqlock));
KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_start(psi));
KASSERT(uvm_physseg_get_start(psi) <= uvm_physseg_get_avail_end(psi));
KASSERT(uvm_physseg_get_avail_start(psi) <= uvm_physseg_get_end(psi));
@@ -461,7 +450,7 @@ again:
/*
* Block all memory allocation and lock the free list.
*/
- mutex_spin_enter(&uvm_fpageqlock);
+ uvm_pgfl_lock();
count++;
/* Are there even any free pages? */
@@ -493,7 +482,7 @@ out:
* the pagedaemon.
*/
- mutex_spin_exit(&uvm_fpageqlock);
+ uvm_pgfl_unlock();
uvm_kick_pdaemon();
if (error) {
@@ -539,6 +528,12 @@ uvm_pglistalloc(psize_t size, paddr_t lo
TAILQ_INIT(rlist);
+ /*
+ * Turn off the caching of free pages - we need everything to be on
+ * the global freelists.
+ */
+ uvm_pgflcache_pause();
+
if ((nsegs < size >> PAGE_SHIFT) || (alignment != PAGE_SIZE) ||
(boundary != 0))
res = uvm_pglistalloc_contig(num, low, high, alignment,
@@ -546,6 +541,8 @@ uvm_pglistalloc(psize_t size, paddr_t lo
else
res = uvm_pglistalloc_simple(num, low, high, rlist, waitok);
+ uvm_pgflcache_resume();
+
return (res);
}
@@ -558,45 +555,34 @@ uvm_pglistalloc(psize_t size, paddr_t lo
void
uvm_pglistfree(struct pglist *list)
{
- struct uvm_cpu *ucpu;
+ struct pgfreelist *pgfl;
+ struct pgflbucket *pgb;
struct vm_page *pg;
- int index, color, queue;
- bool iszero;
+ int c, b;
/*
* Lock the free list and free each page.
*/
- mutex_spin_enter(&uvm_fpageqlock);
- ucpu = curcpu()->ci_data.cpu_uvm;
+ uvm_pgfl_lock();
while ((pg = TAILQ_FIRST(list)) != NULL) {
- KASSERT(!uvmpdpol_pageisqueued_p(pg));
TAILQ_REMOVE(list, pg, pageq.queue);
- iszero = (pg->flags & PG_ZERO);
pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
#ifdef DEBUG
pg->uobject = (void *)0xdeadbeef;
pg->uanon = (void *)0xdeadbeef;
-#endif /* DEBUG */
-#ifdef DEBUG
- if (iszero)
+ if (pg->flags & PG_ZERO)
uvm_pagezerocheck(pg);
#endif /* DEBUG */
- index = uvm_page_get_freelist(pg);
- color = VM_PGCOLOR(pg);
- queue = iszero ? PGFL_ZEROS : PGFL_UNKNOWN;
- pg->offset = (uintptr_t)ucpu;
- LIST_INSERT_HEAD(&uvm.page_free[index].pgfl_buckets[color].
- pgfl_queues[queue], pg, pageq.list);
- LIST_INSERT_HEAD(&ucpu->page_free[index].pgfl_buckets[color].
- pgfl_queues[queue], pg, listq.list);
- uvmexp.free++;
- if (iszero)
+ c = VM_PGCOLOR(pg);
+ b = uvm_page_get_bucket(pg);
+ pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
+ pgb = pgfl->pgfl_buckets[b];
+ if (pg->flags & PG_ZERO)
CPU_COUNT(CPU_COUNT_ZEROPAGES, 1);
- ucpu->pages[queue]++;
+ pgb->pgb_nfree++;
+ LIST_INSERT_HEAD(&pgb->pgb_colors[c], pg, pageq.list);
STAT_DECR(uvm_pglistalloc_npages);
}
- if (ucpu->pages[PGFL_ZEROS] < ucpu->pages[PGFL_UNKNOWN])
- ucpu->page_idle_zero = vm_page_zero_enable;
- mutex_spin_exit(&uvm_fpageqlock);
+ uvm_pgfl_unlock();
}
Index: src/sys/uvm/uvm_pglist.h
diff -u src/sys/uvm/uvm_pglist.h:1.8 src/sys/uvm/uvm_pglist.h:1.9
--- src/sys/uvm/uvm_pglist.h:1.8 Sat Nov 6 15:48:00 2010
+++ src/sys/uvm/uvm_pglist.h Fri Dec 27 12:51:57 2019
@@ -1,11 +1,11 @@
-/* $NetBSD: uvm_pglist.h,v 1.8 2010/11/06 15:48:00 uebayasi Exp $ */
+/* $NetBSD: uvm_pglist.h,v 1.9 2019/12/27 12:51:57 ad Exp $ */
/*-
- * Copyright (c) 2000, 2001, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 2000, 2001, 2008, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
- * by Jason R. Thorpe.
+ * by Jason R. Thorpe, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -41,19 +41,51 @@ TAILQ_HEAD(pglist, vm_page);
LIST_HEAD(pgflist, vm_page);
/*
- * A page free list consists of free pages of unknown contents and free
- * pages of all zeros.
+ * The global uvm.page_free list (uvm_page.c, uvm_pglist.c). Free pages are
+ * stored according to freelist, bucket, and cache colour.
+ *
+ * pglist = &uvm.page_free[freelist].pgfl_buckets[bucket].pgb_color[color];
+ *
+ * Freelists provide a priority ordering of pages for allocation, based upon
+ * how valuable they are for special uses (e.g. device driver DMA).
+ *
+ * Pages are then grouped in buckets according to some common factor, for
+ * example L2/L3 cache locality. Each bucket has its own lock, and the
+ * locks are shared among freelists for the same numbered buckets.
+ *
+ * Inside each bucket, pages are further distributed by cache color.
+ *
+ * We want these data structures to occupy as few cache lines as possible,
+ * as they will be highly contended.
*/
-#define PGFL_UNKNOWN 0
-#define PGFL_ZEROS 1
-#define PGFL_NQUEUES 2
-
struct pgflbucket {
- struct pgflist pgfl_queues[PGFL_NQUEUES];
+ uintptr_t pgb_nfree; /* total # free pages, all colors */
+ struct pgflist pgb_colors[1]; /* variable size array */
};
+/*
+ * At the root, the freelists. MD code decides the number and structure of
+ * these. They are always arranged in descending order of allocation
+ * priority.
+ *
+ * 8 buckets should be enough to cover most all current x86 systems (2019),
+ * given the way package/core/smt IDs are structured on x86. For systems
+ * that report high package counts despite having a single physical CPU
+ * package (e.g. Ampere eMAG) a little bit of sharing isn't going to hurt
+ * in the least.
+ */
+#define PGFL_MAX_BUCKETS 8
struct pgfreelist {
- struct pgflbucket *pgfl_buckets;
+ struct pgflbucket *pgfl_buckets[PGFL_MAX_BUCKETS];
+};
+
+/*
+ * Lock for each bucket.
+ */
+union uvm_freelist_lock {
+ kmutex_t lock;
+ uint8_t padding[COHERENCY_UNIT];
};
+extern union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS];
#endif /* _UVM_UVM_PGLIST_H_ */
Added files:
Index: src/sys/uvm/uvm_pgflcache.c
diff -u /dev/null src/sys/uvm/uvm_pgflcache.c:1.1
--- /dev/null Fri Dec 27 12:51:57 2019
+++ src/sys/uvm/uvm_pgflcache.c Fri Dec 27 12:51:57 2019
@@ -0,0 +1,471 @@
+/* $NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $ */
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * uvm_pgflcache.c: page freelist cache.
+ *
+ * This implements a tiny per-CPU cache of pages that sits between the main
+ * page allocator and the freelists. By allocating and freeing pages in
+ * batch, it reduces freelist contention by an order of magnitude.
+ *
+ * The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
+ * uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
+ * world. On system with one CPU per physical package (e.g. a uniprocessor)
+ * the cache is not enabled.
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.1 2019/12/27 12:51:57 ad Exp $");
+
+#include "opt_uvm.h"
+#include "opt_multiprocessor.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sched.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/atomic.h>
+#include <sys/cpu.h>
+#include <sys/xcall.h>
+
+#include <uvm/uvm.h>
+#include <uvm/uvm_pglist.h>
+#include <uvm/uvm_pgflcache.h>
+
+/* There is no point doing any of this on a uniprocessor. */
+#ifdef MULTIPROCESSOR
+
+/*
+ * MAXPGS - maximum pages per color, per bucket.
+ * FILLPGS - number of pages to allocate at once, per color, per bucket.
+ *
+ * Why the chosen values:
+ *
+ * (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
+ * colors. We make the assumption that most of the time allocation activity
+ * will be centered around one UVM freelist, so most of the time there will
+ * be no more than 224kB worth of cached pages per-CPU. That's tiny, but
+ * enough to hugely reduce contention on the freelist locks, and give us a
+ * small pool of pages which if we're very lucky may have some L1/L2 cache
+ * locality, and do so without subtracting too much from the L2/L3 cache
+ * benefits of having per-package free lists in the page allocator.
+ *
+ * (2) With the chosen values on _LP64, the data structure for each color
+ * takes up a single cache line (64 bytes) giving this very low overhead
+ * even in the "miss" case.
+ *
+ * (3) We don't want to cause too much pressure by hiding away memory that
+ * could otherwise be put to good use.
+ */
+#define MAXPGS 7
+#define FILLPGS 6
+
+/* Variable size, according to # colors. */
+struct pgflcache {
+ struct pccolor {
+ intptr_t count;
+ struct vm_page *pages[MAXPGS];
+ } color[1];
+};
+
+static kmutex_t uvm_pgflcache_lock;
+static kcondvar_t uvm_pgflcache_cv;
+static int uvm_pgflcache_sem;
+static bool uvm_pgflcache_draining;
+
+/*
+ * uvm_pgflcache_fill: fill specified freelist/color from global list
+ *
+ * => must be called at IPL_VM
+ * => must be called with given bucket lock held
+ * => must only fill from the correct bucket for this CPU
+ */
+
+void
+uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
+{
+ struct pgflbucket *pgb;
+ struct pgflcache *pc;
+ struct pccolor *pcc;
+ struct pgflist *head;
+ struct vm_page *pg;
+ int count;
+
+ KASSERT(mutex_owned(&uvm_freelist_locks[b].lock));
+ KASSERT(ucpu->pgflbucket == b);
+
+ /* If caching is off, then bail out. */
+ if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+ return;
+ }
+
+ /* Fill only to the limit. */
+ pcc = &pc->color[c];
+ pgb = uvm.page_free[fl].pgfl_buckets[b];
+ head = &pgb->pgb_colors[c];
+ if (pcc->count >= FILLPGS) {
+ return;
+ }
+
+ /* Pull pages from the bucket until it's empty, or we are full. */
+ count = pcc->count;
+ pg = LIST_FIRST(head);
+ while (__predict_true(pg != NULL && count < FILLPGS)) {
+ KASSERT(pg->flags & PG_FREE);
+ KASSERT(uvm_page_get_bucket(pg) == b);
+ pcc->pages[count++] = pg;
+ pg = LIST_NEXT(pg, pageq.list);
+ }
+
+ /* Violate LIST abstraction to remove all pages at once. */
+ head->lh_first = pg;
+ if (__predict_true(pg != NULL)) {
+ pg->pageq.list.le_prev = &head->lh_first;
+ }
+ pgb->pgb_nfree -= (count - pcc->count);
+ pcc->count = count;
+}
+
+/*
+ * uvm_pgflcache_spill: spill specified freelist/color to global list
+ *
+ * => must be called at IPL_VM
+ * => mark __noinline so we don't pull it into uvm_pgflcache_free()
+ */
+
+static void __noinline
+uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
+{
+ struct pgflbucket *pgb;
+ struct pgfreelist *pgfl;
+ struct pgflcache *pc;
+ struct pccolor *pcc;
+ struct pgflist *head;
+ kmutex_t *lock;
+ int b, adj;
+
+ pc = ucpu->pgflcache[fl];
+ pcc = &pc->color[c];
+ pgfl = &uvm.page_free[fl];
+ b = ucpu->pgflbucket;
+ pgb = pgfl->pgfl_buckets[b];
+ head = &pgb->pgb_colors[c];
+ lock = &uvm_freelist_locks[b].lock;
+
+ mutex_spin_enter(lock);
+ for (adj = pcc->count; pcc->count != 0;) {
+ pcc->count--;
+ KASSERT(pcc->pages[pcc->count] != NULL);
+ KASSERT(pcc->pages[pcc->count]->flags & PG_FREE);
+ LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
+ }
+ pgb->pgb_nfree += adj;
+ mutex_spin_exit(lock);
+}
+
+/*
+ * uvm_pgflcache_alloc: try to allocate a cached page.
+ *
+ * => must be called at IPL_VM
+ * => allocate only from the given freelist and given page color
+ */
+
+struct vm_page *
+uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
+{
+ struct pgflcache *pc;
+ struct pccolor *pcc;
+ struct vm_page *pg;
+
+ /* If caching is off, then bail out. */
+ if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+ return NULL;
+ }
+
+ /* Very simple: if we have a page then return it. */
+ pcc = &pc->color[c];
+ if (__predict_false(pcc->count == 0)) {
+ return NULL;
+ }
+ pg = pcc->pages[--(pcc->count)];
+ KASSERT(pg != NULL);
+ KASSERT(pg->flags & PG_FREE);
+ KASSERT(uvm_page_get_freelist(pg) == fl);
+ KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
+ pg->flags &= PG_ZERO;
+ return pg;
+}
+
+/*
+ * uvm_pgflcache_free: cache a page, if possible.
+ *
+ * => must be called at IPL_VM
+ * => must only send pages for the correct bucket for this CPU
+ */
+
+bool
+uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
+{
+ struct pgflcache *pc;
+ struct pccolor *pcc;
+ int fl, c;
+
+ KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
+
+ /* If caching is off, then bail out. */
+ fl = uvm_page_get_freelist(pg);
+ if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
+ return false;
+ }
+
+ /* If the array is full spill it first, then add page to array. */
+ c = VM_PGCOLOR(pg);
+ pcc = &pc->color[c];
+ KASSERT((pg->flags & PG_FREE) == 0);
+ if (__predict_false(pcc->count == MAXPGS)) {
+ uvm_pgflcache_spill(ucpu, fl, c);
+ }
+ pg->flags = (pg->flags & PG_ZERO) | PG_FREE;
+ pcc->pages[pcc->count] = pg;
+ pcc->count++;
+ return true;
+}
+
+/*
+ * uvm_pgflcache_init: allocate and initialize per-CPU data structures for
+ * the free page cache. Don't set anything in motion - that's taken care
+ * of by uvm_pgflcache_resume().
+ */
+
+static void
+uvm_pgflcache_init_cpu(struct cpu_info *ci)
+{
+ struct uvm_cpu *ucpu;
+ size_t sz;
+
+ ucpu = ci->ci_data.cpu_uvm;
+ KASSERT(ucpu->pgflcachemem == NULL);
+ KASSERT(ucpu->pgflcache[0] == NULL);
+
+ sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
+ ucpu->pgflcachememsz =
+ (roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
+ ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
+}
+
+/*
+ * uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
+ * and shut down caching on the CPU. Called on each CPU in the system via
+ * xcall.
+ */
+
+static void
+uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
+{
+ struct uvm_cpu *ucpu;
+ int fl, color, s;
+
+ ucpu = curcpu()->ci_data.cpu_uvm;
+ for (fl = 0; fl < VM_NFREELIST; fl++) {
+ s = splvm();
+ for (color = 0; color < uvmexp.ncolors; color++) {
+ uvm_pgflcache_spill(ucpu, fl, color);
+ }
+ ucpu->pgflcache[fl] = NULL;
+ splx(s);
+ }
+}
+
+/*
+ * uvm_pgflcache_pause: pause operation of the caches
+ */
+
+void
+uvm_pgflcache_pause(void)
+{
+ uint64_t where;
+
+ /* First one in starts draining. Everyone else waits. */
+ mutex_enter(&uvm_pgflcache_lock);
+ if (uvm_pgflcache_sem++ == 0) {
+ uvm_pgflcache_draining = true;
+ mutex_exit(&uvm_pgflcache_lock);
+ where = xc_broadcast(0, uvm_pgflcache_fini_cpu, NULL, NULL);
+ xc_wait(where);
+ mutex_enter(&uvm_pgflcache_lock);
+ uvm_pgflcache_draining = false;
+ cv_broadcast(&uvm_pgflcache_cv);
+ } else {
+ while (uvm_pgflcache_draining) {
+ cv_wait(&uvm_pgflcache_cv, &uvm_pgflcache_lock);
+ }
+ }
+ mutex_exit(&uvm_pgflcache_lock);
+}
+
+/*
+ * uvm_pgflcache_resume: resume operation of the caches
+ */
+
+void
+uvm_pgflcache_resume(void)
+{
+ CPU_INFO_ITERATOR cii;
+ struct cpu_info *ci;
+ struct uvm_cpu *ucpu;
+ uintptr_t addr;
+ size_t sz;
+ int fl;
+
+ /* Last guy out takes care of business. */
+ mutex_enter(&uvm_pgflcache_lock);
+ KASSERT(!uvm_pgflcache_draining);
+ KASSERT(uvm_pgflcache_sem > 0);
+ if (uvm_pgflcache_sem-- > 1) {
+ mutex_exit(&uvm_pgflcache_lock);
+ return;
+ }
+
+ /*
+ * Make sure dependant data structure updates are remotely visible.
+ * Essentially this functions as a global memory barrier.
+ */
+ xc_barrier(XC_HIGHPRI);
+
+ /*
+ * Then set all of the pointers in place on each CPU. As soon as
+ * each pointer is set, caching is operational in that dimension.
+ */
+ sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ ucpu = ci->ci_data.cpu_uvm;
+ addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
+ for (fl = 0; fl < VM_NFREELIST; fl++) {
+ ucpu->pgflcache[fl] = (struct pgflcache *)addr;
+ addr += sz;
+ }
+ }
+ mutex_exit(&uvm_pgflcache_lock);
+}
+
+/*
+ * uvm_pgflcache_start: start operation of the cache.
+ *
+ * => called once only, when init(8) is about to be started
+ */
+
+void
+uvm_pgflcache_start(void)
+{
+ CPU_INFO_ITERATOR cii;
+ struct cpu_info *ci;
+
+ KASSERT(uvm_pgflcache_sem > 0);
+
+ /*
+ * There's not much point doing this if every CPU has its own
+ * bucket (and that includes the uniprocessor case).
+ */
+ if (ncpu == uvm.bucketcount) {
+ return;
+ }
+
+ /* Create each CPU's buckets. */
+ for (CPU_INFO_FOREACH(cii, ci)) {
+ uvm_pgflcache_init_cpu(ci);
+ }
+
+ /* Kick it into action. */
+ uvm_pgflcache_resume();
+}
+
+/*
+ * uvm_pgflcache_init: set up data structures for the free page cache.
+ */
+
+void
+uvm_pgflcache_init(void)
+{
+
+ uvm_pgflcache_sem = 1;
+ mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
+ cv_init(&uvm_pgflcache_cv, "flcache");
+}
+
+#else /* MULTIPROCESSOR */
+
+struct vm_page *
+uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
+{
+
+ return NULL;
+}
+
+bool
+uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
+{
+
+ return false;
+}
+
+void
+uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
+{
+
+}
+
+void
+uvm_pgflcache_pause(void)
+{
+
+}
+
+void
+uvm_pgflcache_resume(void)
+{
+
+}
+
+void
+uvm_pgflcache_start(void)
+{
+
+}
+
+void
+uvm_pgflcache_init(void)
+{
+
+}
+
+#endif /* MULTIPROCESSOR */
Index: src/sys/uvm/uvm_pgflcache.h
diff -u /dev/null src/sys/uvm/uvm_pgflcache.h:1.1
--- /dev/null Fri Dec 27 12:51:57 2019
+++ src/sys/uvm/uvm_pgflcache.h Fri Dec 27 12:51:57 2019
@@ -0,0 +1,43 @@
+/* $NetBSD: uvm_pgflcache.h,v 1.1 2019/12/27 12:51:57 ad Exp $ */
+
+/*-
+ * Copyright (c) 2019 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Andrew Doran.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#if !defined(_UVM_PGFLCACHE_H_)
+#define _UVM_PGFLCACHE_H_
+
+struct vm_page *uvm_pgflcache_alloc(struct uvm_cpu *, int, int);
+void uvm_pgflcache_fill(struct uvm_cpu *, int, int, int);
+bool uvm_pgflcache_free(struct uvm_cpu *, struct vm_page *);
+void uvm_pgflcache_init(void);
+void uvm_pgflcache_pause(void);
+void uvm_pgflcache_resume(void);
+void uvm_pgflcache_start(void);
+
+#endif /* !_UVM_PGFLCACHE_H_ */