Module Name:    src
Committed By:   pooka
Date:           Wed Sep  8 21:02:12 UTC 2010

Modified Files:
        src/sys/rump/librump/rumpkern: vm.c

Log Message:
Improve pagedaemon performance:
  * page out vnode objects
  * drain kmem/kernel_map

As long as there is a reasonable memory hardlimit (>600kB or so),
a rump kernel can now survive file system metadata access for an
arbitrary size file system (provided, of course, that the file
system does not use wired kernel memory for metadata ...).

Data handling still needs a little give&take finetuning.  The
general problem is that a single vm object can easily be the owner
of all vm pages in a rump kernel.  now, if a thread wants to allocate
memory while holding that object locked, there's very little the
pagedaemon can do to avoid deadlock.  but I think the problem can
be solved by making an object release a page when it wants to
allocate a page if a) the system is short on memory and b) too many
pages belong to the object.  that still doesn't take care of the
pathological situation where 1000 threads hold an object with 1
page of memory locked and try to allocate more.  but then again,
running 1000 threads with <1MB of memory is an unlikely scenario.
and ultimately, I call upon the fundamental interaction which is
the basis of why any operating works: luck.


To generate a diff of this commit:
cvs rdiff -u -r1.91 -r1.92 src/sys/rump/librump/rumpkern/vm.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/rump/librump/rumpkern/vm.c
diff -u src/sys/rump/librump/rumpkern/vm.c:1.91 src/sys/rump/librump/rumpkern/vm.c:1.92
--- src/sys/rump/librump/rumpkern/vm.c:1.91	Tue Sep  7 21:11:10 2010
+++ src/sys/rump/librump/rumpkern/vm.c	Wed Sep  8 21:02:11 2010
@@ -1,4 +1,4 @@
-/*	$NetBSD: vm.c,v 1.91 2010/09/07 21:11:10 pooka Exp $	*/
+/*	$NetBSD: vm.c,v 1.92 2010/09/08 21:02:11 pooka Exp $	*/
 
 /*
  * Copyright (c) 2007-2010 Antti Kantee.  All Rights Reserved.
@@ -41,7 +41,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.91 2010/09/07 21:11:10 pooka Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.92 2010/09/08 21:02:11 pooka Exp $");
 
 #include <sys/param.h>
 #include <sys/atomic.h>
@@ -84,6 +84,26 @@
 
 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED;
 static unsigned long curphysmem;
+static unsigned long dddlim;		/* 90% of memory limit used */
+#define NEED_PAGEDAEMON() \
+    (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim)
+
+/*
+ * Try to free two pages worth of pages from objects.
+ * If this succesfully frees a full page cache page, we'll
+ * free the released page plus PAGE_SIZEĀ²/sizeof(vm_page).
+ */
+#define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page))
+
+/*
+ * Keep a list of least recently used pages.  Since the only way a
+ * rump kernel can "access" a page is via lookup, we put the page
+ * at the back of queue every time a lookup for it is done.  If the
+ * page is in front of this global queue and we're short of memory, 
+ * it's a candidate for pageout.
+ */
+static struct pglist vmpage_lruqueue;
+static unsigned vmpage_onqueue;
 
 static int
 pg_compare_key(const struct rb_node *n, const void *key)
@@ -135,13 +155,18 @@
 
 static struct pool_cache pagecache;
 
-/* called with the object locked */
+/*
+ * Called with the object locked.  We don't support anons.
+ */
 struct vm_page *
 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon,
 	int flags, int strat, int free_list)
 {
 	struct vm_page *pg;
 
+	KASSERT(uobj && mutex_owned(&uobj->vmobjlock));
+	KASSERT(anon == NULL);
+
 	pg = pool_cache_get(&pagecache, PR_WAITOK);
 	pg->offset = off;
 	pg->uobject = uobj;
@@ -154,6 +179,17 @@
 	TAILQ_INSERT_TAIL(&uobj->memq, pg, listq.queue);
 	rb_tree_insert_node(&uobj->rb_tree, &pg->rb_node);
 
+	/*
+	 * Put vnodes on the LRU page queue.  we can't flush others,
+	 * so don't bother with them.
+	 */
+	if (UVM_OBJ_IS_VNODE(uobj)) {
+		atomic_inc_uint(&vmpage_onqueue);
+		mutex_enter(&uvm_pageqlock);
+		TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
+		mutex_exit(&uvm_pageqlock);
+	}
+
 	uobj->uo_npages++;
 
 	return pg;
@@ -169,12 +205,21 @@
 {
 	struct uvm_object *uobj = pg->uobject;
 
+	KASSERT(mutex_owned(&uvm_pageqlock));
+
 	if (pg->flags & PG_WANTED)
 		wakeup(pg);
 
+	TAILQ_REMOVE(&uobj->memq, pg, listq.queue);
+
 	uobj->uo_npages--;
 	rb_tree_remove_node(&uobj->rb_tree, &pg->rb_node);
-	TAILQ_REMOVE(&uobj->memq, pg, listq.queue);
+
+	if (UVM_OBJ_IS_VNODE(uobj)) {
+		TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
+		atomic_dec_uint(&vmpage_onqueue);
+	}
+
 	pool_cache_put(&pagecache, pg);
 }
 
@@ -207,11 +252,14 @@
 		CTASSERT(sizeof(buf) >= HUMANIZE_BYTES);
 		format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit);
 #undef HUMANIZE_BYTES
+		dddlim = 9 * (rump_physmemlimit / 10);
 	} else {
 		strlcpy(buf, "unlimited (host limit)", sizeof(buf));
 	}
 	aprint_verbose("total memory = %s\n", buf);
 
+	TAILQ_INIT(&vmpage_lruqueue);
+
 	uvmexp.free = 1024*1024; /* XXX: arbitrary & not updated */
 
 	mutex_init(&pagermtx, MUTEX_DEFAULT, 0);
@@ -416,8 +464,17 @@
 struct vm_page *
 uvm_pagelookup(struct uvm_object *uobj, voff_t off)
 {
+	struct vm_page *pg;
 
-	return (struct vm_page *)rb_tree_find_node(&uobj->rb_tree, &off);
+	pg = (struct vm_page *)rb_tree_find_node(&uobj->rb_tree, &off);
+	if (pg && UVM_OBJ_IS_VNODE(pg->uobject)) {
+		mutex_enter(&uvm_pageqlock);
+		TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
+		TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
+		mutex_exit(&uvm_pageqlock);
+	}
+
+	return pg;
 }
 
 void
@@ -755,30 +812,125 @@
 }
 
 /*
- * Under-construction page mistress.  This is lacking vfs support, namely:
- *
- *  1) draining vfs buffers
- *  2) paging out pages in vm vnode objects
- *     (we will not page out anon memory on the basis that
- *     that's the task of the host)
+ * The Diabolical pageDaemon Director (DDD).
  */
-
 void
 uvm_pageout(void *arg)
 {
+	struct vm_page *pg;
 	struct pool *pp, *pp_first;
 	uint64_t where;
 	int timo = 0;
-	bool succ;
+	int cleaned, skip, skipped;
+	bool succ = false;
 
 	mutex_enter(&pdaemonmtx);
 	for (;;) {
-		cv_timedwait(&pdaemoncv, &pdaemonmtx, timo);
+		if (succ) {
+			kernel_map->flags &= ~VM_MAP_WANTVA;
+			kmem_map->flags &= VM_MAP_WANTVA;
+			timo = 0;
+		}
+		succ = false;
+
+		/*
+		 * Wake up everyone regardless of perceived success.
+		 * They will just resleep if we're stil out of juice.
+		 */
+		if (pdaemon_waiters) {
+			pdaemon_waiters = 0;
+			cv_broadcast(&oomwait);
+		}
+
+		cv_timedwait(&pdaemoncv, &pdaemonmtx, 0);
 		uvmexp.pdwoke++;
+
+		/* tell the world that we are hungry */
 		kernel_map->flags |= VM_MAP_WANTVA;
+		kmem_map->flags |= VM_MAP_WANTVA;
+
+		if (pdaemon_waiters == 0 && !NEED_PAGEDAEMON())
+			continue;
 		mutex_exit(&pdaemonmtx);
 
-		succ = false;
+		/*
+		 * step one: reclaim the page cache.  this should give
+		 * us the biggest earnings since whole pages are released
+		 * into backing memory.
+		 */
+		pool_cache_reclaim(&pagecache);
+		if (!NEED_PAGEDAEMON()) {
+			succ = true;
+			mutex_enter(&pdaemonmtx);
+			continue;
+		}
+
+		/*
+		 * Ok, so that didn't help.  Next, try to hunt memory
+		 * by pushing out vnode pages.  The pages might contain
+		 * useful cached data, but we need the memory.
+		 */
+		cleaned = 0;
+		skip = 0;
+ again:
+		mutex_enter(&uvm_pageqlock);
+		while (cleaned < PAGEDAEMON_OBJCHUNK) {
+			skipped = 0;
+			TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) {
+				struct uvm_object *uobj;
+
+				/*
+				 * skip over pages we _might_ have tried
+				 * to handle earlier.  they might not be
+				 * exactly the same ones, but I'm not too
+				 * concerned.
+				 */
+				while (skipped++ < skip)
+					continue;
+
+				uobj = pg->uobject;
+				if (mutex_tryenter(&uobj->vmobjlock)) {
+					if ((pg->flags & PG_BUSY) == 0) {
+						mutex_exit(&uvm_pageqlock);
+						uobj->pgops->pgo_put(uobj,
+						    pg->offset,
+						    pg->offset + PAGE_SIZE,
+						    PGO_CLEANIT|PGO_FREE);
+						cleaned++;
+						goto again;
+					}
+				}
+
+				skip++;
+			}
+			break;
+		}
+		mutex_exit(&uvm_pageqlock);
+
+		/*
+		 * And of course we need to reclaim the page cache
+		 * again to actually release memory.
+		 */
+		pool_cache_reclaim(&pagecache);
+		if (!NEED_PAGEDAEMON()) {
+			succ = true;
+			mutex_enter(&pdaemonmtx);
+			continue;
+		}
+
+		/*
+		 * Still not there?  sleeves come off right about now.
+		 * First: do reclaim on kernel/kmem map.
+		 */
+		callback_run_roundrobin(&kernel_map_store.vmk_reclaim_callback,
+		    NULL);
+		callback_run_roundrobin(&kmem_map_store.vmk_reclaim_callback,
+		    NULL);
+
+		/*
+		 * And then drain the pools.  Wipe them out ... all of them.
+		 */
+
 		pool_drain_start(&pp_first, &where);
 		pp = pp_first;
 		for (;;) {
@@ -792,44 +944,38 @@
 				break;
 			}
 		}
-		mutex_enter(&pdaemonmtx);
+
+		/*
+		 * Need to use PYEC on our bag of tricks.
+		 * Unfortunately, the wife just borrowed it.
+		 */
 
 		if (!succ) {
 			rumpuser_dprintf("pagedaemoness: failed to reclaim "
 			    "memory ... sleeping (deadlock?)\n");
-			timo = hz;
-			continue;
+			kpause("dpdd", false, hz, NULL);
 		}
-		kernel_map->flags &= ~VM_MAP_WANTVA;
-		timo = 0;
 
-		if (pdaemon_waiters) {
-			pdaemon_waiters = 0;
-			cv_broadcast(&oomwait);
-		}
+		mutex_enter(&pdaemonmtx);
 	}
 
 	panic("you can swap out any time you like, but you can never leave");
 }
 
-/*
- * In a regular kernel the pagedaemon is activated when memory becomes
- * low.  In a virtual rump kernel we do not know exactly how much memory
- * we have available -- it depends on the conditions on the host.
- * Therefore, we cannot preemptively kick the pagedaemon.  Rather, we
- * wait until things we desperate and we're forced to uvm_wait().
- *
- * The alternative would be to allocate a huge chunk of memory at
- * startup, but that solution has a number of problems including
- * being a resource hog, failing anyway due to host memory overcommit
- * and core dump size.
- */
-
 void
 uvm_kick_pdaemon()
 {
 
-	/* nada */
+	/*
+	 * Wake up the diabolical pagedaemon director if we are over
+	 * 90% of the memory limit.  This is a complete and utter
+	 * stetson-harrison decision which you are allowed to finetune.
+	 * Don't bother locking.  If we have some unflushed caches,
+	 * other waker-uppers will deal with the issue.
+	 */
+	if (NEED_PAGEDAEMON()) {
+		cv_signal(&pdaemoncv);
+	}
 }
 
 void *
@@ -838,6 +984,8 @@
 	unsigned long newmem;
 	void *rv;
 
+	uvm_kick_pdaemon(); /* ouch */
+
 	/* first we must be within the limit */
  limitagain:
 	if (rump_physmemlimit != RUMPMEM_UNLIMITED) {

Reply via email to