Module Name:    src
Committed By:   christos
Date:           Sat Sep 24 20:11:12 UTC 2016

Modified Files:
        src/lib/libc/db/btree: bt_close.c bt_conv.c bt_debug.c bt_delete.c
            bt_open.c bt_overflow.c bt_page.c bt_put.c bt_search.c bt_seq.c
            bt_split.c extern.h
        src/lib/libc/db/mpool: mpool.c
        src/lib/libc/db/recno: rec_open.c rec_search.c

Log Message:
Merge the recursive tree traversal changes from the mit kerberos tree. This
Also make the tracefile customizable. Unfortunately we can't merge any of
the hash changes because they have a different on-disk format. That does not
matter really because we've fixed most of the problems...


To generate a diff of this commit:
cvs rdiff -u -r1.15 -r1.16 src/lib/libc/db/btree/bt_close.c
cvs rdiff -u -r1.14 -r1.15 src/lib/libc/db/btree/bt_conv.c
cvs rdiff -u -r1.16 -r1.17 src/lib/libc/db/btree/bt_debug.c
cvs rdiff -u -r1.17 -r1.18 src/lib/libc/db/btree/bt_delete.c \
    src/lib/libc/db/btree/bt_search.c
cvs rdiff -u -r1.27 -r1.28 src/lib/libc/db/btree/bt_open.c
cvs rdiff -u -r1.20 -r1.21 src/lib/libc/db/btree/bt_overflow.c \
    src/lib/libc/db/btree/bt_put.c src/lib/libc/db/btree/bt_split.c
cvs rdiff -u -r1.13 -r1.14 src/lib/libc/db/btree/bt_page.c
cvs rdiff -u -r1.18 -r1.19 src/lib/libc/db/btree/bt_seq.c
cvs rdiff -u -r1.12 -r1.13 src/lib/libc/db/btree/extern.h
cvs rdiff -u -r1.21 -r1.22 src/lib/libc/db/mpool/mpool.c
cvs rdiff -u -r1.20 -r1.21 src/lib/libc/db/recno/rec_open.c
cvs rdiff -u -r1.14 -r1.15 src/lib/libc/db/recno/rec_search.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/lib/libc/db/btree/bt_close.c
diff -u src/lib/libc/db/btree/bt_close.c:1.15 src/lib/libc/db/btree/bt_close.c:1.16
--- src/lib/libc/db/btree/bt_close.c:1.15	Wed Aug 31 02:23:51 2016
+++ src/lib/libc/db/btree/bt_close.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_close.c,v 1.15 2016/08/31 06:23:51 christos Exp $	*/
+/*	$NetBSD: bt_close.c,v 1.16 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_close.c,v 1.15 2016/08/31 06:23:51 christos Exp $");
+__RCSID("$NetBSD: bt_close.c,v 1.16 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 
@@ -164,7 +164,7 @@ bt_meta(BTREE *t)
 	BTMETA m;
 	void *p;
 
-	if ((p = mpool_get(t->bt_mp, P_META, 0)) == NULL)
+	if ((p = mpool_getf(t->bt_mp, P_META, 0)) == NULL)
 		return (RET_ERROR);
 
 	/* Fill in metadata. */

Index: src/lib/libc/db/btree/bt_conv.c
diff -u src/lib/libc/db/btree/bt_conv.c:1.14 src/lib/libc/db/btree/bt_conv.c:1.15
--- src/lib/libc/db/btree/bt_conv.c:1.14	Wed Sep 10 13:52:35 2008
+++ src/lib/libc/db/btree/bt_conv.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_conv.c,v 1.14 2008/09/10 17:52:35 joerg Exp $	*/
+/*	$NetBSD: bt_conv.c,v 1.15 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,10 +37,11 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_conv.c,v 1.14 2008/09/10 17:52:35 joerg Exp $");
+__RCSID("$NetBSD: bt_conv.c,v 1.15 2016/09/24 20:11:12 christos Exp $");
 
 #include <assert.h>
 #include <stdio.h>
+#include <memory.h>
 
 #include <db.h>
 #include "btree.h"
@@ -64,6 +65,7 @@ __bt_pgin(void *t, pgno_t pg, void *pp)
 	indx_t i, top;
 	uint8_t flags;
 	char *p;
+	uint32_t ksize;
 
 	if (!F_ISSET(((BTREE *)t), B_NEEDSWAP))
 		return;
@@ -101,6 +103,7 @@ __bt_pgin(void *t, pgno_t pg, void *pp)
 			M_16_SWAP(h->linp[i]);
 			p = (char *)(void *)GETBLEAF(h, i);
 			P_32_SWAP(p);
+			memcpy(&ksize, p, sizeof(ksize));
 			p += sizeof(uint32_t);
 			P_32_SWAP(p);
 			p += sizeof(uint32_t);
@@ -113,7 +116,7 @@ __bt_pgin(void *t, pgno_t pg, void *pp)
 					P_32_SWAP(p);
 				}
 				if (flags & P_BIGDATA) {
-					p += sizeof(uint32_t);
+					p += ksize;
 					P_32_SWAP(p);
 					p += sizeof(pgno_t);
 					P_32_SWAP(p);
@@ -129,6 +132,7 @@ __bt_pgout(void *t, pgno_t pg, void *pp)
 	indx_t i, top;
 	uint8_t flags;
 	char *p;
+	uint32_t ksize;
 
 	if (!F_ISSET(((BTREE *)t), B_NEEDSWAP))
 		return;
@@ -157,6 +161,7 @@ __bt_pgout(void *t, pgno_t pg, void *pp)
 	else if ((h->flags & P_TYPE) == P_BLEAF)
 		for (i = 0; i < top; i++) {
 			p = (char *)(void *)GETBLEAF(h, i);
+			ksize = GETBLEAF(h, i)->ksize;
 			P_32_SWAP(p);
 			p += sizeof(uint32_t);
 			P_32_SWAP(p);
@@ -170,7 +175,7 @@ __bt_pgout(void *t, pgno_t pg, void *pp)
 					P_32_SWAP(p);
 				}
 				if (flags & P_BIGDATA) {
-					p += sizeof(uint32_t);
+					p += ksize;
 					P_32_SWAP(p);
 					p += sizeof(pgno_t);
 					P_32_SWAP(p);

Index: src/lib/libc/db/btree/bt_debug.c
diff -u src/lib/libc/db/btree/bt_debug.c:1.16 src/lib/libc/db/btree/bt_debug.c:1.17
--- src/lib/libc/db/btree/bt_debug.c:1.16	Sun Jul 17 16:47:39 2011
+++ src/lib/libc/db/btree/bt_debug.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_debug.c,v 1.16 2011/07/17 20:47:39 christos Exp $	*/
+/*	$NetBSD: bt_debug.c,v 1.17 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_debug.c,v 1.16 2011/07/17 20:47:39 christos Exp $");
+__RCSID("$NetBSD: bt_debug.c,v 1.17 2016/09/24 20:11:12 christos Exp $");
 
 #include <assert.h>
 #include <stdio.h>
@@ -47,6 +47,29 @@ __RCSID("$NetBSD: bt_debug.c,v 1.16 2011
 #include <db.h>
 #include "btree.h"
 
+#if defined(DEBUG) || defined(STATISTICS)
+
+static FILE *tracefp;
+
+/*
+ * __bt_dinit --
+ *     initialize debugging.
+ */
+static void
+__bt_dinit(void)
+{
+	if (tracefp != NULL)
+		return;
+
+#ifndef TRACE_TO_STDERR
+	if ((tracefp = fopen("/tmp/__bt_debug", "w")) != NULL)
+		return;
+#endif
+	tracefp = stderr;
+}
+#endif
+
+
 #ifdef DEBUG
 /*
  * BT_DUMP -- Dump the tree
@@ -62,15 +85,17 @@ __bt_dump(DB *dbp)
 	pgno_t i;
 	const char *sep;
 
+	__bt_dinit();
+
 	t = dbp->internal;
-	(void)fprintf(stderr, "%s: pgsz %d",
+	(void)fprintf(tracefp, "%s: pgsz %d",
 	    F_ISSET(t, B_INMEM) ? "memory" : "disk", t->bt_psize);
 	if (F_ISSET(t, R_RECNO))
-		(void)fprintf(stderr, " keys %lu", (unsigned long) t->bt_nrecs);
+		(void)fprintf(tracefp, " keys %lu", (unsigned long) t->bt_nrecs);
 #undef X
 #define	X(flag, name) \
 	if (F_ISSET(t, flag)) { \
-		(void)fprintf(stderr, "%s%s", sep, name); \
+		(void)fprintf(tracefp, "%s%s", sep, name); \
 		sep = ", "; \
 	}
 	if (t->flags != 0) {
@@ -81,14 +106,15 @@ __bt_dump(DB *dbp)
 		X(B_RDONLY,	"RDONLY");
 		X(R_RECNO,	"RECNO");
 		X(B_METADIRTY,"METADIRTY");
-		(void)fprintf(stderr, ")\n");
+		(void)fprintf(tracefp, ")\n");
 	}
 #undef X
 
-	for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
+	for (i = P_ROOT; i < t->bt_mp->npages &&
+	    (h = mpool_getf(t->bt_mp, i, MPOOL_IGNOREPIN)) != NULL; ++i)
 		__bt_dpage(h);
-		(void)mpool_put(t->bt_mp, h, 0);
-	}
+
+	(void)fflush(tracefp);
 }
 
 /*
@@ -104,24 +130,26 @@ __bt_dmpage(PAGE *h)
 	const char *sep;
 
 	m = (BTMETA *)(void *)h;
-	(void)fprintf(stderr, "magic %lx\n", (unsigned long) m->magic);
-	(void)fprintf(stderr, "version %lu\n", (unsigned long) m->version);
-	(void)fprintf(stderr, "psize %lu\n", (unsigned long) m->psize);
-	(void)fprintf(stderr, "free %lu\n", (unsigned long) m->free);
-	(void)fprintf(stderr, "nrecs %lu\n", (unsigned long) m->nrecs);
-	(void)fprintf(stderr, "flags %lu", (unsigned long) m->flags);
+	(void)fprintf(tracefp, "magic %lx\n", (unsigned long) m->magic);
+	(void)fprintf(tracefp, "version %lu\n", (unsigned long) m->version);
+	(void)fprintf(tracefp, "psize %lu\n", (unsigned long) m->psize);
+	(void)fprintf(tracefp, "free %lu\n", (unsigned long) m->free);
+	(void)fprintf(tracefp, "nrecs %lu\n", (unsigned long) m->nrecs);
+	(void)fprintf(tracefp, "flags %lu", (unsigned long) m->flags);
 #undef X
 #define	X(flag, name) \
 	if (m->flags & flag) { \
-		(void)fprintf(stderr, "%s%s", sep, name); \
+		(void)fprintf(tracefp, "%s%s", sep, name); \
 		sep = ", "; \
 	}
 	if (m->flags) {
 		sep = " (";
 		X(B_NODUPS,	"NODUPS");
 		X(R_RECNO,	"RECNO");
-		(void)fprintf(stderr, ")");
+		(void)fprintf(tracefp, ")");
 	}
+	(void)fprintf(tracefp, "\n");
+	(void)fflush(tracefp);
 }
 
 static pgno_t
@@ -152,11 +180,14 @@ __bt_dnpage(DB *dbp, pgno_t pgno)
 	BTREE *t;
 	PAGE *h;
 
+	__bt_dinit();
+
 	t = dbp->internal;
-	if ((h = mpool_get(t->bt_mp, pgno, 0)) != NULL) {
+	if ((h = mpool_getf(t->bt_mp, pgno, MPOOL_IGNOREPIN)) != NULL) {
 		__bt_dpage(h);
 		(void)mpool_put(t->bt_mp, h, 0);
 	}
+	(void)fflush(tracefp);
 }
 
 /*
@@ -175,11 +206,13 @@ __bt_dpage(PAGE *h)
 	indx_t cur, top;
 	const char *sep;
 
-	(void)fprintf(stderr, "    page %d: (", h->pgno);
+	__bt_dinit();
+
+	(void)fprintf(tracefp, "    page %d: (", h->pgno);
 #undef X
 #define	X(flag, name) \
 	if (h->flags & flag) { \
-		(void)fprintf(stderr, "%s%s", sep, name); \
+		(void)fprintf(tracefp, "%s%s", sep, name); \
 		sep = ", "; \
 	}
 	sep = "";
@@ -189,68 +222,69 @@ __bt_dpage(PAGE *h)
 	X(P_RLEAF,	"RLEAF")
 	X(P_OVERFLOW,	"OVERFLOW")
 	X(P_PRESERVE,	"PRESERVE");
-	(void)fprintf(stderr, ")\n");
+	(void)fprintf(tracefp, ")\n");
 #undef X
 
-	(void)fprintf(stderr, "\tprev %2d next %2d", h->prevpg, h->nextpg);
+	(void)fprintf(tracefp, "\tprev %2d next %2d", h->prevpg, h->nextpg);
 	if (h->flags & P_OVERFLOW)
 		return;
 
 	top = NEXTINDEX(h);
-	(void)fprintf(stderr, " lower %3d upper %3d nextind %d\n",
+	(void)fprintf(tracefp, " lower %3d upper %3d nextind %d\n",
 	    h->lower, h->upper, top);
 	for (cur = 0; cur < top; cur++) {
-		(void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]);
+		(void)fprintf(tracefp, "\t[%03d] %4d ", cur, h->linp[cur]);
 		switch (h->flags & P_TYPE) {
 		case P_BINTERNAL:
 			bi = GETBINTERNAL(h, cur);
-			(void)fprintf(stderr,
+			(void)fprintf(tracefp,
 			    "size %03d pgno %03d", bi->ksize, bi->pgno);
 			if (bi->flags & P_BIGKEY)
-				(void)fprintf(stderr, " (indirect)");
+				(void)fprintf(tracefp, " (indirect)");
 			else if (bi->ksize)
-				(void)fprintf(stderr,
+				(void)fprintf(tracefp,
 				    " {%.*s}", (int)bi->ksize, bi->bytes);
 			break;
 		case P_RINTERNAL:
 			ri = GETRINTERNAL(h, cur);
-			(void)fprintf(stderr, "entries %03d pgno %03d",
+			(void)fprintf(tracefp, "entries %03d pgno %03d",
 				ri->nrecs, ri->pgno);
 			break;
 		case P_BLEAF:
 			bl = GETBLEAF(h, cur);
 			if (bl->flags & P_BIGKEY)
-				(void)fprintf(stderr,
+				(void)fprintf(tracefp,
 				    "big key page %lu size %u/",
 				    (unsigned long) __bt_pgno_t(bl->bytes),
 				    __bt_uint32_t(bl->bytes + sizeof(pgno_t)));
 			else if (bl->ksize)
-				(void)fprintf(stderr, "%s/", bl->bytes);
+				(void)fprintf(tracefp, "%s/", bl->bytes);
 			if (bl->flags & P_BIGDATA)
-				(void)fprintf(stderr,
+				(void)fprintf(tracefp,
 				    "big data page %lu size %u",
 				    (unsigned long)
 				    __bt_pgno_t(bl->bytes + bl->ksize),
 				    __bt_uint32_t(bl->bytes + bl->ksize +
 				    sizeof(pgno_t)));
 			else if (bl->dsize)
-				(void)fprintf(stderr, "%.*s",
+				(void)fprintf(tracefp, "%.*s",
 				    (int)bl->dsize, bl->bytes + bl->ksize);
 			break;
 		case P_RLEAF:
 			rl = GETRLEAF(h, cur);
 			if (rl->flags & P_BIGDATA)
-				(void)fprintf(stderr,
+				(void)fprintf(tracefp,
 				    "big data page %lu size %u",
 				    (unsigned long) __bt_pgno_t(rl->bytes),
 				    __bt_uint32_t(rl->bytes + sizeof(pgno_t)));
 			else if (rl->dsize)
-				(void)fprintf(stderr,
+				(void)fprintf(tracefp,
 				    "%.*s", (int)rl->dsize, rl->bytes);
 			break;
 		}
-		(void)fprintf(stderr, "\n");
+		(void)fprintf(tracefp, "\n");
 	}
+	(void)fflush(tracefp);
 }
 #endif
 
@@ -272,10 +306,13 @@ __bt_stat(DB *dbp)
 	unsigned long ifree, lfree, nkeys;
 	int levels;
 
+	__bt_dinit();
+
 	t = dbp->internal;
 	pcont = pinternal = pleaf = 0;
 	nkeys = ifree = lfree = 0;
-	for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
+	for (i = P_ROOT; i < t->bt_mp->npages &&
+	    (h = mpool_getf(t->bt_mp, i, MPOOL_IGNOREPIN)) != NULL; ++i)
 		switch (h->flags & P_TYPE) {
 		case P_BINTERNAL:
 		case P_RINTERNAL:
@@ -297,7 +334,7 @@ __bt_stat(DB *dbp)
 
 	/* Count the levels of the tree. */
 	for (i = P_ROOT, levels = 0 ;; ++levels) {
-		h = mpool_get(t->bt_mp, i, 0);
+		h = mpool_getf(t->bt_mp, i, MPOOL_IGNOREPIN);
 		if (h->flags & (P_BLEAF|P_RLEAF)) {
 			if (levels == 0)
 				levels = 1;
@@ -310,32 +347,33 @@ __bt_stat(DB *dbp)
 		(void)mpool_put(t->bt_mp, h, 0);
 	}
 
-	(void)fprintf(stderr, "%d level%s with %ld keys",
+	(void)fprintf(tracefp, "%d level%s with %ld keys",
 	    levels, levels == 1 ? "" : "s", nkeys);
 	if (F_ISSET(t, R_RECNO))
-		(void)fprintf(stderr, " (%ld header count)", (long)t->bt_nrecs);
-	(void)fprintf(stderr,
+		(void)fprintf(tracefp, " (%ld header count)", (long)t->bt_nrecs);
+	(void)fprintf(tracefp,
 	    "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n",
 	    (long)pinternal + pleaf + pcont, (long)pleaf, (long)pinternal,
 	    (long)pcont);
-	(void)fprintf(stderr, "%ld cache hits, %ld cache misses\n",
+	(void)fprintf(tracefp, "%ld cache hits, %ld cache misses\n",
 	    bt_cache_hit, bt_cache_miss);
-	(void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n",
+	(void)fprintf(tracefp, "%ld splits (%ld root splits, %ld sort splits)\n",
 	    bt_split, bt_rootsplit, bt_sortsplit);
 	pleaf *= t->bt_psize - BTDATAOFF;
 	if (pleaf)
-		(void)fprintf(stderr,
+		(void)fprintf(tracefp,
 		    "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n",
 		    ((double)(pleaf - lfree) / pleaf) * 100,
 		    pleaf - lfree, lfree);
 	pinternal *= t->bt_psize - BTDATAOFF;
 	if (pinternal)
-		(void)fprintf(stderr,
+		(void)fprintf(tracefp,
 		    "%.0f%% internal fill (%ld bytes used, %ld bytes free\n",
 		    ((double)(pinternal - ifree) / pinternal) * 100,
 		    pinternal - ifree, ifree);
 	if (bt_pfxsaved)
-		(void)fprintf(stderr, "prefix checking removed %lu bytes.\n",
+		(void)fprintf(tracefp, "prefix checking removed %lu bytes.\n",
 		    bt_pfxsaved);
+	(void)fflush(tracefp);
 }
 #endif

Index: src/lib/libc/db/btree/bt_delete.c
diff -u src/lib/libc/db/btree/bt_delete.c:1.17 src/lib/libc/db/btree/bt_delete.c:1.18
--- src/lib/libc/db/btree/bt_delete.c:1.17	Wed Jan 28 21:02:36 2009
+++ src/lib/libc/db/btree/bt_delete.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_delete.c,v 1.17 2009/01/29 02:02:36 lukem Exp $	*/
+/*	$NetBSD: bt_delete.c,v 1.18 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_delete.c,v 1.17 2009/01/29 02:02:36 lukem Exp $");
+__RCSID("$NetBSD: bt_delete.c,v 1.18 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -53,7 +53,6 @@ __RCSID("$NetBSD: bt_delete.c,v 1.17 200
 static int __bt_bdelete(BTREE *, const DBT *);
 static int __bt_curdel(BTREE *, const DBT *, PAGE *, u_int);
 static int __bt_pdelete(BTREE *, PAGE *);
-static int __bt_relink(BTREE *, PAGE *);
 static int __bt_stkacq(BTREE *, PAGE **, CURSOR *);
 
 /*
@@ -97,7 +96,7 @@ __bt_delete(const DB *dbp, const DBT *ke
 		if (F_ISSET(c, CURS_INIT)) {
 			if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
 				return (RET_SPECIAL);
-			if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, c->pg.pgno, 0)) == NULL)
 				return (RET_ERROR);
 
 			/*
@@ -181,7 +180,7 @@ __bt_stkacq(BTREE *t, PAGE **hp, CURSOR 
 		/* Move up the stack. */
 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
 			/* Get the parent page. */
-			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, parent->pgno, 0)) == NULL)
 				return (1);
 
 			/* Move to the next index. */
@@ -204,12 +203,12 @@ __bt_stkacq(BTREE *t, PAGE **hp, CURSOR 
 			mpool_put(t->bt_mp, h, 0);
 
 			/* Get the next level down. */
-			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, pgno, 0)) == NULL)
 				return (1);
 			idx = 0;
 		}
 		mpool_put(t->bt_mp, h, 0);
-		if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, nextpg, 0)) == NULL)
 			return (1);
 	}
 
@@ -236,7 +235,7 @@ __bt_stkacq(BTREE *t, PAGE **hp, CURSOR 
 		/* Move up the stack. */
 		for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
 			/* Get the parent page. */
-			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, parent->pgno, 0)) == NULL)
 				return (1);
 
 			/* Move to the next index. */
@@ -258,20 +257,20 @@ __bt_stkacq(BTREE *t, PAGE **hp, CURSOR 
 			mpool_put(t->bt_mp, h, 0);
 
 			/* Get the next level down. */
-			if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, pgno, 0)) == NULL)
 				return (1);
 
 			idx = NEXTINDEX(h) - 1;
 			BT_PUSH(t, pgno, idx);
 		}
 		mpool_put(t->bt_mp, h, 0);
-		if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, prevpg, 0)) == NULL)
 			return (1);
 	}
 	
 
 ret:	mpool_put(t->bt_mp, h, 0);
-	return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
+	return ((*hp = mpool_getf(t->bt_mp, c->pg.pgno, 0)) == NULL);
 }
 
 /*
@@ -394,7 +393,7 @@ __bt_pdelete(BTREE *t, PAGE *h)
 	 */
 	while ((parent = BT_POP(t)) != NULL) {
 		/* Get the parent page. */
-		if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+		if ((pg = mpool_getf(t->bt_mp, parent->pgno, 0)) == NULL)
 			return (RET_ERROR);
 		
 		idx = parent->index;
@@ -577,7 +576,7 @@ __bt_curdel(BTREE *t, const DBT *key, PA
 		}
 		/* Check previous key if at the beginning of the page. */
 		if (idx == 0 && h->prevpg != P_INVALID) {
-			if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
+			if ((pg = mpool_getf(t->bt_mp, h->prevpg, 0)) == NULL)
 				return (RET_ERROR);
 			e.page = pg;
 			e.index = NEXTINDEX(pg) - 1;
@@ -589,7 +588,7 @@ __bt_curdel(BTREE *t, const DBT *key, PA
 		}
 		/* Check next key if at the end of the page. */
 		if (idx == (unsigned)(NEXTINDEX(h) - 1) && h->nextpg != P_INVALID) {
-			if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
+			if ((pg = mpool_getf(t->bt_mp, h->nextpg, 0)) == NULL)
 				return (RET_ERROR);
 			e.page = pg;
 			e.index = 0;
@@ -621,19 +620,19 @@ dup2:				c->pg.pgno = e.page->pgno;
  *	t:	tree
  *	h:	page to be deleted
  */
-static int
+int
 __bt_relink(BTREE *t, PAGE *h)
 {
 	PAGE *pg;
 
 	if (h->nextpg != P_INVALID) {
-		if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
+		if ((pg = mpool_getf(t->bt_mp, h->nextpg, 0)) == NULL)
 			return (RET_ERROR);
 		pg->prevpg = h->prevpg;
 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
 	}
 	if (h->prevpg != P_INVALID) {
-		if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
+		if ((pg = mpool_getf(t->bt_mp, h->prevpg, 0)) == NULL)
 			return (RET_ERROR);
 		pg->nextpg = h->nextpg;
 		mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
Index: src/lib/libc/db/btree/bt_search.c
diff -u src/lib/libc/db/btree/bt_search.c:1.17 src/lib/libc/db/btree/bt_search.c:1.18
--- src/lib/libc/db/btree/bt_search.c:1.17	Thu Sep 11 08:58:00 2008
+++ src/lib/libc/db/btree/bt_search.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_search.c,v 1.17 2008/09/11 12:58:00 joerg Exp $	*/
+/*	$NetBSD: bt_search.c,v 1.18 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_search.c,v 1.17 2008/09/11 12:58:00 joerg Exp $");
+__RCSID("$NetBSD: bt_search.c,v 1.18 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -75,7 +75,7 @@ __bt_search(BTREE *t, const DBT *key, in
 
 	BT_CLR(t);
 	for (pg = P_ROOT;;) {
-		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 			return (NULL);
 
 		/* Do a binary search on the current page. */
@@ -150,23 +150,65 @@ next:		BT_PUSH(t, h->pgno, idx);
 static int
 __bt_snext(BTREE *t, PAGE *h, const DBT *key, int *exactp)
 {
+	BINTERNAL *bi;
 	EPG e;
+	EPGNO *parent;
+	indx_t idx = 0;
+	pgno_t pgno;
+	int level;
 
 	/*
 	 * Get the next page.  The key is either an exact
 	 * match, or not as good as the one we already have.
 	 */
-	if ((e.page = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
-		return (0);
+	if ((e.page = mpool_getf(t->bt_mp, h->nextpg, 0)) == NULL)
+		return 0;
 	e.index = 0;
-	if (__bt_cmp(t, key, &e) == 0) {
+	if (__bt_cmp(t, key, &e) != 0) {
+		mpool_put(t->bt_mp, e.page, 0);
+		return 0;
+	}
+	mpool_put(t->bt_mp, h, 0);
+	t->bt_cur = e;
+	*exactp = 1;
+
+	/*
+	 * Adjust the stack for the movement.
+	 *
+	 * Move up the stack.
+	 */
+	for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
+		/* Get the parent page. */
+		if ((h = mpool_getf(t->bt_mp, parent->pgno, 0)) == NULL)
+			return 0;
+
+		/* Move to the next index. */
+		if (parent->index != NEXTINDEX(h) - 1) {
+			idx = parent->index + 1;
+			BT_PUSH(t, h->pgno, idx);
+			break;
+		}
+
 		mpool_put(t->bt_mp, h, 0);
-		t->bt_cur = e;
-		*exactp = 1;
-		return (1);
 	}
-	mpool_put(t->bt_mp, e.page, 0);
-	return (0);
+
+	/* Restore the stack. */
+	while (level--) {
+		/* Push the next level down onto the stack. */
+		bi = GETBINTERNAL(h, idx);
+		pgno = bi->pgno;
+		BT_PUSH(t, pgno, 0);
+
+		/* Lose the currently pinned page. */
+		mpool_put(t->bt_mp, h, 0);
+
+		/* Get the next level down. */
+		if ((h = mpool_getf(t->bt_mp, pgno, 0)) == NULL)
+			return 0;
+		idx = 0;
+	}
+	mpool_put(t->bt_mp, h, 0);
+	return 1;
 }
 
 /*
@@ -185,21 +227,64 @@ __bt_snext(BTREE *t, PAGE *h, const DBT 
 static int
 __bt_sprev(BTREE *t, PAGE *h, const DBT *key, int *exactp)
 {
+	BINTERNAL *bi;
 	EPG e;
+	EPGNO *parent;
+	indx_t idx = 0;
+	pgno_t pgno;
+	int level;
 
 	/*
 	 * Get the previous page.  The key is either an exact
 	 * match, or not as good as the one we already have.
 	 */
-	if ((e.page = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
-		return (0);
+	if ((e.page = mpool_getf(t->bt_mp, h->prevpg, 0)) == NULL)
+		return 0;
 	e.index = NEXTINDEX(e.page) - 1;
-	if (__bt_cmp(t, key, &e) == 0) {
+	if (__bt_cmp(t, key, &e) != 0) {
+		mpool_put(t->bt_mp, e.page, 0);
+		return 0;
+	}
+
+	mpool_put(t->bt_mp, h, 0);
+	t->bt_cur = e;
+	*exactp = 1;
+
+	/*
+	 * Adjust the stack for the movement.
+	 *
+	 * Move up the stack.
+	 */
+	for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
+		/* Get the parent page. */
+		if ((h = mpool_getf(t->bt_mp, parent->pgno, 0)) == NULL)
+			return 1;
+
+		/* Move to the next index. */
+		if (parent->index != 0) {
+			idx = parent->index - 1;
+			BT_PUSH(t, h->pgno, idx);
+			break;
+		}
 		mpool_put(t->bt_mp, h, 0);
-		t->bt_cur = e;
-		*exactp = 1;
-		return (1);
 	}
-	mpool_put(t->bt_mp, e.page, 0);
-	return (0);
+
+	/* Restore the stack. */
+	while (level--) {
+		/* Push the next level down onto the stack. */
+		bi = GETBINTERNAL(h, idx);
+		pgno = bi->pgno;
+
+		/* Lose the currently pinned page. */
+		mpool_put(t->bt_mp, h, 0);
+
+		/* Get the next level down. */
+		if ((h = mpool_getf(t->bt_mp, pgno, 0)) == NULL)
+			return 1;
+
+		idx = NEXTINDEX(h) - 1;
+		BT_PUSH(t, pgno, idx);
+	}
+	mpool_put(t->bt_mp, h, 0);
+	return 1;
 }

Index: src/lib/libc/db/btree/bt_open.c
diff -u src/lib/libc/db/btree/bt_open.c:1.27 src/lib/libc/db/btree/bt_open.c:1.28
--- src/lib/libc/db/btree/bt_open.c:1.27	Sat Nov 30 19:22:48 2013
+++ src/lib/libc/db/btree/bt_open.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_open.c,v 1.27 2013/12/01 00:22:48 christos Exp $	*/
+/*	$NetBSD: bt_open.c,v 1.28 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_open.c,v 1.27 2013/12/01 00:22:48 christos Exp $");
+__RCSID("$NetBSD: bt_open.c,v 1.28 2016/09/24 20:11:12 christos Exp $");
 
 /*
  * Implementation of btree access method for 4.4BSD.
@@ -354,18 +354,25 @@ nroot(BTREE *t)
 	PAGE *meta, *root;
 	pgno_t npg;
 
-	if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
-		mpool_put(t->bt_mp, meta, 0);
-		return (RET_SUCCESS);
+	if ((root = mpool_getf(t->bt_mp, 1, 0)) != NULL) {
+		if (root->lower == 0 &&
+		    root->pgno == 0 &&
+		    root->linp[0] == 0) {
+			mpool_delete(t->bt_mp, root);
+			errno = EINVAL;
+		} else {
+			mpool_put(t->bt_mp, root, 0);
+			return RET_SUCCESS;
+		}
 	}
 	if (errno != EINVAL)		/* It's OK to not exist. */
 		return (RET_ERROR);
 	errno = 0;
 
-	if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
+	if ((meta = mpool_newf(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL)
 		return (RET_ERROR);
 
-	if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
+	if ((root = mpool_newf(t->bt_mp, &npg, MPOOL_PAGE_NEXT)) == NULL)
 		return (RET_ERROR);
 
 	if (npg != P_ROOT)

Index: src/lib/libc/db/btree/bt_overflow.c
diff -u src/lib/libc/db/btree/bt_overflow.c:1.20 src/lib/libc/db/btree/bt_overflow.c:1.21
--- src/lib/libc/db/btree/bt_overflow.c:1.20	Sat Dec 14 13:04:56 2013
+++ src/lib/libc/db/btree/bt_overflow.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_overflow.c,v 1.20 2013/12/14 18:04:56 christos Exp $	*/
+/*	$NetBSD: bt_overflow.c,v 1.21 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_overflow.c,v 1.20 2013/12/14 18:04:56 christos Exp $");
+__RCSID("$NetBSD: bt_overflow.c,v 1.21 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/param.h>
@@ -112,7 +112,7 @@ __ovfl_get(BTREE *t, void *p, size_t *ss
 	_DBFIT(temp, uint32_t);
 	plen = (uint32_t)temp;
 	for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) {
-		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 			return (RET_ERROR);
 
 		nb = MIN(sz, plen);
@@ -208,7 +208,7 @@ __ovfl_delete(BTREE *t, void *p)
 	if (pg == P_INVALID || sz == 0)
 		abort();
 #endif
-	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+	if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 		return (RET_ERROR);
 
 	/* Don't delete chains used by internal pages. */
@@ -226,7 +226,7 @@ __ovfl_delete(BTREE *t, void *p)
 		__bt_free(t, h);
 		if (sz <= plen)
 			break;
-		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 			return (RET_ERROR);
 	}
 	return (RET_SUCCESS);
Index: src/lib/libc/db/btree/bt_put.c
diff -u src/lib/libc/db/btree/bt_put.c:1.20 src/lib/libc/db/btree/bt_put.c:1.21
--- src/lib/libc/db/btree/bt_put.c:1.20	Sun Jun 26 18:20:31 2011
+++ src/lib/libc/db/btree/bt_put.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_put.c,v 1.20 2011/06/26 22:20:31 christos Exp $	*/
+/*	$NetBSD: bt_put.c,v 1.21 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_put.c,v 1.20 2011/06/26 22:20:31 christos Exp $");
+__RCSID("$NetBSD: bt_put.c,v 1.21 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -152,7 +152,7 @@ storekey:		if (__ovfl_put(t, key, &pg) =
 
 	/* Replace the cursor. */
 	if (flags == R_CURSOR) {
-		if ((h = mpool_get(t->bt_mp, t->bt_cursor.pg.pgno, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, t->bt_cursor.pg.pgno, 0)) == NULL)
 			return (RET_ERROR);
 		idx = t->bt_cursor.pg.index;
 		goto delete;
@@ -271,7 +271,7 @@ bt_fast(BTREE *t, const DBT *key, const 
 	uint32_t nbytes;
 	int cmp;
 
-	if ((h = mpool_get(t->bt_mp, t->bt_last.pgno, 0)) == NULL) {
+	if ((h = mpool_getf(t->bt_mp, t->bt_last.pgno, 0)) == NULL) {
 		t->bt_order = NOT;
 		return (NULL);
 	}
Index: src/lib/libc/db/btree/bt_split.c
diff -u src/lib/libc/db/btree/bt_split.c:1.20 src/lib/libc/db/btree/bt_split.c:1.21
--- src/lib/libc/db/btree/bt_split.c:1.20	Mon Jun 20 05:11:17 2011
+++ src/lib/libc/db/btree/bt_split.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_split.c,v 1.20 2011/06/20 09:11:17 mrg Exp $	*/
+/*	$NetBSD: bt_split.c,v 1.21 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_split.c,v 1.20 2011/06/20 09:11:17 mrg Exp $");
+__RCSID("$NetBSD: bt_split.c,v 1.21 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -153,7 +153,7 @@ __bt_split(BTREE *t, PAGE *sp, const DBT
 		rchild = r;
 
 		/* Get the parent page. */
-		if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, parent->pgno, 0)) == NULL)
 			goto err2;
 
 	 	/*
@@ -401,7 +401,7 @@ bt_page(BTREE *t, PAGE *h, PAGE **lp, PA
 
 	/* Fix up the previous pointer of the page after the split page. */
 	if (h->nextpg != P_INVALID) {
-		if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
+		if ((tp = mpool_getf(t->bt_mp, h->nextpg, 0)) == NULL) {
 			free(l);
 			/* XXX mpool_free(t->bt_mp, r->pgno); */
 			return (NULL);
@@ -799,7 +799,7 @@ bt_preserve(BTREE *t, pgno_t pg)
 {
 	PAGE *h;
 
-	if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+	if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 		return (RET_ERROR);
 	h->flags |= P_PRESERVE;
 	mpool_put(t->bt_mp, h, MPOOL_DIRTY);

Index: src/lib/libc/db/btree/bt_page.c
diff -u src/lib/libc/db/btree/bt_page.c:1.13 src/lib/libc/db/btree/bt_page.c:1.14
--- src/lib/libc/db/btree/bt_page.c:1.13	Thu Sep 11 08:58:00 2008
+++ src/lib/libc/db/btree/bt_page.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_page.c,v 1.13 2008/09/11 12:58:00 joerg Exp $	*/
+/*	$NetBSD: bt_page.c,v 1.14 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -34,7 +34,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_page.c,v 1.13 2008/09/11 12:58:00 joerg Exp $");
+__RCSID("$NetBSD: bt_page.c,v 1.14 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -89,7 +89,7 @@ __bt_new(BTREE *t, pgno_t *npg)
 	PAGE *h;
 
 	if (t->bt_free != P_INVALID &&
-	    (h = mpool_get(t->bt_mp, t->bt_free, 0)) != NULL) {
+	    (h = mpool_getf(t->bt_mp, t->bt_free, 0)) != NULL) {
 		*npg = t->bt_free;
 		t->bt_free = h->nextpg;
 		F_SET(t, B_METADIRTY);

Index: src/lib/libc/db/btree/bt_seq.c
diff -u src/lib/libc/db/btree/bt_seq.c:1.18 src/lib/libc/db/btree/bt_seq.c:1.19
--- src/lib/libc/db/btree/bt_seq.c:1.18	Wed Sep  4 09:03:22 2013
+++ src/lib/libc/db/btree/bt_seq.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: bt_seq.c,v 1.18 2013/09/04 13:03:22 ryoon Exp $	*/
+/*	$NetBSD: bt_seq.c,v 1.19 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: bt_seq.c,v 1.18 2013/09/04 13:03:22 ryoon Exp $");
+__RCSID("$NetBSD: bt_seq.c,v 1.19 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -54,6 +54,8 @@ __RCSID("$NetBSD: bt_seq.c,v 1.18 2013/0
 static int __bt_first(BTREE *, const DBT *, EPG *, int *);
 static int __bt_seqadv(BTREE *, EPG *, int);
 static int __bt_seqset(BTREE *, EPG *, DBT *, int);
+static int __bt_rseq_next(BTREE *, EPG *);
+static int __bt_rseq_prev(BTREE *, EPG *);
 
 /*
  * Sequential scan support.
@@ -71,7 +73,7 @@ static int __bt_seqset(BTREE *, EPG *, D
  *	dbp:	pointer to access method
  *	key:	key for positioning and return value
  *	data:	data return value
- *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
+ *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
  *
  * Returns:
  *	RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
@@ -99,6 +101,8 @@ __bt_seq(const DB *dbp, DBT *key, DBT *d
 	switch (flags) {
 	case R_NEXT:
 	case R_PREV:
+	case R_RNEXT:
+	case R_RPREV:
 		if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
 			status = __bt_seqadv(t, &e, (int)flags);
 			break;
@@ -140,7 +144,7 @@ __bt_seq(const DB *dbp, DBT *key, DBT *d
  *	t:	tree
  *	ep:	storage for returned key
  *	key:	key for initial scan position
- *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
+ *	flags:	R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV, R_RNEXT, R_RPREV.
  *
  * Side effects:
  *	Pins the page the cursor references.
@@ -173,9 +177,11 @@ __bt_seqset(BTREE *t, EPG *ep, DBT *key,
 		return (__bt_first(t, key, ep, &exact));
 	case R_FIRST:				/* First record. */
 	case R_NEXT:
+	case R_RNEXT:
+		BT_CLR(t);
 		/* Walk down the left-hand side of the tree. */
 		for (pg = P_ROOT;;) {
-			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 				return (RET_ERROR);
 
 			/* Check for an empty tree. */
@@ -187,6 +193,7 @@ __bt_seqset(BTREE *t, EPG *ep, DBT *key,
 			if (h->flags & (P_BLEAF | P_RLEAF))
 				break;
 			pg = GETBINTERNAL(h, 0)->pgno;
+			BT_PUSH(t, h->pgno, 0);
 			mpool_put(t->bt_mp, h, 0);
 		}
 		ep->page = h;
@@ -194,9 +201,11 @@ __bt_seqset(BTREE *t, EPG *ep, DBT *key,
 		break;
 	case R_LAST:				/* Last record. */
 	case R_PREV:
+	case R_RPREV:
+		BT_CLR(t);
 		/* Walk down the right-hand side of the tree. */
 		for (pg = P_ROOT;;) {
-			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 				return (RET_ERROR);
 
 			/* Check for an empty tree. */
@@ -208,6 +217,7 @@ __bt_seqset(BTREE *t, EPG *ep, DBT *key,
 			if (h->flags & (P_BLEAF | P_RLEAF))
 				break;
 			pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
+			BT_PUSH(t, h->pgno, NEXTINDEX(h) - 1);
 			mpool_put(t->bt_mp, h, 0);
 		}
 
@@ -224,7 +234,7 @@ __bt_seqset(BTREE *t, EPG *ep, DBT *key,
  *
  * Parameters:
  *	t:	tree
- *	flags:	R_NEXT, R_PREV
+ *	flags:	R_NEXT, R_PREV, R_RNEXT, R_RPREV
  *
  * Side effects:
  *	Pins the page the new key/data record is on.
@@ -239,7 +249,7 @@ __bt_seqadv(BTREE *t, EPG *ep, int flags
 	PAGE *h;
 	indx_t idx = 0;	/* pacify gcc */
 	pgno_t pg;
-	int exact;
+	int exact, rval;
 
 	/*
 	 * There are a couple of states that we can be in.  The cursor has
@@ -248,18 +258,48 @@ __bt_seqadv(BTREE *t, EPG *ep, int flags
 	c = &t->bt_cursor;
 
 	/*
-	 * The cursor was deleted where there weren't any duplicate records,
-	 * so the key was saved.  Find out where that key would go in the
-	 * current tree.  It doesn't matter if the returned key is an exact
-	 * match or not -- if it's an exact match, the record was added after
-	 * the delete so we can just return it.  If not, as long as there's
-	 * a record there, return it.
+	 * The cursor was deleted and there weren't any duplicate records,
+	 * so the cursor's key was saved.  Find out where that key would
+	 * be in the current tree.  If the returned key is an exact match,
+	 * it means that a key/data pair was inserted into the tree after
+	 * the delete.  We could reasonably return the key, but the problem
+	 * is that this is the access pattern we'll see if the user is
+	 * doing seq(..., R_NEXT)/put(..., 0) pairs, i.e. the put deletes
+	 * the cursor record and then replaces it, so the cursor was saved,
+	 * and we'll simply return the same "new" record until the user
+	 * notices and doesn't do a put() of it.  Since the key is an exact
+	 * match, we could as easily put the new record before the cursor,
+	 * and we've made no guarantee to return it.  So, move forward or
+	 * back a record if it's an exact match.
+	 *
+	 * XXX
+	 * In the current implementation, put's to the cursor are done with
+	 * delete/add pairs.  This has two consequences.  First, it means
+	 * that seq(..., R_NEXT)/put(..., R_CURSOR) pairs are going to exhibit
+	 * the same behavior as above.  Second, you can return the same key
+	 * twice if you have duplicate records.  The scenario is that the
+	 * cursor record is deleted, moving the cursor forward or backward
+	 * to a duplicate.  The add then inserts the new record at a location
+	 * ahead of the cursor because duplicates aren't sorted in any way,
+	 * and the new record is later returned.  This has to be fixed at some
+	 * point.
 	 */
-	if (F_ISSET(c, CURS_ACQUIRE))
-		return (__bt_first(t, &c->key, ep, &exact));
+	if (F_ISSET(c, CURS_ACQUIRE)) {
+		if ((rval = __bt_first(t, &c->key, ep, &exact)) == RET_ERROR)
+			return RET_ERROR;
+		if (!exact)
+			return rval;
+		/*
+		 * XXX
+		 * Kluge -- get, release, get the page.
+		 */
+		c->pg.pgno = ep->page->pgno;
+		c->pg.index = ep->index;
+		mpool_put(t->bt_mp, ep->page, 0);
+	}
 
 	/* Get the page referenced by the cursor. */
-	if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
+	if ((h = mpool_getf(t->bt_mp, c->pg.pgno, 0)) == NULL)
 		return (RET_ERROR);
 
 	/*
@@ -268,6 +308,7 @@ __bt_seqadv(BTREE *t, EPG *ep, int flags
 	 */
 	switch (flags) {
 	case R_NEXT:			/* Next record. */
+	case R_RNEXT:
 		/*
 		 * The cursor was deleted in duplicate records, and moved
 		 * forward to a record that has yet to be returned.  Clear
@@ -277,16 +318,22 @@ __bt_seqadv(BTREE *t, EPG *ep, int flags
 			goto usecurrent;
 		idx = c->pg.index;
 		if (++idx == NEXTINDEX(h)) {
+			if (flags == R_RNEXT) {
+				ep->page = h;
+				ep->index = idx;
+				return __bt_rseq_next(t, ep);
+			}
 			pg = h->nextpg;
 			mpool_put(t->bt_mp, h, 0);
 			if (pg == P_INVALID)
-				return (RET_SPECIAL);
-			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
-				return (RET_ERROR);
+				return RET_SPECIAL;
+			if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
+				return RET_ERROR;
 			idx = 0;
 		}
 		break;
 	case R_PREV:			/* Previous record. */
+	case R_RPREV:
 		/*
 		 * The cursor was deleted in duplicate records, and moved
 		 * backward to a record that has yet to be returned.  Clear
@@ -300,12 +347,17 @@ usecurrent:		F_CLR(c, CURS_AFTER | CURS_
 		}
 		idx = c->pg.index;
 		if (idx == 0) {
+			if (flags == R_RPREV) {
+				ep->page = h;
+				ep->index = idx;
+				return __bt_rseq_prev(t, ep);
+			}
 			pg = h->prevpg;
 			mpool_put(t->bt_mp, h, 0);
 			if (pg == P_INVALID)
-				return (RET_SPECIAL);
-			if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
-				return (RET_ERROR);
+				return RET_SPECIAL;
+			if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
+				return RET_ERROR;
 			idx = NEXTINDEX(h) - 1;
 		} else
 			--idx;
@@ -316,6 +368,83 @@ usecurrent:		F_CLR(c, CURS_AFTER | CURS_
 	ep->index = idx;
 	return (RET_SUCCESS);
 }
+/*
+ * Get the first item on the next page, but by going up and down the tree.
+ */
+static int
+__bt_rseq_next(BTREE *t, EPG *ep)
+{
+	PAGE *h;
+	indx_t idx;
+	EPGNO *up;
+	pgno_t pg;
+
+	h = ep->page;
+	idx = ep->index;
+	do {
+		/* Move up the tree. */
+		up = BT_POP(t);
+		mpool_put(t->bt_mp, h, 0);
+		/* Did we hit the right edge of the root? */
+		if (up == NULL)
+			return RET_SPECIAL;
+		if ((h = mpool_getf(t->bt_mp, up->pgno, 0)) == NULL)
+			return RET_ERROR;
+		idx = up->index;
+	} while (++idx == NEXTINDEX(h));
+
+	while (!(h->flags & (P_BLEAF | P_RLEAF))) {
+		/* Move back down the tree. */
+		BT_PUSH(t, h->pgno, idx);
+		pg = GETBINTERNAL(h, idx)->pgno;
+		mpool_put(t->bt_mp, h, 0);
+		if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
+			return RET_ERROR;
+		idx = 0;
+	}
+	ep->page = h;
+	ep->index = idx;
+	return RET_SUCCESS;
+}
+
+/*
+ * Get the last item on the previous page, but by going up and down the tree.
+ */
+static int
+__bt_rseq_prev(BTREE *t, EPG *ep)
+{
+	PAGE *h;
+	indx_t idx;
+	EPGNO *up;
+	pgno_t pg;
+
+	h = ep->page;
+	idx = ep->index;
+	do {
+		/* Move up the tree. */
+		up = BT_POP(t);
+		mpool_put(t->bt_mp, h, 0);
+		/* Did we hit the left edge of the root? */
+		if (up == NULL)
+			return RET_SPECIAL;
+		if ((h = mpool_getf(t->bt_mp, up->pgno, 0)) == NULL)
+			return RET_ERROR;
+		idx = up->index;
+	} while (idx == 0);
+	--idx;
+	while (!(h->flags & (P_BLEAF | P_RLEAF))) {
+		/* Move back down the tree. */
+		BT_PUSH(t, h->pgno, idx);
+		pg = GETBINTERNAL(h, idx)->pgno;
+		mpool_put(t->bt_mp, h, 0);
+		if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
+			return RET_ERROR;
+		idx = NEXTINDEX(h) - 1;
+	}
+	ep->page = h;
+	ep->index = idx;
+	return RET_SUCCESS;
+}
 
 /*
  * __bt_first --
@@ -334,7 +463,7 @@ usecurrent:		F_CLR(c, CURS_AFTER | CURS_
 static int
 __bt_first(BTREE *t, const DBT *key, EPG *erval, int *exactp)
 {
-	PAGE *h;
+	PAGE *h, *hprev;
 	EPG *ep, save;
 	pgno_t pg;
 
@@ -347,13 +476,13 @@ __bt_first(BTREE *t, const DBT *key, EPG
 	 * page) and return it.
 	 */
 	if ((ep = __bt_search(t, key, exactp)) == NULL)
-		return (0);
+		return RET_SPECIAL;
 	if (*exactp) {
 		if (F_ISSET(t, B_NODUPS)) {
 			*erval = *ep;
 			return (RET_SUCCESS);
 		}
-			
+
 		/*
 		 * Walk backwards, as long as the entry matches and there are
 		 * keys left in the tree.  Save a copy of each match in case
@@ -378,10 +507,14 @@ __bt_first(BTREE *t, const DBT *key, EPG
 					break;
 				if (h->pgno != save.page->pgno)
 					mpool_put(t->bt_mp, h, 0);
-				if ((h = mpool_get(t->bt_mp,
-				    h->prevpg, 0)) == NULL)
-					return (RET_ERROR);
-				ep->page = h;
+				if ((hprev = mpool_getf(t->bt_mp,
+				    h->prevpg, 0)) == NULL) {
+					if (h->pgno == save.page->pgno)
+						mpool_put(t->bt_mp,
+						    save.page, 0);
+ 					return RET_ERROR;
+				}
+				ep->page = h = hprev;
 				ep->index = NEXTINDEX(h);
 			}
 			--ep->index;
@@ -406,7 +539,7 @@ __bt_first(BTREE *t, const DBT *key, EPG
 		mpool_put(t->bt_mp, h, 0);
 		if (pg == P_INVALID)
 			return (RET_SPECIAL);
-		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 			return (RET_ERROR);
 		ep->index = 0;
 		ep->page = h;

Index: src/lib/libc/db/btree/extern.h
diff -u src/lib/libc/db/btree/extern.h:1.12 src/lib/libc/db/btree/extern.h:1.13
--- src/lib/libc/db/btree/extern.h:1.12	Fri Sep 26 07:41:06 2008
+++ src/lib/libc/db/btree/extern.h	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: extern.h,v 1.12 2008/09/26 11:41:06 tsutsui Exp $	*/
+/*	$NetBSD: extern.h,v 1.13 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1991, 1993, 1994
@@ -48,6 +48,7 @@ void	 __bt_pgin(void *, pgno_t, void *);
 void	 __bt_pgout(void *, pgno_t, void *);
 int	 __bt_push(BTREE *, pgno_t, int);
 int	 __bt_put(const DB *dbp, DBT *, const DBT *, unsigned int);
+int	 __bt_relink(BTREE *, PAGE *);
 int	 __bt_ret(BTREE *, EPG *, DBT *, DBT *, DBT *, DBT *, int);
 EPG	*__bt_search(BTREE *, const DBT *, int *);
 int	 __bt_seq(const DB *, DBT *, DBT *, unsigned int);

Index: src/lib/libc/db/mpool/mpool.c
diff -u src/lib/libc/db/mpool/mpool.c:1.21 src/lib/libc/db/mpool/mpool.c:1.22
--- src/lib/libc/db/mpool/mpool.c:1.21	Sat Dec 14 13:04:00 2013
+++ src/lib/libc/db/mpool/mpool.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: mpool.c,v 1.21 2013/12/14 18:04:00 christos Exp $	*/
+/*	$NetBSD: mpool.c,v 1.22 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -34,7 +34,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: mpool.c,v 1.21 2013/12/14 18:04:00 christos Exp $");
+__RCSID("$NetBSD: mpool.c,v 1.22 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/queue.h>
@@ -55,7 +55,9 @@ __RCSID("$NetBSD: mpool.c,v 1.21 2013/12
 __weak_alias(mpool_close,_mpool_close)
 __weak_alias(mpool_filter,_mpool_filter)
 __weak_alias(mpool_get,_mpool_get)
+__weak_alias(mpool_getf,_mpool_getf)
 __weak_alias(mpool_new,_mpool_new)
+__weak_alias(mpool_newf,_mpool_newf)
 __weak_alias(mpool_open,_mpool_open)
 __weak_alias(mpool_put,_mpool_put)
 __weak_alias(mpool_sync,_mpool_sync)
@@ -121,7 +123,7 @@ mpool_filter(MPOOL *mp, void (*pgin)(voi
  *	Get a new page of memory.
  */
 void *
-mpool_new( MPOOL *mp, pgno_t *pgnoaddr)
+mpool_newf(MPOOL *mp, pgno_t *pgnoaddr, unsigned int flags)
 {
 	struct _hqh *head;
 	BKT *bp;
@@ -140,8 +142,14 @@ mpool_new( MPOOL *mp, pgno_t *pgnoaddr)
 	 */
 	if ((bp = mpool_bkt(mp)) == NULL)
 		return NULL;
-	*pgnoaddr = bp->pgno = mp->npages++;
-	bp->flags = MPOOL_PINNED;
+
+	if (flags == MPOOL_PAGE_REQUEST) {
+		mp->npages++;
+		bp->pgno = *pgnoaddr;
+	} else
+		bp->pgno = *pgnoaddr = mp->npages++;
+
+	bp->flags = MPOOL_PINNED | MPOOL_INUSE;
 
 	head = &mp->hqh[HASHKEY(bp->pgno)];
 	TAILQ_INSERT_HEAD(head, bp, hq);
@@ -149,13 +157,44 @@ mpool_new( MPOOL *mp, pgno_t *pgnoaddr)
 	return bp->page;
 }
 
+void *
+mpool_new(MPOOL *mp, pgno_t *pgnoaddr)
+{
+	return mpool_newf(mp, pgnoaddr, 0);
+}
+
+int
+mpool_delete(MPOOL *mp, void *page)
+{
+	struct _hqh *head;
+	BKT *bp;
+
+	bp = (void *)((char *)page - sizeof(BKT));
+
+#ifdef DEBUG
+	if (!(bp->flags & MPOOL_PINNED)) {
+	       (void)fprintf(stderr,
+		   "%s: page %d not pinned\n", __func__, bp->pgno);
+	       abort();
+	}
+#endif
+
+	/* Remove from the hash and lru queues. */
+	head = &mp->hqh[HASHKEY(bp->pgno)];
+	TAILQ_REMOVE(head, bp, hq);
+	TAILQ_REMOVE(&mp->lqh, bp, q);
+
+	free(bp);
+	return RET_SUCCESS;
+}
+
 /*
  * mpool_get
  *	Get a page.
  */
 /*ARGSUSED*/
 void *
-mpool_get(MPOOL *mp, pgno_t pgno, u_int flags)
+mpool_getf(MPOOL *mp, pgno_t pgno, unsigned int flags)
 {
 	struct _hqh *head;
 	BKT *bp;
@@ -175,7 +214,7 @@ mpool_get(MPOOL *mp, pgno_t pgno, u_int 
 	/* Check for a page that is cached. */
 	if ((bp = mpool_look(mp, pgno)) != NULL) {
 #ifdef DEBUG
-		if (bp->flags & MPOOL_PINNED) {
+		if (!(flags & MPOOL_IGNOREPIN) && bp->flags & MPOOL_PINNED) {
 			(void)fprintf(stderr,
 			    "mpool_get: page %d already pinned\n", bp->pgno);
 			abort();
@@ -192,7 +231,8 @@ mpool_get(MPOOL *mp, pgno_t pgno, u_int 
 		TAILQ_INSERT_TAIL(&mp->lqh, bp, q);
 
 		/* Return a pinned page. */
-		bp->flags |= MPOOL_PINNED;
+		if (!(flags & MPOOL_IGNOREPIN))
+			bp->flags |= MPOOL_PINNED;
 		return bp->page;
 	}
 
@@ -205,15 +245,32 @@ mpool_get(MPOOL *mp, pgno_t pgno, u_int 
 	++mp->pageread;
 #endif
 	off = mp->pagesize * pgno;
+	if (off / mp->pagesize != pgno) {
+		/* Run past the end of the file, or at least the part we
+		   can address without large-file support?  */
+		errno = E2BIG;
+ 		return NULL;
+	}
+
 	if ((nr = pread(mp->fd, bp->page, (size_t)mp->pagesize, off)) != (int)mp->pagesize) {
-		if (nr >= 0)
+		if (nr > 0) {
 			errno = EFTYPE;
-		return NULL;
+			return NULL;
+		} else if (nr == 0) {
+			/*
+			 * A zero-length reads, means you need to create a
+			 * new page.
+			 */
+			memset(bp->page, 0, mp->pagesize);
+		} else
+			return NULL;
 	}
 
 	/* Set the page number, pin the page. */
 	bp->pgno = pgno;
-	bp->flags = MPOOL_PINNED;
+	if (!(flags & MPOOL_IGNOREPIN))
+		bp->flags = MPOOL_PINNED;
+bp->flags |= MPOOL_INUSE;
 
 	/*
 	 * Add the page to the head of the hash chain and the tail
@@ -230,6 +287,12 @@ mpool_get(MPOOL *mp, pgno_t pgno, u_int 
 	return bp->page;
 }
 
+void *
+mpool_get(MPOOL *mp, pgno_t pgno)
+{
+	return mpool_getf(mp, pgno, 0);
+}
+
 /*
  * mpool_put
  *	Return a page.
@@ -252,7 +315,8 @@ mpool_put(MPOOL *mp, void *page, u_int f
 	}
 #endif
 	bp->flags &= ~MPOOL_PINNED;
-	bp->flags |= flags & MPOOL_DIRTY;
+	if (flags & MPOOL_DIRTY)
+		bp->flags |= flags & MPOOL_DIRTY;
 	return (RET_SUCCESS);
 }
 
@@ -371,6 +435,13 @@ mpool_write(MPOOL *mp, BKT *bp)
 		(mp->pgout)(mp->pgcookie, bp->pgno, bp->page);
 
 	off = mp->pagesize * bp->pgno;
+	if (off / mp->pagesize != bp->pgno) {
+		/* Run past the end of the file, or at least the part we
+		   can address without large-file support?  */
+		errno = E2BIG;
+		return RET_ERROR;
+	}
+
 	if (pwrite(mp->fd, bp->page, (size_t)mp->pagesize, off) !=
 	    (ssize_t)mp->pagesize)
 		return RET_ERROR;

Index: src/lib/libc/db/recno/rec_open.c
diff -u src/lib/libc/db/recno/rec_open.c:1.20 src/lib/libc/db/recno/rec_open.c:1.21
--- src/lib/libc/db/recno/rec_open.c:1.20	Sat Nov 30 19:22:48 2013
+++ src/lib/libc/db/recno/rec_open.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: rec_open.c,v 1.20 2013/12/01 00:22:48 christos Exp $	*/
+/*	$NetBSD: rec_open.c,v 1.21 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993, 1994
@@ -37,7 +37,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: rec_open.c,v 1.20 2013/12/01 00:22:48 christos Exp $");
+__RCSID("$NetBSD: rec_open.c,v 1.21 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -197,7 +197,7 @@ slow:			if ((t->bt_rfp = fdopen(rfd, "r"
 	dbp->sync = __rec_sync;
 
 	/* If the root page was created, reset the flags. */
-	if ((h = mpool_get(t->bt_mp, P_ROOT, 0)) == NULL)
+	if ((h = mpool_getf(t->bt_mp, P_ROOT, 0)) == NULL)
 		goto err;
 	if ((h->flags & P_TYPE) == P_BLEAF) {
 		F_CLR(h, P_TYPE);

Index: src/lib/libc/db/recno/rec_search.c
diff -u src/lib/libc/db/recno/rec_search.c:1.14 src/lib/libc/db/recno/rec_search.c:1.15
--- src/lib/libc/db/recno/rec_search.c:1.14	Thu Sep 11 08:58:00 2008
+++ src/lib/libc/db/recno/rec_search.c	Sat Sep 24 16:11:12 2016
@@ -1,4 +1,4 @@
-/*	$NetBSD: rec_search.c,v 1.14 2008/09/11 12:58:00 joerg Exp $	*/
+/*	$NetBSD: rec_search.c,v 1.15 2016/09/24 20:11:12 christos Exp $	*/
 
 /*-
  * Copyright (c) 1990, 1993
@@ -34,7 +34,7 @@
 #endif
 
 #include <sys/cdefs.h>
-__RCSID("$NetBSD: rec_search.c,v 1.14 2008/09/11 12:58:00 joerg Exp $");
+__RCSID("$NetBSD: rec_search.c,v 1.15 2016/09/24 20:11:12 christos Exp $");
 
 #include "namespace.h"
 #include <sys/types.h>
@@ -77,7 +77,7 @@ __rec_search(BTREE *t, recno_t recno, en
 
 	BT_CLR(t);
 	for (pg = P_ROOT, total = 0;;) {
-		if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
+		if ((h = mpool_getf(t->bt_mp, pg, 0)) == NULL)
 			goto err;
 		if (h->flags & P_RLEAF) {
 			t->bt_cur.page = h;
@@ -113,7 +113,7 @@ __rec_search(BTREE *t, recno_t recno, en
 err:	sverrno = errno;
 	if (op != SEARCH)
 		while  ((parent = BT_POP(t)) != NULL) {
-			if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
+			if ((h = mpool_getf(t->bt_mp, parent->pgno, 0)) == NULL)
 				break;
 			if (op == SINSERT)
 				--GETRINTERNAL(h, parent->index)->nrecs;

Reply via email to