On 08/15/2014 02:02 AM, Alvaro Herrera wrote:
Alvaro Herrera wrote:
Heikki Linnakangas wrote:

I'm sure this still needs some cleanup, but here's the patch, based
on your v14. Now that I know what this approach looks like, I still
like it much better. The insert and update code is somewhat more
complicated, because you have to be careful to lock the old page,
new page, and revmap page in the right order. But it's not too bad,
and it gets rid of all the complexity in vacuum.

It seems there is some issue here, because pageinspect tells me the
index is not growing properly for some reason.  minmax_revmap_data gives
me this array of TIDs after a bunch of insert/vacuum/delete/ etc:

I fixed this issue, and did a lot more rework and bugfixing.  Here's
v15, based on v14-heikki2.

So, the other design change I've been advocating is to store the revmap in the first N blocks, instead of having the two-level structure with array pages and revmap pages.

Attached is a patch for that, to be applied after v15. When the revmap needs to be expanded, all the tuples on it are moved elsewhere one-by-one. That adds some latency to the unfortunate guy who needs to do that, but as the patch stands, the revmap is only ever extended by VACUUM or CREATE INDEX, so I think that's fine. Like with my previous patch, the point is to demonstrate how much simpler the code becomes this way; I'm sure there are bugs and cleanup still necessary.

PS. Spotted one oversight in patch v15: callers of mm_doupdate must check the return value, and retry the operation if it returns false.

- Heikki

commit ce4df0e9dbd43f7e3d4fdf3f7920301f81f17d63
Author: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date:   Fri Aug 15 18:32:19 2014 +0300

    Get rid of array pages. Instead, move all tuples out of the way

diff --git a/contrib/pageinspect/mmfuncs.c b/contrib/pageinspect/mmfuncs.c
index 6cd559a..51cc9e2 100644
--- a/contrib/pageinspect/mmfuncs.c
+++ b/contrib/pageinspect/mmfuncs.c
@@ -74,9 +74,6 @@ minmax_page_type(PG_FUNCTION_ARGS)
 		case MINMAX_PAGETYPE_META:
 			type = "meta";
 			break;
-		case MINMAX_PAGETYPE_REVMAP_ARRAY:
-			type = "revmap array";
-			break;
 		case MINMAX_PAGETYPE_REVMAP:
 			type = "revmap";
 			break;
@@ -343,11 +340,9 @@ minmax_metapage_info(PG_FUNCTION_ARGS)
 	Page		page;
 	MinmaxMetaPageData *meta;
 	TupleDesc	tupdesc;
-	Datum		values[3];
-	bool		nulls[3];
-	ArrayBuildState *astate = NULL;
+	Datum		values[4];
+	bool		nulls[4];
 	HeapTuple	htup;
-	int			i;
 
 	page = verify_minmax_page(raw_page, MINMAX_PAGETYPE_META, "metapage");
 
@@ -361,22 +356,8 @@ minmax_metapage_info(PG_FUNCTION_ARGS)
 	MemSet(nulls, 0, sizeof(nulls));
 	values[0] = CStringGetTextDatum(psprintf("0x%08X", meta->minmaxMagic));
 	values[1] = Int32GetDatum(meta->minmaxVersion);
-
-	/* Extract (possibly empty) list of revmap array page numbers. */
-	for (i = 0; i < MAX_REVMAP_ARRAYPAGES; i++)
-	{
-		BlockNumber	blkno;
-
-		blkno = meta->revmapArrayPages[i];
-		if (blkno == InvalidBlockNumber)
-			break;	/* XXX or continue? */
-		astate = accumArrayResult(astate, Int64GetDatum((int64) blkno),
-								  false, INT8OID, CurrentMemoryContext);
-	}
-	if (astate == NULL)
-		nulls[2] = true;
-	else
-		values[2] = makeArrayResult(astate, CurrentMemoryContext);
+	values[2] = Int32GetDatum(meta->pagesPerRange);
+	values[3] = Int64GetDatum(meta->lastRevmapPage);
 
 	htup = heap_form_tuple(tupdesc, values, nulls);
 
@@ -384,34 +365,6 @@ minmax_metapage_info(PG_FUNCTION_ARGS)
 }
 
 /*
- * Return the BlockNumber array stored in a revmap array page
- */
-Datum
-minmax_revmap_array_data(PG_FUNCTION_ARGS)
-{
-	bytea	   *raw_page = PG_GETARG_BYTEA_P(0);
-	Page		page;
-	ArrayBuildState *astate = NULL;
-	RevmapArrayContents *contents;
-	Datum		blkarr;
-	int			i;
-
-	page = verify_minmax_page(raw_page, MINMAX_PAGETYPE_REVMAP_ARRAY,
-							  "revmap array");
-
-	contents = (RevmapArrayContents *) PageGetContents(page);
-
-	for (i = 0; i < contents->rma_nblocks; i++)
-		astate = accumArrayResult(astate,
-								  Int64GetDatum((int64) contents->rma_blocks[i]),
-								  false, INT8OID, CurrentMemoryContext);
-	Assert(astate != NULL);
-
-	blkarr = makeArrayResult(astate, CurrentMemoryContext);
-	PG_RETURN_DATUM(blkarr);
-}
-
-/*
  * Return the TID array stored in a minmax revmap page
  */
 Datum
@@ -437,7 +390,7 @@ minmax_revmap_data(PG_FUNCTION_ARGS)
 	/* Extract values from the revmap page */
 	contents = (RevmapContents *) PageGetContents(page);
 	MemSet(nulls, 0, sizeof(nulls));
-	values[0] = Int64GetDatum((uint64) contents->rmr_logblk);
+	values[0] = Int64GetDatum((uint64) 0);
 
 	/* Extract (possibly empty) list of TIDs in this page. */
 	for (i = 0; i < REGULAR_REVMAP_PAGE_MAXITEMS; i++)
diff --git a/contrib/pageinspect/pageinspect--1.2.sql b/contrib/pageinspect/pageinspect--1.2.sql
index 56c9ba8..cba90ca 100644
--- a/contrib/pageinspect/pageinspect--1.2.sql
+++ b/contrib/pageinspect/pageinspect--1.2.sql
@@ -110,7 +110,7 @@ LANGUAGE C STRICT;
 -- minmax_metapage_info()
 --
 CREATE FUNCTION minmax_metapage_info(IN page bytea, OUT magic text,
-	OUT version integer, OUT revmap_array_pages BIGINT[])
+	OUT version integer, OUT pagesperrange integer, OUT lastrevmappage bigint)
 AS 'MODULE_PATHNAME', 'minmax_metapage_info'
 LANGUAGE C STRICT;
 
@@ -128,16 +128,9 @@ AS 'MODULE_PATHNAME', 'minmax_page_items'
 LANGUAGE C STRICT;
 
 --
--- minmax_revmap_array_data()
-CREATE FUNCTION minmax_revmap_array_data(IN page bytea,
-	OUT revmap_pages BIGINT[])
-AS 'MODULE_PATHNAME', 'minmax_revmap_array_data'
-LANGUAGE C STRICT;
-
---
 -- minmax_revmap_data()
 CREATE FUNCTION minmax_revmap_data(IN page bytea,
-	OUT logblk BIGINT, OUT pages tid[])
+	OUT dummy bigint, OUT pages tid[])
 AS 'MODULE_PATHNAME', 'minmax_revmap_data'
 LANGUAGE C STRICT;
 
diff --git a/src/backend/access/minmax/minmax.c b/src/backend/access/minmax/minmax.c
index addb3a0..18f85d7 100644
--- a/src/backend/access/minmax/minmax.c
+++ b/src/backend/access/minmax/minmax.c
@@ -34,9 +34,11 @@
 #include "storage/freespace.h"
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
+#include "storage/smgr.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
+#include "utils/rel.h"
 #include "utils/syscache.h"
 
 
@@ -76,8 +78,8 @@ static void summarize_range(MMBuildState *mmstate, Relation heapRel,
 static bool mm_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 			mmRevmapAccess *rmAccess, BlockNumber heapBlk,
 			Buffer oldbuf, OffsetNumber oldoff,
-			MMTuple *origtup, Size origsz,
-			MMTuple *newtup, Size newsz,
+			const MMTuple *origtup, Size origsz,
+			const MMTuple *newtup, Size newsz,
 			bool samepage, bool *extended);
 static void mm_doinsert(Relation idxrel, BlockNumber pagesPerRange,
 			mmRevmapAccess *rmAccess, Buffer *buffer, BlockNumber heapblkno,
@@ -85,6 +87,7 @@ static void mm_doinsert(Relation idxrel, BlockNumber pagesPerRange,
 static Buffer mm_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 				   bool *extended);
 static void form_and_insert_tuple(MMBuildState *mmstate);
+static Size mm_page_get_freespace(Page page);
 
 
 /*
@@ -536,11 +539,15 @@ mmbuild(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("expression indexes not supported")));
 
+	/*
+	 * Critical section not required, because on error the creation of the
+	 * whole relation will be rolled back.
+	 */
+
 	meta = ReadBuffer(index, P_NEW);
 	Assert(BufferGetBlockNumber(meta) == MINMAX_METAPAGE_BLKNO);
 	LockBuffer(meta, BUFFER_LOCK_EXCLUSIVE);
 
-	START_CRIT_SECTION();
 	mm_metapage_init(BufferGetPage(meta), MinmaxGetPagesPerRange(index),
 					 MINMAX_CURRENT_VERSION);
 	MarkBufferDirty(meta);
@@ -568,17 +575,11 @@ mmbuild(PG_FUNCTION_ARGS)
 	}
 
 	UnlockReleaseBuffer(meta);
-	END_CRIT_SECTION();
-
-	/*
-	 * Set up an empty revmap, and get access to it
-	 */
-	mmRevmapCreate(index);
-	rmAccess = mmRevmapAccessInit(index, &pagesPerRange);
 
 	/*
 	 * Initialize our state, including the deformed tuple state.
 	 */
+	rmAccess = mmRevmapAccessInit(index, &pagesPerRange);
 	mmstate = initialize_mm_buildstate(index, rmAccess, pagesPerRange);
 
 	/*
@@ -664,10 +665,11 @@ mmvacuumcleanup(PG_FUNCTION_ARGS)
 	heapRel = heap_open(IndexGetRelation(RelationGetRelid(info->index), false),
 						AccessShareLock);
 
+	rmAccess = mmRevmapAccessInit(info->index, &pagesPerRange);
+
 	/*
 	 * Scan the revmap to find unsummarized items.
 	 */
-	rmAccess = mmRevmapAccessInit(info->index, &pagesPerRange);
 	buf = InvalidBuffer;
 	heapNumBlocks = RelationGetNumberOfBlocks(heapRel);
 	for (heapBlk = 0; heapBlk < heapNumBlocks; heapBlk += pagesPerRange)
@@ -751,13 +753,32 @@ mm_page_init(Page page, uint16 type)
 }
 
 /*
+ * Return the amount of free space on a regular minmax index page.
+ *
+ * If the page is not a regular page, or has been marked with the
+ * MINMAX_EVACUATE_PAGE flag, returns 0.
+ */
+static Size
+mm_page_get_freespace(Page page)
+{
+	MinmaxSpecialSpace *special;
+
+	special = (MinmaxSpecialSpace *) PageGetSpecialPointer(page);
+	if (!MINMAX_IS_REGULAR_PAGE(page) ||
+		(special->flags & MINMAX_EVACUATE_PAGE) != 0)
+		return 0;
+	else
+		return PageGetFreeSpace(page);
+
+}
+
+/*
  * Initialize a new minmax index' metapage.
  */
 void
 mm_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
 {
 	MinmaxMetaPageData	*metadata;
-	int			i;
 
 	mm_page_init(page, MINMAX_PAGETYPE_META);
 
@@ -766,8 +787,7 @@ mm_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)
 	metadata->minmaxMagic = MINMAX_META_MAGIC;
 	metadata->pagesPerRange = pagesPerRange;
 	metadata->minmaxVersion = version;
-	for (i = 0; i < MAX_REVMAP_ARRAYPAGES; i++)
-		metadata->revmapArrayPages[i] = InvalidBlockNumber;
+	metadata->lastRevmapPage = 0;
 }
 
 /*
@@ -875,7 +895,7 @@ terminate_mm_buildstate(MMBuildState *mmstate)
 		page = BufferGetPage(mmstate->currentInsertBuf);
 		RecordPageWithFreeSpace(mmstate->irel,
 								BufferGetBlockNumber(mmstate->currentInsertBuf),
-								PageGetFreeSpace(page));
+								mm_page_get_freespace(page));
 		ReleaseBuffer(mmstate->currentInsertBuf);
 	}
 	vacuumfsm = mmstate->extended;
@@ -938,8 +958,8 @@ static bool
 mm_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 			mmRevmapAccess *rmAccess, BlockNumber heapBlk,
 			Buffer oldbuf, OffsetNumber oldoff,
-			MMTuple *origtup, Size origsz,
-			MMTuple *newtup, Size newsz,
+			const MMTuple *origtup, Size origsz,
+			const MMTuple *newtup, Size newsz,
 			bool samepage, bool *extended)
 {
 	Page		oldpage;
@@ -947,11 +967,15 @@ mm_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 	MMTuple	   *oldtup;
 	Size		oldsz;
 	Buffer		newbuf;
+	MinmaxSpecialSpace *special;
 
 	if (!samepage)
 	{
 		/* need a page on which to put the item */
 		newbuf = mm_getinsertbuffer(idxrel, oldbuf, newsz, extended);
+		if (!BufferIsValid(newbuf))
+			return false;
+
 		/*
 		 * Note: it's possible (though unlikely) that the returned newbuf is
 		 * the same as oldbuf, if mm_getinsertbuffer determined that the old
@@ -985,6 +1009,8 @@ mm_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 		return false;
 	}
 
+	special = (MinmaxSpecialSpace *) PageGetSpecialPointer(oldpage);
+
 	/*
 	 * Great, the old tuple is intact.  We can proceed with the update.
 	 *
@@ -994,7 +1020,8 @@ mm_doupdate(Relation idxrel, BlockNumber pagesPerRange,
 	 * the caller told us there isn't, if a concurrent updated moved a tuple
 	 * elsewhere or replaced a tuple with a smaller one.
 	 */
-	if (newsz <= origsz || PageGetExactFreeSpace(oldpage) >= (origsz - newsz))
+	if ((special->flags & MINMAX_EVACUATE_PAGE) == 0 &&
+		(newsz <= origsz || PageGetExactFreeSpace(oldpage) >= (origsz - newsz)))
 	{
 		if (BufferIsValid(newbuf))
 			UnlockReleaseBuffer(newbuf);
@@ -1151,34 +1178,44 @@ mm_doinsert(Relation idxrel, BlockNumber pagesPerRange,
 
 	itemsz = MAXALIGN(itemsz);
 
+	/*
+	 * Lock the revmap page for the update. Note that this may require
+	 * extending the revmap, which in turn may require moving the currently
+	 * pinned index block out of the way.
+	 */
+	revmapbuf = mmLockRevmapPageForUpdate(rmAccess, heapBlk);
+
+	/*
+	 * Obtain a locked buffer to insert the new tuple.  Note mm_getinsertbuffer
+	 * ensures there's enough space in the returned buffer.
+	 */
 	if (BufferIsValid(*buffer))
 	{
 		page = BufferGetPage(*buffer);
 		LockBuffer(*buffer, BUFFER_LOCK_EXCLUSIVE);
-		if (PageGetFreeSpace(page) < itemsz)
+
+		/*
+		 * It's possible that another backend (or ourselves!) extended the
+		 * revmap over the page we held a pin on, so we cannot assume that
+		 * it's still a regular page.
+		 */
+		if (mm_page_get_freespace(page) < itemsz)
 		{
 			UnlockReleaseBuffer(*buffer);
 			*buffer = InvalidBuffer;
 		}
 	}
-
-	/*
-	 * Obtain a locked buffer to insert the new tuple.  Note mm_getinsertbuffer
-	 * ensures there's enough space in the returned buffer.
-	 */
 	if (!BufferIsValid(*buffer))
 	{
 		*buffer = mm_getinsertbuffer(idxrel, InvalidBuffer, itemsz, extended);
+		Assert(BufferIsValid(*buffer));
 		page = BufferGetPage(*buffer);
-		Assert(PageGetFreeSpace(page) >= itemsz);
+		Assert(mm_page_get_freespace(page) >= itemsz);
 	}
 
 	page = BufferGetPage(*buffer);
 	blk = BufferGetBlockNumber(*buffer);
 
-	/* lock the revmap for the update */
-	revmapbuf = mmLockRevmapPageForUpdate(rmAccess, heapBlk);
-
 	START_CRIT_SECTION();
 	off = PageAddItem(page, (Item) tup, itemsz, InvalidOffsetNumber,
 					  false, false);
@@ -1233,12 +1270,116 @@ mm_doinsert(Relation idxrel, BlockNumber pagesPerRange,
 }
 
 /*
+ * Checks if a regular minmax index page is empty.
+ *
+ * If it's not, it's marked for "evacuation", meaning that no new tuples will
+ * be added to it.
+ */
+bool
+mm_start_evacuating_page(Relation idxRel, Buffer buf)
+{
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	MinmaxSpecialSpace *special;
+	Page		page;
+
+	page = BufferGetPage(buf);
+
+	if (PageIsNew(page))
+		return false;
+
+	special = (MinmaxSpecialSpace *) PageGetSpecialPointer(page);
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (off = FirstOffsetNumber; off <= maxoff; off++)
+	{
+		ItemId		lp;
+
+		lp = PageGetItemId(page, off);
+		if (ItemIdIsUsed(lp))
+		{
+			/* prevent other backends from adding more stuff to this page. */
+			special->flags |= MINMAX_EVACUATE_PAGE;
+			MarkBufferDirtyHint(buf, true);
+
+			return true;
+		}
+	}
+	return false;
+}
+
+/*
+ * Move all tuples out of a page.
+ *
+ * The caller must hold an exclusive lock on the page. The lock and pin are
+ * released.
+ */
+void
+mm_evacuate_page(Relation idxRel, Buffer buf)
+{
+	OffsetNumber off;
+	OffsetNumber maxoff;
+	MinmaxSpecialSpace *special;
+	Page		page;
+	mmRevmapAccess *rmAccess;
+	BlockNumber pagesPerRange;
+
+	rmAccess = mmRevmapAccessInit(idxRel, &pagesPerRange);
+
+	page = BufferGetPage(buf);
+	special = (MinmaxSpecialSpace *) PageGetSpecialPointer(page);
+
+	Assert(special->flags & MINMAX_EVACUATE_PAGE);
+
+	maxoff = PageGetMaxOffsetNumber(page);
+	for (off = FirstOffsetNumber; off <= maxoff; off++)
+	{
+		MMTuple	   *tup;
+		Size		sz;
+		ItemId		lp;
+		bool		extended = false;
+
+		lp = PageGetItemId(page, off);
+		if (ItemIdIsUsed(lp))
+		{
+			tup = (MMTuple *) PageGetItem(page, lp);
+			sz = ItemIdGetLength(lp);
+
+			tup = minmax_copy_tuple(tup, sz);
+
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+			if (!mm_doupdate(idxRel, pagesPerRange, rmAccess, tup->mt_blkno, buf,
+							 off, tup, sz, tup, sz, false, &extended))
+				off--; /* retry */
+
+			LockBuffer(buf, BUFFER_LOCK_SHARE);
+
+			if (extended)
+				IndexFreeSpaceMapVacuum(idxRel);
+
+			/* It's possible that someone extended the revmap over this page */
+			if (!MINMAX_IS_REGULAR_PAGE(page))
+				break;
+		}
+	}
+
+	mmRevmapAccessTerminate(rmAccess);
+
+	UnlockReleaseBuffer(buf);
+}
+
+/*
  * Return a pinned and locked buffer which can be used to insert an index item
  * of size itemsz.  If oldbuf is a valid buffer, it is also locked (in a order
  * determined to avoid deadlocks.)
  *
  * If there's no existing page with enough free space to accomodate the new
  * item, the relation is extended.  If this happens, *extended is set to true.
+ *
+ * If we find that the old page is no longer a regular index page (because
+ * of a revmap extension), the old buffer is unlocked and we return
+ * InvalidBuffer.
  */
 static Buffer
 mm_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
@@ -1261,7 +1402,9 @@ mm_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 	 * if we have to restart here, neither buffer is locked and buf is not
 	 * a pinned buffer.
 	 */
-	newblk = GetPageWithFreeSpace(irel, itemsz);
+	newblk = RelationGetTargetBlock(irel);
+	if (newblk == InvalidBlockNumber)
+		newblk = GetPageWithFreeSpace(irel, itemsz);
 	for (;;)
 	{
 		Buffer		buf;
@@ -1298,14 +1441,19 @@ mm_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 			buf = ReadBuffer(irel, newblk);
 		}
 
-		if (BufferIsValid(oldbuf) && newblk < oldblk)
+		if (BufferIsValid(oldbuf) && oldblk < newblk)
+		{
 			LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+			if (!MINMAX_IS_REGULAR_PAGE(BufferGetPage(oldbuf)))
+			{
+				LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
+				ReleaseBuffer(buf);
+				return InvalidBuffer;
+			}
+		}
 
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
-		if (BufferIsValid(oldbuf) && newblk > oldblk)
-			LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
-
 		if (extensionLockHeld)
 			UnlockRelationForExtension(irel, ExclusiveLock);
 
@@ -1319,13 +1467,21 @@ mm_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 		 * Check that the new page has enough free space, and return it if it
 		 * does; otherwise start over.  Note that we allow for the FSM to be
 		 * out of date here, and in that case we update it and move on.
+		 *
+		 * (mm_page_get_freespace also checks that the FSM didn't hand us a
+		 * page that has since been repurposed for the revmap.)
 		 */
-		freespace = PageGetFreeSpace(page);
-
+		freespace = mm_page_get_freespace(page);
 		if (freespace >= itemsz)
 		{
 			if (extended)
 				*was_extended = true;
+			RelationSetTargetBlock(irel, BufferGetBlockNumber(buf));
+
+			/* Lock the old buffer if not locked already */
+			if (BufferIsValid(oldbuf) && newblk < oldblk)
+				LockBuffer(oldbuf, BUFFER_LOCK_EXCLUSIVE);
+
 			return buf;
 		}
 
@@ -1352,7 +1508,7 @@ mm_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 
 		if (newblk != oldblk)
 			UnlockReleaseBuffer(buf);
-		if (BufferIsValid(oldbuf))
+		if (BufferIsValid(oldbuf) && oldblk < newblk)
 			LockBuffer(oldbuf, BUFFER_LOCK_UNLOCK);
 
 		newblk = RecordAndGetPageWithFreeSpace(irel, newblk, freespace, itemsz);
diff --git a/src/backend/access/minmax/mmrevmap.c b/src/backend/access/minmax/mmrevmap.c
index 923490e..48df2cd 100644
--- a/src/backend/access/minmax/mmrevmap.c
+++ b/src/backend/access/minmax/mmrevmap.c
@@ -8,14 +8,10 @@
  * into a table that violates the previously recorded min/max values, a new
  * tuple is inserted into the index and the revmap is updated to point to it.
  *
- * The pages of the revmap are interspersed in the index's main fork.  The
- * first revmap page is always the index's page number one (that is,
- * immediately after the metapage).  Subsequent revmap pages are allocated as
- * they are needed; their locations are tracked by "array pages".  The metapage
- * contains a large BlockNumber array, which correspond to array pages.  Thus,
- * to find the second revmap page, we read the metapage and obtain the block
- * number of the first array page; we then read that page, and the first
- * element in it is the revmap page we're looking for.
+ * The pages of the revmap are in the beginning of the index, starting at
+ * immediately after the metapage at block 1.  When the revmap needs to be
+ * expanded, all tuples on the regular minmax page at that block (if any) are
+ * moved out of the way.
  *
  * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -41,7 +37,7 @@
 
 
 /*
- * In regular revmap pages, each item stores an ItemPointerData.  These defines
+ * In revmap pages, each item stores an ItemPointerData.  These defines
  * let one find the logical revmap page number and index number of the revmap
  * item for the given heap block number.
  */
@@ -50,29 +46,19 @@
 #define HEAPBLK_TO_REVMAP_INDEX(pagesPerRange, heapBlk) \
 	((heapBlk / pagesPerRange) % REGULAR_REVMAP_PAGE_MAXITEMS)
 
-/*
- * In array revmap pages, each item stores a BlockNumber.  These defines let
- * one find the page and index number of a given revmap block number.  Note
- * that the first revmap page (revmap logical page number 0) is always stored
- * in physical block number 1, so array pages do not store that one.
- */
-#define MAPBLK_TO_RMARRAY_BLK(rmBlk)	((rmBlk - 1) / ARRAY_REVMAP_PAGE_MAXITEMS)
-#define MAPBLK_TO_RMARRAY_INDEX(rmBlk)	((rmBlk - 1) % ARRAY_REVMAP_PAGE_MAXITEMS)
-
 
 struct mmRevmapAccess
 {
 	Relation	idxrel;
 	BlockNumber pagesPerRange;
+	BlockNumber lastRevmapPage;		/* cached from the metapage */
 	Buffer		metaBuf;
 	Buffer		currBuf;
-	Buffer		currArrayBuf;
-	BlockNumber *revmapArrayPages;
 };
 /* typedef appears in minmax_revmap.h */
 
 
-static Buffer mm_getnewbuffer(Relation irel);
+static void rm_extend(mmRevmapAccess *rmAccess);
 
 /*
  * Initialize an access object for a reverse range map, which can be used to
@@ -94,8 +80,7 @@ mmRevmapAccessInit(Relation idxrel, BlockNumber *pagesPerRange)
 	rmAccess->idxrel = idxrel;
 	rmAccess->pagesPerRange = metadata->pagesPerRange;
 	rmAccess->currBuf = InvalidBuffer;
-	rmAccess->currArrayBuf = InvalidBuffer;
-	rmAccess->revmapArrayPages = NULL;
+	rmAccess->lastRevmapPage = InvalidBlockNumber;
 
 	if (pagesPerRange)
 		*pagesPerRange = metadata->pagesPerRange;
@@ -109,30 +94,24 @@ mmRevmapAccessInit(Relation idxrel, BlockNumber *pagesPerRange)
 void
 mmRevmapAccessTerminate(mmRevmapAccess *rmAccess)
 {
-	if (rmAccess->revmapArrayPages != NULL)
-		pfree(rmAccess->revmapArrayPages);
 	if (rmAccess->metaBuf != InvalidBuffer)
 		ReleaseBuffer(rmAccess->metaBuf);
 	if (rmAccess->currBuf != InvalidBuffer)
 		ReleaseBuffer(rmAccess->currBuf);
-	if (rmAccess->currArrayBuf != InvalidBuffer)
-		ReleaseBuffer(rmAccess->currArrayBuf);
 	pfree(rmAccess);
 }
 
 /*
- * Lock the metapage as specified by called, and update the given rmAccess with
- * the metapage data.  The metapage buffer is locked when this function
- * returns; it's the caller's responsibility to unlock it.
+ * Read the metapage and update the given rmAccess with the metapage data.
  */
 static void
-rmaccess_get_metapage(mmRevmapAccess *rmAccess, int lockmode)
+rmaccess_read_metapage(mmRevmapAccess *rmAccess)
 {
 	MinmaxMetaPageData *metadata;
 	MinmaxSpecialSpace *special PG_USED_FOR_ASSERTS_ONLY;
 	Page		metapage;
 
-	LockBuffer(rmAccess->metaBuf, lockmode);
+	LockBuffer(rmAccess->metaBuf, BUFFER_LOCK_SHARE);
 	metapage = BufferGetPage(rmAccess->metaBuf);
 
 #ifdef USE_ASSERT_CHECKING
@@ -141,51 +120,11 @@ rmaccess_get_metapage(mmRevmapAccess *rmAccess, int lockmode)
 	Assert(special->type == MINMAX_PAGETYPE_META);
 #endif
 
-	/* first time through? allocate the array */
-	if (rmAccess->revmapArrayPages == NULL)
-		rmAccess->revmapArrayPages =
-			palloc(sizeof(BlockNumber) * MAX_REVMAP_ARRAYPAGES);
-
 	metadata = (MinmaxMetaPageData *) PageGetContents(metapage);
-	memcpy(rmAccess->revmapArrayPages, metadata->revmapArrayPages,
-		   sizeof(BlockNumber) * MAX_REVMAP_ARRAYPAGES);
-}
-
-/*
- * Update the metapage, so that item arrayBlkIdx in the array of revmap array
- * pages points to block number newPgBlkno.
- */
-static void
-update_minmax_metapg(Relation idxrel, Buffer meta, uint32 arrayBlkIdx,
-					 BlockNumber newPgBlkno)
-{
-	MinmaxMetaPageData *metadata;
-
-	metadata = (MinmaxMetaPageData *) PageGetContents(BufferGetPage(meta));
-
-	START_CRIT_SECTION();
-	metadata->revmapArrayPages[arrayBlkIdx] = newPgBlkno;
-	MarkBufferDirty(meta);
-	if (RelationNeedsWAL(idxrel))
-	{
-		xl_minmax_metapg_set	xlrec;
-		XLogRecPtr	recptr;
-		XLogRecData	rdata;
 
-		xlrec.node = idxrel->rd_node;
-		xlrec.blkidx = arrayBlkIdx;
-		xlrec.newpg = newPgBlkno;
+	rmAccess->lastRevmapPage = metadata->lastRevmapPage;
 
-		rdata.data = (char *) &xlrec;
-		rdata.len = SizeOfMinmaxMetapgSet;
-		rdata.buffer = InvalidBuffer;
-		rdata.buffer_std = false;
-		rdata.next = NULL;
-
-		recptr = XLogInsert(RM_MINMAX_ID, XLOG_MINMAX_METAPG_SET, &rdata);
-		PageSetLSN(BufferGetPage(meta), recptr);
-	}
-	END_CRIT_SECTION();
+	LockBuffer(rmAccess->metaBuf, BUFFER_LOCK_UNLOCK);
 }
 
 /*
@@ -200,250 +139,140 @@ update_minmax_metapg(Relation idxrel, Buffer meta, uint32 arrayBlkIdx,
 static BlockNumber
 rm_get_phys_blkno(mmRevmapAccess *rmAccess, BlockNumber mapBlk, bool extend)
 {
-	int		arrayBlkIdx;
-	BlockNumber arrayBlk;
-	RevmapArrayContents *contents;
-	int		revmapIdx;
 	BlockNumber targetblk;
 
+	if (rmAccess->lastRevmapPage == InvalidBlockNumber)
+		rmaccess_read_metapage(rmAccess);
+
 	/* the first revmap page is always block number 1 */
-	if (mapBlk == 0)
-		return (BlockNumber) 1;
+	targetblk = mapBlk + 1;
 
-	/*
-	 * For all other cases, take the long route of checking the metapage and
-	 * revmap array pages.
-	 */
+	if (targetblk <= rmAccess->lastRevmapPage)
+		return targetblk;
 
-	/*
-	 * Copy the revmap array from the metapage into private storage, if not
-	 * done already in this scan.
-	 */
-	if (rmAccess->revmapArrayPages == NULL)
-	{
-		rmaccess_get_metapage(rmAccess, BUFFER_LOCK_SHARE);
-		LockBuffer(rmAccess->metaBuf, BUFFER_LOCK_UNLOCK);
-	}
+	if (!extend)
+		return InvalidBlockNumber;
 
-	/*
-	 * Consult the metapage array; if the array page we need is not set there,
-	 * we need to extend the index to allocate the array page, and update the
-	 * metapage array.
-	 */
-	arrayBlkIdx = MAPBLK_TO_RMARRAY_BLK(mapBlk);
-	if (arrayBlkIdx > MAX_REVMAP_ARRAYPAGES)
-		elog(ERROR, "non-existant revmap array page requested");
+	/* Extend the revmap */
+	while (targetblk > rmAccess->lastRevmapPage)
+		rm_extend(rmAccess);
 
-	arrayBlk = rmAccess->revmapArrayPages[arrayBlkIdx];
-	if (arrayBlk == InvalidBlockNumber)
-	{
-		/* if not asked to extend, there's no further work to do here */
-		if (!extend)
-			return InvalidBlockNumber;
-
-		/*
-		 * If we need to create a new array page, check the metapage again;
-		 * someone might have created it after the last time we read the
-		 * metapage.  This time we acquire an exclusive lock, since we may need
-		 * to extend.  Lock before doing the physical relation extension, to
-		 * avoid leaving an unused page around in case someone does this
-		 * concurrently.  Note that, unfortunately, we will be keeping the lock
-		 * on the metapage alongside the relation extension lock, while doing a
-		 * syscall involving disk I/O.  Extending to add a new revmap array page
-		 * is fairly infrequent, so it shouldn't be too bad.
-		 *
-		 * XXX it is possible to extend the relation unconditionally before
-		 * locking the metapage, and later if we find that someone else had
-		 * already added this page, save the page in FSM as MaxFSMRequestSize.
-		 * That would be better for concurrency.  Explore someday.
-		 */
-		rmaccess_get_metapage(rmAccess, BUFFER_LOCK_EXCLUSIVE);
+	return targetblk;
+}
 
-		if (rmAccess->revmapArrayPages[arrayBlkIdx] == InvalidBlockNumber)
-		{
-			BlockNumber	newPgBlkno;
-
-			/*
-			 * Ok, definitely need to allocate a new revmap array page;
-			 * initialize a new page to the initial (empty) array revmap state
-			 * and register it in metapage.
-			 */
-			rmAccess->currArrayBuf = mm_getnewbuffer(rmAccess->idxrel);
-			START_CRIT_SECTION();
-			initialize_rma_page(rmAccess->currArrayBuf);
-			MarkBufferDirty(rmAccess->currArrayBuf);
-			if (RelationNeedsWAL(rmAccess->idxrel))
-			{
-				xl_minmax_init_rmpg	xlrec;
-				XLogRecPtr		recptr;
-				XLogRecData		rdata;
-
-				xlrec.node = rmAccess->idxrel->rd_node;
-				xlrec.blkno = BufferGetBlockNumber(rmAccess->currArrayBuf);
-				xlrec.array = true;
-				xlrec.logblk = InvalidBlockNumber;
-
-				rdata.data = (char *) &xlrec;
-				rdata.len = SizeOfMinmaxInitRmpg;
-				rdata.buffer = InvalidBuffer;	/* FIXME */
-				rdata.buffer_std = false;
-				rdata.next = NULL;
-
-				recptr = XLogInsert(RM_MINMAX_ID, XLOG_MINMAX_INIT_RMPG, &rdata);
-				PageSetLSN(BufferGetPage(rmAccess->currArrayBuf), recptr);
-			}
-			END_CRIT_SECTION();
-			LockBuffer(rmAccess->currArrayBuf, BUFFER_LOCK_UNLOCK);
-			newPgBlkno = BufferGetBlockNumber(rmAccess->currArrayBuf);
-			rmAccess->revmapArrayPages[arrayBlkIdx] = newPgBlkno;
+/*
+ * Extend the revmap by one page.
+ *
+ * If there is an existing minmax page at that block, it is atomically moved
+ * out of the way, and the redirect pointer on the new revmap page is set
+ * to point to its new location.
+ *
+ * If rmAccess->lastRevmapPage is out-of-date, it's updated and nothing else
+ * is done.
+ */
+static void
+rm_extend(mmRevmapAccess *rmAccess)
+{
+	Buffer		buf;
+	Page		page;
+	Page		metapage;
+	MinmaxMetaPageData *metadata;
+	BlockNumber	mapBlk;
+	BlockNumber nblocks;
+	Relation	irel = rmAccess->idxrel;
+	bool		needLock = !RELATION_IS_LOCAL(irel);
 
-			MINMAX_elog(DEBUG2, "allocated block for revmap array page: %u",
-				 BufferGetBlockNumber(rmAccess->currArrayBuf));
+	/*
+	 * Lock the metapage. This locks out concurrent extensions of the revmap,
+	 * but note that we still need to grab the relation extension lock because
+	 * another backend can still extend the index with regular minmax pages.
+	 */
+	LockBuffer(rmAccess->metaBuf, BUFFER_LOCK_EXCLUSIVE);
+	metapage = BufferGetPage(rmAccess->metaBuf);
+	metadata = (MinmaxMetaPageData *) PageGetContents(metapage);
 
-			/* Update the metapage to point to the new array page. */
-			update_minmax_metapg(rmAccess->idxrel, rmAccess->metaBuf, arrayBlkIdx,
-								 newPgBlkno);
-		}
+	/* Check that our cached lastRevmapPage value was up-to-date */
+	if (metadata->lastRevmapPage != rmAccess->lastRevmapPage)
+	{
+		rmAccess->lastRevmapPage = metadata->lastRevmapPage;
 
 		LockBuffer(rmAccess->metaBuf, BUFFER_LOCK_UNLOCK);
-		arrayBlk = rmAccess->revmapArrayPages[arrayBlkIdx];
+		return;
 	}
+	mapBlk = metadata->lastRevmapPage + 1;
 
-	/*
-	 * By here, we know the array page is set in the metapage array.  Read that
-	 * page; except that if we just allocated it, or we already hold pin on it,
-	 * we don't need to read it again.
-	 */
-	Assert(arrayBlk != InvalidBlockNumber);
-
-	if (rmAccess->currArrayBuf == InvalidBuffer ||
-		BufferGetBlockNumber(rmAccess->currArrayBuf) != arrayBlk)
+	nblocks = RelationGetNumberOfBlocks(irel);
+	if (mapBlk < nblocks)
 	{
-		if (rmAccess->currArrayBuf != InvalidBuffer)
-			ReleaseBuffer(rmAccess->currArrayBuf);
+		/* Check that the existing index block is sane. */
+		buf = ReadBuffer(rmAccess->idxrel, mapBlk);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		page = BufferGetPage(buf);
+	}
+	else
+	{
+		if (needLock)
+			LockRelationForExtension(irel, ExclusiveLock);
+
+		buf = ReadBuffer(irel, P_NEW);
+		Assert(BufferGetBlockNumber(buf) == mapBlk);
+		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+		page = BufferGetPage(buf);
 
-		rmAccess->currArrayBuf =
-			ReadBuffer(rmAccess->idxrel, arrayBlk);
+		if (needLock)
+			UnlockRelationForExtension(irel, ExclusiveLock);
 	}
 
-	LockBuffer(rmAccess->currArrayBuf, BUFFER_LOCK_SHARE);
+	/* Check that it's a regular block (or an empty page) */
+	if (!PageIsNew(page) && !MINMAX_IS_REGULAR_PAGE(page))
+		elog(ERROR, "unexpected minmax page type: 0x%04X",
+			 MINMAX_PAGE_TYPE(page));
 
-	/*
-	 * And now we can inspect its contents; if the target page is set, we can
-	 * just return.  Even if not set, we can also return if caller asked us not
-	 * to extend the revmap.
-	 */
-	contents = (RevmapArrayContents *)
-		PageGetContents(BufferGetPage(rmAccess->currArrayBuf));
-	revmapIdx = MAPBLK_TO_RMARRAY_INDEX(mapBlk);
-	if (!extend || revmapIdx <= contents->rma_nblocks - 1)
+	/* If the page is in use, evacuate it and restart */
+	if (mm_start_evacuating_page(rmAccess->idxrel, buf))
 	{
-		LockBuffer(rmAccess->currArrayBuf, BUFFER_LOCK_UNLOCK);
-
-		return contents->rma_blocks[revmapIdx];
+		LockBuffer(rmAccess->metaBuf, BUFFER_LOCK_UNLOCK);
+		mm_evacuate_page(rmAccess->idxrel, buf);
+		return;
 	}
 
 	/*
-	 * Trade our shared lock in the array page for exclusive, because we now
-	 * need to allocate one more revmap page and modify the array page.
+	 * Ok, we have now locked the metapage and the target block. Re-initialize
+	 * it as a revmap page.
 	 */
-	LockBuffer(rmAccess->currArrayBuf, BUFFER_LOCK_UNLOCK);
-	LockBuffer(rmAccess->currArrayBuf, BUFFER_LOCK_EXCLUSIVE);
-
-	contents = (RevmapArrayContents *)
-		PageGetContents(BufferGetPage(rmAccess->currArrayBuf));
+	START_CRIT_SECTION();
 
-	/*
-	 * If someone else already set the value while we were waiting for the
-	 * exclusive lock, we're done; otherwise, allocate a new block as the
-	 * new revmap page, and update the array page to point to it.
-	 */
-	if (contents->rma_blocks[revmapIdx] != InvalidBlockNumber)
-	{
-		targetblk = contents->rma_blocks[revmapIdx];
-	}
-	else
-	{
-		Buffer		newbuf;
-
-		/* not possible to get here if we weren't asked to extend */
-		Assert(extend);
-		newbuf = mm_getnewbuffer(rmAccess->idxrel);
-		START_CRIT_SECTION();
-		targetblk = initialize_rmr_page(newbuf, mapBlk);
-		MarkBufferDirty(newbuf);
-		if (RelationNeedsWAL(rmAccess->idxrel))
-		{
-			xl_minmax_init_rmpg	xlrec;
-			XLogRecPtr	recptr;
-			XLogRecData	rdata;
-
-			xlrec.node = rmAccess->idxrel->rd_node;
-			xlrec.blkno = BufferGetBlockNumber(newbuf);
-			xlrec.array = false;
-			xlrec.logblk = mapBlk;
-
-			rdata.data = (char *) &xlrec;
-			rdata.len = SizeOfMinmaxInitRmpg;
-			rdata.buffer = InvalidBuffer;
-			rdata.buffer_std = false;
-			rdata.next = NULL;
-
-			recptr = XLogInsert(RM_MINMAX_ID, XLOG_MINMAX_INIT_RMPG, &rdata);
-			PageSetLSN(BufferGetPage(newbuf), recptr);
-		}
-		END_CRIT_SECTION();
+	/* the rmr_tids array is initialized to all invalid by PageInit */
+	mm_page_init(page, MINMAX_PAGETYPE_REVMAP);
+	MarkBufferDirty(buf);
 
-		UnlockReleaseBuffer(newbuf);
+	metadata->lastRevmapPage = mapBlk;
+	MarkBufferDirty(rmAccess->metaBuf);
 
-		/*
-		 * Now make the revmap array page point to the newly allocated page.
-		 * If necessary, also update the total number of items in it.
-		 */
-		START_CRIT_SECTION();
+	if (RelationNeedsWAL(rmAccess->idxrel))
+	{
+		xl_minmax_revmap_extend xlrec;
+		XLogRecPtr	recptr;
+		XLogRecData	rdata;
 
-		contents->rma_blocks[revmapIdx] = targetblk;
-		if (contents->rma_nblocks < revmapIdx + 1)
-			contents->rma_nblocks = revmapIdx + 1;
-		MarkBufferDirty(rmAccess->currArrayBuf);
+		xlrec.node = rmAccess->idxrel->rd_node;
+		xlrec.targetBlk = mapBlk;
 
-		/* XLOG stuff */
-		if (RelationNeedsWAL(rmAccess->idxrel))
-		{
-			xl_minmax_rmarray_set	xlrec;
-			XLogRecPtr		recptr;
-			XLogRecData		rdata[2];
-			uint8			info;
-
-			info = XLOG_MINMAX_RMARRAY_SET;
-
-			xlrec.node = rmAccess->idxrel->rd_node;
-			xlrec.rmarray = BufferGetBlockNumber(rmAccess->currArrayBuf);
-			xlrec.blkidx = revmapIdx;
-			xlrec.newpg = targetblk;
-
-			rdata[0].data = (char *) &xlrec;
-			rdata[0].len = SizeOfMinmaxRmarraySet;
-			rdata[0].buffer = InvalidBuffer;
-			rdata[0].buffer_std = false;
-			rdata[0].next = &rdata[1];
-
-			rdata[1].data = NULL;
-			rdata[1].len = 0;
-			rdata[1].buffer = rmAccess->currArrayBuf;
-			rdata[1].buffer_std = false;
-			rdata[1].next = NULL;
-
-			recptr = XLogInsert(RM_MINMAX_ID, info, rdata);
-			PageSetLSN(BufferGetPage(rmAccess->currArrayBuf), recptr);
-		}
+		rdata.data = (char *) &xlrec;
+		rdata.len = SizeOfMinmaxRevmapExtend;
+		rdata.buffer = InvalidBuffer;
+		rdata.buffer_std = false;
+		rdata.next = NULL;
 
-		END_CRIT_SECTION();
+		recptr = XLogInsert(RM_MINMAX_ID, XLOG_MINMAX_REVMAP_EXTEND, &rdata);
+		PageSetLSN(metapage, recptr);
+		PageSetLSN(page, recptr);
 	}
 
-	LockBuffer(rmAccess->currArrayBuf, BUFFER_LOCK_UNLOCK);
+	END_CRIT_SECTION();
 
-	return targetblk;
+	LockBuffer(rmAccess->metaBuf, BUFFER_LOCK_UNLOCK);
+	UnlockReleaseBuffer(buf);
 }
 
 /*
@@ -604,17 +433,23 @@ mmGetMMTupleForHeapBlock(mmRevmapAccess *rmAccess, BlockNumber heapBlk,
 		}
 		LockBuffer(*buf, mode);
 		page = BufferGetPage(*buf);
-		lp = PageGetItemId(page, *off);
-		if (ItemIdIsUsed(lp))
-		{
-			mmtup = (MMTuple *) PageGetItem(page, lp);
 
-			if (mmtup->mt_blkno == heapBlk)
+		/* If we land on a revmap page, start over */
+		if (MINMAX_IS_REGULAR_PAGE(page))
+		{
+			lp = PageGetItemId(page, *off);
+			if (ItemIdIsUsed(lp))
 			{
-				/* found it! */
-				return mmtup;
+				mmtup = (MMTuple *) PageGetItem(page, lp);
+
+				if (mmtup->mt_blkno == heapBlk)
+				{
+					/* found it! */
+					return mmtup;
+				}
 			}
 		}
+
 		/*
 		 * No luck. Assume that the revmap was updated concurrently.
 		 *
@@ -627,106 +462,3 @@ mmGetMMTupleForHeapBlock(mmRevmapAccess *rmAccess, BlockNumber heapBlk,
 	/* not reached, but keep compiler quiet */
 	return NULL;
 }
-
-/*
- * Initialize the revmap of a new minmax index.
- *
- * NB -- caller is assumed to WAL-log this operation
- */
-void
-mmRevmapCreate(Relation idxrel)
-{
-	Buffer		buf;
-
-	/*
-	 * The first page of the revmap is always stored in block number 1 of the
-	 * main fork.  Because of this, the only thing we need to do is request
-	 * a new page; we assume we are called immediately after the metapage has
-	 * been initialized.
-	 */
-	buf = mm_getnewbuffer(idxrel);
-	Assert(BufferGetBlockNumber(buf) == 1);
-
-	mm_page_init(BufferGetPage(buf), MINMAX_PAGETYPE_REVMAP);
-	MarkBufferDirty(buf);
-
-	UnlockReleaseBuffer(buf);
-}
-
-/*
- * Initialize a new regular revmap page, which stores the given revmap logical
- * page number.  The newly allocated physical block number is returned.
- *
- * Used both by regular code path as well as during xlog replay.
- */
-BlockNumber
-initialize_rmr_page(Buffer newbuf, BlockNumber mapBlk)
-{
-	BlockNumber	blkno;
-	Page		page;
-	RevmapContents *contents;
-
-	page = BufferGetPage(newbuf);
-
-	mm_page_init(page, MINMAX_PAGETYPE_REVMAP);
-	contents = (RevmapContents *) PageGetContents(page);
-	contents->rmr_logblk = mapBlk;
-	/* the rmr_tids array is initialized to all invalid by PageInit */
-
-	blkno = BufferGetBlockNumber(newbuf);
-
-	return blkno;
-}
-
-/*
- * Given a buffer (hopefully containing a blank page), set it up as a revmap
- * array page.
- *
- * Used both by regular code path as well as during xlog replay.
- */
-void
-initialize_rma_page(Buffer buf)
-{
-	Page	arrayPg;
-	RevmapArrayContents *contents;
-
-	arrayPg = BufferGetPage(buf);
-	mm_page_init(arrayPg, MINMAX_PAGETYPE_REVMAP_ARRAY);
-	contents = (RevmapArrayContents *) PageGetContents(arrayPg);
-	contents->rma_nblocks = 0;
-	/* set the whole array to InvalidBlockNumber */
-	memset(contents->rma_blocks, 0xFF,
-		   sizeof(BlockNumber) * ARRAY_REVMAP_PAGE_MAXITEMS);
-}
-
-/*
- * Return an exclusively-locked buffer resulting from extending the relation.
- */
-static Buffer
-mm_getnewbuffer(Relation irel)
-{
-	Buffer	buffer;
-	bool	needLock = !RELATION_IS_LOCAL(irel);
-
-	/*
-	 * XXX As a possible improvement, we could request a blank page to the FSM
-	 * here.  Such pages could get inserted into the FSM if, for instance, two
-	 * processes extend the relation concurrently to add one more page to the
-	 * revmap and the second one discovers it doesn't actually need the page it
-	 * got.
-	 */
-
-	if (needLock)
-		LockRelationForExtension(irel, ExclusiveLock);
-
-	buffer = ReadBuffer(irel, P_NEW);
-	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
-
-	MINMAX_elog(DEBUG2, "mm_getnewbuffer: extending to page %u",
-				BufferGetBlockNumber(buffer));
-
-	if (needLock)
-		UnlockRelationForExtension(irel, ExclusiveLock);
-
-	return buffer;
-}
diff --git a/src/backend/access/minmax/mmtuple.c b/src/backend/access/minmax/mmtuple.c
index 2e5aac5..b203b3a 100644
--- a/src/backend/access/minmax/mmtuple.c
+++ b/src/backend/access/minmax/mmtuple.c
@@ -256,7 +256,7 @@ minmax_copy_tuple(MMTuple *tuple, Size len)
 }
 
 bool
-minmax_tuples_equal(MMTuple *a, Size alen, MMTuple *b, Size blen)
+minmax_tuples_equal(const MMTuple *a, Size alen, const MMTuple *b, Size blen)
 {
 	if (alen != blen)
 		return false;
diff --git a/src/backend/access/minmax/mmxlog.c b/src/backend/access/minmax/mmxlog.c
index ab3f9fe..5690ceb 100644
--- a/src/backend/access/minmax/mmxlog.c
+++ b/src/backend/access/minmax/mmxlog.c
@@ -246,84 +246,54 @@ minmax_xlog_samepage_update(XLogRecPtr lsn, XLogRecord *record)
 
 
 static void
-minmax_xlog_metapg_set(XLogRecPtr lsn, XLogRecord *record)
+minmax_xlog_revmap_extend(XLogRecPtr lsn, XLogRecord *record)
 {
-	xl_minmax_metapg_set *xlrec = (xl_minmax_metapg_set *) XLogRecGetData(record);
-	Buffer	meta;
+	xl_minmax_revmap_extend *xlrec = (xl_minmax_revmap_extend *) XLogRecGetData(record);
+	Buffer	metabuf;
 	Page	metapg;
 	MinmaxMetaPageData *metadata;
+	Buffer	buf;
+	Page	page;
 
-	/* If we have a full-page image, restore it and we're done */
-	if (record->xl_info & XLR_BKP_BLOCK(0))
-	{
-		(void) RestoreBackupBlock(lsn, record, 0, false, false);
-		return;
-	}
-
-	meta = XLogReadBuffer(xlrec->node, MINMAX_METAPAGE_BLKNO, false);
-	Assert(BufferIsValid(meta));
-
-	metapg = BufferGetPage(meta);
-	metadata = (MinmaxMetaPageData *) PageGetContents(metapg);
-	metadata->revmapArrayPages[xlrec->blkidx] = xlrec->newpg;
-
-	PageSetLSN(metapg, lsn);
-	MarkBufferDirty(meta);
-	UnlockReleaseBuffer(meta);
-}
-
-static void
-minmax_xlog_init_rmpg(XLogRecPtr lsn, XLogRecord *record)
-{
-	xl_minmax_init_rmpg *xlrec = (xl_minmax_init_rmpg *) XLogRecGetData(record);
-	Buffer		buffer;
-
+	/* Update the metapage */
 	if (record->xl_info & XLR_BKP_BLOCK(0))
 	{
-		(void) RestoreBackupBlock(lsn, record, 0, false, false);
-		return;
+		metabuf = RestoreBackupBlock(lsn, record, 0, false, true);
 	}
-
-	buffer = XLogReadBuffer(xlrec->node, xlrec->blkno, true);
-	Assert(BufferIsValid(buffer));
-
-	if (xlrec->array)
-		initialize_rma_page(buffer);
 	else
-		initialize_rmr_page(buffer, xlrec->logblk);
-
-	PageSetLSN(BufferGetPage(buffer), lsn);
-	MarkBufferDirty(buffer);
-	UnlockReleaseBuffer(buffer);
-}
+	{
+		metabuf = XLogReadBuffer(xlrec->node, MINMAX_METAPAGE_BLKNO, false);
+		if (BufferIsValid(metabuf))
+		{
+			metapg = BufferGetPage(metabuf);
+			if (lsn > PageGetLSN(metapg))
+			{
+				metadata = (MinmaxMetaPageData *) PageGetContents(metapg);
 
-static void
-minmax_xlog_rmarray_set(XLogRecPtr lsn, XLogRecord *record)
-{
-	xl_minmax_rmarray_set *xlrec = (xl_minmax_rmarray_set *) XLogRecGetData(record);
-	Buffer	buffer;
-	Page	page;
-	RevmapArrayContents *contents;
+				Assert(metadata->lastRevmapPage == xlrec->targetBlk - 1);
+				metadata->lastRevmapPage = xlrec->targetBlk;
 
-	/* If we have a full-page image, restore it and we're done */
-	if (record->xl_info & XLR_BKP_BLOCK(0))
-	{
-		(void) RestoreBackupBlock(lsn, record, 0, false, false);
-		return;
+				PageSetLSN(metapg, lsn);
+				MarkBufferDirty(metabuf);
+			}
+		}
 	}
 
-	buffer = XLogReadBuffer(xlrec->node, xlrec->rmarray, false);
-	Assert(BufferIsValid(buffer));
+	/* Re-init the target block as a revmap page */
 
-	page = BufferGetPage(buffer);
-
-	contents = (RevmapArrayContents *) PageGetContents(page);
-	contents->rma_blocks[xlrec->blkidx] = xlrec->newpg;
-	contents->rma_nblocks = xlrec->blkidx + 1;	/* XXX is this okay? */
+	buf = XLogReadBuffer(xlrec->node, xlrec->targetBlk, true);
+	page = (Page) BufferGetPage(buf);
+	mm_page_init(page, MINMAX_PAGETYPE_REVMAP);
 
 	PageSetLSN(page, lsn);
-	MarkBufferDirty(buffer);
-	UnlockReleaseBuffer(buffer);
+	MarkBufferDirty(buf);
+
+	metadata->lastRevmapPage = xlrec->targetBlk;
+	PageSetLSN(metapg, lsn);
+	MarkBufferDirty(metabuf);
+
+	UnlockReleaseBuffer(buf);
+	UnlockReleaseBuffer(metabuf);
 }
 
 void
@@ -345,14 +315,8 @@ minmax_redo(XLogRecPtr lsn, XLogRecord *record)
 		case XLOG_MINMAX_SAMEPAGE_UPDATE:
 			minmax_xlog_samepage_update(lsn, record);
 			break;
-		case XLOG_MINMAX_METAPG_SET:
-			minmax_xlog_metapg_set(lsn, record);
-			break;
-		case XLOG_MINMAX_RMARRAY_SET:
-			minmax_xlog_rmarray_set(lsn, record);
-			break;
-		case XLOG_MINMAX_INIT_RMPG:
-			minmax_xlog_init_rmpg(lsn, record);
+		case XLOG_MINMAX_REVMAP_EXTEND:
+			minmax_xlog_revmap_extend(lsn, record);
 			break;
 		default:
 			elog(PANIC, "minmax_redo: unknown op code %u", info);
diff --git a/src/include/access/minmax_internal.h b/src/include/access/minmax_internal.h
index 47ed279..c206168 100644
--- a/src/include/access/minmax_internal.h
+++ b/src/include/access/minmax_internal.h
@@ -87,5 +87,7 @@ extern void minmax_free_mmdesc(MinmaxDesc *mmdesc);
 extern void mm_page_init(Page page, uint16 type);
 extern void mm_metapage_init(Page page, BlockNumber pagesPerRange,
 				 uint16 version);
+extern bool mm_start_evacuating_page(Relation idxRel, Buffer buf);
+extern void mm_evacuate_page(Relation idxRel, Buffer buf);
 
 #endif   /* MINMAX_INTERNAL_H */
diff --git a/src/include/access/minmax_page.h b/src/include/access/minmax_page.h
index 04f40d8..df7f940 100644
--- a/src/include/access/minmax_page.h
+++ b/src/include/access/minmax_page.h
@@ -19,13 +19,21 @@
 
 /* special space on all minmax pages stores a "type" identifier */
 #define		MINMAX_PAGETYPE_META			0xF091
-#define		MINMAX_PAGETYPE_REVMAP_ARRAY	0xF092
-#define		MINMAX_PAGETYPE_REVMAP			0xF093
-#define		MINMAX_PAGETYPE_REGULAR			0xF094
+#define		MINMAX_PAGETYPE_REVMAP			0xF092
+#define		MINMAX_PAGETYPE_REGULAR			0xF093
+
+#define MINMAX_PAGE_TYPE(page) 	\
+	(((MinmaxSpecialSpace *) PageGetSpecialPointer(page))->type)
+#define MINMAX_IS_REVMAP_PAGE(page) (MINMAX_PAGE_TYPE(page) == MINMAX_PAGETYPE_REVMAP)
+#define MINMAX_IS_REGULAR_PAGE(page) (MINMAX_PAGE_TYPE(page) == MINMAX_PAGETYPE_REGULAR)
+
+/* flags */
+#define		MINMAX_EVACUATE_PAGE			1
 
 typedef struct MinmaxSpecialSpace
 {
-	uint16	type;
+	uint16		flags;
+	uint16		type;
 } MinmaxSpecialSpace;
 
 /* Metapage definitions */
@@ -34,30 +42,18 @@ typedef struct MinmaxMetaPageData
 	uint32	minmaxMagic;
 	uint32	minmaxVersion;
 	BlockNumber	pagesPerRange;
-	BlockNumber revmapArrayPages[1];	/* actually MAX_REVMAP_ARRAYPAGES */
+	BlockNumber lastRevmapPage;
 } MinmaxMetaPageData;
 
-/*
- * Number of array pages listed in metapage.  Need to consider leaving enough
- * space for the page header, the metapage struct, and the minmax special
- * space.
- */
-#define MAX_REVMAP_ARRAYPAGES	\
-	((BLCKSZ - \
-	  MAXALIGN(SizeOfPageHeaderData) - \
-	  offsetof(MinmaxMetaPageData, revmapArrayPages) - \
-	  MAXALIGN(sizeof(MinmaxSpecialSpace)) ) / \
-	 sizeof(BlockNumber))
-
 #define MINMAX_CURRENT_VERSION		1
 #define MINMAX_META_MAGIC			0xA8109CFA
 
-#define MINMAX_METAPAGE_BLKNO	0
+#define MINMAX_METAPAGE_BLKNO		0
+#define MINMAX_REVMAP_FIRST_BLKNO	1
 
 /* Definitions for regular revmap pages */
 typedef struct RevmapContents
 {
-	int32	rmr_logblk;			/* logical blkno of this revmap page */
 	ItemPointerData rmr_tids[1];	/* really REGULAR_REVMAP_PAGE_MAXITEMS */
 } RevmapContents;
 
@@ -69,20 +65,4 @@ typedef struct RevmapContents
 #define REGULAR_REVMAP_PAGE_MAXITEMS \
 	(REGULAR_REVMAP_CONTENT_SIZE / sizeof(ItemPointerData))
 
-/* Definitions for array revmap pages */
-typedef struct RevmapArrayContents
-{
-	int32	rma_nblocks;
-	BlockNumber	rma_blocks[1];	/* really ARRAY_REVMAP_PAGE_MAXITEMS */
-} RevmapArrayContents;
-
-#define REVMAP_ARRAY_CONTENT_SIZE \
-	(BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - \
-	 offsetof(RevmapArrayContents, rma_blocks) - \
-	 MAXALIGN(sizeof(MinmaxSpecialSpace)))
-/* max num of items in the array */
-#define ARRAY_REVMAP_PAGE_MAXITEMS \
-	(REVMAP_ARRAY_CONTENT_SIZE / sizeof(BlockNumber))
-
-
 #endif		/* MINMAX_PAGE_H */
diff --git a/src/include/access/minmax_revmap.h b/src/include/access/minmax_revmap.h
index 68729d8..73c6cd4 100644
--- a/src/include/access/minmax_revmap.h
+++ b/src/include/access/minmax_revmap.h
@@ -33,9 +33,4 @@ extern MMTuple *mmGetMMTupleForHeapBlock(mmRevmapAccess *rmAccess,
 						 BlockNumber heapBlk, Buffer *buf, OffsetNumber *off,
 						 int mode);
 
-/* internal stuff also used by xlog replay */
-extern BlockNumber initialize_rmr_page(Buffer newbuf, BlockNumber mapBlk);
-extern void initialize_rma_page(Buffer buf);
-
-
 #endif   /* MINMAX_REVMAP_H */
diff --git a/src/include/access/minmax_tuple.h b/src/include/access/minmax_tuple.h
index 989a179..eff4d52 100644
--- a/src/include/access/minmax_tuple.h
+++ b/src/include/access/minmax_tuple.h
@@ -77,8 +77,9 @@ typedef struct MMTuple
 extern MMTuple *minmax_form_tuple(MinmaxDesc *mmdesc, BlockNumber blkno,
 				  DeformedMMTuple *tuple, Size *size);
 extern void minmax_free_tuple(MMTuple *tuple);
-MMTuple *minmax_copy_tuple(MMTuple *tuple, Size len);
-extern bool minmax_tuples_equal(MMTuple *a, Size alen, MMTuple *b, Size blen);
+extern MMTuple *minmax_copy_tuple(MMTuple *tuple, Size len);
+extern bool minmax_tuples_equal(const MMTuple *a, Size alen,
+					const MMTuple *b, Size blen);
 
 extern DeformedMMTuple *minmax_new_dtuple(MinmaxDesc *mmdesc);
 extern void minmax_dtuple_initialize(DeformedMMTuple *dtuple,
diff --git a/src/include/access/minmax_xlog.h b/src/include/access/minmax_xlog.h
index 00d3425..01bb065 100644
--- a/src/include/access/minmax_xlog.h
+++ b/src/include/access/minmax_xlog.h
@@ -31,9 +31,8 @@
 #define XLOG_MINMAX_INSERT			0x10
 #define XLOG_MINMAX_UPDATE			0x20
 #define XLOG_MINMAX_SAMEPAGE_UPDATE	0x30
-#define XLOG_MINMAX_METAPG_SET		0x40
-#define XLOG_MINMAX_RMARRAY_SET		0x50
-#define XLOG_MINMAX_INIT_RMPG		0x60
+#define XLOG_MINMAX_REVMAP_EXTEND	0x40
+#define XLOG_MINMAX_REVMAP_VACUUM	0x50
 
 #define XLOG_MINMAX_OPMASK			0x70
 /*
@@ -90,39 +89,14 @@ typedef struct xl_minmax_samepage_update
 
 #define SizeOfMinmaxSamepageUpdate		(offsetof(xl_minmax_samepage_update, tid) + sizeof(ItemPointerData))
 
-/* This is what we need to know about a "metapage set" operation */
-typedef struct xl_minmax_metapg_set
+/* This is what we need to know about a revmap extension */
+typedef struct xl_minmax_revmap_extend
 {
 	RelFileNode		node;
-	uint32			blkidx;
-	BlockNumber		newpg;
-} xl_minmax_metapg_set;
+	BlockNumber		targetBlk;
+} xl_minmax_revmap_extend;
 
-#define SizeOfMinmaxMetapgSet	(offsetof(xl_minmax_metapg_set, newpg) + \
-								 sizeof(BlockNumber))
-
-/* This is what we need to know about a "revmap array set" operation */
-typedef struct xl_minmax_rmarray_set
-{
-	RelFileNode		node;
-	BlockNumber		rmarray;
-	uint32			blkidx;
-	BlockNumber		newpg;
-} xl_minmax_rmarray_set;
-
-#define SizeOfMinmaxRmarraySet	(offsetof(xl_minmax_rmarray_set, newpg) + \
-								 sizeof(BlockNumber))
-
-/* This is what we need to know when we initialize a new revmap page */
-typedef struct xl_minmax_init_rmpg
-{
-	RelFileNode		node;
-	bool			array;	/* array revmap page or regular revmap page */
-	BlockNumber		blkno;
-	BlockNumber		logblk;	/* only used by regular revmap pages */
-} xl_minmax_init_rmpg;
-
-#define SizeOfMinmaxInitRmpg	(offsetof(xl_minmax_init_rmpg, blkno) + \
+#define SizeOfMinmaxRevmapExtend	(offsetof(xl_minmax_revmap_extend, targetBlk) + \
 								 sizeof(BlockNumber))
 
 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to