Re: Avoiding smgrimmedsync() during nbtree index builds

Justin Pryzby Wed, 02 Mar 2022 17:10:11 -0800

Rebased to appease cfbot.

I ran these paches under a branch which shows code coverage in cirrus.  It
looks good to my eyes.
https://api.cirrus-ci.com/v1/artifact/task/5212346552418304/coverage/coverage/00-index.html


Are these patches being considered for v15 ?

>From 30707da3e5eb68d1efbc5594696da47ad7f72bab Mon Sep 17 00:00:00 2001
From: Melanie Plageman <[email protected]>
Date: Tue, 8 Feb 2022 19:01:18 -0500
Subject: [PATCH 1/4] Add unbuffered IO API

Wrap unbuffered extends and writes in a new API, directmgr.

When writing data outside of shared buffers, the backend must do a
series of steps to ensure the data is both durable and recoverable.

When writing or extending a page of data for a WAL-logged table fork,
the backend must log, checksum (if page is not empty), and write out the
page before moving on.

Additionally, the backend must fsync the page data to ensure it reaches
permanent storage since checkpointer is unaware of the buffer and could
move the Redo pointer past the associated WAL for this write/extend
before it fsyncs the data.

This API is also used for non-WAL-logged and non-self-fsync'd table
forks but with the appropriate exceptions to the above steps.

This commit introduces no functional change. It replaces all current
callers of smgrimmedsync(), smgrextend(), and smgrwrite() with the
equivalent directmgr functions. Consolidating these steps makes IO
outside of shared buffers less error-prone.
---
 src/backend/access/gist/gistbuild.c       | 36 +++++++----
 src/backend/access/hash/hashpage.c        | 18 +++---
 src/backend/access/heap/heapam_handler.c  | 15 +++--
 src/backend/access/heap/rewriteheap.c     | 53 +++++----------
 src/backend/access/heap/visibilitymap.c   | 10 ++-
 src/backend/access/nbtree/nbtree.c        | 18 ++----
 src/backend/access/nbtree/nbtsort.c       | 56 ++++++----------
 src/backend/access/spgist/spginsert.c     | 39 ++++-------
 src/backend/catalog/storage.c             | 30 +++------
 src/backend/storage/Makefile              |  2 +-
 src/backend/storage/direct/Makefile       | 17 +++++
 src/backend/storage/direct/directmgr.c    | 79 +++++++++++++++++++++++
 src/backend/storage/freespace/freespace.c | 14 ++--
 src/include/catalog/storage.h             |  1 +
 src/include/storage/directmgr.h           | 57 ++++++++++++++++
 15 files changed, 276 insertions(+), 169 deletions(-)
 create mode 100644 src/backend/storage/direct/Makefile
 create mode 100644 src/backend/storage/direct/directmgr.c
 create mode 100644 src/include/storage/directmgr.h

diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index e081e6571a4..8fabc2a42d7 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -43,6 +43,7 @@
 #include "miscadmin.h"
 #include "optimizer/optimizer.h"
 #include "storage/bufmgr.h"
+#include "storage/directmgr.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -91,6 +92,7 @@ typedef struct
 
 	int64		indtuples;		/* number of tuples indexed */
 
+	UnBufferedWriteState ub_wstate;
 	/*
 	 * Extra data structures used during a buffering build. 'gfbb' contains
 	 * information related to managing the build buffers. 'parentMap' is a
@@ -409,14 +411,16 @@ gist_indexsortbuild(GISTBuildState *state)
 	state->pages_allocated = 0;
 	state->pages_written = 0;
 	state->ready_num_pages = 0;
+	unbuffered_prep(&state->ub_wstate, false, false);
 
 	/*
 	 * Write an empty page as a placeholder for the root page. It will be
 	 * replaced with the real root page at the end.
 	 */
 	page = palloc0(BLCKSZ);
-	smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
-			   page, true);
+	unbuffered_extend(&state->ub_wstate, false,
+			RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
+			page, true);
 	state->pages_allocated++;
 	state->pages_written++;
 
@@ -458,12 +462,13 @@ gist_indexsortbuild(GISTBuildState *state)
 
 	/* Write out the root */
 	PageSetLSN(levelstate->pages[0], GistBuildLSN);
-	PageSetChecksumInplace(levelstate->pages[0], GIST_ROOT_BLKNO);
-	smgrwrite(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
-			  levelstate->pages[0], true);
-	if (RelationNeedsWAL(state->indexrel))
-		log_newpage(&state->indexrel->rd_node, MAIN_FORKNUM, GIST_ROOT_BLKNO,
-					levelstate->pages[0], true);
+
+	unbuffered_write(&state->ub_wstate, RelationNeedsWAL(state->indexrel),
+			RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO,
+			  levelstate->pages[0]);
+
+	unbuffered_finish(&state->ub_wstate, RelationGetSmgr(state->indexrel),
+			MAIN_FORKNUM);
 
 	pfree(levelstate->pages[0]);
 	pfree(levelstate);
@@ -645,6 +650,8 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
 	if (state->ready_num_pages == 0)
 		return;
 
+	unbuffered_prep(&state->ub_wstate, false, false);
+
 	for (int i = 0; i < state->ready_num_pages; i++)
 	{
 		Page		page = state->ready_pages[i];
@@ -655,9 +662,13 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
 			elog(ERROR, "unexpected block number to flush GiST sorting build");
 
 		PageSetLSN(page, GistBuildLSN);
-		PageSetChecksumInplace(page, blkno);
-		smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
-				   true);
+
+		/*
+		 * These will be WAL logged below
+		 */
+		unbuffered_extend(&state->ub_wstate, false,
+				RelationGetSmgr(state->indexrel), MAIN_FORKNUM, blkno, page,
+				false);
 
 		state->pages_written++;
 	}
@@ -666,6 +677,9 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
 		log_newpages(&state->indexrel->rd_node, MAIN_FORKNUM, state->ready_num_pages,
 					 state->ready_blknos, state->ready_pages, true);
 
+	unbuffered_finish(&state->ub_wstate, RelationGetSmgr(state->indexrel),
+			MAIN_FORKNUM);
+
 	for (int i = 0; i < state->ready_num_pages; i++)
 		pfree(state->ready_pages[i]);
 
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 28c5297a1dc..6096604438e 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -33,6 +33,7 @@
 #include "access/xloginsert.h"
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
+#include "storage/directmgr.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/smgr.h"
@@ -991,6 +992,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 	PGAlignedBlock zerobuf;
 	Page		page;
 	HashPageOpaque ovflopaque;
+	UnBufferedWriteState ub_wstate;
 
 	lastblock = firstblock + nblocks - 1;
 
@@ -1001,6 +1003,8 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
 		return false;
 
+	unbuffered_prep(&ub_wstate, false, true);
+
 	page = (Page) zerobuf.data;
 
 	/*
@@ -1018,16 +1022,10 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 	ovflopaque->hasho_flag = LH_UNUSED_PAGE;
 	ovflopaque->hasho_page_id = HASHO_PAGE_ID;
 
-	if (RelationNeedsWAL(rel))
-		log_newpage(&rel->rd_node,
-					MAIN_FORKNUM,
-					lastblock,
-					zerobuf.data,
-					true);
-
-	PageSetChecksumInplace(page, lastblock);
-	smgrextend(RelationGetSmgr(rel), MAIN_FORKNUM, lastblock, zerobuf.data,
-			   false);
+	unbuffered_extend(&ub_wstate, RelationNeedsWAL(rel), RelationGetSmgr(rel),
+			MAIN_FORKNUM, lastblock, zerobuf.data, false);
+
+	unbuffered_finish(&ub_wstate, RelationGetSmgr(rel), MAIN_FORKNUM);
 
 	return true;
 }
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 39ef8a0b77d..9fd6a6f4474 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -38,6 +38,7 @@
 #include "pgstat.h"
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
+#include "storage/directmgr.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
 #include "storage/procarray.h"
@@ -575,7 +576,9 @@ heapam_relation_set_new_filenode(Relation rel,
 								 MultiXactId *minmulti)
 {
 	SMgrRelation srel;
+	UnBufferedWriteState ub_wstate;
 
+	unbuffered_prep(&ub_wstate, true, false);
 	/*
 	 * Initialize to the minimum XID that could put tuples in the table. We
 	 * know that no xacts older than RecentXmin are still running, so that
@@ -597,12 +600,10 @@ heapam_relation_set_new_filenode(Relation rel,
 
 	/*
 	 * If required, set up an init fork for an unlogged table so that it can
-	 * be correctly reinitialized on restart.  An immediate sync is required
-	 * even if the page has been logged, because the write did not go through
-	 * shared_buffers and therefore a concurrent checkpoint may have moved the
-	 * redo pointer past our xlog record.  Recovery may as well remove it
-	 * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
-	 * record. Therefore, logging is necessary even if wal_level=minimal.
+	 * be correctly reinitialized on restart.
+	 * Recovery may as well remove our xlog record while replaying, for
+	 * example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record. Therefore,
+	 * logging is necessary even if wal_level=minimal.
 	 */
 	if (persistence == RELPERSISTENCE_UNLOGGED)
 	{
@@ -611,7 +612,7 @@ heapam_relation_set_new_filenode(Relation rel,
 			   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
 		smgrcreate(srel, INIT_FORKNUM, false);
 		log_smgrcreate(newrnode, INIT_FORKNUM);
-		smgrimmedsync(srel, INIT_FORKNUM);
+		unbuffered_finish(&ub_wstate, srel, INIT_FORKNUM);
 	}
 
 	smgrclose(srel);
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736e..12bdd6ff601 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -119,6 +119,7 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "storage/bufmgr.h"
+#include "storage/directmgr.h"
 #include "storage/fd.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
@@ -152,6 +153,7 @@ typedef struct RewriteStateData
 	HTAB	   *rs_old_new_tid_map; /* unmatched B tuples */
 	HTAB	   *rs_logical_mappings;	/* logical remapping files */
 	uint32		rs_num_rewrite_mappings;	/* # in memory mappings */
+	UnBufferedWriteState rs_unbuffered_wstate;
 }			RewriteStateData;
 
 /*
@@ -265,6 +267,9 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
 
+	unbuffered_prep(&state->rs_unbuffered_wstate,
+			RelationNeedsWAL(state->rs_new_rel), false);
+
 	/* Initialize hash tables used to track update chains */
 	hash_ctl.keysize = sizeof(TidHashKey);
 	hash_ctl.entrysize = sizeof(UnresolvedTupData);
@@ -317,28 +322,14 @@ end_heap_rewrite(RewriteState state)
 	/* Write the last page, if any */
 	if (state->rs_buffer_valid)
 	{
-		if (RelationNeedsWAL(state->rs_new_rel))
-			log_newpage(&state->rs_new_rel->rd_node,
-						MAIN_FORKNUM,
-						state->rs_blockno,
-						state->rs_buffer,
-						true);
-
-		PageSetChecksumInplace(state->rs_buffer, state->rs_blockno);
-
-		smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-				   state->rs_blockno, (char *) state->rs_buffer, true);
+		unbuffered_extend(&state->rs_unbuffered_wstate,
+				RelationNeedsWAL(state->rs_new_rel),
+				RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
+				state->rs_blockno, state->rs_buffer, false);
 	}
 
-	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is the same as in storage.c's RelationCopyStorage(): we're
-	 * writing data that's not in shared buffers, and so a CHECKPOINT
-	 * occurring during the rewriteheap operation won't have fsync'd data we
-	 * wrote before the checkpoint.
-	 */
-	if (RelationNeedsWAL(state->rs_new_rel))
-		smgrimmedsync(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
+	unbuffered_finish(&state->rs_unbuffered_wstate,
+			RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM);
 
 	logical_end_heap_rewrite(state);
 
@@ -676,24 +667,10 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 			 * contains a tuple.  Hence, unlike RelationGetBufferForTuple(),
 			 * enforce saveFreeSpace unconditionally.
 			 */
-
-			/* XLOG stuff */
-			if (RelationNeedsWAL(state->rs_new_rel))
-				log_newpage(&state->rs_new_rel->rd_node,
-							MAIN_FORKNUM,
-							state->rs_blockno,
-							page,
-							true);
-
-			/*
-			 * Now write the page. We say skipFsync = true because there's no
-			 * need for smgr to schedule an fsync for this write; we'll do it
-			 * ourselves in end_heap_rewrite.
-			 */
-			PageSetChecksumInplace(page, state->rs_blockno);
-
-			smgrextend(RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
-					   state->rs_blockno, (char *) page, true);
+			unbuffered_extend(&state->rs_unbuffered_wstate,
+					RelationNeedsWAL(state->rs_new_rel),
+					RelationGetSmgr(state->rs_new_rel), MAIN_FORKNUM,
+					state->rs_blockno, page, false);
 
 			state->rs_blockno++;
 			state->rs_buffer_valid = false;
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index e09f25a684c..897de5ec1fc 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -93,6 +93,7 @@
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
 #include "storage/bufmgr.h"
+#include "storage/directmgr.h"
 #include "storage/lmgr.h"
 #include "storage/smgr.h"
 #include "utils/inval.h"
@@ -617,6 +618,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	BlockNumber vm_nblocks_now;
 	PGAlignedBlock pg;
 	SMgrRelation reln;
+	UnBufferedWriteState ub_wstate;
 
 	PageInit((Page) pg.data, BLCKSZ, 0);
 
@@ -631,6 +633,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 * by the time we get the lock.
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
+	unbuffered_prep(&ub_wstate, false, true);
 
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
@@ -655,12 +658,13 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	/* Now extend the file */
 	while (vm_nblocks_now < vm_nblocks)
 	{
-		PageSetChecksumInplace((Page) pg.data, vm_nblocks_now);
-
-		smgrextend(reln, VISIBILITYMAP_FORKNUM, vm_nblocks_now, pg.data, false);
+		unbuffered_extend(&ub_wstate, false, reln, VISIBILITYMAP_FORKNUM,
+				vm_nblocks_now, (Page) pg.data, false);
 		vm_nblocks_now++;
 	}
 
+	unbuffered_finish(&ub_wstate, reln, VISIBILITYMAP_FORKNUM);
+
 	/*
 	 * Send a shared-inval message to force other backends to close any smgr
 	 * references they may have for this rel, which we are about to change.
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index c9b4964c1e8..1ec7493ad39 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -30,6 +30,7 @@
 #include "pgstat.h"
 #include "postmaster/autovacuum.h"
 #include "storage/condition_variable.h"
+#include "storage/directmgr.h"
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
@@ -152,6 +153,9 @@ void
 btbuildempty(Relation index)
 {
 	Page		metapage;
+	UnBufferedWriteState wstate;
+
+	unbuffered_prep(&wstate, true, false);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
@@ -164,18 +168,10 @@ btbuildempty(Relation index)
 	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
 	 * this even when wal_level=minimal.
 	 */
-	PageSetChecksumInplace(metapage, BTREE_METAPAGE);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, BTREE_METAPAGE,
-			  (char *) metapage, true);
-	log_newpage(&RelationGetSmgr(index)->smgr_rnode.node, INIT_FORKNUM,
-				BTREE_METAPAGE, metapage, true);
+	unbuffered_write(&wstate, true, RelationGetSmgr(index), INIT_FORKNUM,
+			BTREE_METAPAGE, metapage);
 
-	/*
-	 * An immediate sync is required even if we xlog'd the page, because the
-	 * write did not go through shared_buffers and therefore a concurrent
-	 * checkpoint may have moved the redo pointer past our xlog record.
-	 */
-	smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
+	unbuffered_finish(&wstate, RelationGetSmgr(index), INIT_FORKNUM);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 8a19de2f66c..c7a65a99727 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -57,6 +57,7 @@
 #include "executor/instrument.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "storage/directmgr.h"
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"		/* pgrminclude ignore */
 #include "utils/rel.h"
@@ -256,6 +257,7 @@ typedef struct BTWriteState
 	BlockNumber btws_pages_alloced; /* # pages allocated */
 	BlockNumber btws_pages_written; /* # pages written out */
 	Page		btws_zeropage;	/* workspace for filling zeroes */
+	UnBufferedWriteState ub_wstate;
 } BTWriteState;
 
 
@@ -643,13 +645,6 @@ _bt_blnewpage(uint32 level)
 static void
 _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 {
-	/* XLOG stuff */
-	if (wstate->btws_use_wal)
-	{
-		/* We use the XLOG_FPI record type for this */
-		log_newpage(&wstate->index->rd_node, MAIN_FORKNUM, blkno, page, true);
-	}
-
 	/*
 	 * If we have to write pages nonsequentially, fill in the space with
 	 * zeroes until we come back and overwrite.  This is not logically
@@ -661,32 +656,27 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 	{
 		if (!wstate->btws_zeropage)
 			wstate->btws_zeropage = (Page) palloc0(BLCKSZ);
-		/* don't set checksum for all-zero page */
-		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM,
-				   wstate->btws_pages_written++,
-				   (char *) wstate->btws_zeropage,
-				   true);
+
+		unbuffered_extend(&wstate->ub_wstate, false,
+				RelationGetSmgr(wstate->index), MAIN_FORKNUM,
+				wstate->btws_pages_written++, wstate->btws_zeropage, true);
 	}
 
-	PageSetChecksumInplace(page, blkno);
 
-	/*
-	 * Now write the page.  There's no need for smgr to schedule an fsync for
-	 * this write; we'll do it ourselves before ending the build.
-	 */
+	/* Now write the page. Either we are extending the file... */
 	if (blkno == wstate->btws_pages_written)
 	{
-		/* extending the file... */
-		smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-				   (char *) page, true);
+		unbuffered_extend(&wstate->ub_wstate, wstate->btws_use_wal,
+				RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, page,
+				false);
+
 		wstate->btws_pages_written++;
 	}
+
+	/* or we are overwriting a block we zero-filled before. */
 	else
-	{
-		/* overwriting a block we zero-filled before */
-		smgrwrite(RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno,
-				  (char *) page, true);
-	}
+		unbuffered_write(&wstate->ub_wstate, wstate->btws_use_wal,
+				RelationGetSmgr(wstate->index), MAIN_FORKNUM, blkno, page);
 
 	pfree(page);
 }
@@ -1195,6 +1185,9 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	int64		tuples_done = 0;
 	bool		deduplicate;
 
+
+	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal, false);
+
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
 
@@ -1421,17 +1414,8 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	/* Close down final pages and write the metapage */
 	_bt_uppershutdown(wstate, state);
 
-	/*
-	 * When we WAL-logged index pages, we must nonetheless fsync index files.
-	 * Since we're building outside shared buffers, a CHECKPOINT occurring
-	 * during the build has no way to flush the previously written data to
-	 * disk (indeed it won't know the index even exists).  A crash later on
-	 * would replay WAL from the checkpoint, therefore it wouldn't replay our
-	 * earlier WAL entries. If we do not fsync those pages here, they might
-	 * still not be on disk when the crash occurs.
-	 */
-	if (wstate->btws_use_wal)
-		smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM);
+	unbuffered_finish(&wstate->ub_wstate, RelationGetSmgr(wstate->index),
+			MAIN_FORKNUM);
 }
 
 /*
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index bfb74049d0c..e232ba4b866 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -25,6 +25,7 @@
 #include "catalog/index.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/directmgr.h"
 #include "storage/smgr.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
@@ -156,48 +157,30 @@ void
 spgbuildempty(Relation index)
 {
 	Page		page;
+	UnBufferedWriteState wstate;
+
+	unbuffered_prep(&wstate, true, false);
 
 	/* Construct metapage. */
 	page = (Page) palloc(BLCKSZ);
 	SpGistInitMetapage(page);
 
-	/*
-	 * Write the page and log it unconditionally.  This is important
-	 * particularly for indexes created on tablespaces and databases whose
-	 * creation happened after the last redo pointer as recovery removes any
-	 * of their existing content when the corresponding create records are
-	 * replayed.
-	 */
-	PageSetChecksumInplace(page, SPGIST_METAPAGE_BLKNO);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_METAPAGE_BLKNO,
-			  (char *) page, true);
-	log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_METAPAGE_BLKNO, page, true);
+	unbuffered_write(&wstate, true, RelationGetSmgr(index), INIT_FORKNUM,
+			SPGIST_METAPAGE_BLKNO, page);
 
 	/* Likewise for the root page. */
 	SpGistInitPage(page, SPGIST_LEAF);
 
-	PageSetChecksumInplace(page, SPGIST_ROOT_BLKNO);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_ROOT_BLKNO,
-			  (char *) page, true);
-	log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_ROOT_BLKNO, page, true);
+	unbuffered_write(&wstate, true, RelationGetSmgr(index), INIT_FORKNUM,
+			SPGIST_ROOT_BLKNO, page);
 
 	/* Likewise for the null-tuples root page. */
 	SpGistInitPage(page, SPGIST_LEAF | SPGIST_NULLS);
 
-	PageSetChecksumInplace(page, SPGIST_NULL_BLKNO);
-	smgrwrite(RelationGetSmgr(index), INIT_FORKNUM, SPGIST_NULL_BLKNO,
-			  (char *) page, true);
-	log_newpage(&(RelationGetSmgr(index))->smgr_rnode.node, INIT_FORKNUM,
-				SPGIST_NULL_BLKNO, page, true);
+	unbuffered_write(&wstate, true, RelationGetSmgr(index), INIT_FORKNUM,
+			SPGIST_NULL_BLKNO, page);
 
-	/*
-	 * An immediate sync is required even if we xlog'd the pages, because the
-	 * writes did not go through shared buffers and therefore a concurrent
-	 * checkpoint may have moved the redo pointer past our xlog record.
-	 */
-	smgrimmedsync(RelationGetSmgr(index), INIT_FORKNUM);
+	unbuffered_finish(&wstate, RelationGetSmgr(index), INIT_FORKNUM);
 }
 
 /*
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 9b8075536a7..1ec90e00abf 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -28,6 +28,7 @@
 #include "catalog/storage.h"
 #include "catalog/storage_xlog.h"
 #include "miscadmin.h"
+#include "storage/directmgr.h"
 #include "storage/freespace.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
@@ -420,6 +421,8 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	bool		copying_initfork;
 	BlockNumber nblocks;
 	BlockNumber blkno;
+	UnBufferedWriteState wstate;
+
 
 	page = (Page) buf.data;
 
@@ -440,6 +443,8 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
+	unbuffered_prep(&wstate, (use_wal || copying_initfork), false);
+
 	nblocks = smgrnblocks(src, forkNum);
 
 	for (blkno = 0; blkno < nblocks; blkno++)
@@ -474,30 +479,15 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 		 * page this is, so we have to log the full page including any unused
 		 * space.
 		 */
-		if (use_wal)
-			log_newpage(&dst->smgr_rnode.node, forkNum, blkno, page, false);
 
-		PageSetChecksumInplace(page, blkno);
+		// TODO: is it okay to pass the page here to unbuffered_extend so that
+		// it can be WAL-logged as a full page even though smgrextend used to
+		// take just buf.data?
+		unbuffered_extend(&wstate, use_wal, dst, forkNum, blkno, page, false);
 
-		/*
-		 * Now write the page.  We say skipFsync = true because there's no
-		 * need for smgr to schedule an fsync for this write; we'll do it
-		 * ourselves below.
-		 */
-		smgrextend(dst, forkNum, blkno, buf.data, true);
 	}
 
-	/*
-	 * When we WAL-logged rel pages, we must nonetheless fsync them.  The
-	 * reason is that since we're copying outside shared buffers, a CHECKPOINT
-	 * occurring during the copy has no way to flush the previously written
-	 * data to disk (indeed it won't know the new rel even exists).  A crash
-	 * later on would replay WAL from the checkpoint, therefore it wouldn't
-	 * replay our earlier WAL entries. If we do not fsync those pages here,
-	 * they might still not be on disk when the crash occurs.
-	 */
-	if (use_wal || copying_initfork)
-		smgrimmedsync(dst, forkNum);
+	unbuffered_finish(&wstate, dst, forkNum);
 }
 
 /*
diff --git a/src/backend/storage/Makefile b/src/backend/storage/Makefile
index 8376cdfca20..501fae5f9d0 100644
--- a/src/backend/storage/Makefile
+++ b/src/backend/storage/Makefile
@@ -8,6 +8,6 @@ subdir = src/backend/storage
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-SUBDIRS     = buffer file freespace ipc large_object lmgr page smgr sync
+SUBDIRS     = buffer direct file freespace ipc large_object lmgr page smgr sync
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/direct/Makefile b/src/backend/storage/direct/Makefile
new file mode 100644
index 00000000000..d82bbed48c2
--- /dev/null
+++ b/src/backend/storage/direct/Makefile
@@ -0,0 +1,17 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+#    Makefile for storage/direct
+#
+# IDENTIFICATION
+#    src/backend/storage/direct/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/direct
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = directmgr.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/direct/directmgr.c b/src/backend/storage/direct/directmgr.c
new file mode 100644
index 00000000000..371ff5602fe
--- /dev/null
+++ b/src/backend/storage/direct/directmgr.c
@@ -0,0 +1,79 @@
+/*-------------------------------------------------------------------------
+ *
+ * directmgr.c
+ *	  routines for managing unbuffered IO
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/direct/directmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+
+#include "access/xloginsert.h"
+#include "storage/directmgr.h"
+
+void
+unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
+		request_fsync)
+{
+	wstate->fsync_self = fsync_self;
+	wstate->request_fsync = request_fsync;
+}
+
+void
+unbuffered_write(UnBufferedWriteState *wstate, bool do_wal, SMgrRelation
+		smgrrel, ForkNumber forknum, BlockNumber blocknum, Page page)
+{
+	PageSetChecksumInplace(page, blocknum);
+
+	smgrwrite(smgrrel, forknum, blocknum, (char *) page,
+			!wstate->request_fsync);
+
+	if (do_wal)
+		log_newpage(&(smgrrel)->smgr_rnode.node, forknum,
+					blocknum, page, true);
+}
+
+void
+unbuffered_extend(UnBufferedWriteState *wstate, bool do_wal, SMgrRelation
+		smgrrel, ForkNumber forknum, BlockNumber blocknum, Page page, bool
+		empty)
+{
+	/*
+	 * Don't checksum empty pages
+	 */
+	if (!empty)
+		PageSetChecksumInplace(page, blocknum);
+
+	smgrextend(smgrrel, forknum, blocknum, (char *) page,
+			!wstate->request_fsync);
+
+	if (do_wal)
+		log_newpage(&(smgrrel)->smgr_rnode.node, forknum,
+					blocknum, page, true);
+
+}
+
+/*
+ * When writing data outside shared buffers, a concurrent CHECKPOINT can move
+ * the redo pointer past our WAL entries and won't flush our data to disk. If
+ * the database crashes before the data makes it to disk, our WAL won't be
+ * replayed and the data will be lost.
+ * Thus, if a CHECKPOINT begins between unbuffered_prep() and
+ * unbuffered_finish(), the backend must fsync the data itself.
+ */
+void
+unbuffered_finish(UnBufferedWriteState *wstate, SMgrRelation smgrrel,
+		ForkNumber forknum)
+{
+	if (!wstate->fsync_self)
+		return;
+
+	smgrimmedsync(smgrrel, forknum);
+}
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 78c073b7c98..4326ea8f015 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -27,6 +27,7 @@
 #include "access/xloginsert.h"
 #include "access/xlogutils.h"
 #include "miscadmin.h"
+#include "storage/directmgr.h"
 #include "storage/freespace.h"
 #include "storage/fsm_internals.h"
 #include "storage/lmgr.h"
@@ -609,9 +610,11 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	BlockNumber fsm_nblocks_now;
 	PGAlignedBlock pg;
 	SMgrRelation reln;
+	UnBufferedWriteState ub_wstate;
 
 	PageInit((Page) pg.data, BLCKSZ, 0);
 
+
 	/*
 	 * We use the relation extension lock to lock out other backends trying to
 	 * extend the FSM at the same time. It also locks out extension of the
@@ -624,6 +627,8 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
 
+	unbuffered_prep(&ub_wstate, false, true);
+
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
 	 * gets closed.  It's safe as long as we only do smgr-level operations
@@ -648,14 +653,15 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	/* Extend as needed. */
 	while (fsm_nblocks_now < fsm_nblocks)
 	{
-		PageSetChecksumInplace((Page) pg.data, fsm_nblocks_now);
-
-		smgrextend(reln, FSM_FORKNUM, fsm_nblocks_now,
-				   pg.data, false);
+		unbuffered_extend(&ub_wstate, false, reln, FSM_FORKNUM,
+				fsm_nblocks_now, (Page) pg.data, false);
 		fsm_nblocks_now++;
 	}
 
+	unbuffered_finish(&ub_wstate, reln, FSM_FORKNUM);
+
 	UnlockRelationForExtension(rel, ExclusiveLock);
+
 }
 
 /*
diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h
index 9ffc7419131..b097d09c9fc 100644
--- a/src/include/catalog/storage.h
+++ b/src/include/catalog/storage.h
@@ -15,6 +15,7 @@
 #define STORAGE_H
 
 #include "storage/block.h"
+#include "storage/directmgr.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
 #include "utils/relcache.h"
diff --git a/src/include/storage/directmgr.h b/src/include/storage/directmgr.h
new file mode 100644
index 00000000000..47653d0d1bb
--- /dev/null
+++ b/src/include/storage/directmgr.h
@@ -0,0 +1,57 @@
+/*-------------------------------------------------------------------------
+ *
+ * directmgr.h
+ *	  POSTGRES unbuffered IO manager definitions.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/directmgr.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef DIRECTMGR_H
+#define DIRECTMGR_H
+
+#include "common/relpath.h"
+#include "storage/block.h"
+#include "storage/bufpage.h"
+#include "storage/smgr.h"
+
+/*
+ * After committing the pg_buffer_stats patch, this will contain a pointer to a
+ * PgBufferAccess struct to count the writes and extends done in this way.
+ */
+typedef struct UnBufferedWriteState
+{
+	/*
+	 * When writing logged table data outside of shared buffers, there is a
+	 * risk of a concurrent CHECKPOINT moving the redo pointer past the data's
+	 * associated WAL entries. To avoid this, callers in this situation must
+	 * fsync the pages they have written themselves.
+	 *
+	 * Callers able to use the checkpointer's sync request queue when writing
+	 * data outside shared buffers (like fsm and vm) can set request_fsync to
+	 * true so that these fsync requests are added to the queue.
+	 */
+	bool fsync_self;
+	bool request_fsync;
+} UnBufferedWriteState;
+/*
+ * prototypes for functions in directmgr.c
+ */
+extern void
+unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
+		request_fsync);
+extern void
+unbuffered_write(UnBufferedWriteState *wstate, bool do_wal, SMgrRelation
+		smgrrel, ForkNumber forknum, BlockNumber blocknum, Page page);
+extern void
+unbuffered_extend(UnBufferedWriteState *wstate, bool do_wal, SMgrRelation smgrrel,
+		ForkNumber forknum, BlockNumber blocknum, Page page, bool empty);
+extern void
+unbuffered_finish(UnBufferedWriteState *wstate, SMgrRelation smgrrel,
+		ForkNumber forknum);
+
+#endif							/* DIRECTMGR_H */
-- 
2.17.1

>From 8985cd63425e79bd3be641770b61d17e9b9caa00 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <[email protected]>
Date: Tue, 8 Feb 2022 19:01:27 -0500
Subject: [PATCH 2/4] Avoid immediate fsync for unbuffered IO

Data written to WAL-logged table forks is durable once the WAL entries
are on permanent storage; however, the XLOG Redo pointer cannot be moved
past the associated WAL until the page data is safely on permanent
storage. If a crash were to occur before the data is fsync'd, the WAL
wouldn't be replayed during recovery, and the data would be lost.

This is not a problem with pages written in shared buffers because the
checkpointer will block until FlushBuffer() is complete for all buffers
that were dirtied before it began. Therefore it will not move the Redo
pointer past their associated WAL entries until it has fsync'd the data.

A backend writing data outside of shared buffers must ensure that the
data has reached permanent storage itself or that the Redo pointer has
not moved while it was writing the data.

In the common case, the backend should not have to do this fsync itself
and can instead request the checkpointer do it.

To ensure this is safe, the backend can save the XLOG Redo pointer
location before doing the write or extend. Then it can add an fsync
request for the page to the checkpointer's pending-ops table using the
existing mechanism. After doing the write or extend, if the Redo pointer
has moved (meaning a checkpoint has started since it saved it last),
then the backend can simply fsync the page itself. Otherwise, the
checkpointer takes care of fsync'ing the page the next time it processes
the pending-ops table.

This commit adds the optimization option to the directmgr API but does
not add any users, so there is no behavior change.
---
 src/backend/access/gist/gistbuild.c       |  4 ++--
 src/backend/access/hash/hashpage.c        |  2 +-
 src/backend/access/heap/heapam_handler.c  |  2 +-
 src/backend/access/heap/rewriteheap.c     |  2 +-
 src/backend/access/heap/visibilitymap.c   |  2 +-
 src/backend/access/nbtree/nbtree.c        |  2 +-
 src/backend/access/nbtree/nbtsort.c       |  5 ++++-
 src/backend/access/spgist/spginsert.c     |  2 +-
 src/backend/access/transam/xlog.c         | 13 +++++++++++++
 src/backend/catalog/storage.c             |  2 +-
 src/backend/storage/direct/directmgr.c    | 18 ++++++++++++++++--
 src/backend/storage/freespace/freespace.c |  2 +-
 src/include/access/xlog.h                 |  1 +
 src/include/storage/directmgr.h           | 13 +++++++++++--
 14 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 8fabc2a42d7..3d23d5b74d6 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -411,7 +411,7 @@ gist_indexsortbuild(GISTBuildState *state)
 	state->pages_allocated = 0;
 	state->pages_written = 0;
 	state->ready_num_pages = 0;
-	unbuffered_prep(&state->ub_wstate, false, false);
+	unbuffered_prep(&state->ub_wstate, false, false, false);
 
 	/*
 	 * Write an empty page as a placeholder for the root page. It will be
@@ -650,7 +650,7 @@ gist_indexsortbuild_flush_ready_pages(GISTBuildState *state)
 	if (state->ready_num_pages == 0)
 		return;
 
-	unbuffered_prep(&state->ub_wstate, false, false);
+	unbuffered_prep(&state->ub_wstate, false, false, false);
 
 	for (int i = 0; i < state->ready_num_pages; i++)
 	{
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 6096604438e..0c5533e632e 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -1003,7 +1003,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks)
 	if (lastblock < firstblock || lastblock == InvalidBlockNumber)
 		return false;
 
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	page = (Page) zerobuf.data;
 
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 9fd6a6f4474..f9f6527507c 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -578,7 +578,7 @@ heapam_relation_set_new_filenode(Relation rel,
 	SMgrRelation srel;
 	UnBufferedWriteState ub_wstate;
 
-	unbuffered_prep(&ub_wstate, true, false);
+	unbuffered_prep(&ub_wstate, false, true, false);
 	/*
 	 * Initialize to the minimum XID that could put tuples in the table. We
 	 * know that no xacts older than RecentXmin are still running, so that
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 12bdd6ff601..b103a62135e 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -267,7 +267,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm
 	state->rs_cutoff_multi = cutoff_multi;
 	state->rs_cxt = rw_cxt;
 
-	unbuffered_prep(&state->rs_unbuffered_wstate,
+	unbuffered_prep(&state->rs_unbuffered_wstate, false,
 			RelationNeedsWAL(state->rs_new_rel), false);
 
 	/* Initialize hash tables used to track update chains */
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 897de5ec1fc..d844767abc9 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -633,7 +633,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 * by the time we get the lock.
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1ec7493ad39..843c9e23625 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -155,7 +155,7 @@ btbuildempty(Relation index)
 	Page		metapage;
 	UnBufferedWriteState wstate;
 
-	unbuffered_prep(&wstate, true, false);
+	unbuffered_prep(&wstate, false, true, false);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index c7a65a99727..a67770f3fd3 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1186,7 +1186,10 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	bool		deduplicate;
 
 
-	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal, false);
+	/*
+	 * Only bother fsync'ing the data to permanent storage if WAL logging
+	 */
+	unbuffered_prep(&wstate->ub_wstate, false, wstate->btws_use_wal, false);
 
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index e232ba4b866..e45f1f5db9f 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -159,7 +159,7 @@ spgbuildempty(Relation index)
 	Page		page;
 	UnBufferedWriteState wstate;
 
-	unbuffered_prep(&wstate, true, false);
+	unbuffered_prep(&wstate, false, true, false);
 
 	/* Construct metapage. */
 	page = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d2bd7a3576..db7b33ec673 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5971,6 +5971,19 @@ GetLastImportantRecPtr(void)
 	return res;
 }
 
+bool RedoRecPtrChanged(XLogRecPtr comparator_ptr)
+{
+	XLogRecPtr ptr;
+	SpinLockAcquire(&XLogCtl->info_lck);
+	ptr = XLogCtl->RedoRecPtr;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	if (RedoRecPtr < ptr)
+		RedoRecPtr = ptr;
+
+	return RedoRecPtr != comparator_ptr;
+}
+
 /*
  * Get the time and LSN of the last xlog segment switch
  */
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 1ec90e00abf..307f32ab8cc 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -443,7 +443,7 @@ RelationCopyStorage(SMgrRelation src, SMgrRelation dst,
 	use_wal = XLogIsNeeded() &&
 		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
-	unbuffered_prep(&wstate, (use_wal || copying_initfork), false);
+	unbuffered_prep(&wstate, false, (use_wal || copying_initfork), false);
 
 	nblocks = smgrnblocks(src, forkNum);
 
diff --git a/src/backend/storage/direct/directmgr.c b/src/backend/storage/direct/directmgr.c
index 371ff5602fe..120b7c06a74 100644
--- a/src/backend/storage/direct/directmgr.c
+++ b/src/backend/storage/direct/directmgr.c
@@ -15,15 +15,26 @@
 #include "postgres.h"
 
 
+#include "access/xlog.h"
 #include "access/xloginsert.h"
 #include "storage/directmgr.h"
 
+// TODO: do_optimization can be derived from request_fsync and fsync_self, I
+// think. but is that true in all cases and also is it confusing?
 void
-unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
-		request_fsync)
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_optimization, bool
+		fsync_self, bool request_fsync)
 {
+	/*
+	 * No reason to do optimization when not required to fsync self
+	 */
+	Assert(!do_optimization || (do_optimization && fsync_self));
+
+	wstate->do_optimization = do_optimization;
 	wstate->fsync_self = fsync_self;
 	wstate->request_fsync = request_fsync;
+
+	wstate->redo = do_optimization ? GetRedoRecPtr() : InvalidXLogRecPtr;
 }
 
 void
@@ -75,5 +86,8 @@ unbuffered_finish(UnBufferedWriteState *wstate, SMgrRelation smgrrel,
 	if (!wstate->fsync_self)
 		return;
 
+	if (wstate->do_optimization && !RedoRecPtrChanged(wstate->redo))
+		return;
+
 	smgrimmedsync(smgrrel, forknum);
 }
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 4326ea8f015..7c79983ff97 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -627,7 +627,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	 */
 	LockRelationForExtension(rel, ExclusiveLock);
 
-	unbuffered_prep(&ub_wstate, false, true);
+	unbuffered_prep(&ub_wstate, false, false, true);
 
 	/*
 	 * Caution: re-using this smgr pointer could fail if the relcache entry
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 4b45ac64db8..71fe99a28d9 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -241,6 +241,7 @@ extern XLogRecPtr GetInsertRecPtr(void);
 extern XLogRecPtr GetFlushRecPtr(TimeLineID *insertTLI);
 extern TimeLineID GetWALInsertionTimeLine(void);
 extern XLogRecPtr GetLastImportantRecPtr(void);
+extern bool RedoRecPtrChanged(XLogRecPtr comparator_ptr);
 
 extern void SetWalWriterSleeping(bool sleeping);
 
diff --git a/src/include/storage/directmgr.h b/src/include/storage/directmgr.h
index 47653d0d1bb..1ff0c9b0c83 100644
--- a/src/include/storage/directmgr.h
+++ b/src/include/storage/directmgr.h
@@ -14,6 +14,7 @@
 #ifndef DIRECTMGR_H
 #define DIRECTMGR_H
 
+#include "access/xlogdefs.h"
 #include "common/relpath.h"
 #include "storage/block.h"
 #include "storage/bufpage.h"
@@ -31,19 +32,27 @@ typedef struct UnBufferedWriteState
 	 * associated WAL entries. To avoid this, callers in this situation must
 	 * fsync the pages they have written themselves.
 	 *
+	 * These callers can optionally use the following optimization:
+	 * attempt to use the sync request queue and fall back to fsync'ing the
+	 * pages themselves if the redo pointer moves between the start and finish
+	 * of their write. In order to do this, they must set do_optimization to
+	 * true so that the redo pointer is saved before the write begins.
+	 *
 	 * Callers able to use the checkpointer's sync request queue when writing
 	 * data outside shared buffers (like fsm and vm) can set request_fsync to
 	 * true so that these fsync requests are added to the queue.
 	 */
+	bool do_optimization;
 	bool fsync_self;
 	bool request_fsync;
+	XLogRecPtr redo;
 } UnBufferedWriteState;
 /*
  * prototypes for functions in directmgr.c
  */
 extern void
-unbuffered_prep(UnBufferedWriteState *wstate, bool fsync_self, bool
-		request_fsync);
+unbuffered_prep(UnBufferedWriteState *wstate, bool do_optimization, bool
+		fsync_self, bool request_fsync);
 extern void
 unbuffered_write(UnBufferedWriteState *wstate, bool do_wal, SMgrRelation
 		smgrrel, ForkNumber forknum, BlockNumber blocknum, Page page);
-- 
2.17.1

>From a67011c2704bbf6927d6010aa1c2b19f9dcb8266 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <[email protected]>
Date: Tue, 8 Feb 2022 19:01:32 -0500
Subject: [PATCH 3/4] BTree index use unbuffered IO optimization

While building a btree index, the backend can avoid fsync'ing all of the
pages if it uses the optimization introduced in a prior commit.

This can substantially improve performance when many indexes are being
built during DDL operations.
---
 src/backend/access/nbtree/nbtree.c  | 2 +-
 src/backend/access/nbtree/nbtsort.c | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 843c9e23625..a1efbe1e6ae 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -155,7 +155,7 @@ btbuildempty(Relation index)
 	Page		metapage;
 	UnBufferedWriteState wstate;
 
-	unbuffered_prep(&wstate, false, true, false);
+	unbuffered_prep(&wstate, true, true, true);
 
 	/* Construct metapage. */
 	metapage = (Page) palloc(BLCKSZ);
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index a67770f3fd3..079832bb783 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1188,8 +1188,15 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
 	/*
 	 * Only bother fsync'ing the data to permanent storage if WAL logging
+	 *
+	 * The self-fsync optimization requires that the backend both add an fsync
+	 * request to the checkpointer's pending-ops table as well as be prepared
+	 * to fsync the page data itself. Because none of these are required if the
+	 * relation is not WAL-logged, pass btws_use_wal for all parameters of the
+	 * prep function.
 	 */
-	unbuffered_prep(&wstate->ub_wstate, false, wstate->btws_use_wal, false);
+	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal,
+			wstate->btws_use_wal, wstate->btws_use_wal);
 
 	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
-- 
2.17.1

>From a1f5fef8d329ac6580623f38016e85314440da3f Mon Sep 17 00:00:00 2001
From: Melanie Plageman <[email protected]>
Date: Tue, 8 Feb 2022 19:01:36 -0500
Subject: [PATCH 4/4] Use shared buffers when possible for index build

When there are not too many tuples, building the index in shared buffers
makes sense. It allows the buffer manager to handle how best to do the
IO.
---
 src/backend/access/nbtree/nbtree.c  |  29 +--
 src/backend/access/nbtree/nbtsort.c | 267 ++++++++++++++++++++++------
 2 files changed, 223 insertions(+), 73 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index a1efbe1e6ae..4dbf7af9afe 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -152,26 +152,27 @@ bthandler(PG_FUNCTION_ARGS)
 void
 btbuildempty(Relation index)
 {
+	/*
+	 * Since this only writes one page, use shared buffers.
+	 */
 	Page		metapage;
-	UnBufferedWriteState wstate;
-
-	unbuffered_prep(&wstate, true, true, true);
+	Buffer metabuf;
 
-	/* Construct metapage. */
-	metapage = (Page) palloc(BLCKSZ);
+	/*
+	 * Allocate a buffer for metapage and initialize metapage.
+	 */
+	metabuf = ReadBufferExtended(index, INIT_FORKNUM, P_NEW, RBM_ZERO_AND_LOCK, NULL);
+	metapage = BufferGetPage(metabuf);
 	_bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false));
 
 	/*
-	 * Write the page and log it.  It might seem that an immediate sync would
-	 * be sufficient to guarantee that the file exists on disk, but recovery
-	 * itself might remove it while replaying, for example, an
-	 * XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE record.  Therefore, we need
-	 * this even when wal_level=minimal.
+	 * Mark metapage buffer as dirty and XLOG it
 	 */
-	unbuffered_write(&wstate, true, RelationGetSmgr(index), INIT_FORKNUM,
-			BTREE_METAPAGE, metapage);
-
-	unbuffered_finish(&wstate, RelationGetSmgr(index), INIT_FORKNUM);
+	START_CRIT_SECTION();
+	MarkBufferDirty(metabuf);
+	log_newpage_buffer(metabuf, true);
+	END_CRIT_SECTION();
+	_bt_relbuf(index, metabuf);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 079832bb783..7e1dd93df0e 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -237,6 +237,7 @@ typedef struct BTPageState
 {
 	Page		btps_page;		/* workspace for page building */
 	BlockNumber btps_blkno;		/* block # to write this page at */
+	Buffer btps_buf; /* buffer to write this page to */
 	IndexTuple	btps_lowkey;	/* page's strict lower bound pivot tuple */
 	OffsetNumber btps_lastoff;	/* last item offset loaded */
 	Size		btps_lastextra; /* last item's extra posting list space */
@@ -254,10 +255,26 @@ typedef struct BTWriteState
 	Relation	index;
 	BTScanInsert inskey;		/* generic insertion scankey */
 	bool		btws_use_wal;	/* dump pages to WAL? */
-	BlockNumber btws_pages_alloced; /* # pages allocated */
-	BlockNumber btws_pages_written; /* # pages written out */
+	BlockNumber btws_pages_alloced; /* # pages allocated for index builds outside SB */
+	BlockNumber btws_pages_written; /* # pages written out for index builds outside SB */
 	Page		btws_zeropage;	/* workspace for filling zeroes */
 	UnBufferedWriteState ub_wstate;
+	/*
+	 * Allocate a new btree page. This does not initialize the page.
+	 */
+	Page (*_bt_bl_alloc_page) (struct BTWriteState *wstate, BlockNumber
+			*blockno, Buffer *buf);
+	/*
+	 * Emit a completed btree page, and release the working storage.
+	 */
+	void (*_bt_blwritepage) (struct BTWriteState *wstate, Page page,
+			BlockNumber blkno, Buffer buf);
+
+	void (*_bt_bl_unbuffered_prep) (UnBufferedWriteState *wstate, bool
+			do_optimization, bool fsync_self, bool request_fsync);
+
+	void (*_bt_bl_unbuffered_finish) (UnBufferedWriteState *wstate,
+			SMgrRelation smgrrel, ForkNumber forknum);
 } BTWriteState;
 
 
@@ -266,10 +283,22 @@ static double _bt_spools_heapscan(Relation heap, Relation index,
 static void _bt_spooldestroy(BTSpool *btspool);
 static void _bt_spool(BTSpool *btspool, ItemPointer self,
 					  Datum *values, bool *isnull);
-static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);
+static void _bt_leafbuild(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2);
 static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
 							   bool *isnull, bool tupleIsAlive, void *state);
-static Page _bt_blnewpage(uint32 level);
+
+static Page
+_bt_bl_alloc_page_direct(BTWriteState *wstate, BlockNumber *blockno, Buffer *buf);
+static Page
+_bt_bl_alloc_page_shared(BTWriteState *wstate, BlockNumber *blockno, Buffer *buf);
+
+static Page _bt_blnewpage(uint32 level, Page page);
+
+static void
+_bt_blwritepage_direct(BTWriteState *wstate, Page page, BlockNumber blkno, Buffer buf);
+static void
+_bt_blwritepage_shared(BTWriteState *wstate, Page page, BlockNumber blkno, Buffer buf);
+
 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
 static void _bt_slideleft(Page rightmostpage);
 static void _bt_sortaddtup(Page page, Size itemsize,
@@ -280,9 +309,10 @@ static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
 static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,
 										  BTPageState *state,
 										  BTDedupState dstate);
-static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
 static void _bt_load(BTWriteState *wstate,
 					 BTSpool *btspool, BTSpool *btspool2);
+static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state, Buffer
+		metabuf, Page metapage);
 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
 							   int request);
 static void _bt_end_parallel(BTLeader *btleader);
@@ -295,6 +325,21 @@ static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,
 									   Sharedsort *sharedsort2, int sortmem,
 									   bool progress);
 
+#define BT_BUILD_SB_THRESHOLD 1024
+
+static const BTWriteState wstate_shared = {
+	._bt_bl_alloc_page = _bt_bl_alloc_page_shared,
+	._bt_blwritepage = _bt_blwritepage_shared,
+	._bt_bl_unbuffered_prep = NULL,
+	._bt_bl_unbuffered_finish = NULL,
+};
+
+static const BTWriteState wstate_direct = {
+	._bt_bl_alloc_page = _bt_bl_alloc_page_direct,
+	._bt_blwritepage = _bt_blwritepage_direct,
+	._bt_bl_unbuffered_prep = unbuffered_prep,
+	._bt_bl_unbuffered_finish = unbuffered_finish,
+};
 
 /*
  *	btbuild() -- build a new btree index.
@@ -304,6 +349,7 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 {
 	IndexBuildResult *result;
 	BTBuildState buildstate;
+	BTWriteState writestate;
 	double		reltuples;
 
 #ifdef BTREE_BUILD_STATS
@@ -334,8 +380,12 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
 	 * Finish the build by (1) completing the sort of the spool file, (2)
 	 * inserting the sorted tuples into btree pages and (3) building the upper
 	 * levels.  Finally, it may also be necessary to end use of parallelism.
+	 *
+	 * Don't use shared buffers if the number of tuples is too large.
 	 */
-	_bt_leafbuild(buildstate.spool, buildstate.spool2);
+	writestate = reltuples < BT_BUILD_SB_THRESHOLD ? wstate_shared : wstate_direct;
+
+	_bt_leafbuild(&writestate, buildstate.spool, buildstate.spool2);
 	_bt_spooldestroy(buildstate.spool);
 	if (buildstate.spool2)
 		_bt_spooldestroy(buildstate.spool2);
@@ -543,10 +593,8 @@ _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
  * create an entire btree.
  */
 static void
-_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
+_bt_leafbuild(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 {
-	BTWriteState wstate;
-
 #ifdef BTREE_BUILD_STATS
 	if (log_btree_build_stats)
 	{
@@ -566,21 +614,45 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
 		tuplesort_performsort(btspool2->sortstate);
 	}
 
-	wstate.heap = btspool->heap;
-	wstate.index = btspool->index;
-	wstate.inskey = _bt_mkscankey(wstate.index, NULL);
-	/* _bt_mkscankey() won't set allequalimage without metapage */
-	wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);
-	wstate.btws_use_wal = RelationNeedsWAL(wstate.index);
+	wstate->heap = btspool->heap;
+	wstate->index = btspool->index;
+	wstate->inskey = _bt_mkscankey(wstate->index, NULL);
+	/* _bt-mkscankey() won't set allequalimage without metapage */
+	wstate->inskey->allequalimage = _bt_allequalimage(wstate->index, true);
+	wstate->btws_use_wal = RelationNeedsWAL(wstate->index);
 
 	/* reserve the metapage */
-	wstate.btws_pages_alloced = BTREE_METAPAGE + 1;
-	wstate.btws_pages_written = 0;
-	wstate.btws_zeropage = NULL;	/* until needed */
+	wstate->btws_pages_alloced = 0;
+	wstate->btws_pages_written = 0;
+	wstate->btws_zeropage = NULL;
 
 	pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,
 								 PROGRESS_BTREE_PHASE_LEAF_LOAD);
-	_bt_load(&wstate, btspool, btspool2);
+
+	/*
+	 * If not using shared buffers, for a WAL-logged relation, save the redo
+	 * pointer location in case a checkpoint begins during the index build.
+	 *
+	 * This optimization requires that the backend both add an fsync request to
+	 * the checkpointer's pending-ops table as well as be prepared to fsync the
+	 * page data itself. Because none of these are required if the relation is
+	 * not WAL-logged, pass btws_use_wal for all parameters of the prep
+	 * function.
+	 */
+	if (wstate->_bt_bl_unbuffered_prep)
+			wstate->_bt_bl_unbuffered_prep(&wstate->ub_wstate,
+					wstate->btws_use_wal, wstate->btws_use_wal,
+					wstate->btws_use_wal);
+
+	_bt_load(wstate, btspool, btspool2);
+
+	/*
+	 * If not using shared buffers, for a WAL-logged relation, check if backend
+	 * must fsync the page itself.
+	 */
+	if (wstate->_bt_bl_unbuffered_finish)
+		wstate->_bt_bl_unbuffered_finish(&wstate->ub_wstate,
+				RelationGetSmgr(wstate->index), MAIN_FORKNUM);
 }
 
 /*
@@ -613,15 +685,15 @@ _bt_build_callback(Relation index,
 }
 
 /*
- * allocate workspace for a new, clean btree page, not linked to any siblings.
+ * Set up workspace for a new, clean btree page, not linked to any siblings.
+ * Caller must allocate the passed in page.
  */
 static Page
-_bt_blnewpage(uint32 level)
+_bt_blnewpage(uint32 level, Page page)
 {
-	Page		page;
 	BTPageOpaque opaque;
 
-	page = (Page) palloc(BLCKSZ);
+	Assert(page);
 
 	/* Zero the page and set up standard page header info */
 	_bt_pageinit(page, BLCKSZ);
@@ -639,11 +711,8 @@ _bt_blnewpage(uint32 level)
 	return page;
 }
 
-/*
- * emit a completed btree page, and release the working storage.
- */
 static void
-_bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
+_bt_blwritepage_direct(BTWriteState *wstate, Page page, BlockNumber blkno, Buffer buf)
 {
 	/*
 	 * If we have to write pages nonsequentially, fill in the space with
@@ -681,6 +750,61 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 	pfree(page);
 }
 
+static void
+_bt_blwritepage_shared(BTWriteState *wstate, Page page, BlockNumber blkno, Buffer buf)
+{
+	/*
+	 * Indexes built in shared buffers need only to mark the buffer as dirty
+	 * and XLOG it.
+	 */
+	Assert(buf);
+	START_CRIT_SECTION();
+	MarkBufferDirty(buf);
+	if (wstate->btws_use_wal)
+		log_newpage_buffer(buf, true);
+	END_CRIT_SECTION();
+	_bt_relbuf(wstate->index, buf);
+}
+
+static Page
+_bt_bl_alloc_page_direct(BTWriteState *wstate, BlockNumber *blockno, Buffer *buf)
+{
+	 /* buf is only used when using shared buffers, so set it to InvalidBuffer */
+	*buf = InvalidBuffer;
+
+	/*
+	 * Assign block number for the page.
+	 * This will be used to link to sibling page(s) later and, if this is the
+	 * initial page in the level, saved in the BTPageState
+	 */
+	*blockno = wstate->btws_pages_alloced++;
+
+	/* now allocate and set up the new page */
+	return palloc(BLCKSZ);
+}
+
+static Page
+_bt_bl_alloc_page_shared(BTWriteState *wstate, BlockNumber *blockno, Buffer *buf)
+{
+	/*
+	 * Find a shared buffer for the page. Pass mode RBM_ZERO_AND_LOCK to get an
+	 * exclusive lock on the buffer content. No lock on the relation as a whole
+	 * is needed (as in LockRelationForExtension()) because the initial index
+	 * build is not yet complete.
+	 */
+	*buf = ReadBufferExtended(wstate->index, MAIN_FORKNUM, P_NEW,
+			RBM_ZERO_AND_LOCK, NULL);
+
+	/*
+	 * bufmgr will assign a block number for the new page.
+	 * This will be used to link to sibling page(s) later and, if this is the
+	 * initial page in the level, saved in the BTPageState
+	 */
+	*blockno = BufferGetBlockNumber(*buf);
+
+	return BufferGetPage(*buf);
+}
+
 /*
  * allocate and initialize a new BTPageState.  the returned structure
  * is suitable for immediate use by _bt_buildadd.
@@ -688,13 +812,20 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno)
 static BTPageState *
 _bt_pagestate(BTWriteState *wstate, uint32 level)
 {
+	Buffer buf;
+	BlockNumber blockno;
+
 	BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));
 
-	/* create initial page for level */
-	state->btps_page = _bt_blnewpage(level);
+	/*
+	 * Allocate and initialize initial page for the level, and if using shared
+	 * buffers, extend the relation and allocate a shared buffer for the block.
+	 */
+	state->btps_page = _bt_blnewpage(level, wstate->_bt_bl_alloc_page(wstate,
+				&blockno, &buf));
 
-	/* and assign it a page position */
-	state->btps_blkno = wstate->btws_pages_alloced++;
+	state->btps_blkno = blockno;
+	state->btps_buf = buf;
 
 	state->btps_lowkey = NULL;
 	/* initialize lastoff so first item goes into P_FIRSTKEY */
@@ -829,6 +960,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 {
 	Page		npage;
 	BlockNumber nblkno;
+	Buffer nbuf;
 	OffsetNumber last_off;
 	Size		last_truncextra;
 	Size		pgspc;
@@ -843,6 +975,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 
 	npage = state->btps_page;
 	nblkno = state->btps_blkno;
+	nbuf = state->btps_buf;
 	last_off = state->btps_lastoff;
 	last_truncextra = state->btps_lastextra;
 	state->btps_lastextra = truncextra;
@@ -899,15 +1032,14 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 		 */
 		Page		opage = npage;
 		BlockNumber oblkno = nblkno;
+		Buffer obuf = nbuf;
 		ItemId		ii;
 		ItemId		hii;
 		IndexTuple	oitup;
 
-		/* Create new page of same level */
-		npage = _bt_blnewpage(state->btps_level);
-
-		/* and assign it a page position */
-		nblkno = wstate->btws_pages_alloced++;
+		/* Create and initialize a new page of same level */
+		npage = _bt_blnewpage(state->btps_level,
+				wstate->_bt_bl_alloc_page(wstate, &nblkno, &nbuf));
 
 		/*
 		 * We copy the last item on the page into the new page, and then
@@ -1017,9 +1149,12 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 
 		/*
 		 * Write out the old page.  We never need to touch it again, so we can
-		 * free the opage workspace too.
+		 * free the opage workspace too. obuf has been released and is no longer
+		 * valid.
 		 */
-		_bt_blwritepage(wstate, opage, oblkno);
+		 wstate->_bt_blwritepage(wstate, opage, oblkno, obuf);
+		 obuf = InvalidBuffer;
+		 opage = NULL;
 
 		/*
 		 * Reset last_off to point to new page
@@ -1054,6 +1189,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,
 
 	state->btps_page = npage;
 	state->btps_blkno = nblkno;
+	state->btps_buf = nbuf;
 	state->btps_lastoff = last_off;
 }
 
@@ -1099,12 +1235,12 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
  * Finish writing out the completed btree.
  */
 static void
-_bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
+_bt_uppershutdown(BTWriteState *wstate, BTPageState *state, Buffer metabuf,
+		Page metapage)
 {
 	BTPageState *s;
 	BlockNumber rootblkno = P_NONE;
 	uint32		rootlevel = 0;
-	Page		metapage;
 
 	/*
 	 * Each iteration of this loop completes one more level of the tree.
@@ -1150,20 +1286,24 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 		 * back one slot.  Then we can dump out the page.
 		 */
 		_bt_slideleft(s->btps_page);
-		_bt_blwritepage(wstate, s->btps_page, s->btps_blkno);
+		wstate->_bt_blwritepage(wstate, s->btps_page, s->btps_blkno, s->btps_buf);
+		s->btps_buf = InvalidBuffer;
 		s->btps_page = NULL;	/* writepage freed the workspace */
 	}
 
 	/*
-	 * As the last step in the process, construct the metapage and make it
+	 * As the last step in the process, initialize the metapage and make it
 	 * point to the new root (unless we had no data at all, in which case it's
 	 * set to point to "P_NONE").  This changes the index to the "valid" state
 	 * by filling in a valid magic number in the metapage.
+	 * After this, metapage will have been freed or invalid and metabuf, if ever
+	 * valid, will have been released.
 	 */
-	metapage = (Page) palloc(BLCKSZ);
 	_bt_initmetapage(metapage, rootblkno, rootlevel,
 					 wstate->inskey->allequalimage);
-	_bt_blwritepage(wstate, metapage, BTREE_METAPAGE);
+	wstate->_bt_blwritepage(wstate, metapage, BTREE_METAPAGE, metabuf);
+	metabuf = InvalidBuffer;
+	metapage = NULL;
 }
 
 /*
@@ -1173,6 +1313,10 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 static void
 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 {
+	Page metapage;
+	BlockNumber metablkno;
+	Buffer metabuf;
+
 	BTPageState *state = NULL;
 	bool		merge = (btspool2 != NULL);
 	IndexTuple	itup,
@@ -1185,21 +1329,29 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	int64		tuples_done = 0;
 	bool		deduplicate;
 
+	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
+		BTGetDeduplicateItems(wstate->index);
 
 	/*
-	 * Only bother fsync'ing the data to permanent storage if WAL logging
+	 * Reserve block 0 for the metapage up front.
+	 *
+	 * When using the shared buffers API it is easier to allocate the buffer
+	 * for block 0 first instead of trying skip block 0 and allocate it at the
+	 * end of index build.
+	 *
+	 * When not using the shared buffers API, there is no harm in allocating
+	 * the metapage first. When block 1 is written, the direct writepage
+	 * function will zero-fill block 0. When writing out the metapage at the
+	 * end of index build, it will overwrite that block 0.
+	 *
+	 * The metapage will be initialized and written out at the end of the index
+	 * build when all of the information needed to do so is available.
 	 *
-	 * The self-fsync optimization requires that the backend both add an fsync
-	 * request to the checkpointer's pending-ops table as well as be prepared
-	 * to fsync the page data itself. Because none of these are required if the
-	 * relation is not WAL-logged, pass btws_use_wal for all parameters of the
-	 * prep function.
+	 * The block number will always be BTREE_METAPAGE, so the metablkno
+	 * variable is unused and only created to avoid a special case in the
+	 * direct alloc function.
 	 */
-	unbuffered_prep(&wstate->ub_wstate, wstate->btws_use_wal,
-			wstate->btws_use_wal, wstate->btws_use_wal);
-
-	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
-		BTGetDeduplicateItems(wstate->index);
+	metapage = wstate->_bt_bl_alloc_page(wstate, &metablkno, &metabuf);
 
 	if (merge)
 	{
@@ -1422,10 +1574,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	}
 
 	/* Close down final pages and write the metapage */
-	_bt_uppershutdown(wstate, state);
-
-	unbuffered_finish(&wstate->ub_wstate, RelationGetSmgr(wstate->index),
-			MAIN_FORKNUM);
+	_bt_uppershutdown(wstate, state, metabuf, metapage);
 }
 
 /*
-- 
2.17.1

Re: Avoiding smgrimmedsync() during nbtree index builds

Reply via email to