From 50abfee2334abcee69154f49ea5656eca150b8aa Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Sat, 9 Mar 2024 16:04:21 +1300
Subject: [PATCH v6 1/3] Use smgrwritev() for both overwriting and extending.

Since mdwrite() and mdextend() were basically the same and both need
vectored variants, merge them into a single interface to reduce
duplication.  We still want to be able to assert that callers know the
difference between overwriting and extending and activate slightly
different behavior during recovery, so use a new flags argument to
control that.

The traditional single-block smgrwrite() and smgrextend() functions with
skipFsync boolean argument are translated to smgrwritev() with flags by
inlinable wrapper functions, for low-overhead backwards-compatibility.

Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c |   2 +-
 src/backend/storage/smgr/md.c       | 120 ++++++++--------------------
 src/backend/storage/smgr/smgr.c     | 100 ++++++++++-------------
 src/include/storage/md.h            |   4 +-
 src/include/storage/smgr.h          |  19 ++++-
 5 files changed, 90 insertions(+), 155 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f0f8d4259c..aa7331778e 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3720,7 +3720,7 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
 	 *
 	 * In recovery, we cache the value returned by the first lseek(SEEK_END)
 	 * and the future writes keeps the cached value up-to-date. See
-	 * smgrextend. It is possible that the value of the first lseek is smaller
+	 * smgrwritev. It is possible that the value of the first lseek is smaller
 	 * than the actual number of existing blocks in the file due to buggy
 	 * Linux kernels that might not have accounted for the recent write. But
 	 * that should be fine because there must not be any buffers after that
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index bf0f3ca76d..73d077ca3e 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -447,79 +447,10 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 	pfree(path);
 }
 
-/*
- * mdextend() -- Add a block to the specified relation.
- *
- * The semantics are nearly the same as mdwrite(): write at the
- * specified position.  However, this is to be used for the case of
- * extending a relation (i.e., blocknum is at or beyond the current
- * EOF).  Note that we assume writing a block beyond current EOF
- * causes intervening file space to become filled with zeroes.
- */
-void
-mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 const void *buffer, bool skipFsync)
-{
-	off_t		seekpos;
-	int			nbytes;
-	MdfdVec    *v;
-
-	/* If this build supports direct I/O, the buffer must be I/O aligned. */
-	if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
-		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
-
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
-	/*
-	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
-	 * more --- we mustn't create a block whose number actually is
-	 * InvalidBlockNumber.  (Note that this failure should be unreachable
-	 * because of upstream checks in bufmgr.c.)
-	 */
-	if (blocknum == InvalidBlockNumber)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forknum),
-						InvalidBlockNumber)));
-
-	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
-
-	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
-	{
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not extend file \"%s\": %m",
-							FilePathName(v->mdfd_vfd)),
-					 errhint("Check free disk space.")));
-		/* short write: complain appropriately */
-		ereport(ERROR,
-				(errcode(ERRCODE_DISK_FULL),
-				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
-						FilePathName(v->mdfd_vfd),
-						nbytes, BLCKSZ, blocknum),
-				 errhint("Check free disk space.")));
-	}
-
-	if (!skipFsync && !SmgrIsTemp(reln))
-		register_dirty_segment(reln, forknum, v);
-
-	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
-}
-
 /*
  * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
  *
- * Similar to mdextend(), except the relation can be extended by multiple
- * blocks at once and the added blocks will be filled with zeroes.
+ * The added blocks will be filled with zeroes.
  */
 void
 mdzeroextend(SMgrRelation reln, ForkNumber forknum,
@@ -919,20 +850,31 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 /*
  * mdwritev() -- Write the supplied blocks at the appropriate location.
- *
- * This is to be used only for updating already-existing blocks of a
- * relation (ie, those before the current EOF).  To extend a relation,
- * use mdextend().
  */
 void
 mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 const void **buffers, BlockNumber nblocks, bool skipFsync)
+		 const void **buffers, BlockNumber nblocks, int flags)
 {
 	/* This assert is too expensive to have on normally ... */
 #ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum < mdnblocks(reln, forknum));
+	if (flags & SMGR_WRITE_EXTEND)
+		Assert(blocknum >= mdnblocks(reln, forknum));
+	else
+		Assert(blocknum + nblocks <= mdnblocks(reln, forknum));
 #endif
 
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber or larger.
+	 */
+	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rlocator, forknum),
+						InvalidBlockNumber)));
+
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
@@ -944,7 +886,9 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		size_t		transferred_this_segment;
 		size_t		size_this_segment;
 
-		v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+		v = _mdfd_getseg(reln, forknum, blocknum, flags & SMGR_WRITE_SKIP_FSYNC,
+						 (flags & SMGR_WRITE_EXTEND) ?
+						 EXTENSION_CREATE :
 						 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
 		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -992,7 +936,9 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 				ereport(ERROR,
 						(errcode_for_file_access(),
-						 errmsg("could not write blocks %u..%u in file \"%s\": %m",
+						 errmsg((flags & SMGR_WRITE_EXTEND) ?
+								"could not extend blocks %u..%u in file \"%s\": %m" :
+								"could not write blocks %u..%u in file \"%s\": %m",
 								blocknum,
 								blocknum + nblocks_this_segment - 1,
 								FilePathName(v->mdfd_vfd)),
@@ -1010,7 +956,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
 		}
 
-		if (!skipFsync && !SmgrIsTemp(reln))
+		if ((flags & SMGR_WRITE_SKIP_FSYNC) == 0 && !SmgrIsTemp(reln))
 			register_dirty_segment(reln, forknum, v);
 
 		nblocks -= nblocks_this_segment;
@@ -1638,11 +1584,11 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		{
 			/*
 			 * Normally we will create new segments only if authorized by the
-			 * caller (i.e., we are doing mdextend()).  But when doing WAL
-			 * recovery, create segments anyway; this allows cases such as
-			 * replaying WAL data that has a write into a high-numbered
-			 * segment of a relation that was later deleted. We want to go
-			 * ahead and create the segments so we can finish out the replay.
+			 * caller (i.e., we are extending).  But when doing WAL recovery,
+			 * create segments anyway; this allows cases such as replaying WAL
+			 * data that has a write into a high-numbered segment of a
+			 * relation that was later deleted. We want to go ahead and create
+			 * the segments so we can finish out the replay.
 			 *
 			 * We have to maintain the invariant that segments before the last
 			 * active segment are of size RELSEG_SIZE; therefore, if
@@ -1655,9 +1601,9 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 				char	   *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
 													 MCXT_ALLOC_ZERO);
 
-				mdextend(reln, forknum,
-						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
-						 zerobuf, skipFsync);
+				smgrextend(reln, forknum,
+						   nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+						   zerobuf, skipFsync);
 				pfree(zerobuf);
 			}
 			flags = O_CREAT;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 62226d5dca..92628a0339 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -82,8 +82,6 @@ typedef struct f_smgr
 	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
 								bool isRedo);
-	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, const void *buffer, bool skipFsync);
 	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
 									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
@@ -94,7 +92,7 @@ typedef struct f_smgr
 	void		(*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
 								BlockNumber blocknum,
 								const void **buffers, BlockNumber nblocks,
-								bool skipFsync);
+								int flags);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
 								   BlockNumber blocknum, BlockNumber nblocks);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -114,7 +112,6 @@ static const f_smgr smgrsw[] = {
 		.smgr_create = mdcreate,
 		.smgr_exists = mdexists,
 		.smgr_unlink = mdunlink,
-		.smgr_extend = mdextend,
 		.smgr_zeroextend = mdzeroextend,
 		.smgr_prefetch = mdprefetch,
 		.smgr_readv = mdreadv,
@@ -521,40 +518,11 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 	pfree(rlocators);
 }
 
-
-/*
- * smgrextend() -- Add a new block to a file.
- *
- * The semantics are nearly the same as smgrwrite(): write at the
- * specified position.  However, this is to be used for the case of
- * extending a relation (i.e., blocknum is at or beyond the current
- * EOF).  Note that we assume writing a block beyond current EOF
- * causes intervening file space to become filled with zeroes.
- */
-void
-smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		   const void *buffer, bool skipFsync)
-{
-	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
-										 buffer, skipFsync);
-
-	/*
-	 * Normally we expect this to increase nblocks by one, but if the cached
-	 * value isn't as expected, just invalidate it so the next call asks the
-	 * kernel.
-	 */
-	if (reln->smgr_cached_nblocks[forknum] == blocknum)
-		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
-	else
-		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
-}
-
 /*
  * smgrzeroextend() -- Add new zeroed out blocks to a file.
  *
- * Similar to smgrextend(), except the relation can be extended by
- * multiple blocks at once and the added blocks will be filled with
- * zeroes.
+ * Similar to writing with SMGR_WRITE_EXTEND, except the blocks will be filled
+ * with zeroes.
  */
 void
 smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -607,9 +575,9 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 /*
  * smgrwritev() -- Write the supplied buffers out.
  *
- * This is to be used only for updating already-existing blocks of a
- * relation (ie, those before the current EOF).  To extend a relation,
- * use smgrextend().
+ * By default this is to be used only for updating already-existing blocks of
+ * a relation (ie, those before the current EOF).  To extend a relation,
+ * specify SMGR_WRITE_EXTEND in flags.
  *
  * This is not a synchronous write -- the block is not necessarily
  * on disk at return, only dumped out to the kernel.  However,
@@ -623,16 +591,29 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  * checkpoint happened; that relies on the fact that no other backend can be
  * concurrently modifying the page.
  *
- * skipFsync indicates that the caller will make other provisions to
- * fsync the relation, so we needn't bother.  Temporary relations also
- * do not require fsync.
+ * SMGR_WRITE_SKIP_FSYNC indicates that the caller will make other provisions
+ * to fsync the relation, so we needn't bother.  Temporary relations also do
+ * not require fsync.
  */
 void
 smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		   const void **buffers, BlockNumber nblocks, bool skipFsync)
+		   const void **buffers, BlockNumber nblocks, int flags)
 {
 	smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
-										 buffers, nblocks, skipFsync);
+										 buffers, nblocks, flags);
+
+	if (flags & SMGR_WRITE_EXTEND)
+	{
+		/*
+		 * Normally we expect this to increase the fork size by nblocks, but
+		 * if the cached value isn't as expected, just invalidate it so the
+		 * next call asks the smgr implementation.
+		 */
+		if (reln->smgr_cached_nblocks[forknum] == blocknum)
+			reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+		else
+			reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+	}
 }
 
 /*
@@ -743,14 +724,14 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 /*
  * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
  *
- * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
- * true, to register the fsyncs that were skipped earlier.
+ * This can be used after calling smgrwritev() with SMGR_WRITE_SKIP_FSYNC,
+ * to register the fsyncs that were skipped earlier.
  *
  * Note: be mindful that a checkpoint could already have happened between the
- * smgrwrite or smgrextend calls and this!  In that case, the checkpoint
- * already missed fsyncing this relation, and you should use smgrimmedsync
- * instead.  Most callers should use the bulk loading facility in bulk_write.c
- * which handles all that.
+ * smgrwritev calls and this!  In that case, the checkpoint already missed
+ * fsyncing this relation, and you should use smgrimmedsync instead.  Most
+ * callers should use the bulk loading facility in bulk_write.c which handles
+ * all that.
  */
 void
 smgrregistersync(SMgrRelation reln, ForkNumber forknum)
@@ -764,17 +745,16 @@ smgrregistersync(SMgrRelation reln, ForkNumber forknum)
  * Synchronously force all previous writes to the specified relation
  * down to disk.
  *
- * This is useful for building completely new relations (eg, new
- * indexes).  Instead of incrementally WAL-logging the index build
- * steps, we can just write completed index pages to disk with smgrwrite
- * or smgrextend, and then fsync the completed index file before
- * committing the transaction.  (This is sufficient for purposes of
- * crash recovery, since it effectively duplicates forcing a checkpoint
- * for the completed index.  But it is *not* sufficient if one wishes
- * to use the WAL log for PITR or replication purposes: in that case
- * we have to make WAL entries as well.)
- *
- * The preceding writes should specify skipFsync = true to avoid
+ * This is useful for building completely new relations (eg, new indexes).
+ * Instead of incrementally WAL-logging the index build steps, we can just
+ * write completed index pages to disk with smgrwritev, and then fsync the
+ * completed index file before committing the transaction.  (This is
+ * sufficient for purposes of crash recovery, since it effectively duplicates
+ * forcing a checkpoint for the completed index.  But it is *not* sufficient
+ * if one wishes to use the WAL log for PITR or replication purposes: in that
+ * case we have to make WAL entries as well.)
+ *
+ * The preceding writes should specify SMGR_WRITE_SKIP_FSYNC to avoid
  * duplicative fsyncs.
  *
  * Note that you need to do FlushRelationBuffers() first if there is
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 620f10abde..5fcd6f47df 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -26,8 +26,6 @@ extern void mdclose(SMgrRelation reln, ForkNumber forknum);
 extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
-extern void mdextend(SMgrRelation reln, ForkNumber forknum,
-					 BlockNumber blocknum, const void *buffer, bool skipFsync);
 extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
@@ -36,7 +34,7 @@ extern void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					void **buffers, BlockNumber nblocks);
 extern void mdwritev(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum,
-					 const void **buffers, BlockNumber nblocks, bool skipFsync);
+					 const void **buffers, BlockNumber nblocks, int flags);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index fc5f883ce1..2b8b72820c 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -73,6 +73,9 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator)
 
+#define SMGR_WRITE_SKIP_FSYNC 0x01
+#define SMGR_WRITE_EXTEND 0x02
+
 extern void smgrinit(void);
 extern SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
@@ -86,8 +89,6 @@ extern void smgrreleaserellocator(RelFileLocatorBackend rlocator);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
-extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, const void *buffer, bool skipFsync);
 extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
 						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
@@ -98,7 +99,7 @@ extern void smgrreadv(SMgrRelation reln, ForkNumber forknum,
 extern void smgrwritev(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum,
 					   const void **buffer, BlockNumber nblocks,
-					   bool skipFsync);
+					   int flags);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
@@ -121,7 +122,17 @@ static inline void
 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		  const void *buffer, bool skipFsync)
 {
-	smgrwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+	smgrwritev(reln, forknum, blocknum, &buffer, 1,
+			   skipFsync ? SMGR_WRITE_SKIP_FSYNC : 0);
+}
+
+static inline void
+smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		   const void *buffer, bool skipFsync)
+{
+	smgrwritev(reln, forknum, blocknum, &buffer, 1,
+			   SMGR_WRITE_EXTEND |
+			   (skipFsync ? SMGR_WRITE_SKIP_FSYNC : 0));
 }
 
 #endif							/* SMGR_H */
-- 
2.39.3 (Apple Git-146)

