Re: Vectored I/O in bulk_write.c

Thomas Munro Wed, 13 Mar 2024 14:13:27 -0700

Alright, here is a first attempt at merging all three interfaces as
you suggested.  I like it!  I especially like the way it removes lots
of duplication.


I don't understand your argument about the location of the
write-vs-extent assertions.  It seems to me that these are assertions
about what the *public* smgrnblocks() function returns.  In other
words, we assert that the caller is aware of the current relation size
(and has some kind of interlocking scheme for that to be possible),
according to the smgr implementation's public interface.  That's not
an assertion about internal details of the smgr implementation, it's
part of the "contract" for the API.

From 0a57274e29369e61712941e379c24f7db1dec068 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Sat, 9 Mar 2024 16:04:21 +1300
Subject: [PATCH v4 1/3] Merge smgrzeroextend() and smgrextend() with
 smgrwritev().

Since mdwrite() and mdextend() were basically the same and both need
vectored variants, merge them into a single interface.  We still want
to be able to assert that callers know the difference between
overwriting and extending and activate slightly difference behavior
during recovery, so use flags to control that.

Likewise for the zero-extending variant, which is has much in common at
the interface level, except it doesn't deal in buffers.

The traditional single-block smgrwrite() and smgrextend() functions with
skipFsync boolean argument are translated to smgrwritev() by inlinable
wrapper functions, for low-overhead backwards-compatibility.

Reviewed-by: Heikki Linnakangas <[email protected]>
Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c   |   7 +-
 src/backend/storage/buffer/localbuf.c |   3 +-
 src/backend/storage/smgr/md.c         | 119 +++++-------------------
 src/backend/storage/smgr/smgr.c       | 127 +++++++++-----------------
 src/include/storage/md.h              |   7 +-
 src/include/storage/smgr.h            |  22 +++--
 6 files changed, 91 insertions(+), 194 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f0f8d4259c..52bbdff336 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2064,7 +2064,7 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 	io_start = pgstat_prepare_io_time(track_io_timing);
 
 	/*
-	 * Note: if smgrzeroextend fails, we will end up with buffers that are
+	 * Note: if smgrwritev fails, we will end up with buffers that are
 	 * allocated but not marked BM_VALID.  The next relation extension will
 	 * still select the same block number (because the relation didn't get any
 	 * longer on disk) and so future attempts to extend the relation will find
@@ -2073,7 +2073,8 @@ ExtendBufferedRelShared(BufferManagerRelation bmr,
 	 *
 	 * We don't need to set checksum for all-zero pages.
 	 */
-	smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
+	smgrwritev(bmr.smgr, fork, first_block, NULL, extend_by,
+			   SMGR_WRITE_EXTEND | SMGR_WRITE_ZERO);
 
 	/*
 	 * Release the file-extension lock; it's now OK for someone else to extend
@@ -3720,7 +3721,7 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
 	 *
 	 * In recovery, we cache the value returned by the first lseek(SEEK_END)
 	 * and the future writes keeps the cached value up-to-date. See
-	 * smgrextend. It is possible that the value of the first lseek is smaller
+	 * smgrwritev(). It is possible that the value of the first lseek is smaller
 	 * than the actual number of existing blocks in the file due to buggy
 	 * Linux kernels that might not have accounted for the recent write. But
 	 * that should be fine because there must not be any buffers after that
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index fcfac335a5..5b2b0fe9f4 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -416,7 +416,8 @@ ExtendBufferedRelLocal(BufferManagerRelation bmr,
 	io_start = pgstat_prepare_io_time(track_io_timing);
 
 	/* actually extend relation */
-	smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
+	smgrwritev(bmr.smgr, fork, first_block, NULL, extend_by,
+			   SMGR_WRITE_EXTEND | SMGR_WRITE_ZERO);
 
 	pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EXTEND,
 							io_start, extend_by);
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index bf0f3ca76d..0d560393e5 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -447,83 +447,14 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 	pfree(path);
 }
 
-/*
- * mdextend() -- Add a block to the specified relation.
- *
- * The semantics are nearly the same as mdwrite(): write at the
- * specified position.  However, this is to be used for the case of
- * extending a relation (i.e., blocknum is at or beyond the current
- * EOF).  Note that we assume writing a block beyond current EOF
- * causes intervening file space to become filled with zeroes.
- */
-void
-mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 const void *buffer, bool skipFsync)
-{
-	off_t		seekpos;
-	int			nbytes;
-	MdfdVec    *v;
-
-	/* If this build supports direct I/O, the buffer must be I/O aligned. */
-	if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
-		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
-
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
-	/*
-	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
-	 * more --- we mustn't create a block whose number actually is
-	 * InvalidBlockNumber.  (Note that this failure should be unreachable
-	 * because of upstream checks in bufmgr.c.)
-	 */
-	if (blocknum == InvalidBlockNumber)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forknum),
-						InvalidBlockNumber)));
-
-	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
-
-	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
-	{
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not extend file \"%s\": %m",
-							FilePathName(v->mdfd_vfd)),
-					 errhint("Check free disk space.")));
-		/* short write: complain appropriately */
-		ereport(ERROR,
-				(errcode(ERRCODE_DISK_FULL),
-				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
-						FilePathName(v->mdfd_vfd),
-						nbytes, BLCKSZ, blocknum),
-				 errhint("Check free disk space.")));
-	}
-
-	if (!skipFsync && !SmgrIsTemp(reln))
-		register_dirty_segment(reln, forknum, v);
-
-	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
-}
-
 /*
  * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
  *
- * Similar to mdextend(), except the relation can be extended by multiple
- * blocks at once and the added blocks will be filled with zeroes.
+ * The added blocks will be filled with zeroes.
  */
-void
+static void
 mdzeroextend(SMgrRelation reln, ForkNumber forknum,
-			 BlockNumber blocknum, int nblocks, bool skipFsync)
+			 BlockNumber blocknum, int nblocks, int flags)
 {
 	MdfdVec    *v;
 	BlockNumber curblocknum = blocknum;
@@ -559,7 +490,8 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
 		else
 			numblocks = remblocks;
 
-		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+		v = _mdfd_getseg(reln, forknum, curblocknum,
+						 flags & SMGR_WRITE_SKIP_FSYNC, EXTENSION_CREATE);
 
 		Assert(segstartblock < RELSEG_SIZE);
 		Assert(segstartblock + numblocks <= RELSEG_SIZE);
@@ -595,13 +527,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
 		{
 			int			ret;
 
-			/*
-			 * Even if we don't want to use fallocate, we can still extend a
-			 * bit more efficiently than writing each 8kB block individually.
-			 * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
-			 * to avoid multiple writes or needing a zeroed buffer for the
-			 * whole length of the extension.
-			 */
+			/* Fall back to writing out zeroes. */
 			ret = FileZero(v->mdfd_vfd,
 						   seekpos, (off_t) BLCKSZ * numblocks,
 						   WAIT_EVENT_DATA_FILE_EXTEND);
@@ -613,7 +539,7 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum,
 						errhint("Check free disk space."));
 		}
 
-		if (!skipFsync && !SmgrIsTemp(reln))
+		if ((flags & SMGR_WRITE_SKIP_FSYNC) == 0 && !SmgrIsTemp(reln))
 			register_dirty_segment(reln, forknum, v);
 
 		Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
@@ -919,19 +845,14 @@ mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 /*
  * mdwritev() -- Write the supplied blocks at the appropriate location.
- *
- * This is to be used only for updating already-existing blocks of a
- * relation (ie, those before the current EOF).  To extend a relation,
- * use mdextend().
  */
 void
 mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 const void **buffers, BlockNumber nblocks, bool skipFsync)
+		 const void **buffers, BlockNumber nblocks,
+		 int flags)
 {
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum < mdnblocks(reln, forknum));
-#endif
+	if (flags & SMGR_WRITE_ZERO)
+		return mdzeroextend(reln, forknum, blocknum, nblocks, flags);
 
 	while (nblocks > 0)
 	{
@@ -944,7 +865,9 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		size_t		transferred_this_segment;
 		size_t		size_this_segment;
 
-		v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+		v = _mdfd_getseg(reln, forknum, blocknum, flags & SMGR_WRITE_SKIP_FSYNC,
+						 (flags & SMGR_WRITE_EXTEND) ?
+						 EXTENSION_CREATE :
 						 EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
 
 		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
@@ -992,7 +915,9 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 				ereport(ERROR,
 						(errcode_for_file_access(),
-						 errmsg("could not write blocks %u..%u in file \"%s\": %m",
+						 errmsg((flags & SMGR_WRITE_EXTEND) ?
+								"could not extend blocks %u..%u in file \"%s\": %m" :
+								"could not write blocks %u..%u in file \"%s\": %m",
 								blocknum,
 								blocknum + nblocks_this_segment - 1,
 								FilePathName(v->mdfd_vfd)),
@@ -1010,7 +935,7 @@ mdwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
 		}
 
-		if (!skipFsync && !SmgrIsTemp(reln))
+		if ((flags & SMGR_WRITE_SKIP_FSYNC) == 0 && !SmgrIsTemp(reln))
 			register_dirty_segment(reln, forknum, v);
 
 		nblocks -= nblocks_this_segment;
@@ -1638,7 +1563,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 		{
 			/*
 			 * Normally we will create new segments only if authorized by the
-			 * caller (i.e., we are doing mdextend()).  But when doing WAL
+			 * caller (i.e., we are doing smgrextend()).  But when doing WAL
 			 * recovery, create segments anyway; this allows cases such as
 			 * replaying WAL data that has a write into a high-numbered
 			 * segment of a relation that was later deleted. We want to go
@@ -1655,9 +1580,9 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 				char	   *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
 													 MCXT_ALLOC_ZERO);
 
-				mdextend(reln, forknum,
-						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
-						 zerobuf, skipFsync);
+				smgrextend(reln, forknum,
+						   nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+						   zerobuf, skipFsync);
 				pfree(zerobuf);
 			}
 			flags = O_CREAT;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 62226d5dca..6ead87a795 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -82,10 +82,6 @@ typedef struct f_smgr
 	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
 								bool isRedo);
-	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, const void *buffer, bool skipFsync);
-	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
-									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber blocknum, int nblocks);
 	void		(*smgr_readv) (SMgrRelation reln, ForkNumber forknum,
@@ -94,7 +90,7 @@ typedef struct f_smgr
 	void		(*smgr_writev) (SMgrRelation reln, ForkNumber forknum,
 								BlockNumber blocknum,
 								const void **buffers, BlockNumber nblocks,
-								bool skipFsync);
+								int flags);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
 								   BlockNumber blocknum, BlockNumber nblocks);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -114,8 +110,6 @@ static const f_smgr smgrsw[] = {
 		.smgr_create = mdcreate,
 		.smgr_exists = mdexists,
 		.smgr_unlink = mdunlink,
-		.smgr_extend = mdextend,
-		.smgr_zeroextend = mdzeroextend,
 		.smgr_prefetch = mdprefetch,
 		.smgr_readv = mdreadv,
 		.smgr_writev = mdwritev,
@@ -521,59 +515,6 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 	pfree(rlocators);
 }
 
-
-/*
- * smgrextend() -- Add a new block to a file.
- *
- * The semantics are nearly the same as smgrwrite(): write at the
- * specified position.  However, this is to be used for the case of
- * extending a relation (i.e., blocknum is at or beyond the current
- * EOF).  Note that we assume writing a block beyond current EOF
- * causes intervening file space to become filled with zeroes.
- */
-void
-smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		   const void *buffer, bool skipFsync)
-{
-	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
-										 buffer, skipFsync);
-
-	/*
-	 * Normally we expect this to increase nblocks by one, but if the cached
-	 * value isn't as expected, just invalidate it so the next call asks the
-	 * kernel.
-	 */
-	if (reln->smgr_cached_nblocks[forknum] == blocknum)
-		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
-	else
-		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
-}
-
-/*
- * smgrzeroextend() -- Add new zeroed out blocks to a file.
- *
- * Similar to smgrextend(), except the relation can be extended by
- * multiple blocks at once and the added blocks will be filled with
- * zeroes.
- */
-void
-smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-			   int nblocks, bool skipFsync)
-{
-	smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
-											 nblocks, skipFsync);
-
-	/*
-	 * Normally we expect this to increase the fork size by nblocks, but if
-	 * the cached value isn't as expected, just invalidate it so the next call
-	 * asks the kernel.
-	 */
-	if (reln->smgr_cached_nblocks[forknum] == blocknum)
-		reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
-	else
-		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
-}
-
 /*
  * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
  *
@@ -607,9 +548,9 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 /*
  * smgrwritev() -- Write the supplied buffers out.
  *
- * This is to be used only for updating already-existing blocks of a
- * relation (ie, those before the current EOF).  To extend a relation,
- * use smgrextend().
+ * By default this is to be used only for updating already-existing blocks of
+ * a relation (ie, those before the current EOF).  To extend a relation,
+ * specify SMGR_WRITE_EXTEND, optionally with SMGR_WRITE_ZERO.
  *
  * This is not a synchronous write -- the block is not necessarily
  * on disk at return, only dumped out to the kernel.  However,
@@ -629,10 +570,33 @@ smgrreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  */
 void
 smgrwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		   const void **buffers, BlockNumber nblocks, bool skipFsync)
+		   const void **buffers, BlockNumber nblocks, int flags)
 {
+	if (flags & SMGR_WRITE_ZERO)
+		Assert(flags & SMGR_WRITE_EXTEND);
+#ifdef CHECK_WRITE_VS_EXTEND
+	/* These assert are too expensive to have on normally ... */
+	if (flags & SMGR_WRITE_EXTEND)
+		Assert(blocknum >= smgrnblocks(reln, forknum));
+	else
+		Assert(blocknum + nblocks <= smgrnblocks(reln, forknum));
+#endif
+
 	smgrsw[reln->smgr_which].smgr_writev(reln, forknum, blocknum,
-										 buffers, nblocks, skipFsync);
+										 buffers, nblocks, flags);
+
+	if (flags & SMGR_WRITE_EXTEND)
+	{
+		/*
+		 * Normally we expect this to increase the fork size by nblocks, but
+		 * if the cached value isn't as expected, just invalidate it so the
+		 * next call asks the smgr implementation.
+		 */
+		if (reln->smgr_cached_nblocks[forknum] == blocknum)
+			reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+		else
+			reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+	}
 }
 
 /*
@@ -743,14 +707,14 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 /*
  * smgrregistersync() -- Request a relation to be sync'd at next checkpoint
  *
- * This can be used after calling smgrwrite() or smgrextend() with skipFsync =
- * true, to register the fsyncs that were skipped earlier.
+ * This can be used after calling smgrwritev() with SMGR_WRITE_SKIP_FSYNC,
+ * to register the fsyncs that were skipped earlier.
  *
  * Note: be mindful that a checkpoint could already have happened between the
- * smgrwrite or smgrextend calls and this!  In that case, the checkpoint
- * already missed fsyncing this relation, and you should use smgrimmedsync
- * instead.  Most callers should use the bulk loading facility in bulk_write.c
- * which handles all that.
+ * smgrwritev calls and this!  In that case, the checkpoint already missed
+ * fsyncing this relation, and you should use smgrimmedsync instead.  Most
+ * callers should use the bulk loading facility in bulk_write.c which handles
+ * all that.
  */
 void
 smgrregistersync(SMgrRelation reln, ForkNumber forknum)
@@ -764,17 +728,16 @@ smgrregistersync(SMgrRelation reln, ForkNumber forknum)
  * Synchronously force all previous writes to the specified relation
  * down to disk.
  *
- * This is useful for building completely new relations (eg, new
- * indexes).  Instead of incrementally WAL-logging the index build
- * steps, we can just write completed index pages to disk with smgrwrite
- * or smgrextend, and then fsync the completed index file before
- * committing the transaction.  (This is sufficient for purposes of
- * crash recovery, since it effectively duplicates forcing a checkpoint
- * for the completed index.  But it is *not* sufficient if one wishes
- * to use the WAL log for PITR or replication purposes: in that case
- * we have to make WAL entries as well.)
- *
- * The preceding writes should specify skipFsync = true to avoid
+ * This is useful for building completely new relations (eg, new indexes).
+ * Instead of incrementally WAL-logging the index build steps, we can just
+ * write completed index pages to disk with smgrwritev, nd then fsync the
+ * completed index file before committing the transaction.  (This is
+ * sufficient for purposes of crash recovery, since it effectively duplicates
+ * forcing a checkpoint for the completed index.  But it is *not* sufficient
+ * if one wishes to use the WAL log for PITR or replication purposes: in that
+ * case we have to make WAL entries as well.)
+ *
+ * The preceding writes should specify SMGR_WRITE_SKIP_FSYNC to avoid
  * duplicative fsyncs.
  *
  * Note that you need to do FlushRelationBuffers() first if there is
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 620f10abde..794e3d7b40 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -26,17 +26,14 @@ extern void mdclose(SMgrRelation reln, ForkNumber forknum);
 extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
-extern void mdextend(SMgrRelation reln, ForkNumber forknum,
-					 BlockNumber blocknum, const void *buffer, bool skipFsync);
-extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, int nblocks);
 extern void mdreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 					void **buffers, BlockNumber nblocks);
 extern void mdwritev(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum,
-					 const void **buffers, BlockNumber nblocks, bool skipFsync);
+					 const void **buffers, BlockNumber nblocks,
+					 int flags);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index fc5f883ce1..8598b89e2e 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -73,6 +73,10 @@ typedef SMgrRelationData *SMgrRelation;
 #define SmgrIsTemp(smgr) \
 	RelFileLocatorBackendIsTemp((smgr)->smgr_rlocator)
 
+#define SMGR_WRITE_SKIP_FSYNC 0x01
+#define SMGR_WRITE_EXTEND 0x02
+#define SMGR_WRITE_ZERO 0x04
+
 extern void smgrinit(void);
 extern SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend);
 extern bool smgrexists(SMgrRelation reln, ForkNumber forknum);
@@ -86,10 +90,6 @@ extern void smgrreleaserellocator(RelFileLocatorBackend rlocator);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
-extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, const void *buffer, bool skipFsync);
-extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum, int nblocks);
 extern void smgrreadv(SMgrRelation reln, ForkNumber forknum,
@@ -98,7 +98,7 @@ extern void smgrreadv(SMgrRelation reln, ForkNumber forknum,
 extern void smgrwritev(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum,
 					   const void **buffer, BlockNumber nblocks,
-					   bool skipFsync);
+					   int flags);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
@@ -121,7 +121,17 @@ static inline void
 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		  const void *buffer, bool skipFsync)
 {
-	smgrwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
+	smgrwritev(reln, forknum, blocknum, &buffer, 1,
+			   skipFsync ? SMGR_WRITE_SKIP_FSYNC : 0);
+}
+
+static inline void
+smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		   const void *buffer, bool skipFsync)
+{
+	smgrwritev(reln, forknum, blocknum, &buffer, 1,
+			   SMGR_WRITE_EXTEND |
+			   (skipFsync ? SMGR_WRITE_SKIP_FSYNC : 0));
 }
 
 #endif							/* SMGR_H */
-- 
2.43.2

From d9446573bade5ead71159f75c39d965c98d9038e Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Sat, 9 Mar 2024 16:54:56 +1300
Subject: [PATCH v4 2/3] Use vectored I/O for bulk writes.

bulk_write.c was originally designed with the goal of being able to
use the new vectored write APIs, but couldn't initially  because the
vectored variant of "smgrextend" didn't exist yet.  Now that
smgrwritev() can also handle extension, we can use it here to get wide
write system calls.

Reviewed-by: Heikki Linnakangas <[email protected]>
Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/smgr/bulk_write.c | 90 +++++++++++++++++++--------
 1 file changed, 65 insertions(+), 25 deletions(-)

diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index 4a10ece4c3..df1c401e88 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -8,7 +8,7 @@
  * the regular buffer manager and the bulk loading interface!
  *
  * We bypass the buffer manager to avoid the locking overhead, and call
- * smgrextend() directly.  A downside is that the pages will need to be
+ * smgrextendv() directly.  A downside is that the pages will need to be
  * re-read into shared buffers on first use after the build finishes.  That's
  * usually a good tradeoff for large relations, and for small relations, the
  * overhead isn't very significant compared to creating the relation in the
@@ -45,8 +45,6 @@
 
 #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID
 
-static const PGIOAlignedBlock zero_buffer = {{0}};	/* worth BLCKSZ */
-
 typedef struct PendingWrite
 {
 	BulkWriteBuffer buf;
@@ -225,35 +223,77 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 
 	for (int i = 0; i < npending; i++)
 	{
-		BlockNumber blkno = pending_writes[i].blkno;
-		Page		page = pending_writes[i].buf->data;
-
+		Page		page;
+		const void *pages[16];
+		BlockNumber blkno;
+		int			nblocks;
+		int			max_nblocks;
+
+		/* Prepare to write the first block. */
+		blkno = pending_writes[i].blkno;
+		page = pending_writes[i].buf->data;
 		PageSetChecksumInplace(page, blkno);
+		pages[0] = page;
+		nblocks = 1;
+
+		/* Zero-extend any missing space before the first block. */
+		if (blkno > bulkstate->pages_written)
+		{
+			int			nzeroblocks;
+
+			nzeroblocks = blkno - bulkstate->pages_written;
+			smgrwritev(bulkstate->smgr, bulkstate->forknum,
+					   bulkstate->pages_written, NULL, nzeroblocks,
+					   SMGR_WRITE_SKIP_FSYNC |
+					   SMGR_WRITE_EXTEND |
+					   SMGR_WRITE_ZERO);
+			bulkstate->pages_written += nzeroblocks;
+		}
 
-		if (blkno >= bulkstate->pages_written)
+		if (blkno < bulkstate->pages_written)
 		{
 			/*
-			 * If we have to write pages nonsequentially, fill in the space
-			 * with zeroes until we come back and overwrite.  This is not
-			 * logically necessary on standard Unix filesystems (unwritten
-			 * space will read as zeroes anyway), but it should help to avoid
-			 * fragmentation.  The dummy pages aren't WAL-logged though.
+			 * We're overwriting.  Clamp at the existing size, because we
+			 * can't mix writing and extending in a single operation.
 			 */
-			while (blkno > bulkstate->pages_written)
-			{
-				/* don't set checksum for all-zero page */
-				smgrextend(bulkstate->smgr, bulkstate->forknum,
-						   bulkstate->pages_written++,
-						   &zero_buffer,
-						   true);
-			}
-
-			smgrextend(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
-			bulkstate->pages_written = pending_writes[i].blkno + 1;
+			max_nblocks = Min(lengthof(pages),
+							  bulkstate->pages_written - blkno);
 		}
 		else
-			smgrwrite(bulkstate->smgr, bulkstate->forknum, blkno, page, true);
-		pfree(page);
+		{
+			/* We're extending. */
+			Assert(blkno == bulkstate->pages_written);
+			max_nblocks = lengthof(pages);
+		}
+
+		/* Find as many consecutive blocks as we can. */
+		while (i + 1 < npending &&
+			   pending_writes[i + 1].blkno == blkno + nblocks &&
+			   nblocks < max_nblocks)
+		{
+			page = pending_writes[++i].buf->data;
+			PageSetChecksumInplace(page, pending_writes[i].blkno);
+			pages[nblocks++] = page;
+		}
+
+		/* Extend or overwrite. */
+		if (blkno == bulkstate->pages_written)
+		{
+			smgrwritev(bulkstate->smgr, bulkstate->forknum, blkno,
+					   pages, nblocks,
+					   SMGR_WRITE_SKIP_FSYNC | SMGR_WRITE_EXTEND);
+			bulkstate->pages_written += nblocks;
+		}
+		else
+		{
+			Assert(blkno + nblocks <= bulkstate->pages_written);
+			smgrwritev(bulkstate->smgr, bulkstate->forknum, blkno,
+					   pages, nblocks,
+					   SMGR_WRITE_SKIP_FSYNC);
+		}
+
+		for (int j = 0; j < nblocks; ++j)
+			pfree(pending_writes[i - j].buf->data);
 	}
 
 	bulkstate->npending = 0;
-- 
2.43.2

From 5246f51acdcfa815ca71522963e1ad897e438957 Mon Sep 17 00:00:00 2001
From: Thomas Munro <[email protected]>
Date: Mon, 11 Mar 2024 11:44:41 +1300
Subject: [PATCH v4 3/3] Improve bulk_write.c memory management.

Instead of allocating buffers one at a time with palloc(), allocate an
array full of them up front, and then manage them in a FIFO freelist.
Aside from avoiding allocator overheads, this means that callers who
write sequential blocks will tend to fill up sequential memory, which
hopefully generates more efficient vectored writes.

Reviewed-by: Heikki Linnakangas <[email protected]>
Discussion: https://postgr.es/m/CA%2BhUKGLx5bLwezZKAYB2O_qHj%3Dov10RpgRVY7e8TSJVE74oVjg%40mail.gmail.com
---
 src/backend/storage/smgr/bulk_write.c | 62 +++++++++++++++++++--------
 1 file changed, 43 insertions(+), 19 deletions(-)

diff --git a/src/backend/storage/smgr/bulk_write.c b/src/backend/storage/smgr/bulk_write.c
index df1c401e88..38e45c3178 100644
--- a/src/backend/storage/smgr/bulk_write.c
+++ b/src/backend/storage/smgr/bulk_write.c
@@ -36,6 +36,7 @@
 
 #include "access/xloginsert.h"
 #include "access/xlogrecord.h"
+#include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/bufpage.h"
 #include "storage/bulk_write.h"
@@ -45,9 +46,15 @@
 
 #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID
 
+typedef union BufferSlot
+{
+	PGIOAlignedBlock buffer;
+	dlist_node	freelist_node;
+}			BufferSlot;
+
 typedef struct PendingWrite
 {
-	BulkWriteBuffer buf;
+	BufferSlot *slot;
 	BlockNumber blkno;
 	bool		page_std;
 } PendingWrite;
@@ -57,6 +64,10 @@ typedef struct PendingWrite
  */
 struct BulkWriteState
 {
+	/* Comes first so we can align it correctly. */
+	BufferSlot	buffer_slots[MAX_PENDING_WRITES + 2];
+	dlist_head	buffer_slots_freelist;
+
 	/* Information about the target relation we're writing */
 	SMgrRelation smgr;
 	ForkNumber	forknum;
@@ -71,8 +82,6 @@ struct BulkWriteState
 
 	/* The RedoRecPtr at the time that the bulk operation started */
 	XLogRecPtr	start_RedoRecPtr;
-
-	MemoryContext memcxt;
 };
 
 static void smgr_bulk_flush(BulkWriteState *bulkstate);
@@ -98,7 +107,7 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
 {
 	BulkWriteState *state;
 
-	state = palloc(sizeof(BulkWriteState));
+	state = palloc_aligned(sizeof(BulkWriteState), PG_IO_ALIGN_SIZE, 0);
 	state->smgr = smgr;
 	state->forknum = forknum;
 	state->use_wal = use_wal;
@@ -108,11 +117,11 @@ smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
 
 	state->start_RedoRecPtr = GetRedoRecPtr();
 
-	/*
-	 * Remember the memory context.  We will use it to allocate all the
-	 * buffers later.
-	 */
-	state->memcxt = CurrentMemoryContext;
+	/* Set up the free-list of buffers. */
+	dlist_init(&state->buffer_slots_freelist);
+	for (int i = 0; i < lengthof(state->buffer_slots); ++i)
+		dlist_push_tail(&state->buffer_slots_freelist,
+						&state->buffer_slots[i].freelist_node);
 
 	return state;
 }
@@ -206,7 +215,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 		for (int i = 0; i < npending; i++)
 		{
 			blknos[i] = pending_writes[i].blkno;
-			pages[i] = pending_writes[i].buf->data;
+			pages[i] = pending_writes[i].slot->buffer.data;
 
 			/*
 			 * If any of the pages use !page_std, we log them all as such.
@@ -231,7 +240,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 
 		/* Prepare to write the first block. */
 		blkno = pending_writes[i].blkno;
-		page = pending_writes[i].buf->data;
+		page = pending_writes[i].slot->buffer.data;
 		PageSetChecksumInplace(page, blkno);
 		pages[0] = page;
 		nblocks = 1;
@@ -271,7 +280,7 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 			   pending_writes[i + 1].blkno == blkno + nblocks &&
 			   nblocks < max_nblocks)
 		{
-			page = pending_writes[++i].buf->data;
+			page = pending_writes[++i].slot->buffer.data;
 			PageSetChecksumInplace(page, pending_writes[i].blkno);
 			pages[nblocks++] = page;
 		}
@@ -292,8 +301,14 @@ smgr_bulk_flush(BulkWriteState *bulkstate)
 					   SMGR_WRITE_SKIP_FSYNC);
 		}
 
-		for (int j = 0; j < nblocks; ++j)
-			pfree(pending_writes[i - j].buf->data);
+		/*
+		 * Maintain FIFO ordering in the free list, so that users who write
+		 * blocks in sequential order tend to get sequential chunks of buffer
+		 * memory, which may be slight more efficient for vectored writes.
+		 */
+		for (int j = i - nblocks + 1; j <= i; ++j)
+			dlist_push_tail(&bulkstate->buffer_slots_freelist,
+							&pending_writes[j].slot->freelist_node);
 	}
 
 	bulkstate->npending = 0;
@@ -313,7 +328,7 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer
 	PendingWrite *w;
 
 	w = &bulkstate->pending_writes[bulkstate->npending++];
-	w->buf = buf;
+	w->slot = (BufferSlot *) buf;
 	w->blkno = blocknum;
 	w->page_std = page_std;
 
@@ -327,12 +342,21 @@ smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer
  * There is no function to free the buffer.  When you pass it to
  * smgr_bulk_write(), it takes ownership and frees it when it's no longer
  * needed.
- *
- * This is currently implemented as a simple palloc, but could be implemented
- * using a ring buffer or larger chunks in the future, so don't rely on it.
  */
 BulkWriteBuffer
 smgr_bulk_get_buf(BulkWriteState *bulkstate)
 {
-	return MemoryContextAllocAligned(bulkstate->memcxt, BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	BufferSlot *slot;
+
+	if (dlist_is_empty(&bulkstate->buffer_slots_freelist))
+	{
+		smgr_bulk_flush(bulkstate);
+		if (dlist_is_empty(&bulkstate->buffer_slots_freelist))
+			elog(ERROR, "too many bulk write buffers used but not yet written");
+	}
+
+	slot = dlist_head_element(BufferSlot, freelist_node, &bulkstate->buffer_slots_freelist);
+	dlist_pop_head_node(&bulkstate->buffer_slots_freelist);
+
+	return &slot->buffer;
 }
-- 
2.43.2

Re: Vectored I/O in bulk_write.c

Reply via email to