On Wed, Mar 1, 2023 at 12:06 AM Kuntal Ghosh <kuntalghosh.2...@gmail.com> wrote:
>
> On Tue, Feb 28, 2023 at 10:38 AM Bharath Rupireddy
> <bharath.rupireddyforpostg...@gmail.com> wrote:
> >
> +/*
> + * Guts of XLogReadFromBuffers().
> + *
> + * Read 'count' bytes into 'buf', starting at location 'ptr', from WAL
> + * fetched WAL buffers on timeline 'tli' and return the read bytes.
> + */
> s/fetched WAL buffers/fetched from WAL buffers

Modified that comment a bit and moved it to the XLogReadFromBuffers.

> + else if (nread < nbytes)
> + {
> + /*
> + * We read some of the requested bytes. Continue to read remaining
> + * bytes.
> + */
> + ptr += nread;
> + nbytes -= nread;
> + dst += nread;
> + *read_bytes += nread;
> + }
>
> The 'if' condition should always be true. You can replace the same
> with an assertion instead.

Yeah, added an assert and changed that else if (nread < nbytes) to
else only condition.

> s/Continue to read remaining/Continue to read the remaining

Done.

> The good thing about this patch is that it reduces read IO calls
> without impacting the write performance (at least not that
> noticeable). It also takes us one step forward towards the
> enhancements mentioned in the thread.

Right.

> If performance is a concern, we
> can introduce a GUC to enable/disable this feature.

I didn't see any performance issues from my testing so far with 3
different pgbench cases
https://www.postgresql.org/message-id/CALj2ACWXHP6Ha1BfDB14txm%3DXP272wCbOV00mcPg9c6EXbnp5A%40mail.gmail.com.

While adding a GUC to enable/disable a feature sounds useful, IMHO it
isn't good for the user. Because we already have too many GUCs for the
user and we may not want all features to be defensive and add their
own GUCs. If at all, any bugs arise due to some corner-case we missed
to count in, we can surely help fix them. Having said this, I'm open
to suggestions here.

Please find the attached v5 patch set for further review.

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
From 121f98633541c340a89dc628cc3f02711abdee02 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Wed, 1 Mar 2023 08:40:30 +0000
Subject: [PATCH v5] Improve WALRead() to suck data directly from WAL buffers

---
 src/backend/access/transam/xlog.c       | 175 ++++++++++++++++++++++++
 src/backend/access/transam/xlogreader.c |  45 +++++-
 src/include/access/xlog.h               |   6 +
 3 files changed, 224 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f9f0f6db8d..98bd588ea0 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -689,6 +689,10 @@ static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
 							  XLogRecPtr *PrevPtr);
 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
 static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli);
+static Size XLogReadFromBuffersGuts(XLogRecPtr ptr,
+									TimeLineID tli,
+									Size count,
+									char *page);
 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
@@ -1639,6 +1643,177 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli)
 	return cachedPos + ptr % XLOG_BLCKSZ;
 }
 
+/*
+ * Guts of XLogReadFromBuffers().
+ */
+static Size
+XLogReadFromBuffersGuts(XLogRecPtr ptr,
+						TimeLineID tli,
+						Size count,
+						char *buf)
+{
+	XLogRecPtr	expectedEndPtr;
+	XLogRecPtr	endptr;
+	int 	idx;
+	Size	nread = 0;
+
+	idx = XLogRecPtrToBufIdx(ptr);
+	expectedEndPtr = ptr;
+	expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
+
+	/*
+	 * Holding WALBufMappingLock ensures inserters don't overwrite this value
+	 * while we are reading it. We try to acquire it in shared mode so that the
+	 * concurrent WAL readers are also allowed. We try to do as less work as
+	 * possible while holding the lock to not impact concurrent WAL writers
+	 * much. We quickly exit to not cause any contention, if the lock isn't
+	 * immediately available.
+	 */
+	if (!LWLockConditionalAcquire(WALBufMappingLock, LW_SHARED))
+		return nread;
+
+	endptr = XLogCtl->xlblocks[idx];
+
+	if (expectedEndPtr == endptr)
+	{
+		char	*page;
+		char    *data;
+		XLogPageHeader	phdr;
+
+		/*
+		 * We found WAL buffer page containing given XLogRecPtr. Get starting
+		 * address of the page and a pointer to the right location of given
+		 * XLogRecPtr in that page.
+		 */
+		page = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
+		data = page + ptr % XLOG_BLCKSZ;
+
+		/* Read what is wanted, not the whole page. */
+		if ((data + count) <= (page + XLOG_BLCKSZ))
+		{
+			/* All the bytes are in one page. */
+			memcpy(buf, data, count);
+			nread = count;
+		}
+		else
+		{
+			Size nremaining;
+
+			/*
+			 * All the bytes are not in one page. Compute remaining bytes on
+			 * the current page, copy them over to output buffer.
+			 */
+			nremaining = XLOG_BLCKSZ - (data - page);
+			memcpy(buf, data, nremaining);
+			nread = nremaining;
+		}
+
+		/*
+		 * Release the lock as early as possible to avoid creating any possible
+		 * contention.
+		 */
+		LWLockRelease(WALBufMappingLock);
+
+		/*
+		 * The fact that we acquire WALBufMappingLock while reading the WAL
+		 * buffer page itself guarantees that no one else initializes it or
+		 * makes it ready for next use in AdvanceXLInsertBuffer().
+		 *
+		 * However, we perform basic page header checks for ensuring that we
+		 * are not reading a page that just got initialized. Callers will
+		 * anyway perform extensive page-level and record-level checks.
+		 */
+		phdr = (XLogPageHeader) page;
+
+		if (!(phdr->xlp_magic == XLOG_PAGE_MAGIC &&
+			  phdr->xlp_pageaddr == (ptr - (ptr % XLOG_BLCKSZ)) &&
+			  phdr->xlp_tli == tli))
+		{
+			/*
+			 * WAL buffer page doesn't look valid, so return as if nothing was
+			 * read.
+			 */
+			nread = 0;
+		}
+	}
+	else
+	{
+		/* Requested WAL isn't available in WAL buffers. */
+		LWLockRelease(WALBufMappingLock);
+	}
+
+	/* We never read more than what the caller has asked for. */
+	Assert(nread <= count);
+
+	return nread;
+}
+
+/*
+ * Read WAL from WAL buffers.
+ *
+ * Read 'count' bytes of WAL from WAL buffers into 'buf', starting at location
+ * 'startptr', on timeline 'tli' and set the read bytes to 'read_bytes'.
+ *
+ * Note that this function reads as much as it can from WAL buffers, meaning,
+ * it may not read all the requested 'count' bytes. The caller must be aware of
+ * this and deal with it.
+ */
+void
+XLogReadFromBuffers(XLogRecPtr startptr,
+					TimeLineID tli,
+					Size count,
+					char *buf,
+					Size *read_bytes)
+{
+	XLogRecPtr	ptr;
+	char    *dst;
+	Size    nbytes;
+
+	Assert(!XLogRecPtrIsInvalid(startptr));
+	Assert(count > 0);
+	Assert(startptr <= GetFlushRecPtr(NULL));
+	Assert(!RecoveryInProgress());
+
+	ptr = startptr;
+	nbytes = count;
+	dst = buf;
+	*read_bytes = 0;
+
+	while (nbytes > 0)
+	{
+		Size	nread;
+
+		nread = XLogReadFromBuffersGuts(ptr, tli, nbytes, dst);
+
+		if (nread <= 0)
+		{
+			/* We read nothing. */
+			break;
+		}
+		else if (nread == nbytes)
+		{
+			/* We read all the requested bytes. */
+			*read_bytes += nread;
+			break;
+		}
+		else
+		{
+			/*
+			 * We read some of the requested bytes. Continue to read the
+			 * remaining bytes.
+			 */
+			Assert(nread < nbytes);
+			ptr += nread;
+			nbytes -= nread;
+			dst += nread;
+			*read_bytes += nread;
+		}
+	}
+
+	elog(DEBUG1, "read %zu bytes out of %zu bytes from WAL buffers for given LSN %X/%X, Timeline ID %u",
+		 *read_bytes, count, LSN_FORMAT_ARGS(startptr), tli);
+}
+
 /*
  * Converts a "usable byte position" to XLogRecPtr. A usable byte position
  * is the position starting from the beginning of WAL, excluding all WAL
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index aa6c929477..723379b7d9 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1485,8 +1485,7 @@ err:
  * Returns true if succeeded, false if an error occurs, in which case
  * 'errinfo' receives error details.
  *
- * XXX probably this should be improved to suck data directly from the
- * WAL buffers when possible.
+ * When possible, this function reads data directly from WAL buffers.
  */
 bool
 WALRead(XLogReaderState *state,
@@ -1497,6 +1496,48 @@ WALRead(XLogReaderState *state,
 	XLogRecPtr	recptr;
 	Size		nbytes;
 
+#ifndef FRONTEND
+	/* Frontend tools have no idea of WAL buffers. */
+	Size        read_bytes;
+
+	/*
+	 * When possible, read WAL from WAL buffers. We skip this step and continue
+	 * the usual way, that is to read from WAL file, either when server is in
+	 * recovery (standby mode, archive or crash recovery), in which case the
+	 * WAL buffers are not used or when the server is inserting in a different
+	 * timeline from that of the timeline that we're trying to read WAL from.
+	 */
+	if (!RecoveryInProgress() &&
+		tli == GetWALInsertionTimeLine())
+	{
+		XLogReadFromBuffers(startptr, tli, count, buf, &read_bytes);
+
+		/*
+		 * Check if we have read fully (hit), partially (partial hit) or
+		 * nothing (miss) from WAL buffers. If we have read either partially or
+		 * nothing, then continue to read the remaining bytes the usual way,
+		 * that is, read from WAL file.
+		 */
+		if (count == read_bytes)
+		{
+			/* Buffer hit, so return. */
+			return true;
+		}
+		else if (read_bytes > 0 && count > read_bytes)
+		{
+			/*
+			 * Buffer partial hit, so reset the state to count the read bytes
+			 * and continue.
+			 */
+			buf += read_bytes;
+			startptr += read_bytes;
+			count -= read_bytes;
+		}
+
+		/* Buffer miss i.e., read_bytes = 0, so continue */
+	}
+#endif	/* FRONTEND */
+
 	p = buf;
 	recptr = startptr;
 	nbytes = count;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index cfe5409738..c9941aa001 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -247,6 +247,12 @@ extern XLogRecPtr GetLastImportantRecPtr(void);
 
 extern void SetWalWriterSleeping(bool sleeping);
 
+extern void XLogReadFromBuffers(XLogRecPtr startptr,
+								TimeLineID tli,
+								Size count,
+								char *buf,
+								Size *read_bytes);
+
 /*
  * Routines used by xlogrecovery.c to call back into xlog.c during recovery.
  */
-- 
2.34.1

From a889a11b814a902c0b4292f00a360acdab2c7c1f Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Wed, 1 Mar 2023 08:40:57 +0000
Subject: [PATCH v5] Add test module for verifying WAL read from WAL buffers

---
 src/test/modules/Makefile                     |  1 +
 src/test/modules/meson.build                  |  1 +
 .../test_wal_read_from_buffers/.gitignore     |  4 ++
 .../test_wal_read_from_buffers/Makefile       | 23 +++++++++
 .../test_wal_read_from_buffers/meson.build    | 36 +++++++++++++
 .../test_wal_read_from_buffers/t/001_basic.pl | 44 ++++++++++++++++
 .../test_wal_read_from_buffers--1.0.sql       | 16 ++++++
 .../test_wal_read_from_buffers.c              | 51 +++++++++++++++++++
 .../test_wal_read_from_buffers.control        |  4 ++
 9 files changed, 180 insertions(+)
 create mode 100644 src/test/modules/test_wal_read_from_buffers/.gitignore
 create mode 100644 src/test/modules/test_wal_read_from_buffers/Makefile
 create mode 100644 src/test/modules/test_wal_read_from_buffers/meson.build
 create mode 100644 src/test/modules/test_wal_read_from_buffers/t/001_basic.pl
 create mode 100644 src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers--1.0.sql
 create mode 100644 src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.c
 create mode 100644 src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.control

diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index c629cbe383..ea33361f69 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -33,6 +33,7 @@ SUBDIRS = \
 		  test_rls_hooks \
 		  test_shm_mq \
 		  test_slru \
+		  test_wal_read_from_buffers \
 		  unsafe_tests \
 		  worker_spi
 
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 1baa6b558d..e3ffd3538d 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -29,5 +29,6 @@ subdir('test_regex')
 subdir('test_rls_hooks')
 subdir('test_shm_mq')
 subdir('test_slru')
+subdir('test_wal_read_from_buffers')
 subdir('unsafe_tests')
 subdir('worker_spi')
diff --git a/src/test/modules/test_wal_read_from_buffers/.gitignore b/src/test/modules/test_wal_read_from_buffers/.gitignore
new file mode 100644
index 0000000000..5dcb3ff972
--- /dev/null
+++ b/src/test/modules/test_wal_read_from_buffers/.gitignore
@@ -0,0 +1,4 @@
+# Generated subdirectories
+/log/
+/results/
+/tmp_check/
diff --git a/src/test/modules/test_wal_read_from_buffers/Makefile b/src/test/modules/test_wal_read_from_buffers/Makefile
new file mode 100644
index 0000000000..7a09533ec7
--- /dev/null
+++ b/src/test/modules/test_wal_read_from_buffers/Makefile
@@ -0,0 +1,23 @@
+# src/test/modules/test_wal_read_from_buffers/Makefile
+
+MODULE_big = test_wal_read_from_buffers
+OBJS = \
+	$(WIN32RES) \
+	test_wal_read_from_buffers.o
+PGFILEDESC = "test_wal_read_from_buffers - test module to verify that WAL can be read from WAL buffers"
+
+EXTENSION = test_wal_read_from_buffers
+DATA = test_wal_read_from_buffers--1.0.sql
+
+TAP_TESTS = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_wal_read_from_buffers
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_wal_read_from_buffers/meson.build b/src/test/modules/test_wal_read_from_buffers/meson.build
new file mode 100644
index 0000000000..40a36edc07
--- /dev/null
+++ b/src/test/modules/test_wal_read_from_buffers/meson.build
@@ -0,0 +1,36 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+# FIXME: prevent install during main install, but not during test :/
+
+test_wal_read_from_buffers_sources = files(
+  'test_wal_read_from_buffers.c',
+)
+
+if host_system == 'windows'
+  test_wal_read_from_buffers_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_wal_read_from_buffers',
+    '--FILEDESC', 'test_wal_read_from_buffers - test module to verify that WAL can be read from WAL buffers',])
+endif
+
+test_wal_read_from_buffers = shared_module('test_wal_read_from_buffers',
+  test_wal_read_from_buffers_sources,
+  kwargs: pg_mod_args,
+)
+testprep_targets += test_wal_read_from_buffers
+
+install_data(
+  'test_wal_read_from_buffers.control',
+  'test_wal_read_from_buffers--1.0.sql',
+  kwargs: contrib_data_args,
+)
+
+tests += {
+  'name': 'test_wal_read_from_buffers',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'tap': {
+    'tests': [
+      't/001_basic.pl',
+    ],
+  },
+}
diff --git a/src/test/modules/test_wal_read_from_buffers/t/001_basic.pl b/src/test/modules/test_wal_read_from_buffers/t/001_basic.pl
new file mode 100644
index 0000000000..3448e0bed6
--- /dev/null
+++ b/src/test/modules/test_wal_read_from_buffers/t/001_basic.pl
@@ -0,0 +1,44 @@
+# Copyright (c) 2021-2023, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+
+$node->init;
+
+# Ensure nobody interferes with us so that the WAL in WAL buffers don't get
+# overwritten while running tests.
+$node->append_conf(
+	'postgresql.conf', qq(
+autovacuum = off
+checkpoint_timeout = 1h
+wal_writer_delay = 10000ms
+wal_writer_flush_after = 1GB
+));
+$node->start;
+
+# Setup.
+$node->safe_psql('postgres', 'CREATE EXTENSION test_wal_read_from_buffers');
+
+# Get current insert LSN. After this, we generate some WAL which is guranteed
+# to be in WAL buffers as there is no other WAL generating activity is
+# happening on the server. We then verify if we can read the WAL from WAL
+# buffers using this LSN.
+my $lsn =
+  $node->safe_psql('postgres', 'SELECT pg_current_wal_insert_lsn();');
+
+# Generate minimal WAL so that WAL buffers don't get overwritten.
+$node->safe_psql('postgres',
+	"CREATE TABLE t (c int); INSERT INTO t VALUES (1);");
+
+# Check if WAL is successfully read from WAL buffers.
+my $result = $node->safe_psql('postgres',
+	qq{SELECT test_wal_read_from_buffers('$lsn')});
+is($result, 't', "WAL is successfully read from WAL buffers");
+
+done_testing();
diff --git a/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers--1.0.sql b/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers--1.0.sql
new file mode 100644
index 0000000000..8e89910133
--- /dev/null
+++ b/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers--1.0.sql
@@ -0,0 +1,16 @@
+/* src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_wal_read_from_buffers" to load this file. \quit
+
+--
+-- test_wal_read_from_buffers()
+--
+-- Returns true if WAL data at a given LSN can be read from WAL buffers.
+-- Otherwise returns false.
+--
+CREATE FUNCTION test_wal_read_from_buffers(IN lsn pg_lsn,
+    OUT read_from_buffers bool
+)
+AS 'MODULE_PATHNAME', 'test_wal_read_from_buffers'
+LANGUAGE C STRICT PARALLEL UNSAFE;
diff --git a/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.c b/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.c
new file mode 100644
index 0000000000..ca8645101a
--- /dev/null
+++ b/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.c
@@ -0,0 +1,51 @@
+/*--------------------------------------------------------------------------
+ *
+ * test_wal_read_from_buffers.c
+ *		Test code for veryfing WAL read from WAL buffers.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *	src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "fmgr.h"
+#include "utils/pg_lsn.h"
+
+PG_MODULE_MAGIC;
+
+/*
+ * SQL function for verifying that WAL data at a given LSN can be read from WAL
+ * buffers. Returns true if read from WAL buffers, otherwise false.
+ */
+PG_FUNCTION_INFO_V1(test_wal_read_from_buffers);
+Datum
+test_wal_read_from_buffers(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	lsn;
+	Size	read_bytes;
+	TimeLineID	tli;
+	char	data[XLOG_BLCKSZ] = {0};
+
+	lsn = PG_GETARG_LSN(0);
+
+	if (XLogRecPtrIsInvalid(lsn))
+		PG_RETURN_BOOL(false);
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("WAL control functions cannot be executed during recovery.")));
+
+	tli = GetWALInsertionTimeLine();
+
+	XLogReadFromBuffers(lsn, tli, XLOG_BLCKSZ, data, &read_bytes);
+
+	PG_RETURN_LSN(read_bytes > 0);
+}
diff --git a/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.control b/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.control
new file mode 100644
index 0000000000..7852b3e331
--- /dev/null
+++ b/src/test/modules/test_wal_read_from_buffers/test_wal_read_from_buffers.control
@@ -0,0 +1,4 @@
+comment = 'Test code for veryfing WAL read from WAL buffers'
+default_version = '1.0'
+module_pathname = '$libdir/test_wal_read_from_buffers'
+relocatable = true
-- 
2.34.1

Reply via email to