On Thu, Apr 8, 2021 at 12:13 AM Andrey Borodin <x4...@yandex-team.ru> wrote:
> > 7 апр. 2021 г., в 14:44, Andrey Borodin <x4...@yandex-team.ru> написал(а):
> > Maybe instead of fully associative cache with random replacement we could 
> > use 1-associative cache?
> > i.e. each page can reside only in one spcific buffer slot. If there's 
> > something else - evict it.
> > I think this would be as efficient as RR cache. And it's soooo fast.
>
> I thought a bit more and understood that RR is protected from two competing 
> pages in working set, while 1-associative cache is not. So, discard that idea.

It's an interesting idea.  I know that at least one proprietary fork
just puts the whole CLOG in memory for direct indexing, which is what
we'd have here if we said "oh, your xact_buffers setting is so large
I'm just going to use slotno = pageno & mask".

Here's another approach that is a little less exciting than
"tournament RR" (or whatever that should be called; I couldn't find an
established name for it).  This version is just our traditional linear
search, except that it stops at 128, and remembers where to start from
next time (like a sort of Fisher-Price GCLOCK hand).  This feels more
committable to me.  You can argue that all buffers above 128 are bonus
buffers that PostgreSQL 13 didn't have, so the fact that we can no
longer find the globally least recently used page when you set
xact_buffers > 128 doesn't seem too bad to me, as an incremental step
(but to be clear, of course we can do better than this with more work
in later releases).
From 72ebd5052851aa4b9aa281df30b9bf42c0ad5de4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 25 Mar 2021 10:11:31 +1300
Subject: [PATCH v16 1/3] Add a buffer mapping table for SLRUs.

Instead of doing a linear search for the buffer holding a given page
number, use a hash table.  This will allow us to increase the size of
these caches.

Reviewed-by: Andrey M. Borodin <x4...@yandex-team.ru>
Discussion: https://postgr.es/m/2BEC2B3F-9B61-4C1D-9FB5-5FAB0F05EF86%40yandex-team.ru
---
 src/backend/access/transam/slru.c | 121 +++++++++++++++++++++++++-----
 src/include/access/slru.h         |   2 +
 2 files changed, 103 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 82149ad782..82c61c475b 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -58,6 +58,7 @@
 #include "pgstat.h"
 #include "storage/fd.h"
 #include "storage/shmem.h"
+#include "utils/hsearch.h"
 
 #define SlruFileName(ctl, path, seg) \
 	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
@@ -79,6 +80,12 @@ typedef struct SlruWriteAllData
 
 typedef struct SlruWriteAllData *SlruWriteAll;
 
+typedef struct SlruMappingTableEntry
+{
+	int			pageno;
+	int			slotno;
+} SlruMappingTableEntry;
+
 /*
  * Populate a file tag describing a segment file.  We only use the segment
  * number, since we can derive everything else we need by having separate
@@ -146,13 +153,16 @@ static int	SlruSelectLRUPage(SlruCtl ctl, int pageno);
 static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
 									  int segpage, void *data);
 static void SlruInternalDeleteSegment(SlruCtl ctl, int segno);
+static void	SlruMappingAdd(SlruCtl ctl, int pageno, int slotno);
+static void	SlruMappingRemove(SlruCtl ctl, int pageno);
+static int	SlruMappingFind(SlruCtl ctl, int pageno);
 
 /*
  * Initialization of shared memory
  */
 
-Size
-SimpleLruShmemSize(int nslots, int nlsns)
+static Size
+SimpleLruStructSize(int nslots, int nlsns)
 {
 	Size		sz;
 
@@ -167,10 +177,16 @@ SimpleLruShmemSize(int nslots, int nlsns)
 
 	if (nlsns > 0)
 		sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));	/* group_lsn[] */
-
 	return BUFFERALIGN(sz) + BLCKSZ * nslots;
 }
 
+Size
+SimpleLruShmemSize(int nslots, int nlsns)
+{
+	return SimpleLruStructSize(nslots, nlsns) +
+		hash_estimate_size(nslots, sizeof(SlruMappingTableEntry));
+}
+
 /*
  * Initialize, or attach to, a simple LRU cache in shared memory.
  *
@@ -187,11 +203,14 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 			  LWLock *ctllock, const char *subdir, int tranche_id,
 			  SyncRequestHandler sync_handler)
 {
+	char		mapping_table_name[SHMEM_INDEX_KEYSIZE];
+	HASHCTL		mapping_table_info;
+	HTAB	   *mapping_table;
 	SlruShared	shared;
 	bool		found;
 
 	shared = (SlruShared) ShmemInitStruct(name,
-										  SimpleLruShmemSize(nslots, nlsns),
+										  SimpleLruStructSize(nslots, nlsns),
 										  &found);
 
 	if (!IsUnderPostmaster)
@@ -258,11 +277,21 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 	else
 		Assert(found);
 
+	/* Create or find the buffer mapping table. */
+	memset(&mapping_table_info, 0, sizeof(mapping_table_info));
+	mapping_table_info.keysize = sizeof(int);
+	mapping_table_info.entrysize = sizeof(SlruMappingTableEntry);
+	snprintf(mapping_table_name, sizeof(mapping_table_name),
+			 "%s Lookup Table", name);
+	mapping_table = ShmemInitHash(mapping_table_name, nslots, nslots,
+								  &mapping_table_info, HASH_ELEM | HASH_BLOBS);
+
 	/*
 	 * Initialize the unshared control struct, including directory path. We
 	 * assume caller set PagePrecedes.
 	 */
 	ctl->shared = shared;
+	ctl->mapping_table = mapping_table;
 	ctl->sync_handler = sync_handler;
 	strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
 }
@@ -289,6 +318,9 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
 		   shared->page_number[slotno] == pageno);
 
 	/* Mark the slot as containing this page */
+	if (shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+		SlruMappingRemove(ctl, shared->page_number[slotno]);
+	SlruMappingAdd(ctl, pageno, slotno);
 	shared->page_number[slotno] = pageno;
 	shared->page_status[slotno] = SLRU_PAGE_VALID;
 	shared->page_dirty[slotno] = true;
@@ -362,7 +394,10 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
 		{
 			/* indeed, the I/O must have failed */
 			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
+			{
+				SlruMappingRemove(ctl, shared->page_number[slotno]);
 				shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			}
 			else				/* write_in_progress */
 			{
 				shared->page_status[slotno] = SLRU_PAGE_VALID;
@@ -436,6 +471,9 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 				!shared->page_dirty[slotno]));
 
 		/* Mark the slot read-busy */
+		if (shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+			SlruMappingRemove(ctl, shared->page_number[slotno]);
+		SlruMappingAdd(ctl, pageno, slotno);
 		shared->page_number[slotno] = pageno;
 		shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
 		shared->page_dirty[slotno] = false;
@@ -459,7 +497,13 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 			   shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
 			   !shared->page_dirty[slotno]);
 
-		shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
+		if (ok)
+			shared->page_status[slotno] = SLRU_PAGE_VALID;
+		else
+		{
+			SlruMappingRemove(ctl, pageno);
+			shared->page_status[slotno] =  SLRU_PAGE_EMPTY;
+		}
 
 		LWLockRelease(&shared->buffer_locks[slotno].lock);
 
@@ -500,20 +544,20 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
 	LWLockAcquire(shared->ControlLock, LW_SHARED);
 
 	/* See if page is already in a buffer */
-	for (slotno = 0; slotno < shared->num_slots; slotno++)
+	slotno = SlruMappingFind(ctl, pageno);
+	if (slotno >= 0 &&
+		shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
 	{
-		if (shared->page_number[slotno] == pageno &&
-			shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
-			shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
-		{
-			/* See comments for SlruRecentlyUsed macro */
-			SlruRecentlyUsed(shared, slotno);
+		Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
+		Assert(shared->page_number[slotno] == pageno);
 
-			/* update the stats counter of pages found in the SLRU */
-			pgstat_count_slru_page_hit(shared->slru_stats_idx);
+		/* See comments for SlruRecentlyUsed macro */
+		SlruRecentlyUsed(shared, slotno);
 
-			return slotno;
-		}
+		/* update the stats counter of pages found in the SLRU */
+		pgstat_count_slru_page_hit(shared->slru_stats_idx);
+
+		return slotno;
 	}
 
 	/* No luck, so switch to normal exclusive lock and do regular read */
@@ -1029,11 +1073,12 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		int			best_invalid_page_number = 0;	/* keep compiler quiet */
 
 		/* See if page already has a buffer assigned */
-		for (slotno = 0; slotno < shared->num_slots; slotno++)
+		slotno = SlruMappingFind(ctl, pageno);
+		if (slotno >= 0)
 		{
-			if (shared->page_number[slotno] == pageno &&
-				shared->page_status[slotno] != SLRU_PAGE_EMPTY)
-				return slotno;
+			Assert(shared->page_number[slotno] == pageno);
+			Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY);
+			return slotno;
 		}
 
 		/*
@@ -1266,6 +1311,7 @@ restart:;
 		if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
 			!shared->page_dirty[slotno])
 		{
+			SlruMappingRemove(ctl, shared->page_number[slotno]);
 			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
 			continue;
 		}
@@ -1348,6 +1394,7 @@ restart:
 		if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
 			!shared->page_dirty[slotno])
 		{
+			SlruMappingRemove(ctl, shared->page_number[slotno]);
 			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
 			continue;
 		}
@@ -1609,3 +1656,37 @@ SlruSyncFileTag(SlruCtl ctl, const FileTag *ftag, char *path)
 	errno = save_errno;
 	return result;
 }
+
+static int
+SlruMappingFind(SlruCtl ctl, int pageno)
+{
+	SlruMappingTableEntry *mapping;
+
+	mapping = hash_search(ctl->mapping_table, &pageno, HASH_FIND, NULL);
+	if (mapping)
+		return mapping->slotno;
+
+	return -1;
+}
+
+static void
+SlruMappingAdd(SlruCtl ctl, int pageno, int slotno)
+{
+	SlruMappingTableEntry *mapping;
+	bool		found PG_USED_FOR_ASSERTS_ONLY;
+
+	mapping = hash_search(ctl->mapping_table, &pageno, HASH_ENTER, &found);
+	mapping->slotno = slotno;
+
+	Assert(!found);
+}
+
+static void
+SlruMappingRemove(SlruCtl ctl, int pageno)
+{
+	bool		found PG_USED_FOR_ASSERTS_ONLY;
+
+	hash_search(ctl->mapping_table, &pageno, HASH_REMOVE, &found);
+
+	Assert(found);
+}
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index dd52e8cec7..8aa3efc0ee 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -16,6 +16,7 @@
 #include "access/xlogdefs.h"
 #include "storage/lwlock.h"
 #include "storage/sync.h"
+#include "utils/hsearch.h"
 
 
 /*
@@ -110,6 +111,7 @@ typedef SlruSharedData *SlruShared;
 typedef struct SlruCtlData
 {
 	SlruShared	shared;
+	HTAB	   *mapping_table;
 
 	/*
 	 * Which sync handler function to use when handing sync requests over to
-- 
2.30.1

From 97c5078f661a41617450826173dd252fc4d0c856 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amboro...@acm.org>
Date: Mon, 15 Feb 2021 21:51:56 +0500
Subject: [PATCH v16 2/3] Make all SLRU buffer sizes configurable.

Provide new GUCs to set the number of buffers, instead of using hard
coded defaults.

Remove the limits on xact_buffers and commit_ts_buffers.  The default
sizes for those caches are ~0.2% and ~0.1% of shared_buffers, as before,
but now there is no cap at 128 and 16 buffers respectively (unless
track_commit_timestamp is disabled, in the latter case, then we might as
well keep it tiny).  Sizes much larger than the old limits have been
shown to be useful on modern systems, and an earlier commit replaced a
linear search with a hash table to avoid problems with extreme cases.

Author: Andrey M. Borodin <x4...@yandex-team.ru>
Reviewed-by: Anastasia Lubennikova <a.lubennik...@postgrespro.ru>
Reviewed-by: Tomas Vondra <tomas.von...@2ndquadrant.com>
Reviewed-by: Alexander Korotkov <aekorot...@gmail.com>
Reviewed-by: Gilles Darold <gil...@darold.net>
Reviewed-by: Thomas Munro <thomas.mu...@gmail.com>
Discussion: https://postgr.es/m/2BEC2B3F-9B61-4C1D-9FB5-5FAB0F05EF86%40yandex-team.ru
---
 doc/src/sgml/config.sgml                      | 135 ++++++++++++++++++
 src/backend/access/transam/clog.c             |  23 ++-
 src/backend/access/transam/commit_ts.c        |  10 +-
 src/backend/access/transam/multixact.c        |   8 +-
 src/backend/access/transam/subtrans.c         |   5 +-
 src/backend/commands/async.c                  |   8 +-
 src/backend/storage/lmgr/predicate.c          |   4 +-
 src/backend/utils/init/globals.c              |   8 ++
 src/backend/utils/misc/guc.c                  |  99 +++++++++++++
 src/backend/utils/misc/postgresql.conf.sample |   9 ++
 src/include/access/clog.h                     |  10 ++
 src/include/access/commit_ts.h                |   1 -
 src/include/access/multixact.h                |   4 -
 src/include/access/slru.h                     |   5 +
 src/include/access/subtrans.h                 |   2 -
 src/include/commands/async.h                  |   5 -
 src/include/miscadmin.h                       |   7 +
 src/include/storage/predicate.h               |   4 -
 18 files changed, 301 insertions(+), 46 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 963824d050..58c46edf62 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1924,6 +1924,141 @@ include_dir 'conf.d'
        </para>
       </listitem>
      </varlistentry>
+     
+    <varlistentry id="guc-multixact-offsets-buffers" xreflabel="multixact_offsets_buffers">
+      <term><varname>multixact_offsets_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>multixact_offsets_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_multixact/offsets</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>8</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+
+    <varlistentry id="guc-multixact-members-buffers" xreflabel="multixact_members_buffers">
+      <term><varname>multixact_members_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>multixact_members_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_multixact/members</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>16</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-subtrans-buffers" xreflabel="subtrans_buffers">
+      <term><varname>subtrans_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>subtrans_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_subtrans</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>8</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-notify-buffers" xreflabel="notify_buffers">
+      <term><varname>notify_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>notify_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_notify</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>8</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-serial-buffers" xreflabel="serial_buffers">
+      <term><varname>serial_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>serial_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_serial</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>16</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-xact-buffers" xreflabel="xact_buffers">
+      <term><varname>xact_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>xact_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_xact</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>0</literal>, which requests
+        <varname>shared_buffers</varname> / 512, but not fewer than 4 blocks.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-commit-ts-buffers" xreflabel="commit_ts_buffers">
+      <term><varname>commit_ts_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>commit_ts_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of memory to use to cache the cotents of
+        <literal>pg_commit_ts</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>0</literal>, which requests
+        <varname>shared_buffers</varname> / 1024, but not fewer than 4 blocks.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
 
      <varlistentry id="guc-max-stack-depth" xreflabel="max_stack_depth">
       <term><varname>max_stack_depth</varname> (<type>integer</type>)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 6fa4713fb4..dd2d7a5184 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -58,8 +58,8 @@
 
 /* We need two bits per xact, so four xacts fit in a byte */
 #define CLOG_BITS_PER_XACT	2
-#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+StaticAssertDecl((CLOG_BITS_PER_XACT * CLOG_XACTS_PER_BYTE) == BITS_PER_BYTE,
+				 "CLOG_BITS_PER_XACT and CLOG_XACTS_PER_BYTE are inconsistent");
 #define CLOG_XACT_BITMASK	((1 << CLOG_BITS_PER_XACT) - 1)
 
 #define TransactionIdToPage(xid)	((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
@@ -659,23 +659,16 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
 /*
  * Number of shared CLOG buffers.
  *
- * On larger multi-processor systems, it is possible to have many CLOG page
- * requests in flight at one time which could lead to disk access for CLOG
- * page if the required page is not found in memory.  Testing revealed that we
- * can get the best performance by having 128 CLOG buffers, more than that it
- * doesn't improve performance.
- *
- * Unconditionally keeping the number of CLOG buffers to 128 did not seem like
- * a good idea, because it would increase the minimum amount of shared memory
- * required to start, which could be a problem for people running very small
- * configurations.  The following formula seems to represent a reasonable
- * compromise: people with very low values for shared_buffers will get fewer
- * CLOG buffers as well, and everyone else will get 128.
+ * By default, we'll use 2MB of for every 1GB of shared buffers, up to the
+ * theoretical maximum useful value, but always at least 4 buffers.
  */
 Size
 CLOGShmemBuffers(void)
 {
-	return Min(128, Max(4, NBuffers / 512));
+	/* Use configured value if provided. */
+	if (xact_buffers > 0)
+		return Max(4, xact_buffers);
+	return Min(CLOG_MAX_ALLOWED_BUFFERS, Max(4, NBuffers / 512));
 }
 
 /*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 268bdba339..729a4b9212 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -524,13 +524,17 @@ pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
 /*
  * Number of shared CommitTS buffers.
  *
- * We use a very similar logic as for the number of CLOG buffers; see comments
- * in CLOGShmemBuffers.
+ * By default, we'll use 1MB of for every 1GB of shared buffers, up to the
+ * maximum value that slru.c will allow, but always at least 4 buffers.
  */
 Size
 CommitTsShmemBuffers(void)
 {
-	return Min(16, Max(4, NBuffers / 1024));
+	/* Use configured value if provided. */
+	if (commit_ts_buffers > 0)
+		return Max(4, commit_ts_buffers);
+	return Min(track_commit_timestamp ? SLRU_MAX_ALLOWED_BUFFERS : 16,
+			   Max(4, NBuffers / 1024));
 }
 
 /*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 1f9f1a1fa1..21787765e2 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1831,8 +1831,8 @@ MultiXactShmemSize(void)
 			 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
 
 	size = SHARED_MULTIXACT_STATE_SIZE;
-	size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
-	size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(multixact_offsets_buffers, 0));
+	size = add_size(size, SimpleLruShmemSize(multixact_members_buffers, 0));
 
 	return size;
 }
@@ -1848,13 +1848,13 @@ MultiXactShmemInit(void)
 	MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
 
 	SimpleLruInit(MultiXactOffsetCtl,
-				  "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+				  "MultiXactOffset", multixact_offsets_buffers, 0,
 				  MultiXactOffsetSLRULock, "pg_multixact/offsets",
 				  LWTRANCHE_MULTIXACTOFFSET_BUFFER,
 				  SYNC_HANDLER_MULTIXACT_OFFSET);
 	SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
 	SimpleLruInit(MultiXactMemberCtl,
-				  "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+				  "MultiXactMember", multixact_offsets_buffers, 0,
 				  MultiXactMemberSLRULock, "pg_multixact/members",
 				  LWTRANCHE_MULTIXACTMEMBER_BUFFER,
 				  SYNC_HANDLER_MULTIXACT_MEMBER);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 6a8e521f89..785f2520fd 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -31,6 +31,7 @@
 #include "access/slru.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
+#include "miscadmin.h"
 #include "pg_trace.h"
 #include "utils/snapmgr.h"
 
@@ -184,14 +185,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
 Size
 SUBTRANSShmemSize(void)
 {
-	return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+	return SimpleLruShmemSize(subtrans_buffers, 0);
 }
 
 void
 SUBTRANSShmemInit(void)
 {
 	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
-	SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+	SimpleLruInit(SubTransCtl, "Subtrans", subtrans_buffers, 0,
 				  SubtransSLRULock, "pg_subtrans",
 				  LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
 	SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE);
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 4b16fb5682..de17f52cd7 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -107,7 +107,7 @@
  * frontend during startup.)  The above design guarantees that notifies from
  * other backends will never be missed by ignoring self-notifies.
  *
- * The amount of shared memory used for notify management (NUM_NOTIFY_BUFFERS)
+ * The amount of shared memory used for notify management (notify_buffers)
  * can be varied without affecting anything but performance.  The maximum
  * amount of notification data that can be queued at one time is determined
  * by slru.c's wraparound limit; see QUEUE_MAX_PAGE below.
@@ -225,7 +225,7 @@ typedef struct QueuePosition
  *
  * Resist the temptation to make this really large.  While that would save
  * work in some places, it would add cost in others.  In particular, this
- * should likely be less than NUM_NOTIFY_BUFFERS, to ensure that backends
+ * should likely be less than notify_buffers, to ensure that backends
  * catch up before the pages they'll need to read fall out of SLRU cache.
  */
 #define QUEUE_CLEANUP_DELAY 4
@@ -514,7 +514,7 @@ AsyncShmemSize(void)
 	size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus));
 	size = add_size(size, offsetof(AsyncQueueControl, backend));
 
-	size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(notify_buffers, 0));
 
 	return size;
 }
@@ -562,7 +562,7 @@ AsyncShmemInit(void)
 	 * Set up SLRU management of the pg_notify data.
 	 */
 	NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
-	SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
+	SimpleLruInit(NotifyCtl, "Notify", notify_buffers, 0,
 				  NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
 				  SYNC_HANDLER_NONE);
 
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d493aeef0f..b1f4f1651d 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -872,7 +872,7 @@ SerialInit(void)
 	 */
 	SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
 	SimpleLruInit(SerialSlruCtl, "Serial",
-				  NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+				  serial_buffers, 0, SerialSLRULock, "pg_serial",
 				  LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
 #ifdef USE_ASSERT_CHECKING
 	SerialPagePrecedesLogicallyUnitTests();
@@ -1395,7 +1395,7 @@ PredicateLockShmemSize(void)
 
 	/* Shared memory structures for SLRU tracking of old committed xids. */
 	size = add_size(size, sizeof(SerialControlData));
-	size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(serial_buffers, 0));
 
 	return size;
 }
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 381d9e548d..c83151d5ab 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -150,3 +150,11 @@ int64		VacuumPageDirty = 0;
 
 int			VacuumCostBalance = 0;	/* working state for vacuum */
 bool		VacuumCostActive = false;
+
+int			multixact_offsets_buffers = 8;
+int			multixact_members_buffers = 16;
+int			subtrans_buffers = 32;
+int			notify_buffers = 8;
+int			serial_buffers = 16;
+int			xact_buffers = 0;
+int			commit_ts_buffers = 0;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index bee976bae8..250cef80bd 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -32,9 +32,11 @@
 #endif
 #include <unistd.h>
 
+#include "access/clog.h"
 #include "access/commit_ts.h"
 #include "access/gin.h"
 #include "access/rmgr.h"
+#include "access/slru.h"
 #include "access/tableam.h"
 #include "access/toast_compression.h"
 #include "access/transam.h"
@@ -200,6 +202,8 @@ static const char *show_tcp_keepalives_idle(void);
 static const char *show_tcp_keepalives_interval(void);
 static const char *show_tcp_keepalives_count(void);
 static const char *show_tcp_user_timeout(void);
+static const char *show_xact_buffers(void);
+static const char *show_commit_ts_buffers(void);
 static bool check_maxconnections(int *newval, void **extra, GucSource source);
 static bool check_max_worker_processes(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource source);
@@ -2340,6 +2344,83 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"multixact_offsets_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the MultiXact offset SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&multixact_offsets_buffers,
+		8, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"multixact_members_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the MultiXact member SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&multixact_members_buffers,
+		16, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"subtrans_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the sub-transaction SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&subtrans_buffers,
+		32, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"notify_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the NOTIFY message SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&notify_buffers,
+		8, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"serial_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the serializable transaction SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&serial_buffers,
+		16, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"xact_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the transaction status SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&xact_buffers,
+		0, 0, CLOG_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, show_xact_buffers
+	},
+
+	{
+		{"commit_ts_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the size of the dedicated buffer pool used for the commit timestamp SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&commit_ts_buffers,
+		0, 0, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, show_commit_ts_buffers
+	},
+
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
 			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
@@ -11959,6 +12040,24 @@ show_tcp_user_timeout(void)
 	return nbuf;
 }
 
+static const char *
+show_xact_buffers(void)
+{
+	static char nbuf[16];
+
+	snprintf(nbuf, sizeof(nbuf), "%zu", CLOGShmemBuffers());
+	return nbuf;
+}
+
+static const char *
+show_commit_ts_buffers(void)
+{
+	static char nbuf[16];
+
+	snprintf(nbuf, sizeof(nbuf), "%zu", CommitTsShmemBuffers());
+	return nbuf;
+}
+
 static bool
 check_maxconnections(int *newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ff9fa006fe..7e14df3b51 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -190,6 +190,15 @@
 					# (change requires restart)
 #backend_flush_after = 0		# measured in pages, 0 disables
 
+# - SLRU Buffers (change requires restart) -
+
+#xact_buffers = 0			# memory for pg_xact (0 = auto)
+#subtrans_buffers = 32			# memory for pg_subtrans
+#multixact_offsets_buffers = 8		# memory for pg_multixact/offsets
+#multixact_members_buffers = 16		# memory for pg_multixact/members
+#notify_buffers = 8			# memory for pg_notify
+#serial_buffers = 16			# memory for pg_serial
+#commit_ts_buffers = 0			# memory for pg_commit_ts (0 = auto)
 
 #------------------------------------------------------------------------------
 # WRITE-AHEAD LOG
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index 39b8e4afa8..739a292f7f 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -15,6 +15,16 @@
 #include "storage/sync.h"
 #include "lib/stringinfo.h"
 
+/*
+ * Don't allow xact_buffers to be set higher than could possibly be useful or
+ * SLRU would allow.
+ */
+#define CLOG_XACTS_PER_BYTE 4
+#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_MAX_ALLOWED_BUFFERS \
+	Min(SLRU_MAX_ALLOWED_BUFFERS, \
+		(((MaxTransactionId / 2) + (CLOG_XACTS_PER_PAGE - 1)) / CLOG_XACTS_PER_PAGE))
+
 /*
  * Possible transaction statuses --- note that all-zeroes is the initial
  * state.
diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h
index 750369104a..e4cf988609 100644
--- a/src/include/access/commit_ts.h
+++ b/src/include/access/commit_ts.h
@@ -17,7 +17,6 @@
 #include "storage/sync.h"
 #include "utils/guc.h"
 
-
 extern PGDLLIMPORT bool track_commit_timestamp;
 
 extern bool check_track_commit_timestamp(bool *newval, void **extra,
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 4bbb035eae..97c0a46376 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -29,10 +29,6 @@
 
 #define MaxMultiXactOffset	((MultiXactOffset) 0xFFFFFFFF)
 
-/* Number of SLRU buffers to use for multixact */
-#define NUM_MULTIXACTOFFSET_BUFFERS		8
-#define NUM_MULTIXACTMEMBER_BUFFERS		16
-
 /*
  * Possible multixact lock modes ("status").  The first four modes are for
  * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 8aa3efc0ee..97ea837646 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -18,6 +18,11 @@
 #include "storage/sync.h"
 #include "utils/hsearch.h"
 
+/*
+ * To avoid overflowing internal arithmetic and the size_t data type, the
+ * number of buffers should not exceed this number.
+ */
+#define SLRU_MAX_ALLOWED_BUFFERS ((1024 * 1024 * 1024) / BLCKSZ)
 
 /*
  * Define SLRU segment size.  A page is the same BLCKSZ as is used everywhere
diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h
index d0ab44ae82..64fa86938e 100644
--- a/src/include/access/subtrans.h
+++ b/src/include/access/subtrans.h
@@ -11,8 +11,6 @@
 #ifndef SUBTRANS_H
 #define SUBTRANS_H
 
-/* Number of SLRU buffers to use for subtrans */
-#define NUM_SUBTRANS_BUFFERS	32
 
 extern void SubTransSetParent(TransactionId xid, TransactionId parent);
 extern TransactionId SubTransGetParent(TransactionId xid);
diff --git a/src/include/commands/async.h b/src/include/commands/async.h
index 9217f66b91..fa831e3721 100644
--- a/src/include/commands/async.h
+++ b/src/include/commands/async.h
@@ -15,11 +15,6 @@
 
 #include <signal.h>
 
-/*
- * The number of SLRU page buffers we use for the notification queue.
- */
-#define NUM_NOTIFY_BUFFERS	8
-
 extern bool Trace_notify;
 extern volatile sig_atomic_t notifyInterruptPending;
 
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 95202d37af..495c1bf901 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -164,6 +164,13 @@ extern PGDLLIMPORT int MaxBackends;
 extern PGDLLIMPORT int MaxConnections;
 extern PGDLLIMPORT int max_worker_processes;
 extern PGDLLIMPORT int max_parallel_workers;
+extern PGDLLIMPORT int multixact_offsets_buffers;
+extern PGDLLIMPORT int multixact_members_buffers;
+extern PGDLLIMPORT int subtrans_buffers;
+extern PGDLLIMPORT int notify_buffers;
+extern PGDLLIMPORT int serial_buffers;
+extern PGDLLIMPORT int xact_buffers;
+extern PGDLLIMPORT int commit_ts_buffers;
 
 extern PGDLLIMPORT int MyProcPid;
 extern PGDLLIMPORT pg_time_t MyStartTime;
diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h
index 152b698611..c72779bd88 100644
--- a/src/include/storage/predicate.h
+++ b/src/include/storage/predicate.h
@@ -26,10 +26,6 @@ extern int	max_predicate_locks_per_xact;
 extern int	max_predicate_locks_per_relation;
 extern int	max_predicate_locks_per_page;
 
-
-/* Number of SLRU buffers to use for Serial SLRU */
-#define NUM_SERIAL_BUFFERS		16
-
 /*
  * A handle used for sharing SERIALIZABLEXACT objects between the participants
  * in a parallel query.
-- 
2.30.1

From 79c15f73a21cfb67cb6157fc5b6822bcdc696383 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Thu, 8 Apr 2021 12:18:25 +1200
Subject: [PATCH v16 3/3] Limit SLRU buffer replacement search.

Now that users can configure large SLRU caches, slru.c's simple buffer
replacement algorithm needs some adjustment.  For now, limit its linear
search for the least recently accessed buffer to an arbitrary cap.  This
means it won't find the globally least recently used buffer, just the
least recently used in a given range of pages.  The cap is initially set
as large as the previous hard-coded search size.

Discussion: https://postgr.es/m/2BEC2B3F-9B61-4C1D-9FB5-5FAB0F05EF86%40yandex-team.ru
---
 src/backend/access/transam/slru.c | 15 ++++++++++++++-
 src/include/access/slru.h         |  3 +++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 82c61c475b..131f7a48d7 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -71,6 +71,12 @@
  */
 #define MAX_WRITEALL_BUFFERS	16
 
+/*
+ * When searching for buffers to replace, we will limit the scope of our search
+ * for now, to avoid holding an exclusive lock for too long.
+ */
+#define MAX_REPLACEMENT_SEARCH	128
+
 typedef struct SlruWriteAllData
 {
 	int			num_files;		/* # files actually open */
@@ -1060,6 +1066,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 {
 	SlruShared	shared = ctl->shared;
 
+
 	/* Outer loop handles restart after I/O */
 	for (;;)
 	{
@@ -1071,6 +1078,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		int			bestinvalidslot = 0;	/* keep compiler quiet */
 		int			best_invalid_delta = -1;
 		int			best_invalid_page_number = 0;	/* keep compiler quiet */
+		int			max_search;
 
 		/* See if page already has a buffer assigned */
 		slotno = SlruMappingFind(ctl, pageno);
@@ -1108,12 +1116,17 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		 * That gets us back on the path to having good data when there are
 		 * multiple pages with the same lru_count.
 		 */
+		max_search = Min(shared->num_slots, MAX_REPLACEMENT_SEARCH);
 		cur_count = (shared->cur_lru_count)++;
-		for (slotno = 0; slotno < shared->num_slots; slotno++)
+		for (int i = 0; i < max_search; ++i)
 		{
 			int			this_delta;
 			int			this_page_number;
 
+			slotno = shared->search_slotno++;
+			if (shared->search_slotno == shared->num_slots)
+				shared->search_slotno = 0;
+
 			if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
 				return slotno;
 			this_delta = cur_count - shared->page_lru_count[slotno];
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 97ea837646..fb8a03972d 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -63,6 +63,9 @@ typedef struct SlruSharedData
 	/* Number of buffers managed by this SLRU structure */
 	int			num_slots;
 
+	/* Where to start buffer replacement search. */
+	int			search_slotno;
+
 	/*
 	 * Arrays holding info for each buffer slot.  Page number is undefined
 	 * when status is EMPTY, as is page_lru_count.
-- 
2.30.1

Reply via email to