>> 8 апр. 2021 г., в 15:22, Thomas Munro <thomas.mu...@gmail.com> написал(а):
>>
> I have one more idea inspired by CPU caches.
> Let's make SLRU n-associative, where n ~ 8.
> We can divide buffers into "banks", number of banks must be power of 2.
> All banks are of equal size. We choose bank size to approximately satisfy 
> user's configured buffer size.
> Each page can live only within one bank. We use same search and eviction 
> algorithms as we used in SLRU, but we only need to search\evict over 8 
> elements.
> All SLRU data of a single bank will be colocated within at most 2 cache line.
> 
> I did not come up with idea how to avoid multiplication of bank_number * 
> bank_size in case when user configured 31337 buffers (any number that is 
> radically not a power of 2).

We can avoid this multiplication by using gapped memory under SLRU 
page_statuses, but from my POV here complexity does not worth possible 
performance gain.

PFA rebase of the patchset. Also I've added a patch to combine page_number, 
page_status, and page_dirty together to touch less cachelines.

Best regards, Andrey Borodin.

From ea59d2ebde818ddc2a9111858b3d956cbcc7bff2 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amboro...@acm.org>
Date: Mon, 15 Feb 2021 21:51:56 +0500
Subject: [PATCH v=18 1/3] Make all SLRU buffer sizes configurable.

Provide new GUCs to set the number of buffers, instead of using hard
coded defaults.

Remove the limits on xact_buffers and commit_ts_buffers.  The default
sizes for those caches are ~0.2% and ~0.1% of shared_buffers, as before,
but now there is no cap at 128 and 16 buffers respectively (unless
track_commit_timestamp is disabled, in the latter case, then we might as
well keep it tiny).  Sizes much larger than the old limits have been
shown to be useful on modern systems, and an earlier commit replaced a
linear search with a hash table to avoid problems with extreme cases.

Author: Andrey M. Borodin <x4...@yandex-team.ru>
Reviewed-by: Anastasia Lubennikova <a.lubennik...@postgrespro.ru>
Reviewed-by: Tomas Vondra <tomas.von...@2ndquadrant.com>
Reviewed-by: Alexander Korotkov <aekorot...@gmail.com>
Reviewed-by: Gilles Darold <gil...@darold.net>
Reviewed-by: Thomas Munro <thomas.mu...@gmail.com>
Discussion: https://postgr.es/m/2BEC2B3F-9B61-4C1D-9FB5-5FAB0F05EF86%40yandex-team.ru
---
 doc/src/sgml/config.sgml                      | 135 ++++++++++++++++++
 src/backend/access/transam/clog.c             |  23 ++-
 src/backend/access/transam/commit_ts.c        |   5 +
 src/backend/access/transam/multixact.c        |   8 +-
 src/backend/access/transam/subtrans.c         |   5 +-
 src/backend/commands/async.c                  |   8 +-
 src/backend/storage/lmgr/predicate.c          |   4 +-
 src/backend/utils/init/globals.c              |   8 ++
 src/backend/utils/misc/guc.c                  |  99 +++++++++++++
 src/backend/utils/misc/postgresql.conf.sample |   9 ++
 src/include/access/clog.h                     |  10 ++
 src/include/access/commit_ts.h                |   1 -
 src/include/access/multixact.h                |   4 -
 src/include/access/slru.h                     |   5 +
 src/include/access/subtrans.h                 |   2 -
 src/include/commands/async.h                  |   5 -
 src/include/miscadmin.h                       |   7 +
 src/include/storage/predicate.h               |   4 -
 18 files changed, 299 insertions(+), 43 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index afbb6c35e30..57d9696abe8 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1952,6 +1952,141 @@ include_dir 'conf.d'
        </para>
       </listitem>
      </varlistentry>
+     
+    <varlistentry id="guc-multixact-offsets-buffers" xreflabel="multixact_offsets_buffers">
+      <term><varname>multixact_offsets_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>multixact_offsets_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_multixact/offsets</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>8</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+
+    <varlistentry id="guc-multixact-members-buffers" xreflabel="multixact_members_buffers">
+      <term><varname>multixact_members_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>multixact_members_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_multixact/members</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>16</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-subtrans-buffers" xreflabel="subtrans_buffers">
+      <term><varname>subtrans_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>subtrans_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_subtrans</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>8</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-notify-buffers" xreflabel="notify_buffers">
+      <term><varname>notify_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>notify_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_notify</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>8</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-serial-buffers" xreflabel="serial_buffers">
+      <term><varname>serial_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>serial_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_serial</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>16</literal>.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-xact-buffers" xreflabel="xact_buffers">
+      <term><varname>xact_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>xact_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of shared memory to use to cache the contents
+        of <literal>pg_xact</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>0</literal>, which requests
+        <varname>shared_buffers</varname> / 512, but not fewer than 4 blocks.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
+     
+    <varlistentry id="guc-commit-ts-buffers" xreflabel="commit_ts_buffers">
+      <term><varname>commit_ts_buffers</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>commit_ts_buffers</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of memory to use to cache the cotents of
+        <literal>pg_commit_ts</literal> (see
+        <xref linkend="pgdata-contents-table"/>).
+        If this value is specified without units, it is taken as blocks,
+        that is <symbol>BLCKSZ</symbol> bytes, typically 8kB.
+        The default value is <literal>0</literal>, which requests
+        <varname>shared_buffers</varname> / 1024, but not fewer than 4 blocks.
+        This parameter can only be set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
 
      <varlistentry id="guc-max-stack-depth" xreflabel="max_stack_depth">
       <term><varname>max_stack_depth</varname> (<type>integer</type>)
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 3ea16a270a8..ca28ada75fa 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -58,8 +58,8 @@
 
 /* We need two bits per xact, so four xacts fit in a byte */
 #define CLOG_BITS_PER_XACT	2
-#define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+StaticAssertDecl((CLOG_BITS_PER_XACT * CLOG_XACTS_PER_BYTE) == BITS_PER_BYTE,
+				 "CLOG_BITS_PER_XACT and CLOG_XACTS_PER_BYTE are inconsistent");
 #define CLOG_XACT_BITMASK	((1 << CLOG_BITS_PER_XACT) - 1)
 
 #define TransactionIdToPage(xid)	((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
@@ -664,23 +664,16 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn)
 /*
  * Number of shared CLOG buffers.
  *
- * On larger multi-processor systems, it is possible to have many CLOG page
- * requests in flight at one time which could lead to disk access for CLOG
- * page if the required page is not found in memory.  Testing revealed that we
- * can get the best performance by having 128 CLOG buffers, more than that it
- * doesn't improve performance.
- *
- * Unconditionally keeping the number of CLOG buffers to 128 did not seem like
- * a good idea, because it would increase the minimum amount of shared memory
- * required to start, which could be a problem for people running very small
- * configurations.  The following formula seems to represent a reasonable
- * compromise: people with very low values for shared_buffers will get fewer
- * CLOG buffers as well, and everyone else will get 128.
+ * By default, we'll use 2MB of for every 1GB of shared buffers, up to the
+ * theoretical maximum useful value, but always at least 4 buffers.
  */
 Size
 CLOGShmemBuffers(void)
 {
-	return Min(128, Max(4, NBuffers / 512));
+	/* Use configured value if provided. */
+	if (xact_buffers > 0)
+		return Max(4, xact_buffers);
+	return Min(CLOG_MAX_ALLOWED_BUFFERS, Max(4, NBuffers / 512));
 }
 
 /*
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index cbbe19fea83..cb2e0ceb1c3 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -511,10 +511,15 @@ pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
  * We use a very similar logic as for the number of CLOG buffers (except we
  * scale up twice as fast with shared buffers, and the maximum is twice as
  * high); see comments in CLOGShmemBuffers.
+ * By default, we'll use 1MB of for every 1GB of shared buffers, up to the
+ * maximum value that slru.c will allow, but always at least 4 buffers.
  */
 Size
 CommitTsShmemBuffers(void)
 {
+	/* Use configured value if provided. */
+	if (commit_ts_buffers > 0)
+		return Max(4, commit_ts_buffers);
 	return Min(256, Max(4, NBuffers / 256));
 }
 
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index e6c70ed0bc2..a29ab4769dc 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1834,8 +1834,8 @@ MultiXactShmemSize(void)
 			 mul_size(sizeof(MultiXactId) * 2, MaxOldestSlot))
 
 	size = SHARED_MULTIXACT_STATE_SIZE;
-	size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTOFFSET_BUFFERS, 0));
-	size = add_size(size, SimpleLruShmemSize(NUM_MULTIXACTMEMBER_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(multixact_offsets_buffers, 0));
+	size = add_size(size, SimpleLruShmemSize(multixact_members_buffers, 0));
 
 	return size;
 }
@@ -1851,13 +1851,13 @@ MultiXactShmemInit(void)
 	MultiXactMemberCtl->PagePrecedes = MultiXactMemberPagePrecedes;
 
 	SimpleLruInit(MultiXactOffsetCtl,
-				  "MultiXactOffset", NUM_MULTIXACTOFFSET_BUFFERS, 0,
+				  "MultiXactOffset", multixact_offsets_buffers, 0,
 				  MultiXactOffsetSLRULock, "pg_multixact/offsets",
 				  LWTRANCHE_MULTIXACTOFFSET_BUFFER,
 				  SYNC_HANDLER_MULTIXACT_OFFSET);
 	SlruPagePrecedesUnitTests(MultiXactOffsetCtl, MULTIXACT_OFFSETS_PER_PAGE);
 	SimpleLruInit(MultiXactMemberCtl,
-				  "MultiXactMember", NUM_MULTIXACTMEMBER_BUFFERS, 0,
+				  "MultiXactMember", multixact_offsets_buffers, 0,
 				  MultiXactMemberSLRULock, "pg_multixact/members",
 				  LWTRANCHE_MULTIXACTMEMBER_BUFFER,
 				  SYNC_HANDLER_MULTIXACT_MEMBER);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 6a8e521f894..785f2520fde 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -31,6 +31,7 @@
 #include "access/slru.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
+#include "miscadmin.h"
 #include "pg_trace.h"
 #include "utils/snapmgr.h"
 
@@ -184,14 +185,14 @@ SubTransGetTopmostTransaction(TransactionId xid)
 Size
 SUBTRANSShmemSize(void)
 {
-	return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0);
+	return SimpleLruShmemSize(subtrans_buffers, 0);
 }
 
 void
 SUBTRANSShmemInit(void)
 {
 	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
-	SimpleLruInit(SubTransCtl, "Subtrans", NUM_SUBTRANS_BUFFERS, 0,
+	SimpleLruInit(SubTransCtl, "Subtrans", subtrans_buffers, 0,
 				  SubtransSLRULock, "pg_subtrans",
 				  LWTRANCHE_SUBTRANS_BUFFER, SYNC_HANDLER_NONE);
 	SlruPagePrecedesUnitTests(SubTransCtl, SUBTRANS_XACTS_PER_PAGE);
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 85570085450..7f2b7598449 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -117,7 +117,7 @@
  * frontend during startup.)  The above design guarantees that notifies from
  * other backends will never be missed by ignoring self-notifies.
  *
- * The amount of shared memory used for notify management (NUM_NOTIFY_BUFFERS)
+ * The amount of shared memory used for notify management (notify_buffers)
  * can be varied without affecting anything but performance.  The maximum
  * amount of notification data that can be queued at one time is determined
  * by slru.c's wraparound limit; see QUEUE_MAX_PAGE below.
@@ -235,7 +235,7 @@ typedef struct QueuePosition
  *
  * Resist the temptation to make this really large.  While that would save
  * work in some places, it would add cost in others.  In particular, this
- * should likely be less than NUM_NOTIFY_BUFFERS, to ensure that backends
+ * should likely be less than notify_buffers, to ensure that backends
  * catch up before the pages they'll need to read fall out of SLRU cache.
  */
 #define QUEUE_CLEANUP_DELAY 4
@@ -521,7 +521,7 @@ AsyncShmemSize(void)
 	size = mul_size(MaxBackends + 1, sizeof(QueueBackendStatus));
 	size = add_size(size, offsetof(AsyncQueueControl, backend));
 
-	size = add_size(size, SimpleLruShmemSize(NUM_NOTIFY_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(notify_buffers, 0));
 
 	return size;
 }
@@ -569,7 +569,7 @@ AsyncShmemInit(void)
 	 * Set up SLRU management of the pg_notify data.
 	 */
 	NotifyCtl->PagePrecedes = asyncQueuePagePrecedes;
-	SimpleLruInit(NotifyCtl, "Notify", NUM_NOTIFY_BUFFERS, 0,
+	SimpleLruInit(NotifyCtl, "Notify", notify_buffers, 0,
 				  NotifySLRULock, "pg_notify", LWTRANCHE_NOTIFY_BUFFER,
 				  SYNC_HANDLER_NONE);
 
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 4f4d5b0d20f..c5e66757643 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -872,7 +872,7 @@ SerialInit(void)
 	 */
 	SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
 	SimpleLruInit(SerialSlruCtl, "Serial",
-				  NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+				  serial_buffers, 0, SerialSLRULock, "pg_serial",
 				  LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
 #ifdef USE_ASSERT_CHECKING
 	SerialPagePrecedesLogicallyUnitTests();
@@ -1396,7 +1396,7 @@ PredicateLockShmemSize(void)
 
 	/* Shared memory structures for SLRU tracking of old committed xids. */
 	size = add_size(size, sizeof(SerialControlData));
-	size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+	size = add_size(size, SimpleLruShmemSize(serial_buffers, 0));
 
 	return size;
 }
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 381d9e548d1..c83151d5ab5 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -150,3 +150,11 @@ int64		VacuumPageDirty = 0;
 
 int			VacuumCostBalance = 0;	/* working state for vacuum */
 bool		VacuumCostActive = false;
+
+int			multixact_offsets_buffers = 8;
+int			multixact_members_buffers = 16;
+int			subtrans_buffers = 32;
+int			notify_buffers = 8;
+int			serial_buffers = 16;
+int			xact_buffers = 0;
+int			commit_ts_buffers = 0;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index bff949a40bc..eb6bf4c0a04 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -32,9 +32,11 @@
 #endif
 #include <unistd.h>
 
+#include "access/clog.h"
 #include "access/commit_ts.h"
 #include "access/gin.h"
 #include "access/rmgr.h"
+#include "access/slru.h"
 #include "access/tableam.h"
 #include "access/toast_compression.h"
 #include "access/transam.h"
@@ -202,6 +204,8 @@ static const char *show_tcp_keepalives_idle(void);
 static const char *show_tcp_keepalives_interval(void);
 static const char *show_tcp_keepalives_count(void);
 static const char *show_tcp_user_timeout(void);
+static const char *show_xact_buffers(void);
+static const char *show_commit_ts_buffers(void);
 static bool check_maxconnections(int *newval, void **extra, GucSource source);
 static bool check_max_worker_processes(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource source);
@@ -2366,6 +2370,83 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"multixact_offsets_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the MultiXact offset SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&multixact_offsets_buffers,
+		8, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"multixact_members_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the MultiXact member SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&multixact_members_buffers,
+		16, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"subtrans_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the sub-transaction SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&subtrans_buffers,
+		32, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"notify_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the NOTIFY message SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&notify_buffers,
+		8, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"serial_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the serializable transaction SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&serial_buffers,
+		16, 2, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, NULL
+	},
+
+	{
+		{"xact_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the number of shared memory buffers used for the transaction status SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&xact_buffers,
+		0, 0, CLOG_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, show_xact_buffers
+	},
+
+	{
+		{"commit_ts_buffers", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Sets the size of the dedicated buffer pool used for the commit timestamp SLRU cache."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&commit_ts_buffers,
+		0, 0, SLRU_MAX_ALLOWED_BUFFERS,
+		NULL, NULL, show_commit_ts_buffers
+	},
+
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
 			gettext_noop("Sets the maximum number of temporary buffers used by each session."),
@@ -12074,6 +12155,24 @@ show_tcp_user_timeout(void)
 	return nbuf;
 }
 
+static const char *
+show_xact_buffers(void)
+{
+	static char nbuf[16];
+
+	snprintf(nbuf, sizeof(nbuf), "%zu", CLOGShmemBuffers());
+	return nbuf;
+}
+
+static const char *
+show_commit_ts_buffers(void)
+{
+	static char nbuf[16];
+
+	snprintf(nbuf, sizeof(nbuf), "%zu", CommitTsShmemBuffers());
+	return nbuf;
+}
+
 static bool
 check_maxconnections(int *newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index a1acd46b611..22bda4383ce 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -195,6 +195,15 @@
 #old_snapshot_threshold = -1		# 1min-60d; -1 disables; 0 is immediate
 					# (change requires restart)
 
+# - SLRU Buffers (change requires restart) -
+
+#xact_buffers = 0			# memory for pg_xact (0 = auto)
+#subtrans_buffers = 32			# memory for pg_subtrans
+#multixact_offsets_buffers = 8		# memory for pg_multixact/offsets
+#multixact_members_buffers = 16		# memory for pg_multixact/members
+#notify_buffers = 8			# memory for pg_notify
+#serial_buffers = 16			# memory for pg_serial
+#commit_ts_buffers = 0			# memory for pg_commit_ts (0 = auto)
 
 #------------------------------------------------------------------------------
 # WRITE-AHEAD LOG
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index 39b8e4afa8a..739a292f7f3 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -15,6 +15,16 @@
 #include "storage/sync.h"
 #include "lib/stringinfo.h"
 
+/*
+ * Don't allow xact_buffers to be set higher than could possibly be useful or
+ * SLRU would allow.
+ */
+#define CLOG_XACTS_PER_BYTE 4
+#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_MAX_ALLOWED_BUFFERS \
+	Min(SLRU_MAX_ALLOWED_BUFFERS, \
+		(((MaxTransactionId / 2) + (CLOG_XACTS_PER_PAGE - 1)) / CLOG_XACTS_PER_PAGE))
+
 /*
  * Possible transaction statuses --- note that all-zeroes is the initial
  * state.
diff --git a/src/include/access/commit_ts.h b/src/include/access/commit_ts.h
index a1538978c62..f86760f7240 100644
--- a/src/include/access/commit_ts.h
+++ b/src/include/access/commit_ts.h
@@ -16,7 +16,6 @@
 #include "replication/origin.h"
 #include "storage/sync.h"
 
-
 extern PGDLLIMPORT bool track_commit_timestamp;
 
 extern void TransactionTreeSetCommitTsData(TransactionId xid, int nsubxids,
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 4bbb035eaea..97c0a463768 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -29,10 +29,6 @@
 
 #define MaxMultiXactOffset	((MultiXactOffset) 0xFFFFFFFF)
 
-/* Number of SLRU buffers to use for multixact */
-#define NUM_MULTIXACTOFFSET_BUFFERS		8
-#define NUM_MULTIXACTMEMBER_BUFFERS		16
-
 /*
  * Possible multixact lock modes ("status").  The first four modes are for
  * tuple locks (FOR KEY SHARE, FOR SHARE, FOR NO KEY UPDATE, FOR UPDATE); the
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index dd52e8cec7e..793c045f160 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -17,6 +17,11 @@
 #include "storage/lwlock.h"
 #include "storage/sync.h"
 
+/*
+ * To avoid overflowing internal arithmetic and the size_t data type, the
+ * number of buffers should not exceed this number.
+ */
+#define SLRU_MAX_ALLOWED_BUFFERS ((1024 * 1024 * 1024) / BLCKSZ)
 
 /*
  * Define SLRU segment size.  A page is the same BLCKSZ as is used everywhere
diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h
index d0ab44ae828..64fa86938e2 100644
--- a/src/include/access/subtrans.h
+++ b/src/include/access/subtrans.h
@@ -11,8 +11,6 @@
 #ifndef SUBTRANS_H
 #define SUBTRANS_H
 
-/* Number of SLRU buffers to use for subtrans */
-#define NUM_SUBTRANS_BUFFERS	32
 
 extern void SubTransSetParent(TransactionId xid, TransactionId parent);
 extern TransactionId SubTransGetParent(TransactionId xid);
diff --git a/src/include/commands/async.h b/src/include/commands/async.h
index f371ac896b9..99575974982 100644
--- a/src/include/commands/async.h
+++ b/src/include/commands/async.h
@@ -15,11 +15,6 @@
 
 #include <signal.h>
 
-/*
- * The number of SLRU page buffers we use for the notification queue.
- */
-#define NUM_NOTIFY_BUFFERS	8
-
 extern bool Trace_notify;
 extern volatile sig_atomic_t notifyInterruptPending;
 
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 90a30160657..22d31546ad5 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -176,6 +176,13 @@ extern PGDLLIMPORT int MaxBackends;
 extern PGDLLIMPORT int MaxConnections;
 extern PGDLLIMPORT int max_worker_processes;
 extern PGDLLIMPORT int max_parallel_workers;
+extern PGDLLIMPORT int multixact_offsets_buffers;
+extern PGDLLIMPORT int multixact_members_buffers;
+extern PGDLLIMPORT int subtrans_buffers;
+extern PGDLLIMPORT int notify_buffers;
+extern PGDLLIMPORT int serial_buffers;
+extern PGDLLIMPORT int xact_buffers;
+extern PGDLLIMPORT int commit_ts_buffers;
 
 extern PGDLLIMPORT int MyProcPid;
 extern PGDLLIMPORT pg_time_t MyStartTime;
diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h
index 152b6986114..c72779bd88d 100644
--- a/src/include/storage/predicate.h
+++ b/src/include/storage/predicate.h
@@ -26,10 +26,6 @@ extern int	max_predicate_locks_per_xact;
 extern int	max_predicate_locks_per_relation;
 extern int	max_predicate_locks_per_page;
 
-
-/* Number of SLRU buffers to use for Serial SLRU */
-#define NUM_SERIAL_BUFFERS		16
-
 /*
  * A handle used for sharing SERIALIZABLEXACT objects between the participants
  * in a parallel query.
-- 
2.33.1

From 5c1aace8b1f5a2b852bf1476dffbd4a43f196ef3 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <x4m@flight.local>
Date: Sun, 26 Dec 2021 15:03:30 +0500
Subject: [PATCH v=18 3/3] Pack SLRU page_number, page_status and page_dirty
 toogether

This allows to test only one cacheline during successfull
SlruSelectLRUPage().
---
 src/backend/access/transam/clog.c      |  12 +--
 src/backend/access/transam/commit_ts.c |   6 +-
 src/backend/access/transam/multixact.c |  16 +--
 src/backend/access/transam/slru.c      | 143 ++++++++++++-------------
 src/backend/access/transam/subtrans.c  |   4 +-
 src/backend/commands/async.c           |   2 +-
 src/backend/storage/lmgr/predicate.c   |   2 +-
 src/include/access/slru.h              |  13 ++-
 8 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index ca28ada75fa..7d3a0286a5f 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -375,7 +375,7 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
 		{
 			for (i = 0; i < nsubxids; i++)
 			{
-				Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+				Assert(XactCtl->shared->page_entries[slotno].page_number == TransactionIdToPage(subxids[i]));
 				TransactionIdSetStatusBit(subxids[i],
 										  TRANSACTION_STATUS_SUB_COMMITTED,
 										  lsn, slotno);
@@ -389,11 +389,11 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids,
 	/* Set the subtransactions */
 	for (i = 0; i < nsubxids; i++)
 	{
-		Assert(XactCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		Assert(XactCtl->shared->page_entries[slotno].page_number == TransactionIdToPage(subxids[i]));
 		TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
 	}
 
-	XactCtl->shared->page_dirty[slotno] = true;
+	XactCtl->shared->page_entries[slotno].page_dirty = true;
 }
 
 /*
@@ -713,7 +713,7 @@ BootStrapCLOG(void)
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(XactCtl, slotno);
-	Assert(!XactCtl->shared->page_dirty[slotno]);
+	Assert(!XactCtl->shared->page_entries[slotno].page_dirty);
 
 	LWLockRelease(XactSLRULock);
 }
@@ -798,7 +798,7 @@ TrimCLOG(void)
 		/* Zero the rest of the page */
 		MemSet(byteptr + 1, 0, BLCKSZ - byteno - 1);
 
-		XactCtl->shared->page_dirty[slotno] = true;
+		XactCtl->shared->page_entries[slotno].page_dirty = true;
 	}
 
 	LWLockRelease(XactSLRULock);
@@ -994,7 +994,7 @@ clog_redo(XLogReaderState *record)
 
 		slotno = ZeroCLOGPage(pageno, false);
 		SimpleLruWritePage(XactCtl, slotno);
-		Assert(!XactCtl->shared->page_dirty[slotno]);
+		Assert(!XactCtl->shared->page_entries[slotno].page_dirty);
 
 		LWLockRelease(XactSLRULock);
 	}
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index cb2e0ceb1c3..6879ef3ec71 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -227,7 +227,7 @@ SetXidCommitTsInPage(TransactionId xid, int nsubxids,
 	for (i = 0; i < nsubxids; i++)
 		TransactionIdSetCommitTs(subxids[i], ts, nodeid, slotno);
 
-	CommitTsCtl->shared->page_dirty[slotno] = true;
+	CommitTsCtl->shared->page_entries[slotno].page_dirty = true;
 
 	LWLockRelease(CommitTsSLRULock);
 }
@@ -735,7 +735,7 @@ ActivateCommitTs(void)
 		LWLockAcquire(CommitTsSLRULock, LW_EXCLUSIVE);
 		slotno = ZeroCommitTsPage(pageno, false);
 		SimpleLruWritePage(CommitTsCtl, slotno);
-		Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+		Assert(!CommitTsCtl->shared->page_entries[slotno].page_dirty);
 		LWLockRelease(CommitTsSLRULock);
 	}
 
@@ -1005,7 +1005,7 @@ commit_ts_redo(XLogReaderState *record)
 
 		slotno = ZeroCommitTsPage(pageno, false);
 		SimpleLruWritePage(CommitTsCtl, slotno);
-		Assert(!CommitTsCtl->shared->page_dirty[slotno]);
+		Assert(!CommitTsCtl->shared->page_entries[slotno].page_dirty);
 
 		LWLockRelease(CommitTsSLRULock);
 	}
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index a29ab4769dc..eb32821b717 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -887,7 +887,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 
 	*offptr = offset;
 
-	MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+	MultiXactOffsetCtl->shared->page_entries[slotno].page_dirty = true;
 
 	/* Exchange our lock */
 	LWLockRelease(MultiXactOffsetSLRULock);
@@ -931,7 +931,7 @@ RecordNewMultiXact(MultiXactId multi, MultiXactOffset offset,
 		flagsval |= (members[i].status << bshift);
 		*flagsptr = flagsval;
 
-		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+		MultiXactMemberCtl->shared->page_entries[slotno].page_dirty = true;
 	}
 
 	LWLockRelease(MultiXactMemberSLRULock);
@@ -1902,7 +1902,7 @@ BootStrapMultiXact(void)
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(MultiXactOffsetCtl, slotno);
-	Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+	Assert(!MultiXactOffsetCtl->shared->page_entries[slotno].page_dirty);
 
 	LWLockRelease(MultiXactOffsetSLRULock);
 
@@ -1913,7 +1913,7 @@ BootStrapMultiXact(void)
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(MultiXactMemberCtl, slotno);
-	Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
+	Assert(!MultiXactMemberCtl->shared->page_entries[slotno].page_dirty);
 
 	LWLockRelease(MultiXactMemberSLRULock);
 }
@@ -2074,7 +2074,7 @@ TrimMultiXact(void)
 
 		MemSet(offptr, 0, BLCKSZ - (entryno * sizeof(MultiXactOffset)));
 
-		MultiXactOffsetCtl->shared->page_dirty[slotno] = true;
+		MultiXactOffsetCtl->shared->page_entries[slotno].page_dirty = true;
 	}
 
 	LWLockRelease(MultiXactOffsetSLRULock);
@@ -2112,7 +2112,7 @@ TrimMultiXact(void)
 		 * writing.
 		 */
 
-		MultiXactMemberCtl->shared->page_dirty[slotno] = true;
+		MultiXactMemberCtl->shared->page_entries[slotno].page_dirty = true;
 	}
 
 	LWLockRelease(MultiXactMemberSLRULock);
@@ -3251,7 +3251,7 @@ multixact_redo(XLogReaderState *record)
 
 		slotno = ZeroMultiXactOffsetPage(pageno, false);
 		SimpleLruWritePage(MultiXactOffsetCtl, slotno);
-		Assert(!MultiXactOffsetCtl->shared->page_dirty[slotno]);
+		Assert(!MultiXactOffsetCtl->shared->page_entries[slotno].page_dirty);
 
 		LWLockRelease(MultiXactOffsetSLRULock);
 	}
@@ -3266,7 +3266,7 @@ multixact_redo(XLogReaderState *record)
 
 		slotno = ZeroMultiXactMemberPage(pageno, false);
 		SimpleLruWritePage(MultiXactMemberCtl, slotno);
-		Assert(!MultiXactMemberCtl->shared->page_dirty[slotno]);
+		Assert(!MultiXactMemberCtl->shared->page_entries[slotno].page_dirty);
 
 		LWLockRelease(MultiXactMemberSLRULock);
 	}
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 33857bffb79..1f87e50f05c 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -186,9 +186,7 @@ SimpleLruShmemSize(int nslots, int nlsns)
 	/* we assume nslots isn't so large as to risk overflow */
 	sz = MAXALIGN(sizeof(SlruSharedData));
 	sz += MAXALIGN(nslots * sizeof(char *));	/* page_buffer[] */
-	sz += MAXALIGN(nslots * sizeof(SlruPageStatus));	/* page_status[] */
-	sz += MAXALIGN(nslots * sizeof(bool));	/* page_dirty[] */
-	sz += MAXALIGN(nslots * sizeof(int));	/* page_number[] */
+	sz += MAXALIGN(nslots * sizeof(SlruPageEntry));	/* page_entries[] */
 	sz += MAXALIGN(nslots * sizeof(int));	/* page_lru_count[] */
 	sz += MAXALIGN(nslots * sizeof(LWLockPadded));	/* buffer_locks[] */
 
@@ -248,16 +246,13 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 
 		shared->slru_stats_idx = pgstat_slru_index(name);
 
+		Assert(sizeof(SlruPageEntry) == 8);
 		ptr = (char *) shared;
 		offset = MAXALIGN(sizeof(SlruSharedData));
 		shared->page_buffer = (char **) (ptr + offset);
 		offset += MAXALIGN(nslots * sizeof(char *));
-		shared->page_status = (SlruPageStatus *) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(SlruPageStatus));
-		shared->page_dirty = (bool *) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(bool));
-		shared->page_number = (int *) (ptr + offset);
-		offset += MAXALIGN(nslots * sizeof(int));
+		shared->page_entries = (SlruPageEntry *) (ptr + offset);
+		offset += MAXALIGN(nslots * sizeof(SlruPageEntry));
 		shared->page_lru_count = (int *) (ptr + offset);
 		offset += MAXALIGN(nslots * sizeof(int));
 
@@ -278,8 +273,8 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 							 tranche_id);
 
 			shared->page_buffer[slotno] = ptr;
-			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
-			shared->page_dirty[slotno] = false;
+			shared->page_entries[slotno].page_status = SLRU_PAGE_EMPTY;
+			shared->page_entries[slotno].page_dirty = false;
 			shared->page_lru_count[slotno] = 0;
 			ptr += BLCKSZ;
 		}
@@ -315,15 +310,15 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
 
 	/* Find a suitable buffer slot for the page */
 	slotno = SlruSelectLRUPage(ctl, pageno);
-	Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
-		   (shared->page_status[slotno] == SLRU_PAGE_VALID &&
-			!shared->page_dirty[slotno]) ||
-		   shared->page_number[slotno] == pageno);
+	Assert(shared->page_entries[slotno].page_status == SLRU_PAGE_EMPTY ||
+		   (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID &&
+			!shared->page_entries[slotno].page_dirty) ||
+		   shared->page_entries[slotno].page_number == pageno);
 
 	/* Mark the slot as containing this page */
-	shared->page_number[slotno] = pageno;
-	shared->page_status[slotno] = SLRU_PAGE_VALID;
-	shared->page_dirty[slotno] = true;
+	shared->page_entries[slotno].page_number = pageno;
+	shared->page_entries[slotno].page_status = SLRU_PAGE_VALID;
+	shared->page_entries[slotno].page_dirty = true;
 	SlruRecentlyUsed(shared, slotno);
 
 	/* Set the buffer to zeroes */
@@ -387,18 +382,18 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno)
 	 * cheaply test for failure by seeing if the buffer lock is still held (we
 	 * assume that transaction abort would release the lock).
 	 */
-	if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
-		shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
+	if (shared->page_entries[slotno].page_status == SLRU_PAGE_READ_IN_PROGRESS ||
+		shared->page_entries[slotno].page_status == SLRU_PAGE_WRITE_IN_PROGRESS)
 	{
 		if (LWLockConditionalAcquire(&shared->buffer_locks[slotno].lock, LW_SHARED))
 		{
 			/* indeed, the I/O must have failed */
-			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
-				shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			if (shared->page_entries[slotno].page_status == SLRU_PAGE_READ_IN_PROGRESS)
+				shared->page_entries[slotno].page_status = SLRU_PAGE_EMPTY;
 			else				/* write_in_progress */
 			{
-				shared->page_status[slotno] = SLRU_PAGE_VALID;
-				shared->page_dirty[slotno] = true;
+				shared->page_entries[slotno].page_status = SLRU_PAGE_VALID;
+				shared->page_entries[slotno].page_dirty = true;
 			}
 			LWLockRelease(&shared->buffer_locks[slotno].lock);
 		}
@@ -438,15 +433,15 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 		slotno = SlruSelectLRUPage(ctl, pageno);
 
 		/* Did we find the page in memory? */
-		if (shared->page_number[slotno] == pageno &&
-			shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+		if (shared->page_entries[slotno].page_number == pageno &&
+			shared->page_entries[slotno].page_status != SLRU_PAGE_EMPTY)
 		{
 			/*
 			 * If page is still being read in, we must wait for I/O.  Likewise
 			 * if the page is being written and the caller said that's not OK.
 			 */
-			if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS ||
-				(shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
+			if (shared->page_entries[slotno].page_status == SLRU_PAGE_READ_IN_PROGRESS ||
+				(shared->page_entries[slotno].page_status == SLRU_PAGE_WRITE_IN_PROGRESS &&
 				 !write_ok))
 			{
 				SimpleLruWaitIO(ctl, slotno);
@@ -463,14 +458,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 		}
 
 		/* We found no match; assert we selected a freeable slot */
-		Assert(shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
-			   (shared->page_status[slotno] == SLRU_PAGE_VALID &&
-				!shared->page_dirty[slotno]));
+		Assert(shared->page_entries[slotno].page_status == SLRU_PAGE_EMPTY ||
+			   (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID &&
+				!shared->page_entries[slotno].page_dirty));
 
 		/* Mark the slot read-busy */
-		shared->page_number[slotno] = pageno;
-		shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS;
-		shared->page_dirty[slotno] = false;
+		shared->page_entries[slotno].page_number = pageno;
+		shared->page_entries[slotno].page_status = SLRU_PAGE_READ_IN_PROGRESS;
+		shared->page_entries[slotno].page_dirty = false;
 
 		/* Acquire per-buffer lock (cannot deadlock, see notes at top) */
 		LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
@@ -487,11 +482,11 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 		/* Re-acquire control lock and update page state */
 		LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
-		Assert(shared->page_number[slotno] == pageno &&
-			   shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS &&
-			   !shared->page_dirty[slotno]);
+		Assert(shared->page_entries[slotno].page_number == pageno &&
+			   shared->page_entries[slotno].page_status == SLRU_PAGE_READ_IN_PROGRESS &&
+			   !shared->page_entries[slotno].page_dirty);
 
-		shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
+		shared->page_entries[slotno].page_status = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY;
 
 		LWLockRelease(&shared->buffer_locks[slotno].lock);
 
@@ -536,9 +531,9 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
 	int bankend = bankstart + shared->bank_size;
 	for (slotno = bankstart; slotno < bankend; slotno++)
 	{
-		if (shared->page_number[slotno] == pageno &&
-			shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
-			shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
+		if (shared->page_entries[slotno].page_number == pageno &&
+			shared->page_entries[slotno].page_status != SLRU_PAGE_EMPTY &&
+			shared->page_entries[slotno].page_status != SLRU_PAGE_READ_IN_PROGRESS)
 		{
 			/* See comments for SlruRecentlyUsed macro */
 			SlruRecentlyUsed(shared, slotno);
@@ -572,12 +567,12 @@ static void
 SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
 {
 	SlruShared	shared = ctl->shared;
-	int			pageno = shared->page_number[slotno];
+	int			pageno = shared->page_entries[slotno].page_number;
 	bool		ok;
 
 	/* If a write is in progress, wait for it to finish */
-	while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
-		   shared->page_number[slotno] == pageno)
+	while (shared->page_entries[slotno].page_status == SLRU_PAGE_WRITE_IN_PROGRESS &&
+		   shared->page_entries[slotno].page_number == pageno)
 	{
 		SimpleLruWaitIO(ctl, slotno);
 	}
@@ -586,17 +581,17 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
 	 * Do nothing if page is not dirty, or if buffer no longer contains the
 	 * same page we were called for.
 	 */
-	if (!shared->page_dirty[slotno] ||
-		shared->page_status[slotno] != SLRU_PAGE_VALID ||
-		shared->page_number[slotno] != pageno)
+	if (!shared->page_entries[slotno].page_dirty ||
+		shared->page_entries[slotno].page_status != SLRU_PAGE_VALID ||
+		shared->page_entries[slotno].page_number != pageno)
 		return;
 
 	/*
 	 * Mark the slot write-busy, and clear the dirtybit.  After this point, a
 	 * transaction status update on this page will mark it dirty again.
 	 */
-	shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
-	shared->page_dirty[slotno] = false;
+	shared->page_entries[slotno].page_status = SLRU_PAGE_WRITE_IN_PROGRESS;
+	shared->page_entries[slotno].page_dirty = false;
 
 	/* Acquire per-buffer lock (cannot deadlock, see notes at top) */
 	LWLockAcquire(&shared->buffer_locks[slotno].lock, LW_EXCLUSIVE);
@@ -619,14 +614,14 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata)
 	/* Re-acquire control lock and update page state */
 	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
-	Assert(shared->page_number[slotno] == pageno &&
-		   shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS);
+	Assert(shared->page_entries[slotno].page_number == pageno &&
+		   shared->page_entries[slotno].page_status == SLRU_PAGE_WRITE_IN_PROGRESS);
 
 	/* If we failed to write, mark the page dirty again */
 	if (!ok)
-		shared->page_dirty[slotno] = true;
+		shared->page_entries[slotno].page_dirty = true;
 
-	shared->page_status[slotno] = SLRU_PAGE_VALID;
+	shared->page_entries[slotno].page_status = SLRU_PAGE_VALID;
 
 	LWLockRelease(&shared->buffer_locks[slotno].lock);
 
@@ -1067,8 +1062,8 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		int bankend = bankstart + shared->bank_size;
 		for (slotno = bankstart; slotno < bankend; slotno++)
 		{
-			if (shared->page_number[slotno] == pageno &&
-				shared->page_status[slotno] != SLRU_PAGE_EMPTY)
+			if (shared->page_entries[slotno].page_number == pageno &&
+				shared->page_entries[slotno].page_status != SLRU_PAGE_EMPTY)
 				return slotno;
 		}
 
@@ -1105,7 +1100,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 			int			this_delta;
 			int			this_page_number;
 
-			if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+			if (shared->page_entries[slotno].page_status == SLRU_PAGE_EMPTY)
 				return slotno;
 			this_delta = cur_count - shared->page_lru_count[slotno];
 			if (this_delta < 0)
@@ -1120,10 +1115,10 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 				shared->page_lru_count[slotno] = cur_count;
 				this_delta = 0;
 			}
-			this_page_number = shared->page_number[slotno];
+			this_page_number = shared->page_entries[slotno].page_number;
 			if (this_page_number == shared->latest_page_number)
 				continue;
-			if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+			if (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID)
 			{
 				if (this_delta > best_valid_delta ||
 					(this_delta == best_valid_delta &&
@@ -1165,7 +1160,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		/*
 		 * If the selected page is clean, we're set.
 		 */
-		if (!shared->page_dirty[bestvalidslot])
+		if (!shared->page_entries[bestvalidslot].page_dirty)
 			return bestvalidslot;
 
 		/*
@@ -1217,9 +1212,9 @@ SimpleLruWriteAll(SlruCtl ctl, bool allow_redirtied)
 		 * already.  That's okay.
 		 */
 		Assert(allow_redirtied ||
-			   shared->page_status[slotno] == SLRU_PAGE_EMPTY ||
-			   (shared->page_status[slotno] == SLRU_PAGE_VALID &&
-				!shared->page_dirty[slotno]));
+			   shared->page_entries[slotno].page_status == SLRU_PAGE_EMPTY ||
+			   (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID &&
+				!shared->page_entries[slotno].page_dirty));
 	}
 
 	LWLockRelease(shared->ControlLock);
@@ -1291,18 +1286,18 @@ restart:;
 
 	for (slotno = 0; slotno < shared->num_slots; slotno++)
 	{
-		if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+		if (shared->page_entries[slotno].page_status == SLRU_PAGE_EMPTY)
 			continue;
-		if (!ctl->PagePrecedes(shared->page_number[slotno], cutoffPage))
+		if (!ctl->PagePrecedes(shared->page_entries[slotno].page_number, cutoffPage))
 			continue;
 
 		/*
 		 * If page is clean, just change state to EMPTY (expected case).
 		 */
-		if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
-			!shared->page_dirty[slotno])
+		if (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID &&
+			!shared->page_entries[slotno].page_dirty)
 		{
-			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			shared->page_entries[slotno].page_status = SLRU_PAGE_EMPTY;
 			continue;
 		}
 
@@ -1316,7 +1311,7 @@ restart:;
 		 * won't have cause to read its data again.  For now, keep the logic
 		 * the same as it was.)
 		 */
-		if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+		if (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID)
 			SlruInternalWritePage(ctl, slotno, NULL);
 		else
 			SimpleLruWaitIO(ctl, slotno);
@@ -1371,9 +1366,9 @@ restart:
 	did_write = false;
 	for (slotno = 0; slotno < shared->num_slots; slotno++)
 	{
-		int			pagesegno = shared->page_number[slotno] / SLRU_PAGES_PER_SEGMENT;
+		int			pagesegno = shared->page_entries[slotno].page_number / SLRU_PAGES_PER_SEGMENT;
 
-		if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
+		if (shared->page_entries[slotno].page_status == SLRU_PAGE_EMPTY)
 			continue;
 
 		/* not the segment we're looking for */
@@ -1381,15 +1376,15 @@ restart:
 			continue;
 
 		/* If page is clean, just change state to EMPTY (expected case). */
-		if (shared->page_status[slotno] == SLRU_PAGE_VALID &&
-			!shared->page_dirty[slotno])
+		if (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID &&
+			!shared->page_entries[slotno].page_dirty)
 		{
-			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
+			shared->page_entries[slotno].page_status = SLRU_PAGE_EMPTY;
 			continue;
 		}
 
 		/* Same logic as SimpleLruTruncate() */
-		if (shared->page_status[slotno] == SLRU_PAGE_VALID)
+		if (shared->page_entries[slotno].page_status == SLRU_PAGE_VALID)
 			SlruInternalWritePage(ctl, slotno, NULL);
 		else
 			SimpleLruWaitIO(ctl, slotno);
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index 785f2520fde..11754fdbac6 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -97,7 +97,7 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
 	{
 		Assert(*ptr == InvalidTransactionId);
 		*ptr = parent;
-		SubTransCtl->shared->page_dirty[slotno] = true;
+		SubTransCtl->shared->page_entries[slotno].page_dirty = true;
 	}
 
 	LWLockRelease(SubtransSLRULock);
@@ -220,7 +220,7 @@ BootStrapSUBTRANS(void)
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(SubTransCtl, slotno);
-	Assert(!SubTransCtl->shared->page_dirty[slotno]);
+	Assert(!SubTransCtl->shared->page_entries[slotno].page_dirty);
 
 	LWLockRelease(SubtransSLRULock);
 }
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 7f2b7598449..86a5bc2430c 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -1445,7 +1445,7 @@ asyncQueueAddEntries(ListCell *nextNotify)
 								   InvalidTransactionId);
 
 	/* Note we mark the page dirty before writing in it */
-	NotifyCtl->shared->page_dirty[slotno] = true;
+	NotifyCtl->shared->page_entries[slotno].page_dirty = true;
 
 	while (nextNotify != NULL)
 	{
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index c5e66757643..53bd7d957ce 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -963,7 +963,7 @@ SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
 		slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid);
 
 	SerialValue(slotno, xid) = minConflictCommitSeqNo;
-	SerialSlruCtl->shared->page_dirty[slotno] = true;
+	SerialSlruCtl->shared->page_entries[slotno].page_dirty = true;
 
 	LWLockRelease(SerialSLRULock);
 }
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index f4df54d3c12..0a4fae91d7f 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -44,7 +44,7 @@
  * in the latter case it implies that the page has been re-dirtied since
  * the write started.
  */
-typedef enum
+typedef enum SlruPageStatus:int16_t
 {
 	SLRU_PAGE_EMPTY,			/* buffer is not in use */
 	SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */
@@ -52,6 +52,13 @@ typedef enum
 	SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */
 } SlruPageStatus;
 
+typedef struct SlruPageEntry
+{
+	int				page_number;
+	SlruPageStatus	page_status;
+	bool			page_dirty;
+} SlruPageEntry;
+
 /*
  * Shared-memory state
  */
@@ -69,9 +76,7 @@ typedef struct SlruSharedData
 	 * when status is EMPTY, as is page_lru_count.
 	 */
 	char	  **page_buffer;
-	SlruPageStatus *page_status;
-	bool	   *page_dirty;
-	int		   *page_number;
+	SlruPageEntry *page_entries;
 	int		   *page_lru_count;
 	LWLockPadded *buffer_locks;
 
-- 
2.33.1

From 12a1c31240e338c0e8c3c7211fbdd7b2e9666564 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amboro...@acm.org>
Date: Sun, 11 Apr 2021 21:18:10 +0300
Subject: [PATCH v=18 2/3] Divide SLRU buffers into 8-associative banks

We want to eliminate linear search within SLRU buffers.
To do so we divide SLRU buffers into banks. Each bank holds
approximately 8 buffers. Each SLRU pageno may reside only in one bank.
Adjacent pagenos reside in different banks.
---
 src/backend/access/transam/slru.c | 43 ++++++++++++++++++++++++++++---
 src/include/access/slru.h         |  2 ++
 2 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 7585ae24ce9..33857bffb79 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -134,7 +134,7 @@ typedef enum
 static SlruErrorCause slru_errcause;
 static int	slru_errno;
 
-
+static void SlruAdjustNSlots(int* nslots, int* banksize, int* bankoffset);
 static void SimpleLruZeroLSNs(SlruCtl ctl, int slotno);
 static void SimpleLruWaitIO(SlruCtl ctl, int slotno);
 static void SlruInternalWritePage(SlruCtl ctl, int slotno, SlruWriteAll fdata);
@@ -148,6 +148,30 @@ static bool SlruScanDirCbDeleteCutoff(SlruCtl ctl, char *filename,
 									  int segpage, void *data);
 static void SlruInternalDeleteSegment(SlruCtl ctl, int segno);
 
+/*
+ * Pick bank size optimal for N-assiciative SLRU buffers.
+ * We expect bank number to be picked from lowest bits of requested pageno.
+ * Thus we want number of banks to be power of 2. This routine computes number
+ * of banks aiming to make each bank of size 8. So we can pack page number and
+ * statuses of each bank on one cacheline.
+ */
+static void SlruAdjustNSlots(int* nslots, int* banksize, int* bankoffset)
+{
+	*banksize = *nslots;
+	int nbanks = 1;
+	*bankoffset = 0;
+	while (*banksize > 15)
+	{
+		if ((*banksize & 1) != 0)
+			*banksize +=1;
+		*banksize /= 2;
+		nbanks *= 2;
+		*bankoffset += 1;
+	}
+	elog(DEBUG5, "nslots %d banksize %d nbanks %d ", *nslots, *banksize, nbanks);
+	*nslots = *banksize * nbanks;
+}
+
 /*
  * Initialization of shared memory
  */
@@ -156,6 +180,8 @@ Size
 SimpleLruShmemSize(int nslots, int nlsns)
 {
 	Size		sz;
+	int bankoffset, banksize;
+	SlruAdjustNSlots(&nslots, &banksize, &bankoffset);
 
 	/* we assume nslots isn't so large as to risk overflow */
 	sz = MAXALIGN(sizeof(SlruSharedData));
@@ -190,6 +216,8 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 {
 	SlruShared	shared;
 	bool		found;
+	int bankoffset, banksize;
+	SlruAdjustNSlots(&nslots, &banksize, &bankoffset);
 
 	shared = (SlruShared) ShmemInitStruct(name,
 										  SimpleLruShmemSize(nslots, nlsns),
@@ -209,6 +237,9 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		shared->ControlLock = ctllock;
 
 		shared->num_slots = nslots;
+		shared->bank_mask =  (1 << bankoffset) - 1;
+		shared->bank_size = banksize;
+		
 		shared->lsn_groups_per_page = nlsns;
 
 		shared->cur_lru_count = 0;
@@ -501,7 +532,9 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid)
 	LWLockAcquire(shared->ControlLock, LW_SHARED);
 
 	/* See if page is already in a buffer */
-	for (slotno = 0; slotno < shared->num_slots; slotno++)
+	int bankstart = (pageno & shared->bank_mask) * shared->bank_size;
+	int bankend = bankstart + shared->bank_size;
+	for (slotno = bankstart; slotno < bankend; slotno++)
 	{
 		if (shared->page_number[slotno] == pageno &&
 			shared->page_status[slotno] != SLRU_PAGE_EMPTY &&
@@ -1030,7 +1063,9 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		int			best_invalid_page_number = 0;	/* keep compiler quiet */
 
 		/* See if page already has a buffer assigned */
-		for (slotno = 0; slotno < shared->num_slots; slotno++)
+		int bankstart = (pageno & shared->bank_mask) * shared->bank_size;
+		int bankend = bankstart + shared->bank_size;
+		for (slotno = bankstart; slotno < bankend; slotno++)
 		{
 			if (shared->page_number[slotno] == pageno &&
 				shared->page_status[slotno] != SLRU_PAGE_EMPTY)
@@ -1065,7 +1100,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		 * multiple pages with the same lru_count.
 		 */
 		cur_count = (shared->cur_lru_count)++;
-		for (slotno = 0; slotno < shared->num_slots; slotno++)
+		for (slotno = bankstart; slotno < bankend; slotno++)
 		{
 			int			this_delta;
 			int			this_page_number;
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 793c045f160..f4df54d3c12 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -61,6 +61,8 @@ typedef struct SlruSharedData
 
 	/* Number of buffers managed by this SLRU structure */
 	int			num_slots;
+	int			bank_size;
+	int			bank_mask;
 
 	/*
 	 * Arrays holding info for each buffer slot.  Page number is undefined
-- 
2.33.1

Reply via email to