From f7295a1c4cd07c393ced70ad0c8622efdb6ab26d Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Wed, 16 Apr 2025 10:23:31 +0200
Subject: [PATCH v1] Add capability to interleave shared memory across multiple
 NUMA nodes

Introduce new GUC numa=off(default)/on/auto that might be used to
enable interleaving shared memory. Until today, imbalances in shared memory
allocations on NUMA setups, may have caused non-deterministic performance
due to differences in latencies and bandwidths across interconnects ("remote"
access). This is only supported on Linux with libnuma.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Inspired-by: Andres Freund <andres@anarazel.de>
Reviewed-by:
Discussion:
---
 doc/src/sgml/config.sgml                      | 16 ++++++++++++++++
 src/backend/port/sysv_shmem.c                 |  7 +++++++
 src/backend/utils/misc/guc_tables.c           | 18 ++++++++++++++++++
 src/backend/utils/misc/postgresql.conf.sample |  2 ++
 src/include/port/pg_numa.h                    |  1 +
 src/include/storage/pg_shmem.h                | 10 ++++++++++
 src/port/pg_numa.c                            | 17 +++++++++++++++++
 7 files changed, 71 insertions(+)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index c1674c22cb2..15397df71d6 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2313,6 +2313,22 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-numa" xreflabel="numa">
+      <term><varname>numa</varname> (<type>enum</type>)
+      <indexterm>
+       <primary><varname>numa</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies wheter to use NUMA interleaving policy for the shared memory
+        segment. Possible values are <literal>off</literal>, <literal>on</literal>
+        and <literal>auto</literal>. This parameter is only effective on Linux.
+        The default value is <literal>off</literal>. This parameter can only be set
+        at server start.
+       </para>
+      </listitem>
+     </varlistentry>
      </variablelist>
      </sect2>
 
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 197926d44f6..510b0e53638 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -29,6 +29,7 @@
 
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
+#include "port/pg_numa.h"
 #include "portability/mem.h"
 #include "storage/dsm.h"
 #include "storage/fd.h"
@@ -663,6 +664,12 @@ CreateAnonymousSegment(Size *size)
 						 allocsize) : 0));
 	}
 
+	if (numa == NUMA_ON || (numa == NUMA_AUTO && pg_numa_get_max_node() > 1))
+	{
+		elog(DEBUG1, "enabling NUMA shm interleaving");
+		pg_numa_interleave_memptr(ptr, allocsize);
+	}
+
 	*size = allocsize;
 	return ptr;
 }
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 60b12446a1c..e4c9491df78 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -491,6 +491,13 @@ static const struct config_enum_entry file_copy_method_options[] = {
 	{NULL, 0, false}
 };
 
+static const struct config_enum_entry numa_options[] = {
+	{"off", NUMA_OFF, false},
+	{"on", NUMA_ON, false},
+	{"auto", NUMA_AUTO, false},
+	{NULL, 0, false}
+};
+
 /*
  * Options for enum values stored in other modules
  */
@@ -579,6 +586,7 @@ static int	ssl_renegotiation_limit;
 int			huge_pages = HUGE_PAGES_TRY;
 int			huge_page_size;
 int			huge_pages_status = HUGE_PAGES_UNKNOWN;
+int			numa = DEFAULT_NUMA;
 
 /*
  * These variables are all dummies that don't do anything, except in some
@@ -5418,6 +5426,16 @@ struct config_enum ConfigureNamesEnum[] =
 		NULL, assign_io_method, NULL
 	},
 
+	{
+		{"numa", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Whether to use NUMA interleaving for shared memory."),
+			NULL
+		},
+		&numa,
+		DEFAULT_NUMA, numa_options,
+		NULL, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, NULL, NULL, NULL, NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 34826d01380..f46b2d8de4d 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -137,6 +137,8 @@
 					# (change requires restart)
 #huge_page_size = 0			# zero for system default
 					# (change requires restart)
+#numa = off				# off, on, auto
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
index 40f1d324dcf..129663de2e8 100644
--- a/src/include/port/pg_numa.h
+++ b/src/include/port/pg_numa.h
@@ -17,6 +17,7 @@
 extern PGDLLIMPORT int pg_numa_init(void);
 extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status);
 extern PGDLLIMPORT int pg_numa_get_max_node(void);
+extern PGDLLIMPORT int pg_numa_interleave_memptr(void *ptr, size_t sz);
 
 #ifdef USE_LIBNUMA
 
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 5f7d4b83a60..1b09f7ec390 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -46,6 +46,7 @@ extern PGDLLIMPORT int shared_memory_type;
 extern PGDLLIMPORT int huge_pages;
 extern PGDLLIMPORT int huge_page_size;
 extern PGDLLIMPORT int huge_pages_status;
+extern PGDLLIMPORT int numa;
 
 /* Possible values for huge_pages and huge_pages_status */
 typedef enum
@@ -64,6 +65,15 @@ typedef enum
 	SHMEM_TYPE_MMAP,
 }			PGShmemType;
 
+typedef enum
+{
+	NUMA_OFF,
+	NUMA_ON,
+	NUMA_AUTO,
+}			NumaType;
+
+#define DEFAULT_NUMA NUMA_OFF
+
 #ifndef WIN32
 extern PGDLLIMPORT unsigned long UsedShmemSegID;
 #else
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
index 4b487a2a4e8..6ed0a5d2949 100644
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -55,6 +55,17 @@ pg_numa_get_max_node(void)
 	return numa_max_node();
 }
 
+int
+pg_numa_interleave_memptr(void *ptr, size_t sz)
+{
+	struct bitmask *nodemask = numa_allocate_nodemask();
+
+	numa_bitmask_setall(nodemask);
+	numa_interleave_memory(ptr, sz, nodemask);
+	numa_free_nodemask(nodemask);
+	return 0;
+}
+
 #else
 
 /* Empty wrappers */
@@ -77,4 +88,10 @@ pg_numa_get_max_node(void)
 	return 0;
 }
 
+int
+pg_numa_interleave_memptr(void *ptr, size_t sz)
+{
+	return 0;
+}
+
 #endif
-- 
2.39.5

