From da7fa0b8af9b75108bd4f0b50b25bdf1a2167473 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Tue, 24 Jun 2025 11:23:36 +0200
Subject: [PATCH v4] Add capability to interleave shared memory across multiple
 NUMA nodes.

Introduce new GUC numa=off(default)/auto/all/../=../@.. that might be used to
enable interleaving of shared memory. Until today, imbalances in shared memory
allocations on NUMA setups, may have caused non-deterministic performance
due to differences in latencies and bandwidths across interconnects ("remote"
access).

When provided list of nodes, the default is to use interleave memory on
preferred NUMA nodes, but support for more strict modes: pinning memory or
pinning both memory and CPU to specific NUMA node(s) is handled using special
'=' and '@' prefixes.

This is only supported on Linux with libnuma.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Inspired-by: Andres Freund <andres@anarazel.de>
Reviewed-by:
Discussion: https://postgr.es/m/CAKZiRmw6i1W1AwXxa-Asrn8wrVcVH3TO715g_MCoowTS9rkGyw%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      |  26 ++++
 src/backend/port/sysv_shmem.c                 |  22 ++++
 src/backend/postmaster/postmaster.c           |  17 +++
 src/backend/storage/ipc/dsm_impl.c            |  13 ++
 src/backend/storage/ipc/shmem.c               |  76 ++++++++++++
 src/backend/utils/misc/guc_tables.c           |  14 +++
 src/backend/utils/misc/postgresql.conf.sample |   2 +
 src/include/port/pg_numa.h                    |  13 ++
 src/include/storage/pg_shmem.h                |  19 +++
 src/include/utils/guc_hooks.h                 |   2 +
 src/port/pg_numa.c                            | 114 +++++++++++++++++-
 11 files changed, 317 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b265cc89c9d..0ab5c519624 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2329,6 +2329,32 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-numa" xreflabel="numa">
+      <term><varname>numa</varname> (<type>enum</type>)
+      <indexterm>
+       <primary><varname>numa</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies wheter to use NUMA interleaving policy for the shared memory
+        segment. Possible values are <literal>off</literal>,
+        <literal>all</literal> (interleaves shared memory across all available NUMA nodes),
+        <literal>auto</literal> (as previous, but only if number of available NUMA nodes is 2 or higher)
+        or <literal>[=@]comma-separated list of node numbers or node ranges</literal>
+
+        If comma-separated list of NUMA nodes is prefixed with <literal>=</literal> the memory allocations
+        are made strict to avoid spilling to other NUMA nodes.
+        If comma-separated list of NUMA nodes is prefixed with <literal>@</literal> the memory allocations
+        are made strict and also available CPUs are limited only to those of listed NUMA nodes.
+
+        This parameter is only effective on Linux. Parallel Query interleaving is
+        only supported with <literal>dynamic_shared_memory</literal>=<literal>posix</literal>
+        The default value is <literal>off</literal>. This parameter can only be
+        set at server start.
+       </para>
+      </listitem>
+     </varlistentry>
      </variablelist>
      </sect2>
 
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 197926d44f6..77af7c56ecd 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -29,6 +29,7 @@
 
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
+#include "port/pg_numa.h"
 #include "portability/mem.h"
 #include "storage/dsm.h"
 #include "storage/fd.h"
@@ -663,6 +664,27 @@ CreateAnonymousSegment(Size *size)
 						 allocsize) : 0));
 	}
 
+	if (numa->setting > NUMA_OFF)
+	{
+		/* In strict mode we want to ensure to not spill memory to another NUMA nodes */
+		int mem_bind_policy = numa->setting >= NUMA_STRICT_ONLY ? 1 : 0;
+
+		/* We do nothing in auto mode, if there is just one standard NUMA node */
+		if(numa->setting == NUMA_AUTO && pg_numa_get_max_node() <= 1) {
+			elog(DEBUG1, "no NUMA nodes found");
+		} else {
+			elog(LOG, "enabling NUMA shm interleaving");
+			pg_numa_interleave_memptr(ptr, allocsize, numa->nodes);
+
+			/* In NUMA_PREFERRED we can spill memory to other nodes, but not in STRICT modes */
+			pg_numa_set_bind_policy(mem_bind_policy);
+
+			/* We can also isolate CPUs to just isolated NUMA nodes */
+			if(numa->setting >= NUMA_STRICT_ONLY_AND_CPU_TOO)
+				pg_numa_bind(numa->nodes);
+		}
+	}
+
 	*size = allocsize;
 	return ptr;
 }
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 490f7ce3664..bc9e3da8fa7 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -100,6 +100,7 @@
 #include "pg_getopt.h"
 #include "pgstat.h"
 #include "port/pg_bswap.h"
+#include "port/pg_numa.h"
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/pgarch.h"
@@ -113,6 +114,7 @@
 #include "storage/fd.h"
 #include "storage/io_worker.h"
 #include "storage/ipc.h"
+#include "storage/pg_shmem.h"
 #include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "tcop/backend_startup.h"
@@ -453,6 +455,7 @@ static void StartSysLogger(void);
 static void StartAutovacuumWorker(void);
 static bool StartBackgroundWorker(RegisteredBgWorker *rw);
 static void InitPostmasterDeathWatchHandle(void);
+static void InitNuma(void);
 
 #ifdef WIN32
 #define WNOHANG 0				/* ignored, so any integer value will do */
@@ -993,6 +996,9 @@ PostmasterMain(int argc, char *argv[])
 		ExitPostmaster(0);
 	}
 
+	/* Initialize libnuma if necessary */
+	InitNuma();
+
 	/*
 	 * Set up shared memory and semaphores.
 	 *
@@ -4616,3 +4622,14 @@ InitPostmasterDeathWatchHandle(void)
 								 GetLastError())));
 #endif							/* WIN32 */
 }
+
+
+static void
+InitNuma(void)
+{
+	if(numa->setting > NUMA_OFF) {
+		if (pg_numa_init() == -1)
+			elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+	}
+	return;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 6bf8ab5bb5b..46dcef48394 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -64,8 +64,10 @@
 #include "pgstat.h"
 #include "portability/mem.h"
 #include "postmaster/postmaster.h"
+#include "port/pg_numa.h"
 #include "storage/dsm_impl.h"
 #include "storage/fd.h"
+#include "storage/pg_shmem.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 
@@ -334,6 +336,13 @@ dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
 	}
 	*mapped_address = address;
 	*mapped_size = request_size;
+
+	/* We interleave memory only at creation time. */
+	if (op == DSM_OP_CREATE && numa->setting > NUMA_OFF) {
+		elog(DEBUG1, "interleaving shm mem @ %p size=%zu", *mapped_address, *mapped_size);
+		pg_numa_interleave_memptr(*mapped_address, *mapped_size, numa->nodes);
+	}
+
 	close(fd);
 	ReleaseExternalFD();
 
@@ -588,6 +597,8 @@ dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
 	*mapped_address = address;
 	*mapped_size = request_size;
 
+	/* As dynamic_shared_memory=sysv is a bit legacy, we do not peform NUMA interleave here */
+
 	return true;
 }
 #endif
@@ -937,6 +948,8 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
 	*mapped_address = address;
 	*mapped_size = request_size;
 
+	/* As dynamic_shared_memory=mmap is a bit legacy, we do not peform NUMA interleave here */
+
 	if (CloseTransientFile(fd) != 0)
 	{
 		ereport(elevel,
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index c9ae3b45b76..bac84492e79 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -74,6 +74,10 @@
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/guc_hooks.h"
+#include <ctype.h>
+#include <numa.h>
 
 static void *ShmemAllocRaw(Size size, Size *allocated_size);
 
@@ -765,3 +769,75 @@ pg_numa_available(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_BOOL(pg_numa_init() != -1);
 }
+
+bool
+check_numa(char **newval, void **extra, GucSource source)
+{
+	bool		result = true;
+	NumaConfigData *n;
+	char	   *rawstring = *newval;
+
+	n = (NumaConfigData *) guc_malloc(LOG, sizeof(NumaConfigData));
+#ifndef USE_LIBNUMA
+	n->setting = NUMA_OFF;
+
+	if (!(strcmp(rawstring, "") == 0 || strcmp(rawstring, "off") == 0)) {
+
+		GUC_check_errdetail("\"%s\" is not supported on this platform.",
+							"numa");
+		result = false;
+	}
+#else
+
+	/* in case of just listing NUMA nodes it's list of preferred ones */
+	n->setting = NUMA_PREFERRED;
+
+	if (strcmp(rawstring, "") == 0)
+		n->setting = DEFAULT_NUMA;
+	else if (pg_strcasecmp(rawstring, "off") == 0)
+		n->setting = NUMA_OFF;
+	else if (pg_strcasecmp(rawstring, "all") == 0) {
+		n->setting = NUMA_ALL;
+		n->nodes = numa_all_nodes_ptr;
+	} else if (pg_strcasecmp(rawstring, "auto") == 0) {
+		n->setting = NUMA_AUTO;
+		n->nodes = numa_all_nodes_ptr;
+	} else if (isdigit(rawstring[0]))
+		n->setting = NUMA_PREFERRED;
+	else if (rawstring[0] == '=')
+		n->setting = NUMA_STRICT_ONLY;
+	else if (rawstring[0] == '@')
+		n->setting = NUMA_STRICT_ONLY_AND_CPU_TOO;
+	else {
+		GUC_check_errdetail("Invalid option \"%s\".", rawstring);
+		guc_free(n);
+		return false;
+	}
+
+	if(n->setting >= NUMA_PREFERRED) {
+		char *s = rawstring;
+
+		/* skip first character */
+		if(n->setting >= NUMA_STRICT_ONLY)
+			s++;
+
+		n->nodes = pg_numa_parse_nodestring(s);
+		if(n->nodes == 0) {
+			GUC_check_errdetail("Invalid list syntax in parameter \"%s\".",
+				"numa");
+			guc_free(n);
+			return false;
+		}
+	}
+
+#endif
+
+	*extra = n;
+	return result;
+}
+
+void
+assign_numa(const char *newval, void *extra)
+{
+	numa = (NumaConfigData *) extra;
+}
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index f04bfedb2fd..65b7ab7b5b0 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -491,6 +491,7 @@ static const struct config_enum_entry file_copy_method_options[] = {
 	{NULL, 0, false}
 };
 
+
 /*
  * Options for enum values stored in other modules
  */
@@ -580,6 +581,8 @@ int			huge_pages = HUGE_PAGES_TRY;
 int			huge_page_size;
 int			huge_pages_status = HUGE_PAGES_UNKNOWN;
 
+NumaConfigData *numa;
+
 /*
  * These variables are all dummies that don't do anything, except in some
  * cases provide the value for SHOW to display.  The real state is elsewhere
@@ -594,6 +597,7 @@ static char *server_version_string;
 static int	server_version_num;
 static char *debug_io_direct_string;
 static char *restrict_nonsystem_relation_kind_string;
+static char *numa_string;
 
 #ifdef HAVE_SYSLOG
 #define	DEFAULT_SYSLOG_FACILITY LOG_LOCAL0
@@ -4984,6 +4988,16 @@ struct config_string ConfigureNamesString[] =
 		check_log_connections, assign_log_connections, NULL
 	},
 
+	{
+		{"numa", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Whether to enable NUMA optimizations."),
+			NULL
+		},
+		&numa_string,
+		"",
+		check_numa, assign_numa, NULL
+	},
+
 
 	/* End-of-list marker */
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 341f88adc87..d9e0c165a94 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -135,6 +135,8 @@
 					# (change requires restart)
 #huge_page_size = 0			# zero for system default
 					# (change requires restart)
+#numa = off				# off,all, auto, or comma list of NUMA nodes
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
index 40f1d324dcf..567cef3c505 100644
--- a/src/include/port/pg_numa.h
+++ b/src/include/port/pg_numa.h
@@ -14,9 +14,19 @@
 #ifndef PG_NUMA_H
 #define PG_NUMA_H
 
+// JW: is this legal to be included here?
+#include <numa.h>
+#include <numaif.h>
+
+typedef struct bitmask pg_numa_bitmask_t;
+
 extern PGDLLIMPORT int pg_numa_init(void);
 extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status);
 extern PGDLLIMPORT int pg_numa_get_max_node(void);
+extern PGDLLIMPORT int pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask);
+extern PGDLLIMPORT pg_numa_bitmask_t *pg_numa_parse_nodestring(const char *string);
+extern PGDLLIMPORT void pg_numa_set_bind_policy(int strict);
+extern PGDLLIMPORT void pg_numa_bind(pg_numa_bitmask_t *nodemask);
 
 #ifdef USE_LIBNUMA
 
@@ -27,6 +37,9 @@ extern PGDLLIMPORT int pg_numa_get_max_node(void);
 #define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \
 	ro_volatile_var = *(volatile uint64 *) ptr
 
+extern void numa_warn(int num, char *fmt,...) pg_attribute_printf(2, 3);
+extern void numa_error(char *where);
+
 #else
 
 #define pg_numa_touch_mem_if_required(ro_volatile_var, ptr) \
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 5f7d4b83a60..0c95fc4cdd0 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -25,6 +25,7 @@
 #define PG_SHMEM_H
 
 #include "storage/dsm_impl.h"
+#include "port/pg_numa.h"
 
 typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 {
@@ -41,11 +42,17 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 #endif
 } PGShmemHeader;
 
+typedef struct NumaConfigData {
+	int				  setting;
+	pg_numa_bitmask_t *nodes;
+} NumaConfigData;
+
 /* GUC variables */
 extern PGDLLIMPORT int shared_memory_type;
 extern PGDLLIMPORT int huge_pages;
 extern PGDLLIMPORT int huge_page_size;
 extern PGDLLIMPORT int huge_pages_status;
+extern PGDLLIMPORT NumaConfigData *numa;
 
 /* Possible values for huge_pages and huge_pages_status */
 typedef enum
@@ -64,6 +71,18 @@ typedef enum
 	SHMEM_TYPE_MMAP,
 }			PGShmemType;
 
+typedef enum
+{
+	NUMA_OFF,
+	NUMA_ALL,
+	NUMA_AUTO,
+	NUMA_PREFERRED,
+	NUMA_STRICT_ONLY,
+	NUMA_STRICT_ONLY_AND_CPU_TOO,
+}			NumaType;
+
+#define DEFAULT_NUMA NUMA_OFF
+
 #ifndef WIN32
 extern PGDLLIMPORT unsigned long UsedShmemSegID;
 #else
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 799fa7ace68..854a7dd02b4 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -94,6 +94,8 @@ extern bool check_multixact_member_buffers(int *newval, void **extra,
 extern bool check_multixact_offset_buffers(int *newval, void **extra,
 										   GucSource source);
 extern bool check_notify_buffers(int *newval, void **extra, GucSource source);
+extern bool check_numa(char **newval, void **extra, GucSource source);
+extern void assign_numa(const char *newval, void *extra);
 extern bool check_primary_slot_name(char **newval, void **extra,
 									GucSource source);
 extern bool check_random_seed(double *newval, void **extra, GucSource source);
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
index 4b487a2a4e8..6956f33ef44 100644
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -13,10 +13,17 @@
  *-------------------------------------------------------------------------
  */
 
-#include "c.h"
+//JW:is this legal to replace "c.h" with below:
+#ifndef FRONTEND
+#include "postgres.h"
+#else
+#include "postgres_fe.h"
+#endif
+
 #include <unistd.h>
 
 #include "port/pg_numa.h"
+#include "common/string.h"
 
 /*
  * At this point we provide support only for Linux thanks to libnuma, but in
@@ -55,6 +62,87 @@ pg_numa_get_max_node(void)
 	return numa_max_node();
 }
 
+int
+pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask)
+{
+	numa_interleave_memory(ptr, sz, mask);
+	return 0;
+}
+
+pg_numa_bitmask_t *
+pg_numa_parse_nodestring(const char *string)
+{
+	return numa_parse_nodestring(string);
+}
+
+void
+pg_numa_set_bind_policy(int strict)
+{
+	numa_set_bind_policy(strict);
+}
+
+void
+pg_numa_bind(pg_numa_bitmask_t *nodemask)
+{
+	numa_bind(nodemask);
+}
+
+#ifndef FRONTEND
+/*
+ * The standard libnuma built-in code can be seen here:
+ * https://github.com/numactl/numactl/blob/master/libnuma.c
+ *
+ */
+void
+numa_warn(int num, char *fmt,...)
+{
+	va_list		ap;
+	int			olde = errno;
+	int			needed;
+	StringInfoData msg;
+
+	initStringInfo(&msg);
+
+	va_start(ap, fmt);
+	needed = appendStringInfoVA(&msg, fmt, ap);
+	va_end(ap);
+	if (needed > 0)
+	{
+		enlargeStringInfo(&msg, needed);
+		va_start(ap, fmt);
+		appendStringInfoVA(&msg, fmt, ap);
+		va_end(ap);
+	}
+
+	/* chomp last newline character */
+	pg_strip_crlf(msg.data);
+
+	ereport(WARNING,
+			(errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
+			 errmsg_internal("libnuma: %s", msg.data)));
+
+	pfree(msg.data);
+
+	errno = olde;
+}
+
+void
+numa_error(char *where)
+{
+	int			olde = errno;
+
+	/* chomp last newline character */
+	pg_strip_crlf(where);
+
+	/*
+	 * XXX: for now we issue just WARNING, but long-term that might depend on
+	 * numa_set_strict() here.
+	 */
+	elog(WARNING, "libnuma: %s", where);
+	errno = olde;
+}
+#endif							/* FRONTEND */
+
 #else
 
 /* Empty wrappers */
@@ -77,4 +165,28 @@ pg_numa_get_max_node(void)
 	return 0;
 }
 
+int
+pg_numa_interleave_memptr(void *ptr, size_t sz, pg_numa_bitmask_t *mask)
+{
+	return 0;
+}
+
+pg_numa_bitmask_t *
+pg_numa_parse_nodestring(const char *string)
+{
+	return NULL;
+}
+
+void
+pg_numa_set_bind_policy(int strict)
+{
+	return;
+}
+
+void
+pg_numa_bind(pg_numa_bitmask_t *nodemask)
+{
+	return;
+}
+
 #endif
-- 
2.39.5

