From 995011841cde76e530e2ff12452f54e8b8da5923 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 21 Feb 2025 14:20:18 +0100
Subject: [PATCH v3 3/3] Add pg_shmem_numa_allocations

---
 src/backend/catalog/system_views.sql |   8 ++
 src/backend/storage/ipc/shmem.c      | 120 +++++++++++++++++++++++++++
 src/include/catalog/pg_proc.dat      |   8 ++
 3 files changed, 136 insertions(+)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index eff0990957e..c808fb82d75 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -658,6 +658,14 @@ GRANT SELECT ON pg_shmem_allocations TO pg_read_all_stats;
 REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
 GRANT EXECUTE ON FUNCTION pg_get_shmem_allocations() TO pg_read_all_stats;
 
+CREATE VIEW pg_shmem_numa_allocations AS
+    SELECT * FROM pg_get_shmem_numa_allocations();
+
+REVOKE ALL ON pg_shmem_numa_allocations FROM PUBLIC;
+GRANT SELECT ON pg_shmem_numa_allocations TO pg_read_all_stats;
+REVOKE EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_shmem_numa_allocations() TO pg_read_all_stats;
+
 CREATE VIEW pg_backend_memory_contexts AS
     SELECT * FROM pg_get_backend_memory_contexts();
 
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 895a43fb39e..c8881d98e05 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -73,6 +73,11 @@
 #include "storage/shmem.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#include <unistd.h>
+#endif
 
 static void *ShmemAllocRaw(Size size, Size *allocated_size);
 
@@ -568,3 +573,118 @@ pg_get_shmem_allocations(PG_FUNCTION_ARGS)
 
 	return (Datum) 0;
 }
+
+/* SQL SRF showing NUMA zones for allocated shared memory */
+Datum
+pg_get_shmem_numa_allocations(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_NUMA_SIZES_COLS 3
+//#ifdef LIBNUMA
+#if 1
+
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	HASH_SEQ_STATUS hstat;
+	ShmemIndexEnt *ent;
+	Datum		values[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	bool		nulls[PG_GET_SHMEM_NUMA_SIZES_COLS];
+	Size		os_page_size;
+
+	/* According to numa(3) it is required to initialize library even if that's no-op. */
+	if(numa_available() == -1) {
+		elog(NOTICE, "libnuma initialization failed, some NUMA data might be unavailable.");;
+		return (Datum) 0;
+	}
+
+	/* This is for gathering some NUMA statistics. We might be using
+	 * various DB block sizes (4kB, 8kB , .. 32kB) that end up being
+	 * allocated in various different OS memory pages sizes, so first
+	 * we need to understand the OS memory page size before
+	 * calling move_pages()
+	 */
+	os_page_size = sysconf(_SC_PAGESIZE);
+	if(huge_pages_status == HUGE_PAGES_ON)
+			GetHugePageSize(&os_page_size, NULL);
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	// MemoryContext!
+
+	LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+	hash_seq_init(&hstat, ShmemIndex);
+
+	/* output all allocated entries */
+	memset(nulls, 0, sizeof(nulls));
+	while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+	{
+		int 		i;
+		void		**os_page_ptrs;
+		int			*os_pages_status;
+		int			os_page_count;
+#define MAX_ZONES 32 /* FIXME? */
+		Size		zones[MAX_ZONES];
+
+		os_page_count = ent->allocated_size / os_page_size;
+		//elog(NOTICE, "os_page_count=%d os_page_size=%ld ", os_page_count, os_page_size);
+
+		os_page_ptrs = palloc(sizeof(void *) * os_page_count);
+		os_pages_status = palloc(sizeof(int) * os_page_count);
+		memset(os_page_ptrs, 0, sizeof(void *) * os_page_count);
+		/*
+		 * If we get ever 0xff back from kernel inquiry, then we probably
+		 * have bug in our buffers to OS page mapping code here
+		 */
+		memset(os_pages_status, 0xff, sizeof(int) * os_page_count);
+
+		for(i = 0; i < os_page_count; i++) {
+			/*
+			 * In order to get reliable results we also need to touch memory pages
+			 * so that inquiry about NUMA zone doesn't return -2.
+			 */
+			volatile uint64 touch pg_attribute_unused();
+			os_page_ptrs[i] = (char *)ent->location + (i * os_page_size);
+			touch = *(uint64 *)os_page_ptrs[i];
+		}
+
+		/* Amortize the number of pages we need to query about */
+		if(numa_move_pages(0, os_page_count, os_page_ptrs, NULL, os_pages_status, 0) == -1) {
+			elog(ERROR, "failed NUMA pages inquiry status");
+		}
+
+		memset(zones, 0, sizeof(zones));
+		/* Counter number of NUMA zones used for this shared memory entry */
+		for (i = 0; i < os_page_count; i++) {
+			int s = os_pages_status[i];
+			if(s >= 0)
+				zones[s]++;
+		}
+
+		pfree(os_page_ptrs);
+		pfree(os_pages_status);
+
+		for(i = 0; i <= numa_max_node(); i++){
+			values[0] = CStringGetTextDatum(ent->key);
+			values[1] = i;
+			values[2] = Int64GetDatum(zones[i] * os_page_size);
+
+			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+							 values, nulls);
+		}
+	}
+
+	/* XXX: We are ignoring reporting the following regions in pg_get_shmem_allocations() case:
+	 * - output shared memory allocated but not counted via the shmem index
+	 * - output as-of-yet unused shared memory
+	 */
+
+	LWLockRelease(ShmemIndexLock);
+#else
+    ereport(WARNING,
+		(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+        errmsg("NUMA support is not availble"),
+        errdetail("NUMA zone information is not available on this platform due to lack of libnuma"),
+		errhint("It looks like you need to re-compile with libnuma packages available")));
+#endif
+
+	return (Datum) 0;
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 9e803d610d7..1efa342b725 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -8463,6 +8463,14 @@
   proargnames => '{name,off,size,allocated_size}',
   prosrc => 'pg_get_shmem_allocations' },
 
+# shared memory usage with NUMA info
+{ oid => '5101', descr => 'NUMA mappings for the main shared memory segment',
+  proname => 'pg_get_shmem_numa_allocations', prorows => '50', proretset => 't',
+  provolatile => 'v', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,int8,int8}', proargmodes => '{o,o,o}',
+  proargnames => '{name,numa_zone_id,numa_size}',
+  prosrc => 'pg_get_shmem_numa_allocations' },
+
 # memory context of local backend
 { oid => '2282',
   descr => 'information about all memory contexts of local backend',
-- 
2.39.5

