From 7ad8be9e522a9abc95c81c51da332dfb3edc47fc Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Fri, 21 Feb 2025 11:17:28 +0100
Subject: [PATCH v6 2/3] Extend pg_buffercache with new view
 pg_buffercache_numa to show NUMA zone for indvidual buffer.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Co-authored-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com
---
 contrib/pg_buffercache/Makefile               |   3 +-
 .../expected/pg_buffercache.out               |  30 +++-
 contrib/pg_buffercache/meson.build            |   1 +
 .../pg_buffercache--1.5--1.6.sql              |  35 ++++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 154 +++++++++++++++++-
 contrib/pg_buffercache/sql/pg_buffercache.sql |  19 ++-
 src/backend/utils/misc/guc_tables.c           |   2 +-
 src/include/storage/pg_shmem.h                |   1 +
 9 files changed, 234 insertions(+), 13 deletions(-)
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index eae65ead9e5..2a33602537e 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -8,7 +8,8 @@ OBJS = \
 EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
-	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql
+	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
+	pg_buffercache--1.5--1.6.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache
diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out
index b745dc69eae..f34f137075e 100644
--- a/contrib/pg_buffercache/expected/pg_buffercache.out
+++ b/contrib/pg_buffercache/expected/pg_buffercache.out
@@ -8,6 +8,18 @@ from pg_buffercache;
  t
 (1 row)
 
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+RESET client_min_messages;
 select buffers_used + buffers_unused > 0,
         buffers_dirty <= buffers_used,
         buffers_pinned <= buffers_used
@@ -28,12 +40,19 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
 SET ROLE pg_database_owner;
 SELECT * FROM pg_buffercache;
 ERROR:  permission denied for view pg_buffercache
-SELECT * FROM pg_buffercache_pages() AS p (wrong int);
+SELECT * FROM pg_buffercache_pages(false) AS p (wrong int);
+ERROR:  permission denied for function pg_buffercache_pages
+SELECT * FROM pg_buffercache_pages(true) AS p (wrong int);
 ERROR:  permission denied for function pg_buffercache_pages
 SELECT * FROM pg_buffercache_summary();
 ERROR:  permission denied for function pg_buffercache_summary
 SELECT * FROM pg_buffercache_usage_counts();
 ERROR:  permission denied for function pg_buffercache_usage_counts
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT * FROM pg_buffercache_numa;
+ERROR:  permission denied for view pg_buffercache_numa
+RESET client_min_messages;
 RESET role;
 -- Check that pg_monitor is allowed to query view / function
 SET ROLE pg_monitor;
@@ -55,3 +74,12 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
  t
 (1 row)
 
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+RESET client_min_messages;
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 12d1fe48717..9b2e9393410 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -23,6 +23,7 @@ install_data(
   'pg_buffercache--1.2.sql',
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
+  'pg_buffercache--1.5--1.6.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
new file mode 100644
index 00000000000..448d08196f3
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -0,0 +1,35 @@
+/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit
+
+-- Register the new function.
+DROP VIEW pg_buffercache;
+DROP FUNCTION pg_buffercache_pages();
+
+CREATE OR REPLACE FUNCTION pg_buffercache_pages(boolean)
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE OR REPLACE VIEW pg_buffercache AS
+	SELECT P.* FROM pg_buffercache_pages(false) AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4);
+
+CREATE OR REPLACE VIEW pg_buffercache_numa AS
+	SELECT P.* FROM pg_buffercache_pages(true) AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4, numa_zone_id int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pages(boolean) FROM PUBLIC;
+REVOKE ALL ON pg_buffercache FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_pages(boolean) TO pg_monitor;
+GRANT SELECT ON pg_buffercache TO pg_monitor;
+GRANT SELECT ON pg_buffercache_numa TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 5ee875f77dd..b030ba3a6fa 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index 3ae0a018e10..dfe53eb8471 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -6,6 +6,7 @@
  *	  contrib/pg_buffercache/pg_buffercache_pages.c
  *-------------------------------------------------------------------------
  */
+#include "pg_config.h"
 #include "postgres.h"
 
 #include "access/htup_details.h"
@@ -13,10 +14,12 @@
 #include "funcapi.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "port/pg_numa.h"
+#include "storage/pg_shmem.h"
 
 
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM	10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 
@@ -43,6 +46,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int32		numa_zone_id;
 } BufferCachePagesRec;
 
 
@@ -65,6 +69,17 @@ PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 
+static void
+pg_buffercache_mark_numa_invalid(BufferCachePagesContext *fctx, int n)
+{
+	int			i;
+
+	for (i = 0; i < n; i++)
+	{
+		fctx->record[i].numa_zone_id = -1;
+	}
+}
+
 Datum
 pg_buffercache_pages(PG_FUNCTION_ARGS)
 {
@@ -75,14 +90,33 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 	TupleDesc	tupledesc;
 	TupleDesc	expected_tupledesc;
 	HeapTuple	tuple;
+	Buffer		query_numa = PG_GETARG_BOOL(0);
 
 	if (SRF_IS_FIRSTCALL())
 	{
-		int			i;
+		int			i,
+					blk2page,
+					j;
+		Size		os_page_size;
+		void	  **os_page_ptrs;
+		int		   *os_pages_status;
+		int			os_page_count;
+		float		pages_per_blk;
 
 		funcctx = SRF_FIRSTCALL_INIT();
 
-		/* Switch context when allocating stuff to be used in later calls */
+		if (query_numa)
+		{
+			if (pg_numa_init() == -1)
+			{
+				elog(NOTICE, "libnuma initialization failed or NUMA is not supported on this platform, some NUMA data might be unavailable.");
+				query_numa = false;
+			}
+		}
+
+		/*
+		 * Switch context when allocating stuff to be used in later calls
+		 */
 		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
 
 		/* Create a user function context for cross-call persistence */
@@ -122,10 +156,14 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count",
 						   INT2OID, -1, 0);
 
-		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1)
 			TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
 							   INT4OID, -1, 0);
 
+		if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+			TupleDescInitEntry(tupledesc, (AttrNumber) 10, "numa_zone_id",
+							   INT4OID, -1, 0);
+
 		fctx->tupdesc = BlessTupleDesc(tupledesc);
 
 		/* Allocate NBuffers worth of BufferCachePagesRec records. */
@@ -137,9 +175,35 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 		funcctx->max_calls = NBuffers;
 		funcctx->user_fctx = fctx;
 
-		/* Return to original context when allocating transient memory */
+		/*
+		 * Return to original context when allocating transient memory
+		 */
 		MemoryContextSwitchTo(oldcontext);
 
+		/*
+		 * This is for gathering some NUMA statistics. We might be using
+		 * various DB block sizes (4kB, 8kB , .. 32kB) that end up being
+		 * allocated in various different OS memory pages sizes, so first we
+		 * need to understand the OS memory page size before calling
+		 * move_pages()
+		 */
+		os_page_size = pg_numa_get_pagesize();
+		os_page_count = ((uint64)NBuffers * BLCKSZ) / os_page_size;
+		pages_per_blk = (float) BLCKSZ / os_page_size;
+
+		elog(DEBUG1, "NUMA os_page_count=%d os_page_size=%ld pages_per_blk=%f",
+			 os_page_count, os_page_size, pages_per_blk);
+
+		os_page_ptrs = palloc(sizeof(void *) * os_page_count);
+		os_pages_status = palloc(sizeof(int) * os_page_count);
+		memset(os_page_ptrs, 0, sizeof(void *) * os_page_count);
+
+		/*
+		 * If we ever get 0xff back from kernel inquiry, then we probably have
+		 * bug in our buffers to OS page mapping code here
+		 */
+		memset(os_pages_status, 0xff, sizeof(int) * os_page_count);
+
 		/*
 		 * Scan through all the buffers, saving the relevant fields in the
 		 * fctx->record structure.
@@ -171,14 +235,79 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			else
 				fctx->record[i].isdirty = false;
 
-			/* Note if the buffer is valid, and has storage created */
+			/*
+			 * Note if the buffer is valid, and has storage created
+			 */
 			if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID))
 				fctx->record[i].isvalid = true;
 			else
 				fctx->record[i].isvalid = false;
 
+			if (query_numa)
+			{
+				blk2page = (int) i * pages_per_blk;
+				j = 0;
+				do
+				{
+					/*
+					 * Many buffers can point to the same page (in case of
+					 * BLCKSZ < 4kB), but we want to also query just first
+					 * address.
+					 *
+					 * In order to get reliable results we also need to touch
+					 * memory pages, so that inquiry about NUMA zone doesn't
+					 * return -2.
+					 */
+					if (os_page_ptrs[blk2page + j] == 0)
+					{
+						volatile uint64 touch pg_attribute_unused();
+
+						/*
+						 * NBuffers count start really from 1
+						 */
+						os_page_ptrs[blk2page + j] = (char *) BufferGetBlock(i + 1) + (os_page_size * j);
+						pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2page + j]);
+
+						/*
+						 * Every 1GB of scanned memory we give process chance
+						 * to respond
+						 */
+#define ONE_GIGABYTE 1024*1024*1024
+						if ((i * os_page_size) % ONE_GIGABYTE == 0)
+							CHECK_FOR_INTERRUPTS();
+					}
+					j++;
+				} while (j < (int) pages_per_blk);
+			}
+
 			UnlockBufHdr(bufHdr, buf_state);
 		}
+
+
+		if (query_numa)
+		{
+			if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1)
+				elog(ERROR, "failed NUMA pages inquiry: %m");
+
+			for (i = 0; i < NBuffers; i++)
+			{
+				blk2page = (int) i * pages_per_blk;
+
+				/*
+				 * Technically we can get errors too here and pass that to
+				 * user
+				 *
+				 * XXX:: also we could somehow report single DB block spanning
+				 * more than 2 NUMA zones, but it should be rare (?)
+				 */
+				fctx->record[i].numa_zone_id = os_pages_status[blk2page];
+			}
+		}
+		else
+			pg_buffercache_mark_numa_invalid(fctx, NBuffers);
+
+		pfree(os_page_ptrs);
+		pfree(os_pages_status);
 	}
 
 	funcctx = SRF_PERCALL_SETUP();
@@ -209,8 +338,12 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			nulls[5] = true;
 			nulls[6] = true;
 			nulls[7] = true;
-			/* unused for v1.0 callers, but the array is always long enough */
+
+			/*
+			 * unused for v1.0 callers, but the array is always long enough
+			 */
 			nulls[8] = true;
+			nulls[9] = true;
 		}
 		else
 		{
@@ -228,9 +361,14 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 			nulls[6] = false;
 			values[7] = Int16GetDatum(fctx->record[i].usagecount);
 			nulls[7] = false;
-			/* unused for v1.0 callers, but the array is always long enough */
+
+			/*
+			 * unused for v1.0 callers, but the array is always long enough
+			 */
 			values[8] = Int32GetDatum(fctx->record[i].pinning_backends);
 			nulls[8] = false;
+			values[9] = Int32GetDatum(fctx->record[i].numa_zone_id);
+			nulls[9] = false;
 		}
 
 		/* Build and return the tuple. */
diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql
index 944fbb1beae..7f2ce683e6c 100644
--- a/contrib/pg_buffercache/sql/pg_buffercache.sql
+++ b/contrib/pg_buffercache/sql/pg_buffercache.sql
@@ -5,6 +5,14 @@ select count(*) = (select setting::bigint
                    where name = 'shared_buffers')
 from pg_buffercache;
 
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+RESET client_min_messages;
+
 select buffers_used + buffers_unused > 0,
         buffers_dirty <= buffers_used,
         buffers_pinned <= buffers_used
@@ -16,9 +24,14 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0;
 -- having to create a dedicated user, use the pg_database_owner pseudo-role.
 SET ROLE pg_database_owner;
 SELECT * FROM pg_buffercache;
-SELECT * FROM pg_buffercache_pages() AS p (wrong int);
+SELECT * FROM pg_buffercache_pages(false) AS p (wrong int);
+SELECT * FROM pg_buffercache_pages(true) AS p (wrong int);
 SELECT * FROM pg_buffercache_summary();
 SELECT * FROM pg_buffercache_usage_counts();
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT * FROM pg_buffercache_numa;
+RESET client_min_messages;
 RESET role;
 
 -- Check that pg_monitor is allowed to query view / function
@@ -26,3 +39,7 @@ SET ROLE pg_monitor;
 SELECT count(*) > 0 FROM pg_buffercache;
 SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary();
 SELECT count(*) > 0 FROM pg_buffercache_usage_counts();
+-- to ignore potential NOTICE:  libnuma initialization failed..
+SET client_min_messages TO warning ;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET client_min_messages;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 03a6dd49154..172309d389a 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -562,7 +562,7 @@ static int	ssl_renegotiation_limit;
  */
 int			huge_pages = HUGE_PAGES_TRY;
 int			huge_page_size;
-static int	huge_pages_status = HUGE_PAGES_UNKNOWN;
+int			huge_pages_status = HUGE_PAGES_UNKNOWN;
 
 /*
  * These variables are all dummies that don't do anything, except in some
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index b99ebc9e86f..5f7d4b83a60 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -45,6 +45,7 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 extern PGDLLIMPORT int shared_memory_type;
 extern PGDLLIMPORT int huge_pages;
 extern PGDLLIMPORT int huge_page_size;
+extern PGDLLIMPORT int huge_pages_status;
 
 /* Possible values for huge_pages and huge_pages_status */
 typedef enum
-- 
2.39.5

