From a920535a78df30661e10936e4750ad4c3bfc1818 Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Wed, 19 Mar 2025 09:34:56 +0100
Subject: [PATCH v17 3/4] Extend pg_buffercache with new view
 pg_buffercache_numa to show NUMA memory node for individual buffer.

Author: Jakub Wartak <jakub.wartak@enterprisedb.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Discussion: https://postgr.es/m/CAKZiRmxh6KWo0aqRqvmcoaX2jUxZYb4kGp3N%3Dq1w%2BDiH-696Xw%40mail.gmail.com
---
 contrib/pg_buffercache/Makefile               |   3 +-
 .../expected/pg_buffercache_numa.out          |  28 +++
 .../expected/pg_buffercache_numa_1.out        |   3 +
 contrib/pg_buffercache/meson.build            |   2 +
 .../pg_buffercache--1.5--1.6.sql              |  39 ++++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c | 170 +++++++++++++++++-
 .../sql/pg_buffercache_numa.sql               |  20 +++
 doc/src/sgml/pgbuffercache.sgml               |  61 ++++++-
 9 files changed, 324 insertions(+), 4 deletions(-)
 create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa.out
 create mode 100644 contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
 create mode 100644 contrib/pg_buffercache/sql/pg_buffercache_numa.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index eae65ead9e5..2a33602537e 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -8,7 +8,8 @@ OBJS = \
 EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
-	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql
+	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
+	pg_buffercache--1.5--1.6.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa.out b/contrib/pg_buffercache/expected/pg_buffercache_numa.out
new file mode 100644
index 00000000000..d4de5ea52fc
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_numa.out
@@ -0,0 +1,28 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ERROR:  permission denied for view pg_buffercache_numa
+RESET role;
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+ ?column? 
+----------
+ t
+(1 row)
+
+RESET role;
diff --git a/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
new file mode 100644
index 00000000000..6dd6824b4e4
--- /dev/null
+++ b/contrib/pg_buffercache/expected/pg_buffercache_numa_1.out
@@ -0,0 +1,3 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build
index 12d1fe48717..7cd039a1df9 100644
--- a/contrib/pg_buffercache/meson.build
+++ b/contrib/pg_buffercache/meson.build
@@ -23,6 +23,7 @@ install_data(
   'pg_buffercache--1.2.sql',
   'pg_buffercache--1.3--1.4.sql',
   'pg_buffercache--1.4--1.5.sql',
+  'pg_buffercache--1.5--1.6.sql',
   'pg_buffercache.control',
   kwargs: contrib_data_args,
 )
@@ -34,6 +35,7 @@ tests += {
   'regress': {
     'sql': [
       'pg_buffercache',
+      'pg_buffercache_numa',
     ],
   },
 }
diff --git a/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
new file mode 100644
index 00000000000..720dc84b2c9
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql
@@ -0,0 +1,39 @@
+/* contrib/pg_buffercache/pg_buffercache--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION pg_buffercache" to load this file. \quit
+
+-- Register the new functions.
+CREATE OR REPLACE FUNCTION pg_buffercache_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_pages'
+LANGUAGE C PARALLEL SAFE;
+
+CREATE OR REPLACE FUNCTION pg_buffercache_numa_pages()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_numa_pages'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE OR REPLACE VIEW pg_buffercache AS
+	SELECT P.* FROM pg_buffercache_pages() AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4);
+
+CREATE OR REPLACE VIEW pg_buffercache_numa AS
+	SELECT P.* FROM pg_buffercache_numa_pages() AS P
+	(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+	 relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2,
+	 pinning_backends int4, node_id int4);
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
+REVOKE ALL ON FUNCTION pg_buffercache_numa_pages() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_numa FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_pages() TO pg_monitor;
+GRANT EXECUTE ON FUNCTION pg_buffercache_numa_pages() TO pg_monitor;
+GRANT SELECT ON pg_buffercache TO pg_monitor;
+GRANT SELECT ON pg_buffercache_numa TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 5ee875f77dd..b030ba3a6fa 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index cad7429a21b..1ec9ac25d58 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -11,12 +11,13 @@
 #include "access/htup_details.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
+#include "port/pg_numa.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 
 
 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM	8
-#define NUM_BUFFERCACHE_PAGES_ELEM	9
+#define NUM_BUFFERCACHE_PAGES_ELEM	10
 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
 
@@ -46,6 +47,7 @@ typedef struct
 	 * because of bufmgr.c's PrivateRefCount infrastructure.
 	 */
 	int32		pinning_backends;
+	int32		numa_node_id;
 } BufferCachePagesRec;
 
 
@@ -64,10 +66,56 @@ typedef struct
  * relation node/tablespace/database/blocknum and dirty indicator.
  */
 PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages);
 PG_FUNCTION_INFO_V1(pg_buffercache_summary);
 PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts);
 PG_FUNCTION_INFO_V1(pg_buffercache_evict);
 
+/* Only need to touch memory once per backend process lifetime */
+static bool firstNumaTouch = true;
+
+/*
+ * Helper routine to map Buffers into addresses that is used by
+ * pg_numa_query_pages().
+ *
+ * When database block size (BLCKSZ) is smaller than the OS page size (4kB),
+ * multiple database buffers will map to the same OS memory page. In this case,
+ * we only need to query the NUMA node for the first memory address of each
+ * unique OS page rather than for every buffer.
+ *
+ * In order to get reliable results we also need to touch memory pages, so that
+ * inquiry about NUMA memory node doesn't return -2 (which indicates
+ * unmapped/unallocated pages).
+ */
+static inline void
+pg_buffercache_numa_prepare_ptrs(int buffer_id, float pages_per_blk,
+								 Size os_page_size,
+								 void **os_page_ptrs)
+{
+	size_t		blk2page = (size_t) (buffer_id * pages_per_blk);
+
+	for (size_t j = 0; j < pages_per_blk; j++)
+	{
+		size_t		blk2pageoff = blk2page + j;
+
+		if (os_page_ptrs[blk2pageoff] == 0)
+		{
+			volatile uint64 touch pg_attribute_unused();
+
+			/* NBuffers starts from 1 */
+			os_page_ptrs[blk2pageoff] = (char *) BufferGetBlock(buffer_id + 1) +
+				(os_page_size * j);
+
+			/* Only need to touch memory once per backend process lifetime */
+			if (firstNumaTouch)
+				pg_numa_touch_mem_if_required(touch, os_page_ptrs[blk2pageoff]);
+
+		}
+
+		CHECK_FOR_INTERRUPTS();
+	}
+}
+
 /*
  * Helper routine for pg_buffercache_pages() and pg_buffercache_numa_pages().
  */
@@ -122,6 +170,9 @@ pg_buffercache_init_entries(FuncCallContext *funcctx, FunctionCallInfo fcinfo)
 	if (expected_tupledesc->natts >= NUM_BUFFERCACHE_PAGES_ELEM - 1)
 		TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends",
 						   INT4OID, -1, 0);
+	if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM)
+		TupleDescInitEntry(tupledesc, (AttrNumber) 10, "node_id",
+						   INT4OID, -1, 0);
 
 	fctx->tupdesc = BlessTupleDesc(tupledesc);
 
@@ -175,6 +226,8 @@ pg_buffercache_save_tuple(int record_id, BufferCachePagesContext *fctx)
 	else
 		bufRecord->isvalid = false;
 
+	bufRecord->numa_node_id = -1;
+
 	UnlockBufHdr(bufHdr, buf_state);
 }
 
@@ -220,6 +273,7 @@ get_buffercache_tuple(int record_id, BufferCachePagesContext *fctx)
 		 * unused for v1.0 callers, but the array is always long enough
 		 */
 		values[8] = Int32GetDatum(bufRecord->pinning_backends);
+		values[9] = Int32GetDatum(bufRecord->numa_node_id);
 	}
 
 	/* Build and return the tuple. */
@@ -271,6 +325,120 @@ pg_buffercache_pages(PG_FUNCTION_ARGS)
 	}
 }
 
+/*
+ * This is almost identical to the above, but performs
+ * NUMA inuqiry about memory mappings.
+ */
+Datum
+pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	BufferCachePagesContext *fctx;	/* User function context. */
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		int			i;
+		Size		os_page_size = 0;
+		void	  **os_page_ptrs = NULL;
+		int		   *os_pages_status = NULL;
+		uint64		os_page_count = 0;
+		float		pages_per_blk = 0;
+
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		if (pg_numa_init() == -1)
+			elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform");
+
+		fctx = pg_buffercache_init_entries(funcctx, fcinfo);
+
+		/*
+		 * Different database block sizes (4kB, 8kB, ..., 32kB) can be used,
+		 * while the OS may have different memory page sizes.
+		 *
+		 * To correctly map between them, we need to: 1. Determine the OS
+		 * memory page size 2. Calculate how many OS pages are used by all
+		 * buffer blocks 3. Calculate how many OS pages are contained within
+		 * each database block.
+		 *
+		 * This information is needed before calling move_pages() for NUMA
+		 * node id inquiry.
+		 */
+		os_page_size = pg_numa_get_pagesize();
+		os_page_count = ((uint64) NBuffers * BLCKSZ) / os_page_size;
+		pages_per_blk = (float) BLCKSZ / os_page_size;
+
+		elog(DEBUG1, "NUMA: os_page_count=%lu os_page_size=%zu pages_per_blk=%.2f",
+			 (unsigned long) os_page_count, os_page_size, pages_per_blk);
+
+		os_page_ptrs = palloc0(sizeof(void *) * os_page_count);
+		os_pages_status = palloc(sizeof(uint64) * os_page_count);
+
+		/*
+		 * If we ever get 0xff back from kernel inquiry, then we probably have
+		 * bug in our buffers to OS page mapping code here.
+		 *
+		 */
+		memset(os_pages_status, 0xff, sizeof(int) * os_page_count);
+
+		if (firstNumaTouch)
+			elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts");
+
+		/*
+		 * Scan through all the buffers, saving the relevant fields in the
+		 * fctx->record structure.
+		 *
+		 * We don't hold the partition locks, so we don't get a consistent
+		 * snapshot across all buffers, but we do grab the buffer header
+		 * locks, so the information of each buffer is self-consistent.
+		 */
+		for (i = 0; i < NBuffers; i++)
+		{
+			pg_buffercache_save_tuple(i, fctx);
+			pg_buffercache_numa_prepare_ptrs(i, pages_per_blk, os_page_size,
+											 os_page_ptrs);
+		}
+
+		if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_pages_status) == -1)
+			elog(ERROR, "failed NUMA pages inquiry: %m");
+
+		for (i = 0; i < NBuffers; i++)
+		{
+			int			blk2page = (int) i * pages_per_blk;
+
+			/*
+			 * Set the NUMA node id for this buffer based on the first OS page
+			 * it maps to.
+			 *
+			 * Note: We could check for errors in os_pages_status and report
+			 * them. Also, a single DB block might span multiple NUMA nodes if
+			 * it crosses OS pages on node boundaries, but we only record the
+			 * node of the first page. This is a simplification but should be
+			 * sufficient for most analyses.
+			 */
+			fctx->record[i].numa_node_id = os_pages_status[blk2page];
+		}
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	/* Get the saved state */
+	fctx = funcctx->user_fctx;
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		Datum		result;
+		uint32		i = funcctx->call_cntr;
+
+		result = get_buffercache_tuple(i, fctx);
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+	{
+		firstNumaTouch = false;
+		SRF_RETURN_DONE(funcctx);
+	}
+}
+
 Datum
 pg_buffercache_summary(PG_FUNCTION_ARGS)
 {
diff --git a/contrib/pg_buffercache/sql/pg_buffercache_numa.sql b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql
new file mode 100644
index 00000000000..2225b879f58
--- /dev/null
+++ b/contrib/pg_buffercache/sql/pg_buffercache_numa.sql
@@ -0,0 +1,20 @@
+SELECT NOT(pg_numa_available()) AS skip_test \gset
+\if :skip_test
+\quit
+\endif
+
+select count(*) = (select setting::bigint
+                   from pg_settings
+                   where name = 'shared_buffers')
+from pg_buffercache_numa;
+
+-- Check that the functions / views can't be accessed by default. To avoid
+-- having to create a dedicated user, use the pg_database_owner pseudo-role.
+SET ROLE pg_database_owner;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET role;
+
+-- Check that pg_monitor is allowed to query view / function
+SET ROLE pg_monitor;
+SELECT count(*) > 0 FROM pg_buffercache_numa;
+RESET role;
diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml
index 802a5112d77..086e0062a17 100644
--- a/doc/src/sgml/pgbuffercache.sgml
+++ b/doc/src/sgml/pgbuffercache.sgml
@@ -30,7 +30,9 @@
  <para>
   This module provides the <function>pg_buffercache_pages()</function>
   function (wrapped in the <structname>pg_buffercache</structname> view),
-  the <function>pg_buffercache_summary()</function> function, the
+  <function>pg_buffercache_numa_pages()</function> function (wrapped in the
+  <structname>pg_buffercache_numa</structname> view), the
+  <function>pg_buffercache_summary()</function> function, the
   <function>pg_buffercache_usage_counts()</function> function and
   the <function>pg_buffercache_evict()</function> function.
  </para>
@@ -42,6 +44,14 @@
   convenient use.
  </para>
 
+ <para>
+  The <function>pg_buffercache_numa_pages()</function> provides the same information
+  as <function>pg_buffercache_pages()</function> but is slower because it also
+  provides the <acronym>NUMA</acronym> node ID per shared buffer entry.
+  The <structname>pg_buffercache_numa</structname> view wraps the function for
+  convenient use.
+ </para>
+
  <para>
   The <function>pg_buffercache_summary()</function> function returns a single
   row summarizing the state of the shared buffer cache.
@@ -200,6 +210,55 @@
   </para>
  </sect2>
 
+ <sect2 id="pgbuffercache-pg-buffercache_numa">
+  <title>The <structname>pg_buffercache_numa</structname> View</title>
+
+  <para>
+   The definitions of the columns exposed are identical to the
+   <structname>pg_buffercache</structname> view, except that this one includes
+   one additional <structfield>node_id</structfield> column as defined in
+   <xref linkend="pgbuffercache-numa-columns"/>.
+  </para>
+
+  <table id="pgbuffercache-numa-columns">
+   <title><structname>pg_buffercache_numa</structname> Extra column</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>node_id</structfield> <type>integer</type>
+      </para>
+      <para>
+       <acronym>NUMA</acronym> node ID. NULL if the shared buffer
+       has not been used yet. On systems without <acronym>NUMA</acronym> support
+       this returns 0.
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+
+  <para>
+   As <acronym>NUMA</acronym> node ID inquiry for each page requires memory pages
+   to be paged-in, the first execution of this function can take a noticeable
+   amount of time. In all the cases (first execution or not), retrieving this
+   information is costly and querying the view at a high frequency is not recommended.
+  </para>
+
+ </sect2>
+
  <sect2 id="pgbuffercache-summary">
   <title>The <function>pg_buffercache_summary()</function> Function</title>
 
-- 
2.39.5

