Hi, On Fri, Nov 21, 2025 at 11:53:52AM +0000, Bertrand Drouvot wrote: > Hi, > > On Thu, Nov 20, 2025 at 04:59:07PM +0000, Bertrand Drouvot wrote: > > On Wed, Nov 19, 2025 at 10:49:49PM +0900, Michael Paquier wrote: > > > > > > Hmm. I can think about an option 3 here: pg_buffercache outlines the > > > view pg_buffercache_numa as the primary choice over > > > pg_buffercache_numa_pages(). So I would suggest a more drastic > > > strategy, that should not break monitoring queries with the views > > > being the primary source for the results: > > > - Rename of pg_buffercache_numa_pages() to pg_buffercache_os_pages(), > > > that takes in input a boolean argument to decide if numa should be > > > executed or not. > > > - Creation of a second view for the OS pages that calls > > > pg_buffercache_os_pages() without the numa code activated, for the two > > > attributes that matter. > > > - Switch the existing view pg_buffercache_numa to call > > > pg_buffercache_os_pages() with the numa code activated. If NUMA > > > cannot be set up, elog(ERROR). > > > > Love the idea: the new view would not suffer from the numa availability > > overhead > > and the current behavior is kept. Will look at it, thanks! > > Here they are:
Attached a rebase due to 7d9043aee80. Also 0003 has a minor change (as compared to v8-0004) to avoid this error when creating the 1.6 version with the new code: postgres=# create extension pg_buffercache version '1.6'; CREATE EXTENSION postgres=# select count(*) from pg_buffercache_numa; ERROR: set-valued function called in context that cannot accept a set Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
>From e5edf33f0052c1576090e601b9337dea122438cd Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Fri, 21 Nov 2025 04:52:08 +0000 Subject: [PATCH v9 1/3] Introduce GET_MAX_BUFFER_ENTRIES and get_buffer_page_boundaries Those new macro and function are extracted from pg_buffercache_numa_pages() and help for code readability. --- contrib/pg_buffercache/pg_buffercache_pages.c | 51 ++++++++++++++----- 1 file changed, 38 insertions(+), 13 deletions(-) 100.0% contrib/pg_buffercache/ diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 7c9ff24fa83..5736e0726bd 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -28,6 +28,12 @@ #define NUM_BUFFERCACHE_NUMA_ELEM 3 +/* + * Get the maximum buffer cache entries needed. + */ +#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \ + ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1)) + PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", .version = PG_VERSION @@ -102,6 +108,34 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); /* Only need to touch memory once per backend process lifetime */ static bool firstNumaTouch = true; +/* + * Helper function to get buffer page boundaries. + * + * Given a buffer pointer and OS page size, calculates the start/end + * pointers and first page number. + */ +static void +get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr, + char **startptr_buff, char **endptr_buff, + int32 *page_num) +{ + char *start_ptr; + char *end_ptr; + + /* start of the first page of this buffer */ + start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); + + /* end of the buffer (no need to align to memory page) */ + end_ptr = buffptr + BLCKSZ; + + Assert(start_ptr < end_ptr); + + /* calculate ID of the first page for this buffer */ + *page_num = (start_ptr - startptr) / os_page_size; + *startptr_buff = start_ptr; + *endptr_buff = end_ptr; +} + Datum pg_buffercache_pages(PG_FUNCTION_ARGS) @@ -317,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) void **os_page_ptrs; int *os_page_status; uint64 os_page_count; - int pages_per_buffer; int max_entries; char *startptr, *endptr; @@ -430,8 +463,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) * number as we're walking buffers. That way we can do it in one pass, * without reallocating memory. */ - pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1; - max_entries = NBuffers * pages_per_buffer; + max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size); /* Allocate entries for BufferCachePagesRec records. */ fctx->record = (BufferCacheNumaRec *) @@ -472,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) bufferid = BufferDescriptorGetBuffer(bufHdr); UnlockBufHdr(bufHdr); - /* start of the first page of this buffer */ - startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); - - /* end of the buffer (no need to align to memory page) */ - endptr_buff = buffptr + BLCKSZ; - - Assert(startptr_buff < endptr_buff); - - /* calculate ID of the first page for this buffer */ - page_num = (startptr_buff - startptr) / os_page_size; + /* Get page boundaries for this buffer. */ + get_buffer_page_boundaries(buffptr, os_page_size, startptr, + &startptr_buff, &endptr_buff, &page_num); /* Add an entry for each OS page overlapping with this buffer. */ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size) -- 2.34.1
>From 52ccced8b916179211ce8a80a7f63160da100ab7 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Fri, 21 Nov 2025 09:10:07 +0000 Subject: [PATCH v9 2/3] Adding pg_buffercache_numa_pages_internal() This new function makes NUMA data collection conditional. It extracts the core current pg_buffercache_numa_pages() logic into an internal function that accepts a boolean parameter. It's currently only called with the boolean set to true to serve the pg_buffercache_numa view needs. It will also be called with false in a following commit to serve a new pg_buffercache_os_pages view needs. --- contrib/pg_buffercache/pg_buffercache_pages.c | 150 +++++++++++------- 1 file changed, 90 insertions(+), 60 deletions(-) 100.0% contrib/pg_buffercache/ diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 5736e0726bd..c8aeb01797c 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -88,6 +88,7 @@ typedef struct typedef struct { TupleDesc tupdesc; + bool include_numa; BufferCacheNumaRec *record; } BufferCacheNumaContext; @@ -318,22 +319,26 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } /* - * Inquire about NUMA memory mappings for shared buffers. + * Internal function to inquire about OS pages mappings for shared buffers, + * with optional NUMA information. * - * Returns NUMA node ID for each memory page used by the buffer. Buffers may - * be smaller or larger than OS memory pages. For each buffer we return one - * entry for each memory page used by the buffer (if the buffer is smaller, - * it only uses a part of one memory page). + * When 'include_numa' is: + * - false: Returns buffer to OS page mappings quickly, with numa_node as NULL. + * - true: Initializes NUMA and returns numa_node values. + * + * Buffers may be smaller or larger than OS memory pages. For each buffer we + * return one entry for each memory page used by the buffer (if the buffer is + * smaller, it only uses a part of one memory page). * * We expect both sizes (for buffers and memory pages) to be a power-of-2, so * one is always a multiple of the other. * - * In order to get reliable results we also need to touch memory pages, so - * that the inquiry about NUMA memory node doesn't return -2 (which indicates - * unmapped/unallocated pages). + * When 'include_numa' is true, in order to get reliable results we also need + * to touch memory pages, so that the inquiry about NUMA memory node doesn't + * return -2 (which indicates unmapped/unallocated pages). */ -Datum -pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +static Datum +pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa) { FuncCallContext *funcctx; MemoryContext oldcontext; @@ -348,14 +353,14 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) int i, idx; Size os_page_size; - void **os_page_ptrs; - int *os_page_status; - uint64 os_page_count; + int *os_page_status = NULL; + uint64 os_page_count = 0; int max_entries; char *startptr, *endptr; - if (pg_numa_init() == -1) + /* If NUMA information is requested, initialize NUMA support. */ + if (include_numa && pg_numa_init() == -1) elog(ERROR, "libnuma initialization failed or NUMA is not supported on this platform"); /* @@ -383,51 +388,56 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) */ Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0)); - /* - * How many addresses we are going to query? Simply get the page for - * the first buffer, and first page after the last buffer, and count - * the pages from that. - */ - startptr = (char *) TYPEALIGN_DOWN(os_page_size, - BufferGetBlock(1)); - endptr = (char *) TYPEALIGN(os_page_size, - (char *) BufferGetBlock(NBuffers) + BLCKSZ); - os_page_count = (endptr - startptr) / os_page_size; - - /* Used to determine the NUMA node for all OS pages at once */ - os_page_ptrs = palloc0(sizeof(void *) * os_page_count); - os_page_status = palloc(sizeof(uint64) * os_page_count); - - /* - * Fill pointers for all the memory pages. This loop stores and - * touches (if needed) addresses into os_page_ptrs[] as input to one - * big move_pages(2) inquiry system call, as done in - * pg_numa_query_pages(). - */ - idx = 0; - for (char *ptr = startptr; ptr < endptr; ptr += os_page_size) + if (include_numa) { - os_page_ptrs[idx++] = ptr; + void **os_page_ptrs = NULL; + + /* + * How many addresses we are going to query? Simply get the page + * for the first buffer, and first page after the last buffer, and + * count the pages from that. + */ + startptr = (char *) TYPEALIGN_DOWN(os_page_size, + BufferGetBlock(1)); + endptr = (char *) TYPEALIGN(os_page_size, + (char *) BufferGetBlock(NBuffers) + BLCKSZ); + os_page_count = (endptr - startptr) / os_page_size; + + /* Used to determine the NUMA node for all OS pages at once */ + os_page_ptrs = palloc0(sizeof(void *) * os_page_count); + os_page_status = palloc(sizeof(uint64) * os_page_count); + + /* + * Fill pointers for all the memory pages. This loop stores and + * touches (if needed) addresses into os_page_ptrs[] as input to + * one big move_pages(2) inquiry system call, as done in + * pg_numa_query_pages(). + */ + idx = 0; + for (char *ptr = startptr; ptr < endptr; ptr += os_page_size) + { + os_page_ptrs[idx++] = ptr; - /* Only need to touch memory once per backend process lifetime */ - if (firstNumaTouch) - pg_numa_touch_mem_if_required(ptr); - } + /* Only need to touch memory once per backend process lifetime */ + if (firstNumaTouch) + pg_numa_touch_mem_if_required(ptr); + } - Assert(idx == os_page_count); + Assert(idx == os_page_count); - elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " " - "os_page_size=%zu", NBuffers, os_page_count, os_page_size); + elog(DEBUG1, "NUMA: NBuffers=%d os_page_count=" UINT64_FORMAT " " + "os_page_size=%zu", NBuffers, os_page_count, os_page_size); - /* - * If we ever get 0xff back from kernel inquiry, then we probably have - * bug in our buffers to OS page mapping code here. - */ - memset(os_page_status, 0xff, sizeof(int) * os_page_count); + /* + * If we ever get 0xff back from kernel inquiry, then we probably + * have bug in our buffers to OS page mapping code here. + */ + memset(os_page_status, 0xff, sizeof(int) * os_page_count); - /* Query NUMA status for all the pointers */ - if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1) - elog(ERROR, "failed NUMA pages inquiry: %m"); + /* Query NUMA status for all the pointers */ + if (pg_numa_query_pages(0, os_page_count, os_page_ptrs, os_page_status) == -1) + elog(ERROR, "failed NUMA pages inquiry: %m"); + } /* Initialize the multi-call context, load entries about buffers */ @@ -455,6 +465,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) INT4OID, -1, 0); fctx->tupdesc = BlessTupleDesc(tupledesc); + fctx->include_numa = include_numa; /* * Each buffer needs at least one entry, but it might be offset in @@ -473,7 +484,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); - if (firstNumaTouch) + if (include_numa && firstNumaTouch) elog(DEBUG1, "NUMA: page-faulting the buffercache for proper NUMA readouts"); /* @@ -513,7 +524,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) { fctx->record[idx].bufferid = bufferid; fctx->record[idx].page_num = page_num; - fctx->record[idx].numa_node = os_page_status[page_num]; + fctx->record[idx].numa_node = include_numa ? os_page_status[page_num] : -1; /* advance to the next entry/page */ ++idx; @@ -521,14 +532,18 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) } } - Assert((idx >= os_page_count) && (idx <= max_entries)); + Assert(idx <= max_entries); + + if (include_numa) + Assert(idx >= os_page_count); /* Set max calls and remember the user function context. */ funcctx->max_calls = idx; funcctx->user_fctx = fctx; - /* Remember this backend touched the pages */ - firstNumaTouch = false; + /* Remember this backend touched the pages (only relevant for NUMA) */ + if (include_numa) + firstNumaTouch = false; } funcctx = SRF_PERCALL_SETUP(); @@ -548,8 +563,16 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) values[1] = Int64GetDatum(fctx->record[i].page_num); nulls[1] = false; - values[2] = Int32GetDatum(fctx->record[i].numa_node); - nulls[2] = false; + if (fctx->include_numa) + { + values[2] = Int32GetDatum(fctx->record[i].numa_node); + nulls[2] = false; + } + else + { + values[2] = (Datum) 0; + nulls[2] = true; + } /* Build and return the tuple. */ tuple = heap_form_tuple(fctx->tupdesc, values, nulls); @@ -561,6 +584,13 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +/* Entry point for extension. */ +Datum +pg_buffercache_numa_pages(PG_FUNCTION_ARGS) +{ + return pg_buffercache_numa_pages_internal(fcinfo, true); +} + Datum pg_buffercache_summary(PG_FUNCTION_ARGS) { -- 2.34.1
>From 8ce991d8628910eac136afb631cb82807aacbe68 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Fri, 21 Nov 2025 05:17:20 +0000 Subject: [PATCH v9 3/3] Add pg_buffercache_os_pages function and view ba2a3c2302f added a way to check if a buffer is spread across multiple pages. Adding those new function and view so that one does not need NUMA support enabled to get this information. To do so this commit: - renames pg_buffercache_numa_pages_internal() to pg_buffercache_os_pages_internal() - keep pg_buffercache_numa_pages() as a backward compatibility wrapper - re-create the pg_buffercache_numa view on top of pg_buffercache_os_pages using true as argument That way we avoid code duplication and the pg_buffercache_os_pages view does not get performance penalties when NUMA is available. --- contrib/pg_buffercache/Makefile | 2 +- .../expected/pg_buffercache.out | 18 +++ contrib/pg_buffercache/meson.build | 1 + .../pg_buffercache--1.6--1.7.sql | 34 +++++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 51 +++++--- contrib/pg_buffercache/sql/pg_buffercache.sql | 8 ++ doc/src/sgml/pgbuffercache.sgml | 120 ++++++++++++++++-- src/tools/pgindent/typedefs.list | 2 + 9 files changed, 210 insertions(+), 28 deletions(-) 4.5% contrib/pg_buffercache/expected/ 3.2% contrib/pg_buffercache/sql/ 37.5% contrib/pg_buffercache/ 54.1% doc/src/sgml/ diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index 5f748543e2e..0e618f66aec 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -9,7 +9,7 @@ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ - pg_buffercache--1.5--1.6.sql + pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache pg_buffercache_numa diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 9a9216dc7b1..26c2d5f5710 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -8,6 +8,16 @@ from pg_buffercache; t (1 row) +-- For pg_buffercache_os_pages, we expect at least one entry for each buffer +select count(*) >= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_os_pages; + ?column? +---------- + t +(1 row) + select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -28,6 +38,8 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; ERROR: permission denied for view pg_buffercache +SELECT * FROM pg_buffercache_os_pages; +ERROR: permission denied for view pg_buffercache_os_pages SELECT * FROM pg_buffercache_pages() AS p (wrong int); ERROR: permission denied for function pg_buffercache_pages SELECT * FROM pg_buffercache_summary(); @@ -43,6 +55,12 @@ SELECT count(*) > 0 FROM pg_buffercache; t (1 row) +SELECT count(*) > 0 FROM pg_buffercache_os_pages; + ?column? +---------- + t +(1 row) + SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); ?column? ---------- diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 7cd039a1df9..7c31141881f 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -24,6 +24,7 @@ install_data( 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', 'pg_buffercache--1.5--1.6.sql', + 'pg_buffercache--1.6--1.7.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql new file mode 100644 index 00000000000..3cfe723c0c2 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql @@ -0,0 +1,34 @@ +/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit + +-- Register the new function with boolean parameter +-- This function is the core implementation for both OS pages and NUMA queries +CREATE FUNCTION pg_buffercache_os_pages(IN include_numa boolean, + OUT bufferid integer, + OUT os_page_num bigint, + OUT numa_node integer) +RETURNS SETOF record +AS 'MODULE_PATHNAME', 'pg_buffercache_os_pages' +LANGUAGE C PARALLEL SAFE; + +-- Create a view for convenient access. +CREATE VIEW pg_buffercache_os_pages AS + SELECT bufferid, os_page_num + FROM pg_buffercache_os_pages(false); + +DROP VIEW pg_buffercache_numa; + +-- Create a view for convenient access. +CREATE VIEW pg_buffercache_numa AS + SELECT bufferid, os_page_num, numa_node + FROM pg_buffercache_os_pages(true); + +REVOKE ALL ON FUNCTION pg_buffercache_os_pages(boolean) FROM PUBLIC; +REVOKE ALL ON pg_buffercache_os_pages FROM PUBLIC; +REVOKE ALL ON pg_buffercache_numa FROM PUBLIC; + +GRANT EXECUTE ON FUNCTION pg_buffercache_os_pages(boolean) TO pg_monitor; +GRANT SELECT ON pg_buffercache_os_pages TO pg_monitor; +GRANT SELECT ON pg_buffercache_numa TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index b030ba3a6fa..11499550945 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.6' +default_version = '1.7' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index c8aeb01797c..ae6949a3e0b 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -26,7 +26,7 @@ #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3 -#define NUM_BUFFERCACHE_NUMA_ELEM 3 +#define NUM_BUFFERCACHE_OS_PAGES_ELEM 3 /* * Get the maximum buffer cache entries needed. @@ -73,14 +73,16 @@ typedef struct } BufferCachePagesContext; /* - * Record structure holding the to be exposed cache data. + * Record structure holding the to be exposed cache data for OS pages. + * This structure is used by pg_buffercache_os_pages() which takes a + * boolean parameter to control whether NUMA information is included. */ typedef struct { uint32 bufferid; int64 page_num; int32 numa_node; -} BufferCacheNumaRec; +} BufferCacheOsPagesRec; /* * Function context for data persisting over repeated calls. @@ -89,8 +91,8 @@ typedef struct { TupleDesc tupdesc; bool include_numa; - BufferCacheNumaRec *record; -} BufferCacheNumaContext; + BufferCacheOsPagesRec *record; +} BufferCacheOsPagesContext; /* @@ -98,6 +100,7 @@ typedef struct * relation node/tablespace/database/blocknum and dirty indicator. */ PG_FUNCTION_INFO_V1(pg_buffercache_pages); +PG_FUNCTION_INFO_V1(pg_buffercache_os_pages); PG_FUNCTION_INFO_V1(pg_buffercache_numa_pages); PG_FUNCTION_INFO_V1(pg_buffercache_summary); PG_FUNCTION_INFO_V1(pg_buffercache_usage_counts); @@ -319,8 +322,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) } /* - * Internal function to inquire about OS pages mappings for shared buffers, - * with optional NUMA information. + * Inquire about OS pages mappings for shared buffers, with optional NUMA + * information. * * When 'include_numa' is: * - false: Returns buffer to OS page mappings quickly, with numa_node as NULL. @@ -338,11 +341,11 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) * return -2 (which indicates unmapped/unallocated pages). */ static Datum -pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa) +pg_buffercache_os_pages_internal(FunctionCallInfo fcinfo, bool include_numa) { FuncCallContext *funcctx; MemoryContext oldcontext; - BufferCacheNumaContext *fctx; /* User function context. */ + BufferCacheOsPagesContext *fctx; /* User function context. */ TupleDesc tupledesc; TupleDesc expected_tupledesc; HeapTuple tuple; @@ -447,12 +450,12 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa) oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* Create a user function context for cross-call persistence */ - fctx = (BufferCacheNumaContext *) palloc(sizeof(BufferCacheNumaContext)); + fctx = (BufferCacheOsPagesContext *) palloc(sizeof(BufferCacheOsPagesContext)); if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); - if (expected_tupledesc->natts != NUM_BUFFERCACHE_NUMA_ELEM) + if (expected_tupledesc->natts != NUM_BUFFERCACHE_OS_PAGES_ELEM) elog(ERROR, "incorrect number of output arguments"); /* Construct a tuple descriptor for the result rows. */ @@ -476,10 +479,10 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa) */ max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size); - /* Allocate entries for BufferCachePagesRec records. */ - fctx->record = (BufferCacheNumaRec *) + /* Allocate entries for BufferCacheOsPagesRec records. */ + fctx->record = (BufferCacheOsPagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCacheNumaRec) * max_entries); + sizeof(BufferCacheOsPagesRec) * max_entries); /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); @@ -554,8 +557,8 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa) if (funcctx->call_cntr < funcctx->max_calls) { uint32 i = funcctx->call_cntr; - Datum values[NUM_BUFFERCACHE_NUMA_ELEM]; - bool nulls[NUM_BUFFERCACHE_NUMA_ELEM]; + Datum values[NUM_BUFFERCACHE_OS_PAGES_ELEM]; + bool nulls[NUM_BUFFERCACHE_OS_PAGES_ELEM]; values[0] = Int32GetDatum(fctx->record[i].bufferid); nulls[0] = false; @@ -584,11 +587,23 @@ pg_buffercache_numa_pages_internal(PG_FUNCTION_ARGS, bool include_numa) SRF_RETURN_DONE(funcctx); } -/* Entry point for extension. */ +Datum +pg_buffercache_os_pages(PG_FUNCTION_ARGS) +{ + bool include_numa; + + /* Get the boolean parameter that controls NUMA behavior */ + include_numa = PG_GETARG_BOOL(0); + + return pg_buffercache_os_pages_internal(fcinfo, include_numa); +} + +/* Backward compatibility wrapper. */ Datum pg_buffercache_numa_pages(PG_FUNCTION_ARGS) { - return pg_buffercache_numa_pages_internal(fcinfo, true); + /* Call internal function with include_numa=true */ + return pg_buffercache_os_pages_internal(fcinfo, true); } Datum diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 47cca1907c7..3c70ee9ef4a 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -5,6 +5,12 @@ select count(*) = (select setting::bigint where name = 'shared_buffers') from pg_buffercache; +-- For pg_buffercache_os_pages, we expect at least one entry for each buffer +select count(*) >= (select setting::bigint + from pg_settings + where name = 'shared_buffers') +from pg_buffercache_os_pages; + select buffers_used + buffers_unused > 0, buffers_dirty <= buffers_used, buffers_pinned <= buffers_used @@ -16,6 +22,7 @@ SELECT count(*) > 0 FROM pg_buffercache_usage_counts() WHERE buffers >= 0; -- having to create a dedicated user, use the pg_database_owner pseudo-role. SET ROLE pg_database_owner; SELECT * FROM pg_buffercache; +SELECT * FROM pg_buffercache_os_pages; SELECT * FROM pg_buffercache_pages() AS p (wrong int); SELECT * FROM pg_buffercache_summary(); SELECT * FROM pg_buffercache_usage_counts(); @@ -24,6 +31,7 @@ RESET role; -- Check that pg_monitor is allowed to query view / function SET ROLE pg_monitor; SELECT count(*) > 0 FROM pg_buffercache; +SELECT count(*) > 0 FROM pg_buffercache_os_pages; SELECT buffers_used + buffers_unused > 0 FROM pg_buffercache_summary(); SELECT count(*) > 0 FROM pg_buffercache_usage_counts(); RESET role; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 99ad2e68785..67f96e289b2 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -46,8 +46,9 @@ <para> This module provides the <function>pg_buffercache_pages()</function> function (wrapped in the <structname>pg_buffercache</structname> view), the - <function>pg_buffercache_numa_pages()</function> function (wrapped in the - <structname>pg_buffercache_numa</structname> view), the + <function>pg_buffercache_os_pages()</function> function (wrapped in the + <structname>pg_buffercache_os_pages</structname> and + <structname>pg_buffercache_numa</structname> views), the <function>pg_buffercache_summary()</function> function, the <function>pg_buffercache_usage_counts()</function> function, the <function>pg_buffercache_evict()</function> function, the @@ -63,12 +64,14 @@ </para> <para> - The <function>pg_buffercache_numa_pages()</function> function provides - <acronym>NUMA</acronym> node mappings for shared buffer entries. This - information is not part of <function>pg_buffercache_pages()</function> - itself, as it is much slower to retrieve. - The <structname>pg_buffercache_numa</structname> view wraps the function for - convenient use. + The <function>pg_buffercache_os_pages()</function> function provides OS + pages mappings for shared buffer entries. When its argument is <literal>true</literal>, + it also provides <acronym>NUMA</acronym> node mappings for shared buffer entries ( + this information is not part of <function>pg_buffercache_pages()</function> + itself, as it is much slower to retrieve). + The <structname>pg_buffercache_os_pages</structname> and <structname>pg_buffercache_numa</structname> + views wrap the function for convenient use (with its argument set to <literal>false</literal> + and <literal>true</literal> respectively). </para> <para> @@ -242,6 +245,52 @@ </para> </sect2> + <sect2 id="pgbuffercache-pg-buffercache-os-pages"> + <title>The <structname>pg_buffercache_os_pages</structname> View</title> + + <para> + The definitions of the columns exposed by the view are shown in <xref linkend="pgbuffercache-os-pages-columns"/>. + </para> + + <table id="pgbuffercache-os-pages-columns"> + <title><structname>pg_buffercache_os_pages</structname> Columns</title> + <tgroup cols="1"> + <thead> + <row> + <entry role="catalog_table_entry"><para role="column_definition"> + Column Type + </para> + <para> + Description + </para></entry> + </row> + </thead> + + <tbody> + <row> + <entry role="catalog_table_entry"><para role="column_definition"> + <structfield>bufferid</structfield> <type>integer</type> + </para> + <para> + ID, in the range 1..<varname>shared_buffers</varname> + </para></entry> + </row> + + <row> + <entry role="catalog_table_entry"><para role="column_definition"> + <structfield>os_page_num</structfield> <type>bigint</type> + </para> + <para> + Number of OS memory page for this buffer + </para></entry> + </row> + + </tbody> + </tgroup> + </table> + + </sect2> + <sect2 id="pgbuffercache-pg-buffercache-numa"> <title>The <structname>pg_buffercache_numa</structname> View</title> @@ -558,6 +607,61 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers public | spgist_text_tbl | 182 (10 rows) +regression=# SELECT pages_per_buffer, COUNT(*) as buffer_count + FROM ( + SELECT bufferid, COUNT(*) as pages_per_buffer + FROM pg_buffercache_os_pages + GROUP BY bufferid + ) + GROUP BY pages_per_buffer + ORDER BY pages_per_buffer; + + pages_per_buffer | buffer_count +------------------+-------------- + 1 | 261120 + 2 | 1024 +(2 rows) + +regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages + FROM pg_buffercache b JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase IN (0, (SELECT oid FROM pg_database + WHERE datname = current_database())) + JOIN pg_namespace n ON n.oid = c.relnamespace + JOIN (SELECT bufferid FROM pg_buffercache_os_pages + GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10; + + nspname | relname | buffers_on_multiple_pages +------------+------------------------------+--------------------------- + public | delete_test_table | 3 + public | gin_test_idx | 2 + pg_catalog | pg_depend | 2 + public | quad_poly_tbl | 2 + pg_catalog | pg_depend_reference_index | 1 + pg_catalog | pg_index_indexrelid_index | 1 + pg_catalog | pg_constraint_contypid_index | 1 + pg_catalog | pg_statistic | 1 + pg_catalog | pg_depend_depender_index | 1 + pg_catalog | pg_operator | 1 +(10 rows) + + + nspname | relname | buffers_on_multiple_pages +------------+---------------------------------+--------------------------- + public | gin_test_tbl | 4 + public | delete_test_table | 4 + public | tenk1 | 4 + pg_catalog | pg_attribute_relid_attnum_index | 4 + pg_catalog | pg_class | 2 + pg_catalog | pg_depend_depender_index | 2 + pg_catalog | pg_attribute | 2 + pg_catalog | pg_opfamily | 2 + pg_catalog | pg_opclass_oid_index | 2 + pg_catalog | pg_description | 2 +(10 rows) regression=# SELECT * FROM pg_buffercache_summary(); buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 0d1ea4ec63d..f05872d6860 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -340,6 +340,8 @@ BufferAccessStrategy BufferAccessStrategyType BufferCacheNumaContext BufferCacheNumaRec +BufferCacheOsPagesContext +BufferCacheOsPagesRec BufferCachePagesContext BufferCachePagesRec BufferDesc -- 2.34.1
