Hi, On Thu, Apr 10, 2025 at 03:05:29PM +0000, Bertrand Drouvot wrote: > Hi, > > On Thu, Apr 10, 2025 at 09:58:18AM -0500, Nathan Bossart wrote: > > On Thu, Apr 10, 2025 at 03:35:24PM +0200, Tomas Vondra wrote: > > > This seems like a good idea in principle, but at this point it has to > > > wait for PG19. Please add it to the July commitfest. > > > > +1. From a glance, this seems to fall in the "new feature" bucket and > > should likely wait for v19. > > Thank you both for providing your thoughts that confirm my initial doubt. > Let's > come back to that one later then. >
Here we are. Please find attached a rebased version and while at it, v2 adds a new macro and a function to avoid some code duplication between pg_buffercache_pages() and pg_buffercache_numa_pages(). So, PFA: 0001 - Introduce GET_MAX_BUFFER_ENTRIES and get_buffer_page_boundaries Those new macro and function are extracted from pg_buffercache_numa_pages() and pg_buffercache_numa_pages() makes use of them. 0002 - Add os_page_num to pg_buffercache Making use of the new macro and function from 0001. As it's for v19, also bumping pg_buffercache's version to 1.7. Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
>From 2520b83d0d4ca9dae9853174ac98473a4c63eb83 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <bertranddrouvot...@gmail.com> Date: Tue, 1 Jul 2025 11:38:37 +0000 Subject: [PATCH v2 1/2] Introduce GET_MAX_BUFFER_ENTRIES and get_buffer_page_boundaries Those new macro and function are extracted from pg_buffercache_numa_pages(). Currently, this is used by pg_buffercache_numa_pages() only but will be used by pg_buffercache_pages() in a following commit. --- contrib/pg_buffercache/pg_buffercache_pages.c | 50 ++++++++++++++----- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index ae0291e6e96..8ef13d74186 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -28,6 +28,12 @@ #define NUM_BUFFERCACHE_NUMA_ELEM 3 +/* + * Get the maximum buffer cache entries needed. + */ +#define GET_MAX_BUFFER_ENTRIES(nbuffers, os_page_size) \ + ((nbuffers) * (Max(1, BLCKSZ / (os_page_size)) + 1)) + PG_MODULE_MAGIC_EXT( .name = "pg_buffercache", .version = PG_VERSION @@ -105,6 +111,33 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all); /* Only need to touch memory once per backend process lifetime */ static bool firstNumaTouch = true; +/* + * Helper function to get buffer page boundaries. + * + * Given a buffer pointer and OS page size, calculates the start/end + * pointers and first page number. + */ +static void +get_buffer_page_boundaries(char *buffptr, Size os_page_size, char *startptr, + char **startptr_buff, char **endptr_buff, + int32 *page_num) +{ + char *start_ptr; + char *end_ptr; + + /* start of the first page of this buffer */ + start_ptr = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); + + /* end of the buffer (no need to align to memory page) */ + end_ptr = buffptr + BLCKSZ; + + Assert(start_ptr < end_ptr); + + /* calculate ID of the first page for this buffer */ + *page_num = (start_ptr - startptr) / os_page_size; + *startptr_buff = start_ptr; + *endptr_buff = end_ptr; +} Datum pg_buffercache_pages(PG_FUNCTION_ARGS) @@ -318,7 +351,6 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) void **os_page_ptrs; int *os_page_status; uint64 os_page_count; - int pages_per_buffer; int max_entries; char *startptr, *endptr; @@ -426,8 +458,7 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) * number as we're walking buffers. That way we can do it in one pass, * without reallocating memory. */ - pages_per_buffer = Max(1, BLCKSZ / os_page_size) + 1; - max_entries = NBuffers * pages_per_buffer; + max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size); /* Allocate entries for BufferCachePagesRec records. */ fctx->record = (BufferCacheNumaRec *) @@ -473,16 +504,9 @@ pg_buffercache_numa_pages(PG_FUNCTION_ARGS) bufferid = BufferDescriptorGetBuffer(bufHdr); UnlockBufHdr(bufHdr, buf_state); - /* start of the first page of this buffer */ - startptr_buff = (char *) TYPEALIGN_DOWN(os_page_size, buffptr); - - /* end of the buffer (no need to align to memory page) */ - endptr_buff = buffptr + BLCKSZ; - - Assert(startptr_buff < endptr_buff); - - /* calculate ID of the first page for this buffer */ - page_num = (startptr_buff - startptr) / os_page_size; + /* Get page boundaries for this buffer. */ + get_buffer_page_boundaries(buffptr, os_page_size, startptr, + &startptr_buff, &endptr_buff, &page_num); /* Add an entry for each OS page overlapping with this buffer. */ for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size) -- 2.34.1
>From 4c5b1aad892240e72d8e18786e7cc409fb5e8f85 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <bertranddrouvot...@gmail.com> Date: Tue, 1 Jul 2025 12:03:55 +0000 Subject: [PATCH v2 2/2] Add os_page_num to pg_buffercache ba2a3c2302f added a way to check if a buffer is spread across multiple pages. Adding the same information in pg_buffercache so that one does not need NUMA support enabled to get this information. --- contrib/pg_buffercache/Makefile | 2 +- .../expected/pg_buffercache.out | 2 +- contrib/pg_buffercache/meson.build | 1 + .../pg_buffercache--1.6--1.7.sql | 11 ++ contrib/pg_buffercache/pg_buffercache.control | 2 +- contrib/pg_buffercache/pg_buffercache_pages.c | 108 ++++++++++++++---- contrib/pg_buffercache/sql/pg_buffercache.sql | 2 +- doc/src/sgml/pgbuffercache.sgml | 34 ++++++ 8 files changed, 133 insertions(+), 29 deletions(-) create mode 100644 contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile index 5f748543e2e..0e618f66aec 100644 --- a/contrib/pg_buffercache/Makefile +++ b/contrib/pg_buffercache/Makefile @@ -9,7 +9,7 @@ EXTENSION = pg_buffercache DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \ pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \ pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \ - pg_buffercache--1.5--1.6.sql + pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time" REGRESS = pg_buffercache pg_buffercache_numa diff --git a/contrib/pg_buffercache/expected/pg_buffercache.out b/contrib/pg_buffercache/expected/pg_buffercache.out index 9a9216dc7b1..52e38dcc027 100644 --- a/contrib/pg_buffercache/expected/pg_buffercache.out +++ b/contrib/pg_buffercache/expected/pg_buffercache.out @@ -1,5 +1,5 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint +select count(*) >= (select setting::bigint from pg_settings where name = 'shared_buffers') from pg_buffercache; diff --git a/contrib/pg_buffercache/meson.build b/contrib/pg_buffercache/meson.build index 7cd039a1df9..7c31141881f 100644 --- a/contrib/pg_buffercache/meson.build +++ b/contrib/pg_buffercache/meson.build @@ -24,6 +24,7 @@ install_data( 'pg_buffercache--1.3--1.4.sql', 'pg_buffercache--1.4--1.5.sql', 'pg_buffercache--1.5--1.6.sql', + 'pg_buffercache--1.6--1.7.sql', 'pg_buffercache.control', kwargs: contrib_data_args, ) diff --git a/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql new file mode 100644 index 00000000000..f2ac5d79eb8 --- /dev/null +++ b/contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql @@ -0,0 +1,11 @@ +/* contrib/pg_buffercache/pg_buffercache--1.6--1.7.sql */ + +-- complain if script is sourced in psql, rather than via ALTER EXTENSION +\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.7'" to load this file. \quit + +-- Upgrade view to 1.7. format +CREATE OR REPLACE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relforknumber int2, relblocknumber int8, isdirty bool, usagecount int2, + pinning_backends int4, os_page_num bigint); diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control index b030ba3a6fa..11499550945 100644 --- a/contrib/pg_buffercache/pg_buffercache.control +++ b/contrib/pg_buffercache/pg_buffercache.control @@ -1,5 +1,5 @@ # pg_buffercache extension comment = 'examine the shared buffer cache' -default_version = '1.6' +default_version = '1.7' module_pathname = '$libdir/pg_buffercache' relocatable = true diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 8ef13d74186..f46de190975 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -19,7 +19,7 @@ #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8 -#define NUM_BUFFERCACHE_PAGES_ELEM 9 +#define NUM_BUFFERCACHE_PAGES_ELEM 10 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4 #define NUM_BUFFERCACHE_EVICT_ELEM 2 @@ -60,6 +60,7 @@ typedef struct * because of bufmgr.c's PrivateRefCount infrastructure. */ int32 pinning_backends; + int64 page_num; } BufferCachePagesRec; @@ -69,6 +70,9 @@ typedef struct typedef struct { TupleDesc tupdesc; + int buffers_per_page; + int pages_per_buffer; + int os_page_size; BufferCachePagesRec *record; } BufferCachePagesContext; @@ -152,8 +156,24 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) if (SRF_IS_FIRSTCALL()) { - int i; + int i, + idx; + Size os_page_size; + char *startptr; + int max_entries; + /* + * Different database block sizes (4kB, 8kB, ..., 32kB) can be used, + * while the OS may have different memory page sizes. + * + * To correctly map between them, we need to: 1. Determine the OS + * memory page size 2. Calculate how many OS pages are used by all + * buffer blocks 3. Calculate how many OS pages are contained within + * each database block. + */ + os_page_size = pg_get_shmem_pagesize(); + + /* Initialize the multi-call context, load entries about buffers */ funcctx = SRF_FIRSTCALL_INIT(); /* Switch context when allocating stuff to be used in later calls */ @@ -196,24 +216,36 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) TupleDescInitEntry(tupledesc, (AttrNumber) 8, "usage_count", INT2OID, -1, 0); - if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + if (expected_tupledesc->natts >= (NUM_BUFFERCACHE_PAGES_ELEM - 1)) TupleDescInitEntry(tupledesc, (AttrNumber) 9, "pinning_backends", INT4OID, -1, 0); + if (expected_tupledesc->natts == NUM_BUFFERCACHE_PAGES_ELEM) + TupleDescInitEntry(tupledesc, (AttrNumber) 10, "os_page_num", + INT8OID, -1, 0); + fctx->tupdesc = BlessTupleDesc(tupledesc); + /* + * Each buffer needs at least one entry, but it might be offset in + * some way, and use one extra entry. So we allocate space for the + * maximum number of entries we might need, and then count the exact + * number as we're walking buffers. That way we can do it in one pass, + * without reallocating memory. + */ + max_entries = GET_MAX_BUFFER_ENTRIES(NBuffers, os_page_size); + /* Allocate NBuffers worth of BufferCachePagesRec records. */ fctx->record = (BufferCachePagesRec *) MemoryContextAllocHuge(CurrentMemoryContext, - sizeof(BufferCachePagesRec) * NBuffers); - - /* Set max calls and remember the user function context. */ - funcctx->max_calls = NBuffers; - funcctx->user_fctx = fctx; + sizeof(BufferCachePagesRec) * max_entries); /* Return to original context when allocating transient memory */ MemoryContextSwitchTo(oldcontext); + startptr = (char *) TYPEALIGN_DOWN(os_page_size, (char *) BufferGetBlock(1)); + idx = 0; + /* * Scan through all the buffers, saving the relevant fields in the * fctx->record structure. @@ -224,35 +256,58 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) */ for (i = 0; i < NBuffers; i++) { + char *buffptr = (char *) BufferGetBlock(i + 1); BufferDesc *bufHdr; uint32 buf_state; + int32 page_num; + char *startptr_buff, + *endptr_buff; bufHdr = GetBufferDescriptor(i); /* Lock each buffer header before inspecting. */ buf_state = LockBufHdr(bufHdr); - fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); - fctx->record[i].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); - fctx->record[i].reltablespace = bufHdr->tag.spcOid; - fctx->record[i].reldatabase = bufHdr->tag.dbOid; - fctx->record[i].forknum = BufTagGetForkNum(&bufHdr->tag); - fctx->record[i].blocknum = bufHdr->tag.blockNum; - fctx->record[i].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); - fctx->record[i].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); + /* Get page boundaries for this buffer. */ + get_buffer_page_boundaries(buffptr, os_page_size, startptr, + &startptr_buff, &endptr_buff, &page_num); - if (buf_state & BM_DIRTY) - fctx->record[i].isdirty = true; - else - fctx->record[i].isdirty = false; + /* Add an entry for each OS page overlapping with this buffer. */ + for (char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size) + { + fctx->record[idx].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[idx].relfilenumber = BufTagGetRelNumber(&bufHdr->tag); + fctx->record[idx].reltablespace = bufHdr->tag.spcOid; + fctx->record[idx].reldatabase = bufHdr->tag.dbOid; + fctx->record[idx].forknum = BufTagGetForkNum(&bufHdr->tag); + fctx->record[idx].blocknum = bufHdr->tag.blockNum; + fctx->record[idx].usagecount = BUF_STATE_GET_USAGECOUNT(buf_state); + fctx->record[idx].pinning_backends = BUF_STATE_GET_REFCOUNT(buf_state); + + if (buf_state & BM_DIRTY) + fctx->record[idx].isdirty = true; + else + fctx->record[idx].isdirty = false; + + /* Note if the buffer is valid, and has storage created */ + if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) + fctx->record[idx].isvalid = true; + else + fctx->record[idx].isvalid = false; - /* Note if the buffer is valid, and has storage created */ - if ((buf_state & BM_VALID) && (buf_state & BM_TAG_VALID)) - fctx->record[i].isvalid = true; - else - fctx->record[i].isvalid = false; + fctx->record[idx].page_num = page_num; + /* advance to the next entry/page */ + ++idx; + ++page_num; + } UnlockBufHdr(bufHdr, buf_state); } + + Assert(idx <= max_entries); + + /* Set max calls and remember the user function context. */ + funcctx->max_calls = idx; + funcctx->user_fctx = fctx; } funcctx = SRF_PERCALL_SETUP(); @@ -285,6 +340,7 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) nulls[7] = true; /* unused for v1.0 callers, but the array is always long enough */ nulls[8] = true; + nulls[9] = true; } else { @@ -305,6 +361,8 @@ pg_buffercache_pages(PG_FUNCTION_ARGS) /* unused for v1.0 callers, but the array is always long enough */ values[8] = Int32GetDatum(fctx->record[i].pinning_backends); nulls[8] = false; + values[9] = Int64GetDatum(fctx->record[i].page_num); + nulls[9] = false; } /* Build and return the tuple. */ diff --git a/contrib/pg_buffercache/sql/pg_buffercache.sql b/contrib/pg_buffercache/sql/pg_buffercache.sql index 47cca1907c7..78ec7a88ea0 100644 --- a/contrib/pg_buffercache/sql/pg_buffercache.sql +++ b/contrib/pg_buffercache/sql/pg_buffercache.sql @@ -1,6 +1,6 @@ CREATE EXTENSION pg_buffercache; -select count(*) = (select setting::bigint +select count(*) >= (select setting::bigint from pg_settings where name = 'shared_buffers') from pg_buffercache; diff --git a/doc/src/sgml/pgbuffercache.sgml b/doc/src/sgml/pgbuffercache.sgml index 537d6014942..6183adf73db 100644 --- a/doc/src/sgml/pgbuffercache.sgml +++ b/doc/src/sgml/pgbuffercache.sgml @@ -205,6 +205,15 @@ Number of backends pinning this buffer </para></entry> </row> + + <row> + <entry role="catalog_table_entry"><para role="column_definition"> + <structfield>os_page_num</structfield> <type>bigint</type> + </para> + <para> + number of OS memory page for this buffer + </para></entry> + </row> </tbody> </tgroup> </table> @@ -550,6 +559,31 @@ regression=# SELECT n.nspname, c.relname, count(*) AS buffers public | spgist_text_tbl | 182 (10 rows) +regression=# SELECT n.nspname, c.relname, count(*) AS buffers_on_multiple_pages + FROM pg_buffercache b JOIN pg_class c + ON b.relfilenode = pg_relation_filenode(c.oid) AND + b.reldatabase IN (0, (SELECT oid FROM pg_database + WHERE datname = current_database())) + JOIN pg_namespace n ON n.oid = c.relnamespace + JOIN (SELECT bufferid FROM pg_buffercache + GROUP BY bufferid HAVING count(*) > 1) m on m.bufferid = b.bufferid + GROUP BY n.nspname, c.relname + ORDER BY 3 DESC + LIMIT 10; + + nspname | relname | buffers_on_multiple_pages +------------+---------------------------------+--------------------------- + public | gin_test_tbl | 4 + public | delete_test_table | 4 + public | tenk1 | 4 + pg_catalog | pg_attribute_relid_attnum_index | 4 + pg_catalog | pg_class | 2 + pg_catalog | pg_depend_depender_index | 2 + pg_catalog | pg_attribute | 2 + pg_catalog | pg_opfamily | 2 + pg_catalog | pg_opclass_oid_index | 2 + pg_catalog | pg_description | 2 +(10 rows) regression=# SELECT * FROM pg_buffercache_summary(); buffers_used | buffers_unused | buffers_dirty | buffers_pinned | usagecount_avg -- 2.34.1