Hi, On Tue, Dec 16, 2025 at 10:22:06AM +0000, Bertrand Drouvot wrote: > In the attached
PFA a mandatory rebase due to f4e797171ea. Regards, -- Bertrand Drouvot PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
>From bd285932cbe23ef9e70916269b6be9a5aacbaac4 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Wed, 1 Oct 2025 09:45:26 +0000 Subject: [PATCH v10 1/2] Key PGSTAT_KIND_RELATION by relfile locator This patch changes the key used for the PGSTAT_KIND_RELATION statistic kind. Instead of the relation oid, it now relies on: - dboid (linked to RelFileLocator's dbOid) - objoid which is the result of a new macro (namely RelFileLocatorToPgStatObjid()) that computes an objoid based on the RelFileLocator's spcOid and the RelFileLocator's relNumber. This is possible as, since b14e9ce7d55c, the objoid is now uint64 and spcOid and relNumber are 32 bits. That will allow us to add new stats (add writes counters) and ensure that some counters (n_dead_tup and friends) are replicated. The patch introduces pgstat_reloid_to_relfilelocator() to 1) avoid calling RelationIdGetRelation() to get the relfilelocator based on the relation oid and 2) handle the partitioned table case. Please note that: - when running pg_stat_have_stats('relation',...) we now need to be connected to the database that hosts the relation. As pg_stat_have_stats() is not documented publicly, then the changes done in 029_stats_restart.pl look enough. - this patch does not handle rewrites so some tests are failing. It's only intent is to ease the review and should not be pushed without being merged with the following patch that handles the rewrites. - it can be used to test that stats are incremented correctly and that we're able to retrieve them as long as rewrites are not involved. --- src/backend/postmaster/autovacuum.c | 17 +- src/backend/utils/activity/pgstat_relation.c | 236 ++++++++++++++++--- src/backend/utils/adt/pgstatfuncs.c | 22 +- src/include/catalog/pg_tablespace.dat | 4 + src/include/catalog/pg_tablespace.h | 8 + src/include/pgstat.h | 15 +- src/include/utils/pgstat_internal.h | 1 + src/test/recovery/t/029_stats_restart.pl | 40 ++-- 8 files changed, 271 insertions(+), 72 deletions(-) 6.1% src/backend/postmaster/ 61.6% src/backend/utils/activity/ 5.1% src/backend/utils/adt/ 3.2% src/include/catalog/ 5.6% src/include/ 18.1% src/test/recovery/t/ diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 1bd3924e35e..a11174b25ad 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2014,12 +2014,16 @@ do_autovacuum(void) bool dovacuum; bool doanalyze; bool wraparound; + RelFileLocator locator; if (classForm->relkind != RELKIND_RELATION && classForm->relkind != RELKIND_MATVIEW) continue; relid = classForm->oid; + locator.dbOid = classForm->relisshared ? InvalidOid : MyDatabaseId; + locator.spcOid = classForm->reltablespace; + locator.relNumber = classForm->relfilenode; /* * Check if it is a temp table (presumably, of some other backend's). @@ -2048,8 +2052,7 @@ do_autovacuum(void) /* Fetch reloptions and the pgstat entry for this table */ relopts = extract_autovac_opts(tuple, pg_class_desc); - tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, - relid); + tabentry = pgstat_fetch_stat_tabentry_by_locator(locator); /* Check if it needs vacuum or analyze */ relation_needs_vacanalyze(relid, relopts, classForm, tabentry, @@ -2114,6 +2117,7 @@ do_autovacuum(void) bool dovacuum; bool doanalyze; bool wraparound; + RelFileLocator locator; /* * We cannot safely process other backends' temp tables, so skip 'em. @@ -2122,6 +2126,9 @@ do_autovacuum(void) continue; relid = classForm->oid; + locator.dbOid = classForm->relisshared ? InvalidOid : MyDatabaseId; + locator.spcOid = classForm->reltablespace; + locator.relNumber = classForm->relfilenode; /* * fetch reloptions -- if this toast table does not have them, try the @@ -2141,8 +2148,7 @@ do_autovacuum(void) } /* Fetch the pgstat entry for this table */ - tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, - relid); + tabentry = pgstat_fetch_stat_tabentry_by_locator(locator); relation_needs_vacanalyze(relid, relopts, classForm, tabentry, effective_multixact_freeze_max_age, @@ -2939,8 +2945,7 @@ recheck_relation_needs_vacanalyze(Oid relid, PgStat_StatTabEntry *tabentry; /* fetch the pgstat table entry */ - tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared, - relid); + tabentry = pgstat_fetch_stat_tabentry_ext(relid); relation_needs_vacanalyze(relid, avopts, classForm, tabentry, effective_multixact_freeze_max_age, diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index 55a10c299db..a267e93f8be 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -17,12 +17,17 @@ #include "postgres.h" +#include "access/htup_details.h" #include "access/twophase_rmgr.h" #include "access/xact.h" #include "catalog/catalog.h" +#include "catalog/pg_tablespace.h" +#include "storage/lmgr.h" #include "utils/memutils.h" #include "utils/pgstat_internal.h" #include "utils/rel.h" +#include "utils/relmapper.h" +#include "utils/syscache.h" #include "utils/timestamp.h" @@ -36,13 +41,12 @@ typedef struct TwoPhasePgStatRecord PgStat_Counter inserted_pre_truncdrop; PgStat_Counter updated_pre_truncdrop; PgStat_Counter deleted_pre_truncdrop; - Oid id; /* table's OID */ - bool shared; /* is it a shared catalog? */ + RelFileLocator locator; /* table's rd_locator */ bool truncdropped; /* was the relation truncated/dropped? */ } TwoPhasePgStatRecord; -static PgStat_TableStatus *pgstat_prep_relation_pending(Oid rel_id, bool isshared); +static PgStat_TableStatus *pgstat_prep_relation_pending(RelFileLocator locator); static void add_tabstat_xact_level(PgStat_TableStatus *pgstat_info, int nest_level); static void ensure_tabstat_xact_level(PgStat_TableStatus *pgstat_info); static void save_truncdrop_counters(PgStat_TableXactStatus *trans, bool is_drop); @@ -60,8 +64,7 @@ pgstat_copy_relation_stats(Relation dst, Relation src) PgStatShared_Relation *dstshstats; PgStat_EntryRef *dst_ref; - srcstats = pgstat_fetch_stat_tabentry_ext(src->rd_rel->relisshared, - RelationGetRelid(src)); + srcstats = pgstat_fetch_stat_tabentry_ext(RelationGetRelid(src)); if (!srcstats) return; @@ -94,8 +97,10 @@ pgstat_init_relation(Relation rel) /* * We only count stats for relations with storage and partitioned tables + * and we don't count stats generated during a rewrite. */ - if (!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE) + if ((!RELKIND_HAS_STORAGE(relkind) && relkind != RELKIND_PARTITIONED_TABLE) || + OidIsValid(rel->rd_rel->relrewrite)) { rel->pgstat_enabled = false; rel->pgstat_info = NULL; @@ -130,12 +135,37 @@ pgstat_init_relation(Relation rel) void pgstat_assoc_relation(Relation rel) { + RelFileLocator locator; + Assert(rel->pgstat_enabled); Assert(rel->pgstat_info == NULL); + /* + * Don't associate stats for relations without storage and non partitioned + * tables. + */ + if (!RELKIND_HAS_STORAGE(rel->rd_rel->relkind) && + rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + return; + + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + locator = rel->rd_locator; + else + { + /* + * Partitioned tables don't have storage, so construct a synthetic + * locator for statistics tracking. Use a reserved pseudo tablespace + * OID that cannot conflict with real tablespaces, and the relation + * OID as relNumber. This ensures no collision with regular relations + * even after OID wraparound. + */ + locator.dbOid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); + locator.spcOid = PSEUDO_PARTITION_TABLE_SPCOID; + locator.relNumber = rel->rd_id; + } + /* Else find or make the PgStat_TableStatus entry, and update link */ - rel->pgstat_info = pgstat_prep_relation_pending(RelationGetRelid(rel), - rel->rd_rel->relisshared); + rel->pgstat_info = pgstat_prep_relation_pending(locator); /* don't allow link a stats to multiple relcache entries */ Assert(rel->pgstat_info->relation == NULL); @@ -167,9 +197,13 @@ pgstat_unlink_relation(Relation rel) void pgstat_create_relation(Relation rel) { + /* don't track stats for relations without storage */ + if (!RELKIND_HAS_STORAGE(rel->rd_rel->relkind)) + return; + pgstat_create_transactional(PGSTAT_KIND_RELATION, - rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId, - RelationGetRelid(rel)); + rel->rd_locator.dbOid, + RelFileLocatorToPgStatObjid(rel->rd_locator)); } /* @@ -181,9 +215,13 @@ pgstat_drop_relation(Relation rel) int nest_level = GetCurrentTransactionNestLevel(); PgStat_TableStatus *pgstat_info; + /* don't track stats for relations without storage */ + if (!RELKIND_HAS_STORAGE(rel->rd_rel->relkind)) + return; + pgstat_drop_transactional(PGSTAT_KIND_RELATION, - rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId, - RelationGetRelid(rel)); + rel->rd_locator.dbOid, + RelFileLocatorToPgStatObjid(rel->rd_locator)); if (!pgstat_should_count_relation(rel)) return; @@ -213,20 +251,23 @@ pgstat_report_vacuum(Relation rel, PgStat_Counter livetuples, PgStat_EntryRef *entry_ref; PgStatShared_Relation *shtabentry; PgStat_StatTabEntry *tabentry; - Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); TimestampTz ts; PgStat_Counter elapsedtime; + RelFileLocator locator; if (!pgstat_track_counts) return; + locator = rel->rd_locator; + /* Store the data in the table's hash table entry. */ ts = GetCurrentTimestamp(); elapsedtime = TimestampDifferenceMilliseconds(starttime, ts); /* block acquiring lock for the same reason as pgstat_report_autovac() */ - entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid, - RelationGetRelid(rel), false); + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, locator.dbOid, + RelFileLocatorToPgStatObjid(locator), + false); shtabentry = (PgStatShared_Relation *) entry_ref->shared_stats; tabentry = &shtabentry->stats; @@ -285,9 +326,9 @@ pgstat_report_analyze(Relation rel, PgStat_EntryRef *entry_ref; PgStatShared_Relation *shtabentry; PgStat_StatTabEntry *tabentry; - Oid dboid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); TimestampTz ts; PgStat_Counter elapsedtime; + RelFileLocator locator; if (!pgstat_track_counts) return; @@ -325,9 +366,25 @@ pgstat_report_analyze(Relation rel, ts = GetCurrentTimestamp(); elapsedtime = TimestampDifferenceMilliseconds(starttime, ts); + if (rel->rd_rel->relkind != RELKIND_PARTITIONED_TABLE) + locator = rel->rd_locator; + else + { + /* + * Partitioned tables don't have storage, so construct a synthetic + * locator for statistics tracking. Use a reserved pseudo tablespace + * OID that cannot conflict with real tablespaces, and the relation + * OID as relNumber. This ensures no collision with regular relations + * even after OID wraparound. + */ + locator.dbOid = (rel->rd_rel->relisshared ? InvalidOid : MyDatabaseId); + locator.spcOid = PSEUDO_PARTITION_TABLE_SPCOID; + locator.relNumber = rel->rd_id; + } /* block acquiring lock for the same reason as pgstat_report_autovac() */ - entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, dboid, - RelationGetRelid(rel), + entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, + locator.dbOid, + RelFileLocatorToPgStatObjid(locator), false); /* can't get dropped while accessed */ Assert(entry_ref != NULL && entry_ref->shared_stats != NULL); @@ -468,7 +525,16 @@ pgstat_update_heap_dead_tuples(Relation rel, int delta) PgStat_StatTabEntry * pgstat_fetch_stat_tabentry(Oid relid) { - return pgstat_fetch_stat_tabentry_ext(IsSharedRelation(relid), relid); + return pgstat_fetch_stat_tabentry_ext(relid); +} + +PgStat_StatTabEntry * +pgstat_fetch_stat_tabentry_by_locator(RelFileLocator locator) +{ + return (PgStat_StatTabEntry *) pgstat_fetch_entry( + PGSTAT_KIND_RELATION, + locator.dbOid, + RelFileLocatorToPgStatObjid(locator)); } /* @@ -476,12 +542,14 @@ pgstat_fetch_stat_tabentry(Oid relid) * whether the to-be-accessed table is a shared relation or not. */ PgStat_StatTabEntry * -pgstat_fetch_stat_tabentry_ext(bool shared, Oid reloid) +pgstat_fetch_stat_tabentry_ext(Oid reloid) { - Oid dboid = (shared ? InvalidOid : MyDatabaseId); + RelFileLocator locator; - return (PgStat_StatTabEntry *) - pgstat_fetch_entry(PGSTAT_KIND_RELATION, dboid, reloid); + if (!pgstat_reloid_to_relfilelocator(reloid, &locator)) + return NULL; + + return pgstat_fetch_stat_tabentry_by_locator(locator); } /* @@ -503,14 +571,17 @@ find_tabstat_entry(Oid rel_id) PgStat_TableXactStatus *trans; PgStat_TableStatus *tabentry = NULL; PgStat_TableStatus *tablestatus = NULL; + RelFileLocator locator; + + if (!pgstat_reloid_to_relfilelocator(rel_id, &locator)) + return NULL; + + entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, + locator.dbOid, + RelFileLocatorToPgStatObjid(locator)); - entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, MyDatabaseId, rel_id); if (!entry_ref) - { - entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, InvalidOid, rel_id); - if (!entry_ref) - return tablestatus; - } + return tablestatus; tabentry = (PgStat_TableStatus *) entry_ref->pending; tablestatus = palloc_object(PgStat_TableStatus); @@ -706,8 +777,12 @@ AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) record.inserted_pre_truncdrop = trans->inserted_pre_truncdrop; record.updated_pre_truncdrop = trans->updated_pre_truncdrop; record.deleted_pre_truncdrop = trans->deleted_pre_truncdrop; - record.id = tabstat->id; - record.shared = tabstat->shared; + + if (tabstat->relation != NULL) + record.locator = tabstat->relation->rd_locator; + else + record.locator = tabstat->locator; + record.truncdropped = trans->truncdropped; RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0, @@ -750,7 +825,7 @@ pgstat_twophase_postcommit(FullTransactionId fxid, uint16 info, PgStat_TableStatus *pgstat_info; /* Find or create a tabstat entry for the rel */ - pgstat_info = pgstat_prep_relation_pending(rec->id, rec->shared); + pgstat_info = pgstat_prep_relation_pending(rec->locator); /* Same math as in AtEOXact_PgStat, commit case */ pgstat_info->counts.tuples_inserted += rec->tuples_inserted; @@ -785,8 +860,8 @@ pgstat_twophase_postabort(FullTransactionId fxid, uint16 info, TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; PgStat_TableStatus *pgstat_info; - /* Find or create a tabstat entry for the rel */ - pgstat_info = pgstat_prep_relation_pending(rec->id, rec->shared); + /* Find or create a tabstat entry for the target locator */ + pgstat_info = pgstat_prep_relation_pending(rec->locator); /* Same math as in AtEOXact_PgStat, abort case */ if (rec->truncdropped) @@ -920,17 +995,21 @@ pgstat_relation_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts) * initialized if not exists. */ static PgStat_TableStatus * -pgstat_prep_relation_pending(Oid rel_id, bool isshared) +pgstat_prep_relation_pending(RelFileLocator locator) { PgStat_EntryRef *entry_ref; PgStat_TableStatus *pending; + uint64 objid; + + objid = RelFileLocatorToPgStatObjid(locator); entry_ref = pgstat_prep_pending_entry(PGSTAT_KIND_RELATION, - isshared ? InvalidOid : MyDatabaseId, - rel_id, NULL); + locator.dbOid, + objid, NULL); + pending = entry_ref->pending; - pending->id = rel_id; - pending->shared = isshared; + pending->id = objid; + pending->locator = locator; return pending; } @@ -1009,3 +1088,82 @@ restore_truncdrop_counters(PgStat_TableXactStatus *trans) trans->tuples_deleted = trans->deleted_pre_truncdrop; } } + +/* + * Convert a relation OID to its corresponding RelFileLocator for statistics + * tracking purposes. + * + * Returns true on success, false if the relation doesn't need statistics + * tracking. + * + * For partitioned tables, constructs a synthetic locator using the relation + * OID as relNumber, since they don't have storage. + */ +bool +pgstat_reloid_to_relfilelocator(Oid reloid, RelFileLocator *locator) +{ + HeapTuple tuple; + Form_pg_class relform; + bool result = true; + + /* get the relation's tuple from pg_class */ + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(reloid)); + + if (!HeapTupleIsValid(tuple)) + return false; + + relform = (Form_pg_class) GETSTRUCT(tuple); + + /* skip relations without storage and non partitioned tables */ + if (!RELKIND_HAS_STORAGE(relform->relkind) && + relform->relkind != RELKIND_PARTITIONED_TABLE) + { + ReleaseSysCache(tuple); + return false; + } + + if (relform->relkind != RELKIND_PARTITIONED_TABLE) + { + /* build the RelFileLocator */ + locator->relNumber = relform->relfilenode; + locator->spcOid = relform->reltablespace; + + /* handle default tablespace */ + if (!OidIsValid(locator->spcOid)) + locator->spcOid = MyDatabaseTableSpace; + + /* handle dbOid for global vs local relations */ + if (locator->spcOid == GLOBALTABLESPACE_OID) + locator->dbOid = InvalidOid; + else + locator->dbOid = MyDatabaseId; + + /* handle mapped relations */ + if (!RelFileNumberIsValid(locator->relNumber)) + { + locator->relNumber = RelationMapOidToFilenumber(reloid, + relform->relisshared); + if (!RelFileNumberIsValid(locator->relNumber)) + { + ReleaseSysCache(tuple); + return false; + } + } + } + else + { + /* + * Partitioned tables don't have storage, so construct a synthetic + * locator for statistics tracking. Use a reserved pseudo tablespace + * OID that cannot conflict with real tablespaces, and the relation + * OID as relNumber. This ensures no collision with regular relations + * even after OID wraparound. + */ + locator->dbOid = (relform->relisshared ? InvalidOid : MyDatabaseId); + locator->spcOid = PSEUDO_PARTITION_TABLE_SPCOID; + locator->relNumber = relform->oid; + } + + ReleaseSysCache(tuple); + return result; +} diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index ef6fffe60b9..60ffb1679ec 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -23,13 +23,13 @@ #include "common/ip.h" #include "funcapi.h" #include "miscadmin.h" -#include "pgstat.h" #include "postmaster/bgworker.h" #include "replication/logicallauncher.h" #include "storage/proc.h" #include "storage/procarray.h" #include "utils/acl.h" #include "utils/builtins.h" +#include "utils/pgstat_internal.h" #include "utils/timestamp.h" #define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) @@ -1949,9 +1949,14 @@ Datum pg_stat_reset_single_table_counters(PG_FUNCTION_ARGS) { Oid taboid = PG_GETARG_OID(0); - Oid dboid = (IsSharedRelation(taboid) ? InvalidOid : MyDatabaseId); + RelFileLocator locator; - pgstat_reset(PGSTAT_KIND_RELATION, dboid, taboid); + /* Get the stats locator from the relation OID */ + if (!pgstat_reloid_to_relfilelocator(taboid, &locator)) + PG_RETURN_VOID(); + + pgstat_reset(PGSTAT_KIND_RELATION, locator.dbOid, + RelFileLocatorToPgStatObjid(locator)); PG_RETURN_VOID(); } @@ -2305,5 +2310,16 @@ pg_stat_have_stats(PG_FUNCTION_ARGS) uint64 objid = PG_GETARG_INT64(2); PgStat_Kind kind = pgstat_get_kind_from_str(stats_type); + /* Convert relation OID to relfilenode objid */ + if (kind == PGSTAT_KIND_RELATION) + { + RelFileLocator locator; + + if (!pgstat_reloid_to_relfilelocator(objid, &locator)) + PG_RETURN_BOOL(false); + + objid = RelFileLocatorToPgStatObjid(locator); + } + PG_RETURN_BOOL(pgstat_have_entry(kind, dboid, objid)); } diff --git a/src/include/catalog/pg_tablespace.dat b/src/include/catalog/pg_tablespace.dat index 1302a3d75cd..9430970fffd 100644 --- a/src/include/catalog/pg_tablespace.dat +++ b/src/include/catalog/pg_tablespace.dat @@ -10,6 +10,10 @@ # #---------------------------------------------------------------------- +/* + * When adding a new one, ensure it does not conflict with + * PSEUDO_PARTITION_TABLE_SPCOID. + */ [ { oid => '1663', oid_symbol => 'DEFAULTTABLESPACE_OID', diff --git a/src/include/catalog/pg_tablespace.h b/src/include/catalog/pg_tablespace.h index 7816d779d8c..0e2d8051d69 100644 --- a/src/include/catalog/pg_tablespace.h +++ b/src/include/catalog/pg_tablespace.h @@ -21,6 +21,14 @@ #include "catalog/genbki.h" #include "catalog/pg_tablespace_d.h" /* IWYU pragma: export */ +/* + * Reserved tablespace OID for partitioned table pseudo locators. + * This is not an actual tablespace, just a reserved value to distinguish + * partitioned table statistics from regular table statistics. Ensures it does + * not conflict with the ones in pg_tablespace.dat. + */ +#define PSEUDO_PARTITION_TABLE_SPCOID 1665 + /* ---------------- * pg_tablespace definition. cpp turns this into * typedef struct FormData_pg_tablespace diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 6714363144a..3102f86aa24 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -17,6 +17,7 @@ #include "postmaster/pgarch.h" /* for MAX_XFN_CHARS */ #include "replication/conflict.h" #include "replication/worker_internal.h" +#include "storage/relfilelocator.h" #include "utils/backend_progress.h" /* for backward compatibility */ /* IWYU pragma: export */ #include "utils/backend_status.h" /* for backward compatibility */ /* IWYU pragma: export */ #include "utils/pgstat_kind.h" @@ -35,6 +36,12 @@ /* Default directory to store temporary statistics data in */ #define PG_STAT_TMP_DIR "pg_stat_tmp" +/* + * Build a pgstat key Objid based on a RelFileLocator. + */ +#define RelFileLocatorToPgStatObjid(locator) \ + (((uint64) (locator).spcOid << 32) | (locator).relNumber) + /* Values for track_functions GUC variable --- order is significant! */ typedef enum TrackFunctionsLevel { @@ -175,11 +182,11 @@ typedef struct PgStat_TableCounts */ typedef struct PgStat_TableStatus { - Oid id; /* table's OID */ - bool shared; /* is it a shared catalog? */ + uint64 id; /* hash of relfilelocator for stats key */ struct PgStat_TableXactStatus *trans; /* lowest subxact's counts */ PgStat_TableCounts counts; /* event counts to be sent */ Relation relation; /* rel that is using this entry */ + RelFileLocator locator; /* table's relfilelocator */ } PgStat_TableStatus; /* ---------- @@ -735,8 +742,8 @@ extern void pgstat_twophase_postabort(FullTransactionId fxid, uint16 info, void *recdata, uint32 len); extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry(Oid relid); -extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_ext(bool shared, - Oid reloid); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_by_locator(RelFileLocator locator); +extern PgStat_StatTabEntry *pgstat_fetch_stat_tabentry_ext(Oid reloid); extern PgStat_TableStatus *find_tabstat_entry(Oid rel_id); diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h index 5c1ce4d3d6a..7b24928b00d 100644 --- a/src/include/utils/pgstat_internal.h +++ b/src/include/utils/pgstat_internal.h @@ -764,6 +764,7 @@ extern void PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state); extern bool pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait); extern void pgstat_relation_delete_pending_cb(PgStat_EntryRef *entry_ref); extern void pgstat_relation_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts); +extern bool pgstat_reloid_to_relfilelocator(Oid reloid, RelFileLocator *locator); /* diff --git a/src/test/recovery/t/029_stats_restart.pl b/src/test/recovery/t/029_stats_restart.pl index 021e2bf361f..3a9c05eaf10 100644 --- a/src/test/recovery/t/029_stats_restart.pl +++ b/src/test/recovery/t/029_stats_restart.pl @@ -55,10 +55,10 @@ trigger_funcrel_stat(); # verify stats objects exist $sect = "initial"; -is(have_stats('database', $dboid, 0), 't', "$sect: db stats do exist"); -is(have_stats('function', $dboid, $funcoid), +is(have_stats($connect_db, 'database', $dboid, 0), 't', "$sect: db stats do exist"); +is(have_stats($db_under_test, 'function', $dboid, $funcoid), 't', "$sect: function stats do exist"); -is(have_stats('relation', $dboid, $tableoid), +is(have_stats($db_under_test, 'relation', $dboid, $tableoid), 't', "$sect: relation stats do exist"); # regular shutdown @@ -79,10 +79,10 @@ copy($og_stats, $statsfile) or die "Copy failed: $!"; $node->start; $sect = "copy"; -is(have_stats('database', $dboid, 0), 't', "$sect: db stats do exist"); -is(have_stats('function', $dboid, $funcoid), +is(have_stats($connect_db, 'database', $dboid, 0), 't', "$sect: db stats do exist"); +is(have_stats($db_under_test, 'function', $dboid, $funcoid), 't', "$sect: function stats do exist"); -is(have_stats('relation', $dboid, $tableoid), +is(have_stats($db_under_test, 'relation', $dboid, $tableoid), 't', "$sect: relation stats do exist"); $node->stop('immediate'); @@ -96,10 +96,10 @@ $node->start; # stats should have been discarded $sect = "post immediate"; -is(have_stats('database', $dboid, 0), 'f', "$sect: db stats do not exist"); -is(have_stats('function', $dboid, $funcoid), +is(have_stats($connect_db, 'database', $dboid, 0), 'f', "$sect: db stats do not exist"); +is(have_stats($db_under_test, 'function', $dboid, $funcoid), 'f', "$sect: function stats do exist"); -is(have_stats('relation', $dboid, $tableoid), +is(have_stats($db_under_test, 'relation', $dboid, $tableoid), 'f', "$sect: relation stats do not exist"); # get rid of backup statsfile @@ -110,10 +110,10 @@ unlink $statsfile or die "cannot unlink $statsfile $!"; trigger_funcrel_stat(); $sect = "post immediate, new"; -is(have_stats('database', $dboid, 0), 't', "$sect: db stats do exist"); -is(have_stats('function', $dboid, $funcoid), +is(have_stats($connect_db, 'database', $dboid, 0), 't', "$sect: db stats do exist"); +is(have_stats($db_under_test, 'function', $dboid, $funcoid), 't', "$sect: function stats do exist"); -is(have_stats('relation', $dboid, $tableoid), +is(have_stats($db_under_test, 'relation', $dboid, $tableoid), 't', "$sect: relation stats do exist"); # regular shutdown @@ -129,10 +129,10 @@ $node->start; # no stats present due to invalid stats file $sect = "invalid_overwrite"; -is(have_stats('database', $dboid, 0), 'f', "$sect: db stats do not exist"); -is(have_stats('function', $dboid, $funcoid), +is(have_stats($connect_db, 'database', $dboid, 0), 'f', "$sect: db stats do not exist"); +is(have_stats($db_under_test, 'function', $dboid, $funcoid), 'f', "$sect: function stats do not exist"); -is(have_stats('relation', $dboid, $tableoid), +is(have_stats($db_under_test, 'relation', $dboid, $tableoid), 'f', "$sect: relation stats do not exist"); @@ -145,10 +145,10 @@ append_file($og_stats, "XYZ"); $node->start; $sect = "invalid_append"; -is(have_stats('database', $dboid, 0), 'f', "$sect: db stats do not exist"); -is(have_stats('function', $dboid, $funcoid), +is(have_stats($connect_db, 'database', $dboid, 0), 'f', "$sect: db stats do not exist"); +is(have_stats($db_under_test, 'function', $dboid, $funcoid), 'f', "$sect: function stats do not exist"); -is(have_stats('relation', $dboid, $tableoid), +is(have_stats($db_under_test, 'relation', $dboid, $tableoid), 'f', "$sect: relation stats do not exist"); @@ -307,9 +307,9 @@ sub trigger_funcrel_stat sub have_stats { - my ($kind, $dboid, $objid) = @_; + my ($db, $kind, $dboid, $objid) = @_; - return $node->safe_psql($connect_db, + return $node->safe_psql($db, "SELECT pg_stat_have_stats('$kind', $dboid, $objid)"); } -- 2.34.1
>From 272952b5631f092d9bcfdc7865dbf7ae9b8a2b23 Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot <[email protected]> Date: Tue, 4 Nov 2025 13:52:46 +0000 Subject: [PATCH v10 2/2] handle relation statistics correctly during rewrites Now that PGSTAT_KIND_RELATION is keyed by refilenode, we need to handle rewrites. To do so, this patch: - Adds PgStat_PendingRewrite, a new struct to track rewrite operations within a transaction, storing the old locator, new locator, and original locator (for rewrite chains). This allows stats to be copied from the original location to the final location at commit time. - Adds a new function, pgstat_mark_rewrite(), called when a table rewrite begins. It records the rewrite operation in a local list and detects rewrite chains by checking if the old_locator matches any existing new_locator, preserving the chain's original_locator. - Modifies pgstat_copy_relation_stats(), to accept RelFileLocators instead of Relations, with a new increment parameter to accumulate stats (needed for rewrite chains with DML between rewrites). - Ensures that AtEOXact_PgStat_Relations(), AtPrepare_PgStat_Relations(), pgstat_twophase_postcommit()/postabort() pgstat_drop_relation() handle the PgStat_PendingRewrite list correctly. Note that due to the new flush call in pgstat_twophase_postcommit() we can not call GetCurrentTransactionStopTimestamp() in pgstat_relation_flush_cb(). So, adding a check to handle this special case and call GetCurrentTimestamp() instead. Note that we'd call GetCurrentTimestamp() only if there is a rewrite, so that the GetCurrentTimestamp() extra cost should be negligible. Another solution could be to trigger the flush from FinishPreparedTransaction() but that's not worth the extra complexity. The new pending_rewrites list is traversed in multiple places. The overhead should be negligible in comparison to a rewrite and the list should not contain a lot of rewrites in practice. The pending_rewrites list is traversed in multiple places. In typical usage, the list will contain only a few entries so the traversal cost is negligible ( furthermore in comparison to a rewrite). --- src/backend/catalog/index.c | 2 +- src/backend/commands/cluster.c | 5 + src/backend/commands/tablecmds.c | 6 + src/backend/utils/activity/pgstat_relation.c | 391 ++++++++++++++++++- src/backend/utils/activity/pgstat_xact.c | 25 +- src/backend/utils/cache/relcache.c | 6 + src/include/pgstat.h | 5 +- src/tools/pgindent/typedefs.list | 1 + 8 files changed, 424 insertions(+), 17 deletions(-) 92.8% src/backend/utils/activity/ 4.9% src/backend/ diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 8dea58ad96b..b71925a22c3 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1795,7 +1795,7 @@ index_concurrently_swap(Oid newIndexId, Oid oldIndexId, const char *oldName) changeDependenciesOn(RelationRelationId, oldIndexId, newIndexId); /* copy over statistics from old to new index */ - pgstat_copy_relation_stats(newClassRel, oldClassRel); + pgstat_copy_relation_stats(newClassRel->rd_locator, oldClassRel->rd_locator, false); /* Copy data of pg_statistic from the old index to the new one */ CopyStatistics(oldIndexId, newIndexId); diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 2120c85ccb4..6155b12afab 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -1196,6 +1196,11 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, rel1 = relation_open(r1, NoLock); rel2 = relation_open(r2, NoLock); + + /* Mark that a rewrite happened */ + if (RELKIND_HAS_STORAGE(rel1->rd_rel->relkind)) + pgstat_mark_rewrite(rel1->rd_locator, rel2->rd_locator); + rel2->rd_createSubid = rel1->rd_createSubid; rel2->rd_newRelfilelocatorSubid = rel1->rd_newRelfilelocatorSubid; rel2->rd_firstRelfilelocatorSubid = rel1->rd_firstRelfilelocatorSubid; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 6b1a00ed477..9de70f321ed 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -16884,6 +16884,7 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) Oid reltoastrelid; RelFileNumber newrelfilenumber; RelFileLocator newrlocator; + RelFileLocator oldrlocator; List *reltoastidxids = NIL; ListCell *lc; @@ -16922,6 +16923,7 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) newrlocator = rel->rd_locator; newrlocator.relNumber = newrelfilenumber; newrlocator.spcOid = newTableSpace; + oldrlocator = rel->rd_locator; /* hand off to AM to actually create new rel storage and copy the data */ if (rel->rd_rel->relkind == RELKIND_INDEX) @@ -16934,6 +16936,10 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) table_relation_copy_data(rel, &newrlocator); } + /* mark that a rewrite happened */ + if (RELKIND_HAS_STORAGE(rel->rd_rel->relkind)) + pgstat_mark_rewrite(oldrlocator, newrlocator); + /* * Update the pg_class row. * diff --git a/src/backend/utils/activity/pgstat_relation.c b/src/backend/utils/activity/pgstat_relation.c index a267e93f8be..123fb50d98f 100644 --- a/src/backend/utils/activity/pgstat_relation.c +++ b/src/backend/utils/activity/pgstat_relation.c @@ -30,6 +30,19 @@ #include "utils/syscache.h" #include "utils/timestamp.h" +/* Pending rewrite operations for stats copying */ +typedef struct PgStat_PendingRewrite +{ + RelFileLocator old_locator; + RelFileLocator new_locator; + RelFileLocator original_locator; + int nest_level; /* Transaction nesting level where rewrite + * occurred */ + struct PgStat_PendingRewrite *next; +} PgStat_PendingRewrite; + +/* The pending rewrites list for current transaction */ +static PgStat_PendingRewrite *pending_rewrites = NULL; /* Record that's written to 2PC state file when pgstat state is persisted */ typedef struct TwoPhasePgStatRecord @@ -43,6 +56,8 @@ typedef struct TwoPhasePgStatRecord PgStat_Counter deleted_pre_truncdrop; RelFileLocator locator; /* table's rd_locator */ bool truncdropped; /* was the relation truncated/dropped? */ + RelFileLocator rewrite_old_locator; + int rewrite_nest_level; } TwoPhasePgStatRecord; @@ -54,27 +69,70 @@ static void restore_truncdrop_counters(PgStat_TableXactStatus *trans); /* - * Copy stats between relations. This is used for things like REINDEX + * Copy stats between RelFileLocator. This is used for things like REINDEX * CONCURRENTLY. */ void -pgstat_copy_relation_stats(Relation dst, Relation src) +pgstat_copy_relation_stats(RelFileLocator dst, RelFileLocator src, bool increment) { PgStat_StatTabEntry *srcstats; PgStatShared_Relation *dstshstats; PgStat_EntryRef *dst_ref; - srcstats = pgstat_fetch_stat_tabentry_ext(RelationGetRelid(src)); + srcstats = (PgStat_StatTabEntry *) pgstat_fetch_entry(PGSTAT_KIND_RELATION, + src.dbOid, + RelFileLocatorToPgStatObjid(src)); if (!srcstats) return; dst_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_RELATION, - dst->rd_rel->relisshared ? InvalidOid : MyDatabaseId, - RelationGetRelid(dst), + dst.dbOid, + RelFileLocatorToPgStatObjid(dst), false); dstshstats = (PgStatShared_Relation *) dst_ref->shared_stats; - dstshstats->stats = *srcstats; + + if (!increment) + dstshstats->stats = *srcstats; + else + { + /* Increment those statistics */ +#define RELFSTAT_ACC(fld, stats_to_add) \ + (dstshstats->stats.fld += stats_to_add->fld) + RELFSTAT_ACC(numscans, srcstats); + RELFSTAT_ACC(tuples_returned, srcstats); + RELFSTAT_ACC(tuples_fetched, srcstats); + RELFSTAT_ACC(tuples_inserted, srcstats); + RELFSTAT_ACC(tuples_updated, srcstats); + RELFSTAT_ACC(tuples_deleted, srcstats); + RELFSTAT_ACC(tuples_hot_updated, srcstats); + RELFSTAT_ACC(tuples_newpage_updated, srcstats); + RELFSTAT_ACC(live_tuples, srcstats); + RELFSTAT_ACC(dead_tuples, srcstats); + RELFSTAT_ACC(mod_since_analyze, srcstats); + RELFSTAT_ACC(ins_since_vacuum, srcstats); + RELFSTAT_ACC(blocks_fetched, srcstats); + RELFSTAT_ACC(blocks_hit, srcstats); + RELFSTAT_ACC(vacuum_count, srcstats); + RELFSTAT_ACC(autovacuum_count, srcstats); + RELFSTAT_ACC(analyze_count, srcstats); + RELFSTAT_ACC(autoanalyze_count, srcstats); + RELFSTAT_ACC(total_vacuum_time, srcstats); + RELFSTAT_ACC(total_autovacuum_time, srcstats); + RELFSTAT_ACC(total_analyze_time, srcstats); + RELFSTAT_ACC(total_autoanalyze_time, srcstats); +#undef RELFSTAT_ACC + + /* Replace those statistics */ +#define RELFSTAT_REP(fld, stats_to_rep) \ + (dstshstats->stats.fld = stats_to_rep->fld) + RELFSTAT_REP(lastscan, srcstats); + RELFSTAT_REP(last_vacuum_time, srcstats); + RELFSTAT_REP(last_autovacuum_time, srcstats); + RELFSTAT_REP(last_analyze_time, srcstats); + RELFSTAT_REP(last_autoanalyze_time, srcstats); +#undef RELFSTAT_REP + } pgstat_unlock_entry(dst_ref); } @@ -136,6 +194,7 @@ void pgstat_assoc_relation(Relation rel) { RelFileLocator locator; + PgStat_TableStatus *pgstat_info; Assert(rel->pgstat_enabled); Assert(rel->pgstat_info == NULL); @@ -164,14 +223,54 @@ pgstat_assoc_relation(Relation rel) locator.relNumber = rel->rd_id; } + /* + * If this relation was rewritten during the current transaction we may be + * reopening it with its new RelFileLocator. In that case, continue using + * the stats entry associated with the old locator rather than creating a + * new one. This ensures all stats from before and after the rewrite are + * tracked in a single entry which will be properly copied to the new + * locator at transaction commit. + */ + if (pending_rewrites != NULL) + { + PgStat_PendingRewrite *rewrite; + + for (rewrite = pending_rewrites; rewrite != NULL; rewrite = rewrite->next) + { + if (locator.dbOid == rewrite->new_locator.dbOid && + locator.spcOid == rewrite->new_locator.spcOid && + locator.relNumber == rewrite->new_locator.relNumber) + { + pgstat_info = pgstat_prep_relation_pending(rewrite->old_locator); + goto found_entry; + } + } + } + /* Else find or make the PgStat_TableStatus entry, and update link */ - rel->pgstat_info = pgstat_prep_relation_pending(locator); + pgstat_info = pgstat_prep_relation_pending(locator); + +found_entry: + rel->pgstat_info = pgstat_info; + + /* + * For relations stats, we key by physical file location, not by relation + * OID. This means during operations like ALTER TYPE it's possible that + * the relation OID changes but the relfilenode stays the same (no actual + * rewrite needed). Unlink the old relation first. + */ + if (pgstat_info->relation != NULL && + pgstat_info->relation != rel) + { + pgstat_info->relation->pgstat_info = NULL; + pgstat_info->relation = NULL; + } /* don't allow link a stats to multiple relcache entries */ - Assert(rel->pgstat_info->relation == NULL); + Assert(pgstat_info->relation == NULL); /* mark this relation as the owner */ - rel->pgstat_info->relation = rel; + pgstat_info->relation = rel; } /* @@ -214,14 +313,37 @@ pgstat_drop_relation(Relation rel) { int nest_level = GetCurrentTransactionNestLevel(); PgStat_TableStatus *pgstat_info; + bool skip_transactional_drop = false; /* don't track stats for relations without storage */ if (!RELKIND_HAS_STORAGE(rel->rd_rel->relkind)) return; - pgstat_drop_transactional(PGSTAT_KIND_RELATION, - rel->rd_locator.dbOid, - RelFileLocatorToPgStatObjid(rel->rd_locator)); + /* Check if this drop is part of a pending rewrite */ + if (pending_rewrites != NULL) + { + PgStat_PendingRewrite *rewrite; + + for (rewrite = pending_rewrites; rewrite != NULL; rewrite = rewrite->next) + { + if (rel->rd_locator.dbOid == rewrite->old_locator.dbOid && + rel->rd_locator.spcOid == rewrite->old_locator.spcOid && + rel->rd_locator.relNumber == rewrite->old_locator.relNumber) + { + skip_transactional_drop = true; + break; + } + } + } + + /* + * If it is part of a rewrite, drop its stats later, for example in + * AtEOXact_PgStat_Relations(), so skip it here. + */ + if (!skip_transactional_drop) + pgstat_drop_transactional(PGSTAT_KIND_RELATION, + rel->rd_locator.dbOid, + RelFileLocatorToPgStatObjid(rel->rd_locator)); if (!pgstat_should_count_relation(rel)) return; @@ -666,6 +788,48 @@ AtEOXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit) } tabstat->trans = NULL; } + + /* preserve the stats in case of rewrite */ + if (isCommit && pending_rewrites != NULL) + { + PgStat_PendingRewrite *rewrite; + PgStat_PendingRewrite *prev = NULL; + PgStat_PendingRewrite *current = pending_rewrites; + PgStat_PendingRewrite *next; + + /* reverse the rewrites list to process in chronological order */ + while (current != NULL) + { + next = current->next; + current->next = prev; + prev = current; + current = next; + } + + /* now process rewrites in chronological order */ + for (rewrite = prev; rewrite != NULL; rewrite = rewrite->next) + { + PgStat_EntryRef *old_entry_ref; + + old_entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, + rewrite->old_locator.dbOid, + RelFileLocatorToPgStatObjid(rewrite->old_locator)); + + if (old_entry_ref && old_entry_ref->pending) + pgstat_relation_flush_cb(old_entry_ref, false); + + pgstat_copy_relation_stats(rewrite->new_locator, + rewrite->old_locator, true); + + /* drop old locator's stats */ + if (!pgstat_drop_entry(PGSTAT_KIND_RELATION, + rewrite->old_locator.dbOid, + RelFileLocatorToPgStatObjid(rewrite->old_locator))) + pgstat_request_entry_refs_gc(); + } + } + + pending_rewrites = NULL; } /* @@ -681,6 +845,30 @@ AtEOSubXact_PgStat_Relations(PgStat_SubXactStatus *xact_state, bool isCommit, in PgStat_TableXactStatus *trans; PgStat_TableXactStatus *next_trans; + /* + * If we don't commit then remove the associated rewrites if any, to keep + * the rewrite chain in sync with what will be eventually committed. + */ + if (!isCommit) + { + PgStat_PendingRewrite **rewrite_ptr = &pending_rewrites; + + while (*rewrite_ptr != NULL) + { + if ((*rewrite_ptr)->nest_level >= nestDepth) + { + PgStat_PendingRewrite *to_remove = *rewrite_ptr; + + *rewrite_ptr = (*rewrite_ptr)->next; + pfree(to_remove); + } + else + { + rewrite_ptr = &((*rewrite_ptr)->next); + } + } + } + for (trans = xact_state->first; trans != NULL; trans = next_trans) { PgStat_TableStatus *tabstat; @@ -760,11 +948,19 @@ void AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) { PgStat_TableXactStatus *trans; + PgStat_PendingRewrite *rewrite; + /* + * For each tabstat, find its matching rewrite and remove it from the + * pending rewrites list. This way, after processing all tabstats, pending + * rewrites will only contain rewrite only transactions. + */ for (trans = xact_state->first; trans != NULL; trans = trans->next) { PgStat_TableStatus *tabstat PG_USED_FOR_ASSERTS_ONLY; TwoPhasePgStatRecord record; + PgStat_PendingRewrite **rewrite_ptr; + bool found_rewrite = false; Assert(trans->nest_level == 1); Assert(trans->upper == NULL); @@ -784,10 +980,83 @@ AtPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) record.locator = tabstat->locator; record.truncdropped = trans->truncdropped; + record.rewrite_nest_level = 0; + + /* + * Look for a matching rewrite and remove it from pending rewrites. We + * check three possible matches: + * + * The new_locator when stats have been added after the rewrite. The + * old_locator when stats have been added before the rewrite but not + * after. The original_locator when this tabstat is part of a rewrite + * chain. + */ + rewrite_ptr = &pending_rewrites; + while (*rewrite_ptr != NULL) + { + rewrite = *rewrite_ptr; + + if ((record.locator.dbOid == rewrite->new_locator.dbOid && + record.locator.spcOid == rewrite->new_locator.spcOid && + record.locator.relNumber == rewrite->new_locator.relNumber) || + (tabstat->locator.dbOid == rewrite->old_locator.dbOid && + tabstat->locator.spcOid == rewrite->old_locator.spcOid && + tabstat->locator.relNumber == rewrite->old_locator.relNumber) || + (tabstat->locator.dbOid == rewrite->original_locator.dbOid && + tabstat->locator.spcOid == rewrite->original_locator.spcOid && + tabstat->locator.relNumber == rewrite->original_locator.relNumber)) + { + /* + * Found matching rewrite. Record the rewrite information and + * remove this rewrite from the list since it's now handled. + */ + record.rewrite_old_locator = rewrite->original_locator; + record.rewrite_nest_level = rewrite->nest_level; + record.locator = rewrite->new_locator; + found_rewrite = true; + + /* Remove from pending_rewrites list */ + *rewrite_ptr = rewrite->next; + pfree(rewrite); + break; + } + else + { + /* Move to next rewrite in the list */ + rewrite_ptr = &(rewrite->next); + } + } + + /* If no rewrite found, clear the rewrite fields */ + if (!found_rewrite) + { + memset(&record.rewrite_old_locator, 0, sizeof(RelFileLocator)); + } + + RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0, + &record, sizeof(TwoPhasePgStatRecord)); + } + + /* + * Now process any rewrites still pending. These are rewrite only + * transactions. We need to preserve their stats even though there's no + * tabstat entry for them. + */ + for (rewrite = pending_rewrites; rewrite != NULL; rewrite = rewrite->next) + { + TwoPhasePgStatRecord record; + + memset(&record, 0, sizeof(TwoPhasePgStatRecord)); + record.locator = rewrite->new_locator; + record.rewrite_old_locator = rewrite->original_locator; + record.rewrite_nest_level = rewrite->nest_level; + record.truncdropped = false; RegisterTwoPhaseRecord(TWOPHASE_RM_PGSTAT_ID, 0, &record, sizeof(TwoPhasePgStatRecord)); } + + pending_rewrites = NULL; } /* @@ -810,6 +1079,8 @@ PostPrepare_PgStat_Relations(PgStat_SubXactStatus *xact_state) tabstat = trans->parent; tabstat->trans = NULL; } + + pending_rewrites = NULL; } /* @@ -845,6 +1116,29 @@ pgstat_twophase_postcommit(FullTransactionId fxid, uint16 info, pgstat_info->counts.changed_tuples += rec->tuples_inserted + rec->tuples_updated + rec->tuples_deleted; + + if (rec->rewrite_nest_level > 0) + { + PgStat_EntryRef *old_entry_ref; + + /* Flush any pending stats for old locator first */ + old_entry_ref = pgstat_fetch_pending_entry(PGSTAT_KIND_RELATION, + rec->rewrite_old_locator.dbOid, + RelFileLocatorToPgStatObjid(rec->rewrite_old_locator)); + + if (old_entry_ref && old_entry_ref->pending) + pgstat_relation_flush_cb(old_entry_ref, false); + + /* Copy stats from old to new locator */ + pgstat_copy_relation_stats(rec->locator, rec->rewrite_old_locator, + true); + + /* Drop old locator's stats */ + if (!pgstat_drop_entry(PGSTAT_KIND_RELATION, + rec->rewrite_old_locator.dbOid, + RelFileLocatorToPgStatObjid(rec->rewrite_old_locator))) + pgstat_request_entry_refs_gc(); + } } /* @@ -859,9 +1153,26 @@ pgstat_twophase_postabort(FullTransactionId fxid, uint16 info, { TwoPhasePgStatRecord *rec = (TwoPhasePgStatRecord *) recdata; PgStat_TableStatus *pgstat_info; + RelFileLocator target_locator; + + /* + * For aborted transactions with rewrites (like TRUNCATE), we need to + * restore stats to the old locator, not the new one. The new locator + * should be dropped since the rewrite is being rolled back. + */ + if (rec->rewrite_nest_level > 0) + { + /* Use the old locator */ + target_locator = rec->rewrite_old_locator; + } + else + { + /* No rewrite, use the original locator */ + target_locator = rec->locator; + } /* Find or create a tabstat entry for the target locator */ - pgstat_info = pgstat_prep_relation_pending(rec->locator); + pgstat_info = pgstat_prep_relation_pending(target_locator); /* Same math as in AtEOXact_PgStat, abort case */ if (rec->truncdropped) @@ -916,7 +1227,17 @@ pgstat_relation_flush_cb(PgStat_EntryRef *entry_ref, bool nowait) tabentry->numscans += lstats->counts.numscans; if (lstats->counts.numscans) { - TimestampTz t = GetCurrentTransactionStopTimestamp(); + TimestampTz t; + + /* + * Checking the transaction state due to the flush call in + * pgstat_twophase_postcommit() that would break the assertion on the + * state in GetCurrentTransactionStopTimestamp(). + */ + if (!IsTransactionState()) + t = GetCurrentTransactionStopTimestamp(); + else + t = GetCurrentTimestamp(); if (t > tabentry->lastscan) tabentry->lastscan = t; @@ -1167,3 +1488,45 @@ pgstat_reloid_to_relfilelocator(Oid reloid, RelFileLocator *locator) ReleaseSysCache(tuple); return result; } + +/* + * Mark that a relation rewrite has occurred, preserving the original locator + * so stats can be copied at transaction commit. + */ +void +pgstat_mark_rewrite(RelFileLocator old_locator, RelFileLocator new_locator) +{ + PgStat_PendingRewrite *rewrite; + PgStat_PendingRewrite *existing; + RelFileLocator original_locator = old_locator; + + for (existing = pending_rewrites; existing != NULL; existing = existing->next) + { + if (old_locator.dbOid == existing->new_locator.dbOid && + old_locator.spcOid == existing->new_locator.spcOid && + old_locator.relNumber == existing->new_locator.relNumber) + { + original_locator = existing->original_locator; + break; + } + } + + /* Allocate in TopTransactionContext memory context */ + rewrite = MemoryContextAlloc(TopTransactionContext, + sizeof(PgStat_PendingRewrite)); + + rewrite->old_locator = old_locator; + rewrite->new_locator = new_locator; + rewrite->original_locator = original_locator; + rewrite->nest_level = GetCurrentTransactionNestLevel(); + + /* Add to the list */ + rewrite->next = pending_rewrites; + pending_rewrites = rewrite; +} + +void +pgstat_clear_rewrite(void) +{ + pending_rewrites = NULL; +} diff --git a/src/backend/utils/activity/pgstat_xact.c b/src/backend/utils/activity/pgstat_xact.c index bc9864bd8d9..f8cf3755ce2 100644 --- a/src/backend/utils/activity/pgstat_xact.c +++ b/src/backend/utils/activity/pgstat_xact.c @@ -55,6 +55,8 @@ AtEOXact_PgStat(bool isCommit, bool parallel) } pgStatXactStack = NULL; + pgstat_clear_rewrite(); + /* Make sure any stats snapshot is thrown away */ pgstat_clear_snapshot(); } @@ -360,8 +362,29 @@ create_drop_transactional_internal(PgStat_Kind kind, Oid dboid, uint64 objid, bo void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, uint64 objid) { - if (pgstat_get_entry_ref(kind, dboid, objid, false, NULL)) + PgStat_EntryRef *entry_ref; + + entry_ref = pgstat_get_entry_ref(kind, dboid, objid, false, NULL); + + if (entry_ref) { + /* + * For relations stats, we key by physical file location, not by + * relation OID. This means during operations like ALTER TYPE where + * the relation OID changes but the relfilenode stays the same (no + * actual rewrite needed), we'll find an existing entry. + * + * This is expected behavior, we want to preserve stats across the + * catalog change. Simply reset and recreate the entry for the new + * relation OID without warning. + */ + if (kind == PGSTAT_KIND_RELATION) + { + pgstat_reset(kind, dboid, objid); + create_drop_transactional_internal(kind, dboid, objid, true); + return; + } + ereport(WARNING, errmsg("resetting existing statistics for kind %s, db=%u, oid=%" PRIu64, (pgstat_get_kind_info(kind))->name, dboid, diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 2d0cb7bcfd4..c98e5c51d63 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -85,6 +85,7 @@ #include "utils/inval.h" #include "utils/lsyscache.h" #include "utils/memutils.h" +#include "utils/pgstat_internal.h" #include "utils/relmapper.h" #include "utils/resowner.h" #include "utils/snapmgr.h" @@ -3780,6 +3781,7 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) MultiXactId minmulti = InvalidMultiXactId; TransactionId freezeXid = InvalidTransactionId; RelFileLocator newrlocator; + RelFileLocator oldrlocator = relation->rd_locator; if (!IsBinaryUpgrade) { @@ -3951,6 +3953,10 @@ RelationSetNewRelfilenumber(Relation relation, char persistence) table_close(pg_class, RowExclusiveLock); + /* Mark that a rewrite happened */ + if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) + pgstat_mark_rewrite(oldrlocator, newrlocator); + /* * Make the pg_class row change or relation map change visible. This will * cause the relcache entry to get updated, too. diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 3102f86aa24..8750c025bbe 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -670,7 +670,7 @@ extern PgStat_FunctionCounts *find_funcstat_entry(Oid func_id); extern void pgstat_create_relation(Relation rel); extern void pgstat_drop_relation(Relation rel); -extern void pgstat_copy_relation_stats(Relation dst, Relation src); +extern void pgstat_copy_relation_stats(RelFileLocator dst, RelFileLocator src, bool increment); extern void pgstat_init_relation(Relation rel); extern void pgstat_assoc_relation(Relation rel); @@ -682,6 +682,9 @@ extern void pgstat_report_vacuum(Relation rel, PgStat_Counter livetuples, extern void pgstat_report_analyze(Relation rel, PgStat_Counter livetuples, PgStat_Counter deadtuples, bool resetcounter, TimestampTz starttime); +extern void pgstat_mark_rewrite(RelFileLocator old_locator, + RelFileLocator new_locator); +extern void pgstat_clear_rewrite(void); /* * If stats are enabled, but pending data hasn't been prepared yet, call diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 04845d5e680..3184e9c464b 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2269,6 +2269,7 @@ PgStat_KindInfo PgStat_LocalState PgStat_PendingDroppedStatsItem PgStat_PendingIO +PgStat_PendingRewrite PgStat_SLRUStats PgStat_ShmemControl PgStat_Snapshot -- 2.34.1
