This is an automated email from the ASF dual-hosted git repository.
maxyang pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git
The following commit(s) were added to refs/heads/main by this push:
new f225eeb2f3d FIX: Invalid relcache leak WARNING logged in autovacuum
f225eeb2f3d is described below
commit f225eeb2f3da9018d5a120df4f07a35c3e6e9810
Author: zhoujiaqi <[email protected]>
AuthorDate: Mon Aug 11 13:52:15 2025 +0800
FIX: Invalid relcache leak WARNING logged in autovacuum
The autovacuum launcher process periodically launches workers to vacuum the
table.
During this process, the UDF `pg_catalog.gp_acquire_sample_rows` will be
called.
Also the vacuum task always be canceled by launcher.
The plan of `pg_catalog.gp_acquire_sample_rows` is:
```
QUERY PLAN
---------------------------------------------------------------------------------
Gather Motion 3:1 (slice1; segments: 3) (cost=0.00..45.02 rows=3000
width=32)
Output: (gp_acquire_sample_rows('17018'::oid, 1250, false))
-> ProjectSet (cost=0.00..5.02 rows=1000 width=32)
Output: gp_acquire_sample_rows('17018'::oid, 1250, false)
-> Result (cost=0.00..0.01 rows=1 width=0)
Optimizer: Postgres query optimizer
(6 rows)
```
In actual examples, we often encounter relcache leaks caused by
`pg_catalog.gp_acquire_sample_rows`.
In fact, this warning is not caused by the UDF itself.
The following are the complete steps to reproduce(not stable reproduce)
1. User use the insert/update/delete SQL. Auto-vacuum is enabled.
2. The auto-vacuum worker process call the
`pg_catalog.gp_acquire_sample_rows`
2.1 The vacuum launches in master cancel the vacuum query.
2.2 The vacuum worker in master process the interrupt in the
intercontect.
So the gather motion will be aborted.
2.3 The segment do the tuple sender in the motion(`doSendTuple`).But it
found the connection is NOT alive. Also it have not recv the SIGN
INT
in this time.
So segment mark the `StopRequested` to true, and finish the current
motion,
and the function `pg_catalog.gp_acquire_sample_rows` in project set
can't
call the `table_close` in this time.
2.4 The segment call the `PortalDrop` to destory the resowner which
inside the current
portal, and current portal status won't be FAIL, because current
segment still
have not recv the SIGN INT.
The resowner found the leaked relcache, log the WARNING.
3. After step2, segments recv the SIGN INT, But nothing to do.
---
contrib/interconnect/udp/ic_udpifc.c | 5 ++++-
src/backend/executor/nodeMotion.c | 3 ++-
src/backend/utils/mmgr/portalmem.c | 3 ++-
src/include/utils/portal.h | 3 +++
src/test/regress/expected/vacuum_gp.out | 28 ++++++++++++++++++++++++++++
src/test/regress/sql/vacuum_gp.sql | 17 ++++++++++++++++-
6 files changed, 55 insertions(+), 4 deletions(-)
diff --git a/contrib/interconnect/udp/ic_udpifc.c
b/contrib/interconnect/udp/ic_udpifc.c
index 63e8c9301dd..c28511fe828 100644
--- a/contrib/interconnect/udp/ic_udpifc.c
+++ b/contrib/interconnect/udp/ic_udpifc.c
@@ -3965,7 +3965,10 @@ receiveChunksUDPIFCLoop(ChunkTransportState
*pTransportStates, ChunkTransportSta
/* check the potential errors in rx thread. */
checkRxThreadError();
- /* do not check interrupts when holding the lock */
+ FaultInjector_InjectFaultIfSet("interconnect_stop_recv_chunk",
+
DDLNotSpecified,
+
"" /* databaseName */ ,
+
"" /* tableName */ );
ML_CHECK_FOR_INTERRUPTS(pTransportStates->teardownActive);
/*
diff --git a/src/backend/executor/nodeMotion.c
b/src/backend/executor/nodeMotion.c
index b6514f2a0ce..1229d460a55 100644
--- a/src/backend/executor/nodeMotion.c
+++ b/src/backend/executor/nodeMotion.c
@@ -31,7 +31,7 @@
#include "utils/wait_event.h"
#include "miscadmin.h"
#include "utils/memutils.h"
-
+#include "tcop/pquery.h" /* ActivePortal */
/* #define MEASURE_MOTION_TIME */
@@ -269,6 +269,7 @@ execMotionSender(MotionState *node)
if (node->stopRequested)
{
+ ActivePortal->stop_requested_in_motion = true;
elog(gp_workfile_caching_loglevel, "Motion
calling Squelch on child node");
/* propagate stop notification to our children
*/
ExecSquelchNode(outerNode, true);
diff --git a/src/backend/utils/mmgr/portalmem.c
b/src/backend/utils/mmgr/portalmem.c
index 45812a608e0..2835fbd10f1 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -224,6 +224,7 @@ CreatePortal(const char *name, bool allowDup, bool
dupSilent)
portal->atEnd = true; /* disallow fetches until query is set
*/
portal->visible = true;
portal->creation_time = GetCurrentStatementStartTimestamp();
+ portal->stop_requested_in_motion = false;
if (IsResQueueEnabled())
{
@@ -587,7 +588,7 @@ PortalDrop(Portal portal, bool isTopCommit)
if (portal->resowner &&
(!isTopCommit || portal->status == PORTAL_FAILED))
{
- bool isCommit = (portal->status != PORTAL_FAILED);
+ bool isCommit = (portal->status != PORTAL_FAILED) &&
!portal->stop_requested_in_motion;
ResourceOwnerRelease(portal->resowner,
RESOURCE_RELEASE_BEFORE_LOCKS,
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index fb940fa1ecf..e592d1e8863 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -225,6 +225,9 @@ typedef struct PortalData
/* MPP: is this portal a CURSOR, or protocol level portal? */
bool is_extended_query; /* simple or extended query
protocol? */
+
+ /* current motion stop requested? */
+ bool stop_requested_in_motion;
} PortalData;
/*
diff --git a/src/test/regress/expected/vacuum_gp.out
b/src/test/regress/expected/vacuum_gp.out
index 35d1a1e4fed..daeb2504559 100644
--- a/src/test/regress/expected/vacuum_gp.out
+++ b/src/test/regress/expected/vacuum_gp.out
@@ -437,3 +437,31 @@ SELECT reltuples, relname FROM pg_class WHERE
oid='vac_reltuple_distortion'::reg
1e+06 | vac_reltuple_distortion
(1 row)
+-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+NOTICE: table "relcache_leak_in_motion" does not exist, skipping
+-- end_ignore
+create table relcache_leak_in_motion(v1 int);
+NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'v1'
as the Apache Cloudberry data distribution key for this table.
+HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make
sure column(s) chosen are the optimal data distribution key to minimize skew.
+insert into relcache_leak_in_motion values(generate_series(0, 10000));
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid)
+ FROM gp_segment_configuration WHERE content = -1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+analyze relcache_leak_in_motion;
+ERROR: canceling statement due to user request
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid)
+ FROM gp_segment_configuration WHERE content = -1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+-- end_ignore
diff --git a/src/test/regress/sql/vacuum_gp.sql
b/src/test/regress/sql/vacuum_gp.sql
index 04f88df9e0c..198a80f4a93 100644
--- a/src/test/regress/sql/vacuum_gp.sql
+++ b/src/test/regress/sql/vacuum_gp.sql
@@ -290,4 +290,19 @@ VACUUM vac_reltuple_distortion;
VACUUM vac_reltuple_distortion; -- 2nd call to VACUUM after ANALYZE
SELECT reltuples, relname FROM pg_class WHERE
oid='vac_reltuple_distortion'::regclass;
VACUUM vac_reltuple_distortion;
-SELECT reltuples, relname FROM pg_class WHERE
oid='vac_reltuple_distortion'::regclass;
\ No newline at end of file
+SELECT reltuples, relname FROM pg_class WHERE
oid='vac_reltuple_distortion'::regclass;
+
+-- test wrong log relcache leak in pg_catalog.gp_acquire_sample_rows
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+-- end_ignore
+create table relcache_leak_in_motion(v1 int);
+insert into relcache_leak_in_motion values(generate_series(0, 10000));
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'interrupt', dbid)
+ FROM gp_segment_configuration WHERE content = -1 and role='p';
+analyze relcache_leak_in_motion;
+SELECT gp_inject_fault('interconnect_stop_recv_chunk', 'reset', dbid)
+ FROM gp_segment_configuration WHERE content = -1 and role='p';
+-- start_ignore
+drop table if exists relcache_leak_in_motion;
+-- end_ignore
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]