This is an automated email from the ASF dual-hosted git repository.

gfphoenix78 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/cloudberry.git


The following commit(s) were added to refs/heads/main by this push:
     new e51dd0806a5 Fix the creation timeout retry logic of 
cdbgang_createGang_async should be synchronized with the reader to avoid slow 
creation due to platform, container, network and other reasons, which would 
cause the reader to prematurely consider it an abnormal termination.
e51dd0806a5 is described below

commit e51dd0806a5cc9f10d2a0951c9c53a6e20cfdff1
Author: zhaoxi <[email protected]>
AuthorDate: Fri Nov 21 21:09:57 2025 +0800

    Fix the creation timeout retry logic of cdbgang_createGang_async should be 
synchronized with the reader to avoid slow creation due to platform, container, 
network and other reasons, which would cause the reader to prematurely consider 
it an abnormal termination.
    
    2025-11-20 11:48:27.925475 
CST,"gpadmin","regression",p14056,th-1958096896,"172.18.0.2","40060",2025-11-20 
11:48:27 CST,0,con33,,seg0,,,,sx1,"WARNING","58M01","reader could not find 
writer proc entry","lock [0,1260] AccessShareLock 0. Probably because writer 
gang is gone somehow. Maybe try rerunning.",,,,,,0,,"lock.c",963,"Stack trace:
    1    0xaaaab4db9f14 postgres errstart + 0x494
    2    0xaaaab4b9b064 postgres LockAcquireExtended + 0x76c
    3    0xaaaab4b97d98 postgres LockRelationOid + 0x3c
    4    0xaaaab44a6e30 postgres relation_open + 0x60
    5    0xaaaab45a04e8 postgres table_open + 0x1c
    6    0xaaaab4d7f3e8 postgres <symbol not found> + 0xb4d7f3e8
    7    0xaaaab4d7fdcc postgres <symbol not found> + 0xb4d7fdcc
    8    0xaaaab4d7fc5c postgres SearchCatCache1 + 0x2c
    9    0xaaaab4da0258 postgres SearchSysCache1 + 0xb4
    10   0xaaaab4dd6f48 postgres InitializeSessionUserId + 0x98
    11   0xaaaab4dda874 postgres InitPostgres + 0x504
    12   0xaaaab4bc93bc postgres PostgresMain + 0x390
    13   0xaaaab4ac90b8 postgres <symbol not found> + 0xb4ac90b8
    14   0xaaaab4ac8918 postgres <symbol not found> + 0xb4ac8918
    15   0xaaaab4ac3114 postgres <symbol not found> + 0xb4ac3114
    16   0xaaaab4ac2804 postgres PostmasterMain + 0x1668
    17   0xaaaab4936b50 postgres <symbol not found> + 0xb4936b50
    18   0xffff8b4f1724 libc.so.6 __libc_start_main + 0xf0
    19   0xaaaab448327c postgres <symbol not found> + 0xb448327c
---
 src/backend/storage/lmgr/lock.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index cbc873e0b70..faf69b1581f 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -939,11 +939,26 @@ LockAcquireExtended(const LOCKTAG *locktag,
                        {
                                /* Find the guy who should manage our locks */
                                volatile PGPROC * proc = 
FindProcByGpSessionId(gp_session_id);
-                               int count = 0;
-                               while(proc==NULL && count < 
find_writer_proc_retry_time)
+                               TimestampTz current_time;
+                               TimestampTz start_time;
+                               long        elapsed_secs;
+                               int         elapsed_usecs;
+                               start_time = GetCurrentTimestamp();
+
+                               while (proc == NULL)
                                {
+                                       /*
+                                        * The creation timeout retry logic of 
cdbgang_createGang_async
+                                        * should be synchronized with the 
reader to avoid slow creation
+                                        * due to platform, container, network 
and other reasons, 
+                                        * which would cause the reader to 
prematurely consider it an abnormal termination.
+                                        */
+                                       current_time = GetCurrentTimestamp();
+                                       TimestampDifference(start_time, 
current_time, &elapsed_secs, &elapsed_usecs);
+                                       if (elapsed_secs >= 
gp_segment_connect_timeout / 2)
+                                               break;
+
                                        pg_usleep( /* microseconds */ 2000);
-                                       count++;
                                        CHECK_FOR_INTERRUPTS();
                                        /*
                                         * The reason for using 
pg_memory_barrier() is to ensure that
@@ -954,7 +969,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
                                }
                                if (proc != NULL)
                                {
-                                       elog(DEBUG1,"Found writer proc entry.  
My Pid %d, his pid %d", MyProc-> pid, proc->pid);
+                                       elog(DEBUG1, "Found writer proc entry.  
My Pid %d, his pid %d", MyProc-> pid, proc->pid);
                                        lockHolderProcPtr = (PGPROC*) proc;
                                }
                                else


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to