Hello hackers,
02.07.2025 02:53, Michael Paquier wrote:
Catching up on this thread after-the-fact, specifically looking at
29213636e6cd as I did the original check tweaked here for O_RDONLY.
Agreed that a backpatch should be OK as done. The buildfarm looks OK
currently.
Three months later we can see a number of failures produced by that
animal on several branches, e.g. [1]:
timed out after 3600 secs
---
# +++ regress install-check in src/test/modules/test_shm_mq +++
# using postmaster on
/home/demo/build-farm-19.1/buildroot/tmp/buildfarm-GBEDDQ, port 5678
---- hang ----
I've spun up Debian Hurd locally, using [2], and reproduced this, just by
running: `make -s check -C src/test/modules/test_shm_mq/` in a loop.
It's not that easy to see a backtrace of the running processes on that OS,
but with some debug logging (attached), I can see that the test backend
process or shm_mq background worker just gets stuck shortly after poll().
One example:
echo "
log_min_messages = DEBUG5
log_line_prefix = '%m [%p:%l] %q%a '
log_connections = 'true'
log_disconnections = 'true'
log_statement = 'all'
autovacuum = off
" > /tmp/extra.config
for i in {1..100}; do echo "ITERATION $i"; TEMP_CONFIG=/tmp/extra.config \
timeout 60 make -s check -C src/test/modules/test_shm_mq/ || break; done; \
ps -A | grep postgres
...
ITERATION 9
ok 1 - test_shm_mq 14498 ms
ITERATION 10
ok 1 - test_shm_mq 34759 ms
ITERATION 11
# +++ regress check in src/test/modules/test_shm_mq +++
# initializing database system by copying initdb template
# using temp instance on port 58928 with PID 28848
make: *** [../../../../src/makefiles/pgxs.mk:451: check] Terminated
demo 28848 p0 So 0:00.03 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28849 - Ssfo 0:00.06 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28850 - Ssfo 0:00.00 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28851 - Ssfo 0:00.00 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28852 - Ssfo 0:00.00 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28853 - Ssfo 0:00.00 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28855 - Ssfo 0:00.00 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28856 - Ssfo 0:00.00 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28862 - Ssfo 0:01.13 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
demo 28866 - Ssfo 0:00.87 postgres -D /home/demo/postgresql/src/test/modules/test_shm_mq/tmp_check/data -F -c
listen_addresses= -k /tmp/pg_regress-McfSYJ
The process 28866 is stuck, it doesn't response to SIGTERM.
src/test/modules/test_shm_mq/log/postmaster.log contains:
2025-09-21 05:48:00.749 BST [28862:23] pg_regress/test_shm_mq LOG: statement: SELECT test_shm_mq(100, (select
string_agg(chr(32+(random()*95)::int), '') from generate_series(1,(100+200*random())::int)), 10000, 1);
2025-09-21 05:48:00.750 BST [28848:59] DEBUG: postmaster received pmsignal
signal
2025-09-21 05:48:00.750 BST [28848:60] DEBUG: registering background worker ""
2025-09-21 05:48:00.750 BST [28848:61] DEBUG: assigned pm child slot 240 for
bgworker
2025-09-21 05:48:00.750 BST [28848:62] DEBUG: starting background worker process
""
2025-09-21 05:48:00.751 BST [28866:1] DEBUG: find_in_path: trying
"/home/demo/postgresql/tmp_install/usr/local/pgsql/lib/test_shm_mq"
2025-09-21 05:48:00.751 BST [28866:2] DEBUG: find_in_path: trying
"/home/demo/postgresql/tmp_install/usr/local/pgsql/lib/test_shm_mq.so"
...
!!!shm_mq_receive_bytes[28866]| before WaitLatch
!!!WaitEventSetWaitBlock POLL[28866]| rc: 1
^^^ the last message from that process ^^^
Other run:
2025-09-21 06:03:21.041 BST [29427:23] pg_regress/test_shm_mq LOG: statement: SELECT test_shm_mq(100, (select
string_agg(chr(32+(random()*95)::int), '') from generate_series(1,(100+200*random())::int)), 10000, 1);
2025-09-21 06:03:21.042 BST [29412:64] DEBUG: postmaster received pmsignal
signal
2025-09-21 06:03:21.042 BST [29412:65] DEBUG: registering background worker ""
2025-09-21 06:03:21.042 BST [29412:66] DEBUG: assigned pm child slot 239 for
bgworker
2025-09-21 06:03:21.042 BST [29412:67] DEBUG: starting background worker process
""
2025-09-21 06:03:21.043 BST [29431:1] DEBUG: find_in_path: trying
"/home/demo/postgresql/tmp_install/usr/local/pgsql/lib/test_shm_mq"
2025-09-21 06:03:21.043 BST [29431:2] DEBUG: find_in_path: trying
"/home/demo/postgresql/tmp_install/usr/local/pgsql/lib/test_shm_mq.so"
!!!WaitEventSetWaitBlock POLL[29431]| rc: 1
!!!WaitEventSetWaitBlock[29431] 1
!!!WaitEventSetWaitBlock[29431] 2
!!!WaitEventSetWaitBlock[29431] 3
!!!WaitEventSetWaitBlock[29431] 4
!!!WaitEventSetWaitBlock[29431] 5
^^^ the last message from the process 29431 ^^^
So it seems to me that Hurd is not mature enough yet to test Postgres.
[1]
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=fruitcrow&dt=2025-09-19%2007%3A29%3A06
[2]
https://cdimage.debian.org/cdimage/ports/latest/hurd-amd64/debian-hurd-amd64-20250807.img.tar.xz
Best regards,
Alexander
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
index 2c79a649f46..bb8eff28581 100644
--- a/src/backend/storage/ipc/shm_mq.c
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -1015,8 +1015,10 @@ shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
* at top of loop, because setting an already-set latch is much
* cheaper than setting one that has been reset.
*/
+fprintf(stderr, "!!!shm_mq_send_bytes[%d]| before WaitLatch\n", getpid());
(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
WAIT_EVENT_MESSAGE_QUEUE_SEND);
+fprintf(stderr, "!!!shm_mq_send_bytes[%d]| after WaitLatch\n", getpid());
/* Reset the latch so we don't spin. */
ResetLatch(MyLatch);
@@ -1161,8 +1163,10 @@ shm_mq_receive_bytes(shm_mq_handle *mqh, Size bytes_needed, bool nowait,
* loop, because setting an already-set latch is much cheaper than
* setting one that has been reset.
*/
+fprintf(stderr, "!!!shm_mq_receive_bytes[%d]| before WaitLatch\n", getpid());
(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
WAIT_EVENT_MESSAGE_QUEUE_RECEIVE);
+fprintf(stderr, "!!!shm_mq_receive_bytes[%d]| after WaitLatch\n", getpid());
/* Reset the latch so we don't spin. */
ResetLatch(MyLatch);
diff --git a/src/backend/storage/ipc/waiteventset.c b/src/backend/storage/ipc/waiteventset.c
index 7c0e66900f9..dcf7b2c111d 100644
--- a/src/backend/storage/ipc/waiteventset.c
+++ b/src/backend/storage/ipc/waiteventset.c
@@ -1497,6 +1497,18 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
/* timeout exceeded */
return -1;
}
+fprintf(stderr, "!!!WaitEventSetWaitBlock POLL[%d]| rc: %d\n", getpid(), rc);
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 1\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 2\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 3\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 4\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 5\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 5\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 6\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 7\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 8\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 9\n", getpid());
+fprintf(stderr, "!!!WaitEventSetWaitBlock[%d] 10\n", getpid());
for (cur_event = set->events, cur_pollfd = set->pollfds;
cur_event < (set->events + set->nevents) &&