Hello Andres,
11.02.2022 05:22, Andres Freund wrote:
> Over in another thread I made some wild unsubstantiated guesses that the
> windows issues could have been made much more likely by a somewhat odd bit of
> code in PQisBusy():
>
> https://postgr.es/m/1959196.1644544971%40sss.pgh.pa.us
>
> Alexander, any chance you'd try if that changes the likelihood of the problem
> occurring, without any other fixes / reverts applied?
Unfortunately I haven't seen an improvement for the test in question.
With the PQisBusy-fix.patch from [1] and without any other changes on
the master branch (52377bb8) it still fails (on iterations 13, 5, 2, 2
for me).
The diagnostic logging (in attachment) added:
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive: PQgetCopyData
returned 0
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive: PQgetCopyData
2 returned -1
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive:
end-of-streaming or error: -1
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_PQgetResult:
streamConn->asyncStatus: 1 && streamConn->status: 0
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive
libpqrcv_PQgetResult returned 10551584, 1
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive
libpqrcv_PQgetResult PGRES_COMMAND_OK
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_PQgetResult:
streamConn->asyncStatus: 1 && streamConn->status: 0
2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_PQgetResult loop
before WaitLatchOrSocket
2022-02-12 01:04:16.341 PST [4912] LOG: WSAEventSelect event->fd: 948,
flags: 21
2022-02-12 01:04:16.341 PST [4912] LOG: WaitLatchOrSocket before
WaitEventSetWait
2022-02-12 01:04:16.341 PST [4912] LOG: WaitEventSetWait before
WaitEventSetWaitBlock
2022-02-12 01:04:16.341 PST [4912] LOG: WaitEventSetWaitBlock before
WaitForMultipleObjects: 3
...
shows that before the doomed WaitForMultipleObjects() call the field
conn->status is 0 (CONNECTION_OK).
[1] https://www.postgresql.org/message-id/2187263.1644616494%40sss.pgh.pa.us
Best regards,
Alexander
diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index 0d89db4e6a..0776885306 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -34,6 +34,9 @@
#include "utils/pg_lsn.h"
#include "utils/tuplestore.h"
+#undef ENABLE_THREAD_SAFETY
+#include "libpq-int.h"
+
PG_MODULE_MAGIC;
void _PG_init(void);
@@ -696,10 +699,13 @@ libpqrcv_PQgetResult(PGconn *streamConn)
* Collect data until PQgetResult is ready to get the result without
* blocking.
*/
+elog(LOG, "libpqrcv_PQgetResult: streamConn->asyncStatus: %d && streamConn->status: %d", streamConn->asyncStatus, streamConn->status);
+
while (PQisBusy(streamConn))
{
int rc;
+elog(LOG, "libpqrcv_PQgetResult loop before WaitLatchOrSocket");
/*
* We don't need to break down the sleep into smaller increments,
* since we'll get interrupted by signals and can handle any
@@ -711,6 +717,7 @@ libpqrcv_PQgetResult(PGconn *streamConn)
PQsocket(streamConn),
0,
WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE);
+elog(LOG, "libpqrcv_PQgetResult loop after WaitLatchOrSocket: %d");
/* Interrupted? */
if (rc & WL_LATCH_SET)
@@ -771,6 +778,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
/* Try to receive a CopyData message */
rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1);
+elog(LOG, "libpqrcv_receive: PQgetCopyData returned %d", rawlen);
if (rawlen == 0)
{
/* Try consuming some data. */
@@ -782,6 +790,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
/* Now that we've consumed some input, try again */
rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1);
+elog(LOG, "libpqrcv_receive: PQgetCopyData 2 returned %d", rawlen);
if (rawlen == 0)
{
/* Tell caller to try again when our socket is ready. */
@@ -791,15 +800,20 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
}
if (rawlen == -1) /* end-of-streaming or error */
{
+elog(LOG, "libpqrcv_receive: end-of-streaming or error: %d", rawlen);
+
PGresult *res;
res = libpqrcv_PQgetResult(conn->streamConn);
+elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult returned %d, %d", res, PQresultStatus(res));
if (PQresultStatus(res) == PGRES_COMMAND_OK)
{
+elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult PGRES_COMMAND_OK");
PQclear(res);
/* Verify that there are no more results. */
res = libpqrcv_PQgetResult(conn->streamConn);
+elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult 2 returned %p", res);
if (res != NULL)
{
PQclear(res);
@@ -809,6 +823,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer,
* we'd seen an error, or PGRES_COPY_IN) don't report an error
* here, but let callers deal with it.
*/
+elog(LOG, "libpqrcv_receive PQstatus(conn->streamConn) returned %d", PQstatus(conn->streamConn));
if (PQstatus(conn->streamConn) == CONNECTION_BAD)
return -1;
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index b39fce8c23..6bbbf67344 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -425,6 +425,7 @@ WalReceiverMain(void)
bool endofwal = false;
pgsocket wait_fd = PGINVALID_SOCKET;
int rc;
+elog(LOG, "!!!wal receiver loop");
/*
* Exit walreceiver if we're not in recovery. This should not
@@ -447,6 +448,8 @@ WalReceiverMain(void)
/* See if we can read data immediately */
len = walrcv_receive(wrconn, &buf, &wait_fd);
+elog(LOG, "!!!walrcv_receive returned %d", len);
+
if (len != 0)
{
/*
@@ -514,6 +517,7 @@ WalReceiverMain(void)
wait_fd,
NAPTIME_PER_CYCLE,
WAIT_EVENT_WAL_RECEIVER_MAIN);
+elog(LOG, "!!!WaitLatchOrSocket returned %d", rc);
if (rc & WL_LATCH_SET)
{
ResetLatch(MyLatch);
@@ -583,6 +587,7 @@ WalReceiverMain(void)
XLogWalRcvSendHSFeedback(false);
}
}
+elog(LOG, "!!!wal receiver loop finished");
/*
* The backend finished streaming. Exit streaming COPY-mode from
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 5bb609b368..11afc03072 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -529,12 +529,12 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
if (wakeEvents & WL_SOCKET_MASK)
{
int ev;
-
ev = wakeEvents & WL_SOCKET_MASK;
AddWaitEventToSet(set, ev, sock, NULL, NULL);
}
-
+elog(LOG, "WaitLatchOrSocket before WaitEventSetWait");
rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+elog(LOG, "WaitLatchOrSocket after WaitEventSetWait");
if (rc == 0)
ret |= WL_TIMEOUT;
@@ -1280,6 +1280,7 @@ WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
elog(ERROR, "failed to create event for socket: error code %d",
WSAGetLastError());
}
+elog(LOG, "WSAEventSelect event->fd: %d, flags: %x", event->fd, flags);
if (WSAEventSelect(event->fd, *handle, flags) != 0)
elog(ERROR, "failed to set up event for socket: error code %d",
WSAGetLastError());
@@ -1392,8 +1393,10 @@ WaitEventSetWait(WaitEventSet *set, long timeout,
* this file. If -1 is returned, a timeout has occurred, if 0 we have
* to retry, everything >= 1 is the number of returned events.
*/
+elog(LOG, "WaitEventSetWait before WaitEventSetWaitBlock");
rc = WaitEventSetWaitBlock(set, cur_timeout,
occurred_events, nevents);
+elog(LOG, "WaitEventSetWait after WaitEventSetWaitBlock");
if (set->latch)
{
@@ -1888,9 +1891,11 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
*
* Need to wait for ->nevents + 1, because signal handle is in [0].
*/
+elog(LOG, "WaitEventSetWaitBlock before WaitForMultipleObjects: %d", set->nevents);
rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
cur_timeout);
+elog(LOG, "WaitEventSetWaitBlock after WaitForMultipleObjects");
/* Check return code */
if (rc == WAIT_FAILED)
elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",