Hello Andres, 11.02.2022 05:22, Andres Freund wrote: > Over in another thread I made some wild unsubstantiated guesses that the > windows issues could have been made much more likely by a somewhat odd bit of > code in PQisBusy(): > > https://postgr.es/m/1959196.1644544971%40sss.pgh.pa.us > > Alexander, any chance you'd try if that changes the likelihood of the problem > occurring, without any other fixes / reverts applied? Unfortunately I haven't seen an improvement for the test in question. With the PQisBusy-fix.patch from [1] and without any other changes on the master branch (52377bb8) it still fails (on iterations 13, 5, 2, 2 for me). The diagnostic logging (in attachment) added: 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive: PQgetCopyData returned 0 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive: PQgetCopyData 2 returned -1 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive: end-of-streaming or error: -1 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_PQgetResult: streamConn->asyncStatus: 1 && streamConn->status: 0 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive libpqrcv_PQgetResult returned 10551584, 1 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_receive libpqrcv_PQgetResult PGRES_COMMAND_OK 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_PQgetResult: streamConn->asyncStatus: 1 && streamConn->status: 0 2022-02-12 01:04:16.341 PST [4912] LOG: libpqrcv_PQgetResult loop before WaitLatchOrSocket 2022-02-12 01:04:16.341 PST [4912] LOG: WSAEventSelect event->fd: 948, flags: 21 2022-02-12 01:04:16.341 PST [4912] LOG: WaitLatchOrSocket before WaitEventSetWait 2022-02-12 01:04:16.341 PST [4912] LOG: WaitEventSetWait before WaitEventSetWaitBlock 2022-02-12 01:04:16.341 PST [4912] LOG: WaitEventSetWaitBlock before WaitForMultipleObjects: 3 ... shows that before the doomed WaitForMultipleObjects() call the field conn->status is 0 (CONNECTION_OK).
[1] https://www.postgresql.org/message-id/2187263.1644616494%40sss.pgh.pa.us Best regards, Alexander
diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c index 0d89db4e6a..0776885306 100644 --- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c +++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c @@ -34,6 +34,9 @@ #include "utils/pg_lsn.h" #include "utils/tuplestore.h" +#undef ENABLE_THREAD_SAFETY +#include "libpq-int.h" + PG_MODULE_MAGIC; void _PG_init(void); @@ -696,10 +699,13 @@ libpqrcv_PQgetResult(PGconn *streamConn) * Collect data until PQgetResult is ready to get the result without * blocking. */ +elog(LOG, "libpqrcv_PQgetResult: streamConn->asyncStatus: %d && streamConn->status: %d", streamConn->asyncStatus, streamConn->status); + while (PQisBusy(streamConn)) { int rc; +elog(LOG, "libpqrcv_PQgetResult loop before WaitLatchOrSocket"); /* * We don't need to break down the sleep into smaller increments, * since we'll get interrupted by signals and can handle any @@ -711,6 +717,7 @@ libpqrcv_PQgetResult(PGconn *streamConn) PQsocket(streamConn), 0, WAIT_EVENT_LIBPQWALRECEIVER_RECEIVE); +elog(LOG, "libpqrcv_PQgetResult loop after WaitLatchOrSocket: %d"); /* Interrupted? */ if (rc & WL_LATCH_SET) @@ -771,6 +778,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, /* Try to receive a CopyData message */ rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1); +elog(LOG, "libpqrcv_receive: PQgetCopyData returned %d", rawlen); if (rawlen == 0) { /* Try consuming some data. */ @@ -782,6 +790,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, /* Now that we've consumed some input, try again */ rawlen = PQgetCopyData(conn->streamConn, &conn->recvBuf, 1); +elog(LOG, "libpqrcv_receive: PQgetCopyData 2 returned %d", rawlen); if (rawlen == 0) { /* Tell caller to try again when our socket is ready. */ @@ -791,15 +800,20 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, } if (rawlen == -1) /* end-of-streaming or error */ { +elog(LOG, "libpqrcv_receive: end-of-streaming or error: %d", rawlen); + PGresult *res; res = libpqrcv_PQgetResult(conn->streamConn); +elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult returned %d, %d", res, PQresultStatus(res)); if (PQresultStatus(res) == PGRES_COMMAND_OK) { +elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult PGRES_COMMAND_OK"); PQclear(res); /* Verify that there are no more results. */ res = libpqrcv_PQgetResult(conn->streamConn); +elog(LOG, "libpqrcv_receive libpqrcv_PQgetResult 2 returned %p", res); if (res != NULL) { PQclear(res); @@ -809,6 +823,7 @@ libpqrcv_receive(WalReceiverConn *conn, char **buffer, * we'd seen an error, or PGRES_COPY_IN) don't report an error * here, but let callers deal with it. */ +elog(LOG, "libpqrcv_receive PQstatus(conn->streamConn) returned %d", PQstatus(conn->streamConn)); if (PQstatus(conn->streamConn) == CONNECTION_BAD) return -1; diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index b39fce8c23..6bbbf67344 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -425,6 +425,7 @@ WalReceiverMain(void) bool endofwal = false; pgsocket wait_fd = PGINVALID_SOCKET; int rc; +elog(LOG, "!!!wal receiver loop"); /* * Exit walreceiver if we're not in recovery. This should not @@ -447,6 +448,8 @@ WalReceiverMain(void) /* See if we can read data immediately */ len = walrcv_receive(wrconn, &buf, &wait_fd); +elog(LOG, "!!!walrcv_receive returned %d", len); + if (len != 0) { /* @@ -514,6 +517,7 @@ WalReceiverMain(void) wait_fd, NAPTIME_PER_CYCLE, WAIT_EVENT_WAL_RECEIVER_MAIN); +elog(LOG, "!!!WaitLatchOrSocket returned %d", rc); if (rc & WL_LATCH_SET) { ResetLatch(MyLatch); @@ -583,6 +587,7 @@ WalReceiverMain(void) XLogWalRcvSendHSFeedback(false); } } +elog(LOG, "!!!wal receiver loop finished"); /* * The backend finished streaming. Exit streaming COPY-mode from diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index 5bb609b368..11afc03072 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -529,12 +529,12 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, if (wakeEvents & WL_SOCKET_MASK) { int ev; - ev = wakeEvents & WL_SOCKET_MASK; AddWaitEventToSet(set, ev, sock, NULL, NULL); } - +elog(LOG, "WaitLatchOrSocket before WaitEventSetWait"); rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info); +elog(LOG, "WaitLatchOrSocket after WaitEventSetWait"); if (rc == 0) ret |= WL_TIMEOUT; @@ -1280,6 +1280,7 @@ WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event) elog(ERROR, "failed to create event for socket: error code %d", WSAGetLastError()); } +elog(LOG, "WSAEventSelect event->fd: %d, flags: %x", event->fd, flags); if (WSAEventSelect(event->fd, *handle, flags) != 0) elog(ERROR, "failed to set up event for socket: error code %d", WSAGetLastError()); @@ -1392,8 +1393,10 @@ WaitEventSetWait(WaitEventSet *set, long timeout, * this file. If -1 is returned, a timeout has occurred, if 0 we have * to retry, everything >= 1 is the number of returned events. */ +elog(LOG, "WaitEventSetWait before WaitEventSetWaitBlock"); rc = WaitEventSetWaitBlock(set, cur_timeout, occurred_events, nevents); +elog(LOG, "WaitEventSetWait after WaitEventSetWaitBlock"); if (set->latch) { @@ -1888,9 +1891,11 @@ WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, * * Need to wait for ->nevents + 1, because signal handle is in [0]. */ +elog(LOG, "WaitEventSetWaitBlock before WaitForMultipleObjects: %d", set->nevents); rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE, cur_timeout); +elog(LOG, "WaitEventSetWaitBlock after WaitForMultipleObjects"); /* Check return code */ if (rc == WAIT_FAILED) elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",