Hello Alvaro,
Let me show you another related anomaly, which drongo kindly discovered
recently: [1]. That test failed with:
SELECT dblink_cancel_query('dtest1');
- dblink_cancel_query
----------------------
- OK
+ dblink_cancel_query
+--------------------------
+ cancel request timed out
(1 row)
I've managed to reproduce this when running 20 dblink tests in parallel,
and with extra logging added (see attached) I've got:
...
2024-08-28 10:17:12.949 PDT [8236:204] pg_regress/dblink LOG: statement: SELECT
dblink_cancel_query('dtest1');
!!!PQcancelPoll|8236| conn->status: 2
!!!PQcancelPoll|8236| conn->status: 3
!!!PQconnectPoll|8236| before pqPacketSend(..., &cancelpacket, ...)
!!!pqPacketSend|8236| before pqFlush
!!!pqsecure_raw_write|8236| could not send data to server: Socket is not
connected (0x00002749/10057)
!!!pqPacketSend|8236| after pqFlush, STATUS_OK
!!!PQconnectPoll|8236| after pqPacketSend, STATUS_OK
2024-08-28 10:17:12.950 PDT [5548:7] pg_regress LOG: statement: select * from foo
where f1 < 3
2024-08-28 10:17:12.951 PDT [8692:157] DEBUG: forked new backend, pid=4644
socket=5160
2024-08-28 10:17:12.973 PDT [4644:1] [unknown] LOG: connection received:
host=::1 port=55073
2024-08-28 10:17:12.973 PDT [4644:2] [unknown] LOG: !!!BackendInitialize|
before ProcessSSLStartup()
!!!PQcancelPoll|8236| conn->status: 4
!!!PQcancelPoll|8236| conn->status: 4
2024-08-28 10:17:24.060 PDT [1436:1] DEBUG: snapshot of 0+0 running transaction ids (lsn 0/194C4E0 oldest xid 780
latest complete 779 next xid 780)
!!!PQcancelPoll|8236| conn->status: 4
2024-08-28 10:17:42.951 PDT [4644:3] [unknown] LOG: !!!BackendInitialize|
ProcessSSLStartup() returned -1
2024-08-28 10:17:42.951 PDT [4644:4] [unknown] DEBUG: shmem_exit(0): 0
before_shmem_exit callbacks to make
...
Thus, pqsecure_raw_write(), called via PQcancelPoll() -> PQconnectPoll() ->
pqPacketSend() -> pqFlush) -> pqSendSome() -> pqsecure_write(), returned
the WSAENOTCONN error, but it wasn't noticed at upper levels.
Consequently, the cancelling backend waited for the cancel packet that was
never sent.
The first commit, that I could reproduce this test failure on, is 2466d6654.
[1]
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=drongo&dt=2024-08-26%2021%3A35%3A04
Best regards,
Alexander
diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c
index cfa2755196..fc9abbfe53 100644
--- a/src/backend/tcop/backend_startup.c
+++ b/src/backend/tcop/backend_startup.c
@@ -252,8 +252,10 @@ BackendInitialize(ClientSocket *client_sock, CAC_state cac)
RegisterTimeout(STARTUP_PACKET_TIMEOUT, StartupPacketTimeoutHandler);
enable_timeout_after(STARTUP_PACKET_TIMEOUT, AuthenticationTimeout * 1000);
+elog(LOG, "!!!BackendInitialize| before ProcessSSLStartup()", status);
/* Handle direct SSL handshake */
status = ProcessSSLStartup(port);
+elog(LOG, "!!!BackendInitialize| ProcessSSLStartup() returned %d", status);
/*
* Receive the startup packet (which might turn out to be a cancel request
diff --git a/src/interfaces/libpq/fe-cancel.c b/src/interfaces/libpq/fe-cancel.c
index 213a6f43c2..8482dfa3e8 100644
--- a/src/interfaces/libpq/fe-cancel.c
+++ b/src/interfaces/libpq/fe-cancel.c
@@ -209,6 +209,7 @@ PQcancelPoll(PGcancelConn *cancelConn)
PGconn *conn = &cancelConn->conn;
int n;
+fprintf(stderr, "!!!PQcancelPoll|%d| conn->status: %d\n", getpid(), conn->status);
/*
* We leave most of the connection establishment to PQconnectPoll, since
* it's very similar to normal connection establishment. But once we get
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 360d9a4547..72a31d29ed 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -3407,12 +3407,14 @@ keep_going: /* We will come back to here until there is
cancelpacket.cancelRequestCode = (MsgType) pg_hton32(CANCEL_REQUEST_CODE);
cancelpacket.backendPID = pg_hton32(conn->be_pid);
cancelpacket.cancelAuthCode = pg_hton32(conn->be_key);
+fprintf(stderr, "!!!PQconnectPoll|%d| before pqPacketSend(..., &cancelpacket, ...)\n", getpid());
if (pqPacketSend(conn, 0, &cancelpacket, packetlen) != STATUS_OK)
{
libpq_append_conn_error(conn, "could not send cancel packet: %s",
SOCK_STRERROR(SOCK_ERRNO, sebuf, sizeof(sebuf)));
goto error_return;
}
+fprintf(stderr, "!!!PQconnectPoll|%d| after pqPacketSend, STATUS_OK\n", getpid());
conn->status = CONNECTION_AWAITING_RESPONSE;
return PGRES_POLLING_READING;
}
@@ -5012,9 +5014,13 @@ pqPacketSend(PGconn *conn, char pack_type,
if (pqPutMsgEnd(conn))
return STATUS_ERROR;
+if (buf_len == 12)
+fprintf(stderr, "!!!pqPacketSend|%d| before pqFlush\n", getpid());
/* Flush to ensure backend gets it. */
if (pqFlush(conn))
return STATUS_ERROR;
+if (buf_len == 12)
+fprintf(stderr, "!!!pqPacketSend|%d| after pqFlush, STATUS_OK\n", getpid());
return STATUS_OK;
}
diff --git a/src/interfaces/libpq/fe-secure.c b/src/interfaces/libpq/fe-secure.c
index f628082337..1a85d9a40f 100644
--- a/src/interfaces/libpq/fe-secure.c
+++ b/src/interfaces/libpq/fe-secure.c
@@ -421,6 +421,7 @@ retry_masked:
sebuf, sizeof(sebuf)));
/* keep newline out of translated string */
strlcat(msgbuf, "\n", sizeof(msgbuf));
+fprintf(stderr, "!!!pqsecure_raw_write|%d| %s", getpid(), msgbuf);
conn->write_err_msg = strdup(msgbuf);
/* Now claim the write succeeded */
n = len;