From: [email protected]
> [mailto:[email protected]] On Behalf Of Tsunakawa,> Takayuki
> Instead, I think we should fix the program to match the documented behavior.
> Otherwise, if the first database machine is down, libpq might wait for about
> 2 hours (depending on the OS's TCP keepalive setting), during which it tims
> out after connect_timeout and does not attempt to connect to other hosts.
>
> I'll add this item in the PostgreSQL 10 Open Items.
Please use the attached patch to fix the problem. I confirmed the success as
follows:
$ add "post_auth_delay = 10" in postgresql.conf on host1
$ start database servers on host1 and host2
$ psql -h host1,host2 -p 5432,5433 -d "dbname=postgres connect_timeout=3"
(psql connected to host2 after 3 seconds, which I checked with \conninfo)
Regards
Takayuki Tsunakawa
diff --git a/src/interfaces/libpq/fe-connect.c
b/src/interfaces/libpq/fe-connect.c
index eb5aaf7098..d02e5201fa 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -1720,6 +1720,8 @@ connectDBComplete(PGconn *conn)
{
PostgresPollingStatusType flag = PGRES_POLLING_WRITING;
time_t finish_time = ((time_t) -1);
+ int ret = 0;
+ int timeout = 0;
if (conn == NULL || conn->status == CONNECTION_BAD)
return 0;
@@ -1729,8 +1731,7 @@ connectDBComplete(PGconn *conn)
*/
if (conn->connect_timeout != NULL)
{
- int timeout = atoi(conn->connect_timeout);
-
+ timeout = atoi(conn->connect_timeout);
if (timeout > 0)
{
/*
@@ -1761,7 +1762,8 @@ connectDBComplete(PGconn *conn)
return 1; /* success! */
case PGRES_POLLING_READING:
- if (pqWaitTimed(1, 0, conn, finish_time))
+ ret = pqWaitTimed(1, 0, conn, finish_time);
+ if (ret == -1)
{
conn->status = CONNECTION_BAD;
return 0;
@@ -1769,7 +1771,8 @@ connectDBComplete(PGconn *conn)
break;
case PGRES_POLLING_WRITING:
- if (pqWaitTimed(0, 1, conn, finish_time))
+ ret = pqWaitTimed(0, 1, conn, finish_time);
+ if (ret == -1)
{
conn->status = CONNECTION_BAD;
return 0;
@@ -1782,6 +1785,22 @@ connectDBComplete(PGconn *conn)
return 0;
}
+ if (ret == 1) /* connect_timeout elapsed */
+ {
+ /* If there are no more hosts, return (the error
message is already set) */
+ if (++conn->whichhost >= conn->nconnhost)
+ {
+ conn->whichhost = 0;
+ conn->status = CONNECTION_BAD;
+ return 0;
+ }
+ /* Attempt connection to the next host, starting the
connect_timeout timer */
+ pqDropConnection(conn, true);
+ conn->addr_cur =
conn->connhost[conn->whichhost].addrlist;
+ conn->status = CONNECTION_NEEDED;
+ finish_time = time(NULL) + timeout;
+ }
+
/*
* Now try to advance the state machine.
*/
diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c
index 756c6d7779..1d6ea93a0a 100644
--- a/src/interfaces/libpq/fe-misc.c
+++ b/src/interfaces/libpq/fe-misc.c
@@ -991,11 +991,9 @@ pqWait(int forRead, int forWrite, PGconn *conn)
/*
* pqWaitTimed: wait, but not past finish_time.
*
- * If finish_time is exceeded then we return failure (EOF). This is like
- * the response for a kernel exception because we don't want the caller
- * to try to read/write in that case.
- *
* finish_time = ((time_t) -1) disables the wait limit.
+ *
+ * Returns -1 on failure, 0 if the socket is readable/writable, 1 if it timed
out.
*/
int
pqWaitTimed(int forRead, int forWrite, PGconn *conn, time_t finish_time)
@@ -1005,13 +1003,13 @@ pqWaitTimed(int forRead, int forWrite, PGconn *conn,
time_t finish_time)
result = pqSocketCheck(conn, forRead, forWrite, finish_time);
if (result < 0)
- return EOF; /* errorMessage is
already set */
+ return -1; /* errorMessage is
already set */
if (result == 0)
{
printfPQExpBuffer(&conn->errorMessage,
libpq_gettext("timeout
expired\n"));
- return EOF;
+ return 1;
}
return 0;
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers