From: pgsql-hackers-ow...@postgresql.org
> [mailto:pgsql-hackers-ow...@postgresql.org] On Behalf Of Tsunakawa,> Takayuki
> Instead, I think we should fix the program to match the documented behavior.
> Otherwise, if the first database machine is down, libpq might wait for about
> 2 hours (depending on the OS's TCP keepalive setting), during which it tims
> out after connect_timeout and does not attempt to connect to other hosts.
> 
> I'll add this item in the PostgreSQL 10 Open Items.

Please use the attached patch to fix the problem.  I confirmed the success as 
follows:

$ add "post_auth_delay = 10" in postgresql.conf on host1
$ start database servers on host1 and host2
$ psql -h host1,host2 -p 5432,5433 -d "dbname=postgres connect_timeout=3"
(psql connected to host2 after 3 seconds, which I checked with \conninfo)

Regards
Takayuki Tsunakawa

diff --git a/src/interfaces/libpq/fe-connect.c 
b/src/interfaces/libpq/fe-connect.c
index eb5aaf7098..d02e5201fa 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -1720,6 +1720,8 @@ connectDBComplete(PGconn *conn)
 {
        PostgresPollingStatusType flag = PGRES_POLLING_WRITING;
        time_t          finish_time = ((time_t) -1);
+       int                     ret = 0;
+       int                     timeout = 0;
 
        if (conn == NULL || conn->status == CONNECTION_BAD)
                return 0;
@@ -1729,8 +1731,7 @@ connectDBComplete(PGconn *conn)
         */
        if (conn->connect_timeout != NULL)
        {
-               int                     timeout = atoi(conn->connect_timeout);
-
+               timeout = atoi(conn->connect_timeout);
                if (timeout > 0)
                {
                        /*
@@ -1761,7 +1762,8 @@ connectDBComplete(PGconn *conn)
                                return 1;               /* success! */
 
                        case PGRES_POLLING_READING:
-                               if (pqWaitTimed(1, 0, conn, finish_time))
+                               ret = pqWaitTimed(1, 0, conn, finish_time);
+                               if (ret == -1)
                                {
                                        conn->status = CONNECTION_BAD;
                                        return 0;
@@ -1769,7 +1771,8 @@ connectDBComplete(PGconn *conn)
                                break;
 
                        case PGRES_POLLING_WRITING:
-                               if (pqWaitTimed(0, 1, conn, finish_time))
+                               ret = pqWaitTimed(0, 1, conn, finish_time);
+                               if (ret == -1)
                                {
                                        conn->status = CONNECTION_BAD;
                                        return 0;
@@ -1782,6 +1785,22 @@ connectDBComplete(PGconn *conn)
                                return 0;
                }
 
+               if (ret == 1)   /* connect_timeout elapsed */
+               {
+                       /* If there are no more hosts, return (the error 
message is already set) */
+                       if (++conn->whichhost >= conn->nconnhost)
+                       {
+                               conn->whichhost = 0;
+                               conn->status = CONNECTION_BAD;
+                               return 0;
+                       }
+                       /* Attempt connection to the next host, starting the 
connect_timeout timer */
+                       pqDropConnection(conn, true);
+                       conn->addr_cur = 
conn->connhost[conn->whichhost].addrlist;
+                       conn->status = CONNECTION_NEEDED;
+                       finish_time = time(NULL) + timeout;
+               }
+
                /*
                 * Now try to advance the state machine.
                 */
diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c
index 756c6d7779..1d6ea93a0a 100644
--- a/src/interfaces/libpq/fe-misc.c
+++ b/src/interfaces/libpq/fe-misc.c
@@ -991,11 +991,9 @@ pqWait(int forRead, int forWrite, PGconn *conn)
 /*
  * pqWaitTimed: wait, but not past finish_time.
  *
- * If finish_time is exceeded then we return failure (EOF).  This is like
- * the response for a kernel exception because we don't want the caller
- * to try to read/write in that case.
- *
  * finish_time = ((time_t) -1) disables the wait limit.
+ *
+ * Returns -1 on failure, 0 if the socket is readable/writable, 1 if it timed 
out.
  */
 int
 pqWaitTimed(int forRead, int forWrite, PGconn *conn, time_t finish_time)
@@ -1005,13 +1003,13 @@ pqWaitTimed(int forRead, int forWrite, PGconn *conn, 
time_t finish_time)
        result = pqSocketCheck(conn, forRead, forWrite, finish_time);
 
        if (result < 0)
-               return EOF;                             /* errorMessage is 
already set */
+               return -1;                              /* errorMessage is 
already set */
 
        if (result == 0)
        {
                printfPQExpBuffer(&conn->errorMessage,
                                                  libpq_gettext("timeout 
expired\n"));
-               return EOF;
+               return 1;
        }
 
        return 0;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to