From cc911b79ed1bbcf1a330977b69cc4fe291c2a3a0 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddy@enterprisedb.com>
Date: Fri, 25 Sep 2020 10:11:59 +0530
Subject: [PATCH] Retry Cached Remote Connections For postgres_fdw

Remote connections are cached for postgres_fdw in the local
backend. There are high chances that the remote backend may
not be available i.e. it can get killed, the subsequent foreign
queries from local backend/session that use cached connection
will fail as the remote backend would have become unavailable.

This patch proposes a retry mechanism. Local backend/session
uses cached connection and if it receives connection bad error,
it deletes the cached entry and retry to get a new connection.
This retry happens only at the start of remote xact.
---
 contrib/postgres_fdw/connection.c             | 62 ++++++++++++++++---
 .../postgres_fdw/expected/postgres_fdw.out    | 55 ++++++++++++++++
 contrib/postgres_fdw/sql/postgres_fdw.sql     | 31 ++++++++++
 3 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 08daf26fdf..ab8d505d64 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -170,12 +170,6 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 		disconnect_pg_server(entry);
 	}
 
-	/*
-	 * We don't check the health of cached connection here, because it would
-	 * require some overhead.  Broken connection will be detected when the
-	 * connection is actually used.
-	 */
-
 	/*
 	 * If cache entry doesn't have a connection, we have to establish a new
 	 * connection.  (If connect_pg_server throws an error, the cache entry
@@ -206,9 +200,61 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 	}
 
 	/*
-	 * Start a new transaction or subtransaction if needed.
+	 * We check the health of the cached connection here, while the remote
+	 * xact is going to begin.  Broken connection will be detected and the
+	 * connection is retried. We do this only once during each begin remote
+	 * xact call, if the connection is still broken after the retry attempt,
+	 * then the query will fail.
 	 */
-	begin_remote_xact(entry);
+	PG_TRY();
+	{
+		/* Start a new transaction or subtransaction if needed. */
+		begin_remote_xact(entry);
+	}
+	PG_CATCH();
+	{
+		if (entry->conn &&
+			PQstatus(entry->conn) == CONNECTION_BAD &&
+			entry->xact_depth <= 0)
+		{
+			/* server object will be used for log messages. */
+			ForeignServer *server = GetForeignServer(user->serverid);
+
+			/*
+			 * xact_depth <=0, which means we are retrying only at
+			 * the beginning of remote xact. This is sensible,
+			 * since we can't retry in the middle of a remote xact.
+			 */
+			elog(DEBUG3, "postgres_fdw connection %p to the server \"%s\" is broken, retrying to connect",
+				entry->conn, server->servername);
+
+			disconnect_pg_server(entry);
+
+			/*
+			 * Reset the transient state fields that may have been
+			 * modified in the first attempt.
+			 */
+			entry->changing_xact_state = false;
+
+			entry->conn = connect_pg_server(server, user);
+
+			if (entry->conn && PQstatus(entry->conn) == CONNECTION_OK)
+				elog(DEBUG3, "postgres_fdw connection retry to the server  \"%s\" is successful",
+					server->servername);
+			else
+				ereport(ERROR,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+					 errmsg("could not connect to server \"%s\"",
+							server->servername),
+					 errdetail_internal("%s", pchomp(PQerrorMessage(entry->conn)))));
+
+			/* Retry starting a new transaction. */
+			begin_remote_xact(entry);
+		}
+		else
+			PG_RE_THROW();
+	}
+	PG_END_TRY();
 
 	/* Remember if caller will prepare statements */
 	entry->have_prep_stmt |= will_prep_stmt;
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 10e23d02ed..d72b11a3ac 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -8987,3 +8987,58 @@ PREPARE TRANSACTION 'fdw_tpc';
 ERROR:  cannot PREPARE a transaction that has operated on postgres_fdw foreign tables
 ROLLBACK;
 WARNING:  there is no transaction in progress
+-- Retry cached connections at the beginning of the remote xact
+-- in case remote backend is killed.
+-- Let's use a different application name for remote connection,
+-- so that this test will not kill other backends wrongly.
+ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
+-- Generate a connection to remote. Local backend will cache it.
+SELECT * FROM ft1 LIMIT 1;
+  c1  | c2 | c3 | c4 | c5 | c6 |     c7     | c8 
+------+----+----+----+----+----+------------+----
+ 1111 |  2 |    |    |    |    | ft1        | 
+(1 row)
+
+-- Retrieve pid of remote backend with application name fdw_retry_check
+-- and kill it intentionally here. Note that, local backend still has
+-- the remote connection/backend info in it's cache.
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+ pg_terminate_backend 
+----------------------
+ t
+(1 row)
+
+-- Next query using the same foreign server should succeed if connection
+-- retry works.
+SELECT * FROM ft1 LIMIT 1;
+  c1  | c2 | c3 | c4 | c5 | c6 |     c7     | c8 
+------+----+----+----+----+----+------------+----
+ 1111 |  2 |    |    |    |    | ft1        | 
+(1 row)
+
+-- Subtransaction - remote backend is killed in the middle of a remote
+-- xact, and we don't do retry connection, hence the subsequent query
+-- using the same foreign server should fail.
+BEGIN;
+DECLARE c CURSOR FOR SELECT * FROM ft1 LIMIT 1;
+FETCH c;
+  c1  | c2 | c3 | c4 | c5 | c6 |     c7     | c8 
+------+----+----+----+----+----+------------+----
+ 1111 |  2 |    |    |    |    | ft1        | 
+(1 row)
+
+SAVEPOINT s;
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+ pg_terminate_backend 
+----------------------
+ t
+(1 row)
+
+SELECT * FROM ft1 LIMIT 1;    -- should fail
+ERROR:  server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+CONTEXT:  remote SQL command: SAVEPOINT s2
+COMMIT;
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 78156d10b4..7a1b4c78bd 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -2653,3 +2653,34 @@ SELECT count(*) FROM ft1;
 -- error here
 PREPARE TRANSACTION 'fdw_tpc';
 ROLLBACK;
+
+-- Retry cached connections at the beginning of the remote xact
+-- in case remote backend is killed.
+-- Let's use a different application name for remote connection,
+-- so that this test will not kill other backends wrongly.
+ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
+
+-- Generate a connection to remote. Local backend will cache it.
+SELECT * FROM ft1 LIMIT 1;
+
+-- Retrieve pid of remote backend with application name fdw_retry_check
+-- and kill it intentionally here. Note that, local backend still has
+-- the remote connection/backend info in it's cache.
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+
+-- Next query using the same foreign server should succeed if connection
+-- retry works.
+SELECT * FROM ft1 LIMIT 1;
+
+-- Subtransaction - remote backend is killed in the middle of a remote
+-- xact, and we don't do retry connection, hence the subsequent query
+-- using the same foreign server should fail.
+BEGIN;
+DECLARE c CURSOR FOR SELECT * FROM ft1 LIMIT 1;
+FETCH c;
+SAVEPOINT s;
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+SELECT * FROM ft1 LIMIT 1;    -- should fail
+COMMIT;
-- 
2.25.1

