From f555720f8910733d50793933d9fa377bf35378ce Mon Sep 17 00:00:00 2001
From: Euler Taveira <euler.taveira@enterprisedb.com>
Date: Mon, 25 Mar 2024 22:01:52 -0300
Subject: [PATCH v2 1/5] Improve the code that checks if the recovery is
 finishing

The recovery process has a window between the walreceiver shutdown and
the pg_is_in_recovery function returns false. It means that the
pg_stat_wal_receiver checks can cause the server to finish the recovery
(even if it already reaches the recovery target). Since it checks the
pg_stat_wal_receiver to verify the primary is available, if it does not
return a row, PQping the primary server. If it is up and running, it
can indicate that the target server is finishing the recovery process,
hence, we shouldn't count it as an attempt. It avoids premature failures
on slow hosts.

While on it, increase the number of attempts (10 to 60). The wait time is
the same pg_promote function uses by default.
---
 src/bin/pg_basebackup/pg_createsubscriber.c | 30 +++++++++++++--------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c
index 90cc580811..b4424861b2 100644
--- a/src/bin/pg_basebackup/pg_createsubscriber.c
+++ b/src/bin/pg_basebackup/pg_createsubscriber.c
@@ -30,6 +30,8 @@
 
 #define	DEFAULT_SUB_PORT	"50432"
 
+#define NUM_ATTEMPTS		60
+
 /* Command-line options */
 struct CreateSubscriberOptions
 {
@@ -93,7 +95,7 @@ static void pg_ctl_status(const char *pg_ctl_cmd, int rc);
 static void start_standby_server(const struct CreateSubscriberOptions *opt,
 								 bool restricted_access);
 static void stop_standby_server(const char *datadir);
-static void wait_for_end_recovery(const char *conninfo,
+static void wait_for_end_recovery(const struct LogicalRepInfo *dbinfo,
 								  const struct CreateSubscriberOptions *opt);
 static void create_publication(PGconn *conn, struct LogicalRepInfo *dbinfo);
 static void drop_publication(PGconn *conn, struct LogicalRepInfo *dbinfo);
@@ -1362,18 +1364,16 @@ stop_standby_server(const char *datadir)
  * the recovery process. By default, it waits forever.
  */
 static void
-wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions *opt)
+wait_for_end_recovery(const struct LogicalRepInfo *dbinfo, const struct CreateSubscriberOptions *opt)
 {
 	PGconn	   *conn;
 	int			status = POSTMASTER_STILL_STARTING;
 	int			timer = 0;
 	int			count = 0;		/* number of consecutive connection attempts */
 
-#define NUM_CONN_ATTEMPTS	10
-
 	pg_log_info("waiting for the target server to reach the consistent state");
 
-	conn = connect_database(conninfo, true);
+	conn = connect_database(dbinfo->subconninfo, true);
 
 	for (;;)
 	{
@@ -1392,16 +1392,24 @@ wait_for_end_recovery(const char *conninfo, const struct CreateSubscriberOptions
 		}
 
 		/*
-		 * If it is still in recovery, make sure the target server is
-		 * connected to the primary so it can receive the required WAL to
-		 * finish the recovery process. If it is disconnected try
-		 * NUM_CONN_ATTEMPTS in a row and bail out if not succeed.
+		 * If it is still in recovery, make sure the target server is connected
+		 * to the primary so it can receive the required WAL to finish the
+		 * recovery process. If the walreceiver process is not running it
+		 * should indicate that (i) the recovery is almost finished or (ii) the
+		 * primary is not running or is not accpeting connections. It should
+		 * count as attempts iif (ii) is true. In this case, try NUM_ATTEMPTS
+		 * in a row and bail out if not succeed.
 		 */
 		res = PQexec(conn,
 					 "SELECT 1 FROM pg_catalog.pg_stat_wal_receiver");
 		if (PQntuples(res) == 0)
 		{
-			if (++count > NUM_CONN_ATTEMPTS)
+			if (PQping(dbinfo->pubconninfo) != PQPING_OK)
+				count++;
+			else
+				count = 0;		/* reset counter if it connects again */
+
+			if (count > NUM_ATTEMPTS)
 			{
 				stop_standby_server(subscriber_dir);
 				pg_log_error("standby server disconnected from the primary");
@@ -2121,7 +2129,7 @@ main(int argc, char **argv)
 	start_standby_server(&opt, true);
 
 	/* Waiting the subscriber to be promoted */
-	wait_for_end_recovery(dbinfo[0].subconninfo, &opt);
+	wait_for_end_recovery(&dbinfo[0], &opt);
 
 	/*
 	 * Create the subscription for each database on subscriber. It does not
-- 
2.34.1

