From ed1c90cf2cd328be9fb775783961d29d5d450ff5 Mon Sep 17 00:00:00 2001
From: Ajin Cherian <ajinc@fast.au.fujitsu.com>
Date: Fri, 27 Aug 2021 06:46:33 -0400
Subject: [PATCH v3] Fix the random test failure in 001_rep_changes.

The check to test whether the subscription workers were restarting after a
change in the subscription was failing. The reason was that the test was
assuming the walsender started before it reaches the 'streaming' state and
The check to test whether the subscription workers were restarting after a
change in the subscription was failing. The reason was that the test was
assuming the walsender started before it reaches the 'streaming' state and
the walsender was exiting due to an error before that. Now, the walsender
was erroring out before reaching the 'streaming' state because it tries to
acquire the slot before the previous walsender has exited.

Reported-by: Michael Paquier, as per buildfarm
Author: Ajin Cherian
Reviewed-by: Amit Kapila
Backpatch-through: 10, where this test was introduced
Discussion: https://postgr.es/m/YRnhFxa9bo73wfpV@paquier.xyz
---
 src/test/subscription/t/001_rep_changes.pl      | 20 +++++++++++---------
 src/test/subscription/t/022_twophase_cascade.pl |  8 ++++----
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl
index 0c84d87..f19e4d2 100644
--- a/src/test/subscription/t/001_rep_changes.pl
+++ b/src/test/subscription/t/001_rep_changes.pl
@@ -414,27 +414,29 @@ is( $result, qq(11.11|baz|1
 	'update works with dropped subscriber column');
 
 # check that change of connection string and/or publication list causes
-# restart of subscription workers. Not all of these are registered as tests
-# as we need to poll for a change but the test suite will fail none the less
-# when something goes wrong.
+# restart of subscription workers. We check the state along with application_name
+# to ensure that the walsender is (re)started.
+#
+# Not all of these are registered as tests as we need to poll for a change
+# but the test suite will fail none the less when something goes wrong.
 my $oldpid = $node_publisher->safe_psql('postgres',
-	"SELECT pid FROM pg_stat_replication WHERE application_name = 'tap_sub';"
+	"SELECT pid FROM pg_stat_replication WHERE application_name = 'tap_sub' AND state = 'streaming';"
 );
 $node_subscriber->safe_psql('postgres',
 	"ALTER SUBSCRIPTION tap_sub CONNECTION '$publisher_connstr sslmode=disable'"
 );
 $node_publisher->poll_query_until('postgres',
-	"SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = 'tap_sub';"
+	"SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = 'tap_sub' AND state = 'streaming';"
 ) or die "Timed out while waiting for apply to restart";
 
 $oldpid = $node_publisher->safe_psql('postgres',
-	"SELECT pid FROM pg_stat_replication WHERE application_name = 'tap_sub';"
+	"SELECT pid FROM pg_stat_replication WHERE application_name = 'tap_sub' AND state = 'streaming';;"
 );
 $node_subscriber->safe_psql('postgres',
 	"ALTER SUBSCRIPTION tap_sub SET PUBLICATION tap_pub_ins_only WITH (copy_data = false)"
 );
 $node_publisher->poll_query_until('postgres',
-	"SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = 'tap_sub';"
+	"SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = 'tap_sub' AND state = 'streaming';"
 ) or die "Timed out while waiting for apply to restart";
 
 $node_publisher->safe_psql('postgres',
@@ -483,12 +485,12 @@ is($result, qq(21|0|100), 'check replicated insert after alter publication');
 
 # check restart on rename
 $oldpid = $node_publisher->safe_psql('postgres',
-	"SELECT pid FROM pg_stat_replication WHERE application_name = 'tap_sub';"
+	"SELECT pid FROM pg_stat_replication WHERE application_name = 'tap_sub' AND state = 'streaming';"
 );
 $node_subscriber->safe_psql('postgres',
 	"ALTER SUBSCRIPTION tap_sub RENAME TO tap_sub_renamed");
 $node_publisher->poll_query_until('postgres',
-	"SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = 'tap_sub_renamed';"
+	"SELECT pid != $oldpid FROM pg_stat_replication WHERE application_name = 'tap_sub_renamed' AND state = 'streaming';"
 ) or die "Timed out while waiting for apply to restart";
 
 # check all the cleanup
diff --git a/src/test/subscription/t/022_twophase_cascade.pl b/src/test/subscription/t/022_twophase_cascade.pl
index adda874..a38c5e7 100644
--- a/src/test/subscription/t/022_twophase_cascade.pl
+++ b/src/test/subscription/t/022_twophase_cascade.pl
@@ -235,10 +235,10 @@ is($result, qq(21), 'Rows committed are present on subscriber C');
 
 my $oldpid_B = $node_A->safe_psql('postgres', "
 	SELECT pid FROM pg_stat_replication
-	WHERE application_name = '$appname_B';");
+	WHERE application_name = '$appname_B' AND state = 'streaming';");
 my $oldpid_C = $node_B->safe_psql('postgres', "
 	SELECT pid FROM pg_stat_replication
-	WHERE application_name = '$appname_C';");
+	WHERE application_name = '$appname_C' AND state = 'streaming';");
 
 # Setup logical replication (streaming = on)
 
@@ -253,11 +253,11 @@ $node_C->safe_psql('postgres', "
 
 $node_A->poll_query_until('postgres', "
 	SELECT pid != $oldpid_B FROM pg_stat_replication
-	WHERE application_name = '$appname_B';"
+	WHERE application_name = '$appname_B' AND state = 'streaming';"
 ) or die "Timed out while waiting for apply to restart";
 $node_B->poll_query_until('postgres', "
 	SELECT pid != $oldpid_C FROM pg_stat_replication
-	WHERE application_name = '$appname_C';"
+	WHERE application_name = '$appname_C' AND state = 'streaming';"
 ) or die "Timed out while waiting for apply to restart";
 
 ###############################
-- 
1.8.3.1

