Hi hackers,

Please find attached a patch proposal to $SUBJECT.

Indeed, we have seen occurrences in [1] that some slots were
not invalidated (while we expected vacuum to remove dead rows
leading to slots invalidation on the standby).

Though we don't have strong evidences that this
was due to transactions holding back global xmin (as vacuum did
not run in verbose mode), suspicion is high enough (as Tom pointed
out that the test is broken on its face (see [1])).

The proposed patch:

- set autovacuum = off on the primary (as autovacuum is the usual suspect
for holding global xmin).
- Ensure that vacuum is able to remove dead rows before launching
the slots invalidation tests.
- If after 10 attempts the vacuum is still not able to remove the dead
rows then the slots invalidation tests are skipped: that should be pretty
rare, as currently the majority of the tests are green (with only one attempt).

While at it, the patch also addresses the nitpicks mentioned by Robert in [2].

[1]: 
https://www.postgresql.org/message-id/flat/OSZPR01MB6310CFFD7D0DCD60A05DB1C3FD4A9%40OSZPR01MB6310.jpnprd01.prod.outlook.com#71898e088d2a57564a1bd9c41f3e6f36
[2]: 
https://www.postgresql.org/message-id/CA%2BTgmobHGpU2ZkChgKifGDLaf_%2BmFA7njEpeTjfyNf_msCZYew%40mail.gmail.com

Regards,

--
Bertrand Drouvot
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
From a5c6ef80cae1fb7606ff46679422238fdceb5cc8 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot...@gmail.com>
Date: Tue, 30 May 2023 07:54:02 +0000
Subject: [PATCH v1] Ensure vacuum did remove dead rows in
 035_standby_logical_decoding

We want to ensure that vacuum was able to remove dead rows (aka no other
transactions holding back global xmin) before testing for slots invalidation
on the standby.
---
 .../t/035_standby_logical_decoding.pl         | 263 ++++++++++--------
 1 file changed, 150 insertions(+), 113 deletions(-)
 100.0% src/test/recovery/t/

diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl 
b/src/test/recovery/t/035_standby_logical_decoding.pl
index 64beec4bd3..d26d9bcace 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -9,6 +9,7 @@ use warnings;
 use PostgreSQL::Test::Cluster;
 use PostgreSQL::Test::Utils;
 use Test::More;
+use Time::HiRes qw(usleep);
 
 my ($stdin, $stdout, $stderr,
        $cascading_stdout, $cascading_stderr, $subscriber_stdin,
@@ -23,6 +24,9 @@ my $node_subscriber = 
PostgreSQL::Test::Cluster->new('subscriber');
 my $default_timeout = $PostgreSQL::Test::Utils::timeout_default;
 my $psql_timeout = IPC::Run::timer($default_timeout);
 my $res;
+my ($vacret, $vacstdout, $vacstderr) = ('', '', '');
+my $nb_vac = 0;
+my $vac_attempts = 10;
 
 # Name for the physical slot on primary
 my $primary_slotname = 'primary_physical';
@@ -185,7 +189,7 @@ sub change_hot_standby_feedback_and_wait_for_xmins
 # Check conflicting status in pg_replication_slots.
 sub check_slots_conflicting_status
 {
-       my ($conflicting) = @_;
+       my ($conflicting, $details) = @_;
 
        if ($conflicting)
        {
@@ -193,7 +197,7 @@ sub check_slots_conflicting_status
                        'postgres', qq(
                                 select bool_and(conflicting) from 
pg_replication_slots;));
 
-               is($res, 't', "Logical slots are reported as conflicting");
+               is($res, 't', "logical slots are reported as conflicting 
$details");
        }
        else
        {
@@ -201,7 +205,7 @@ sub check_slots_conflicting_status
                        'postgres', qq(
                                select bool_or(conflicting) from 
pg_replication_slots;));
 
-               is($res, 'f', "Logical slots are reported as non conflicting");
+               is($res, 'f', "logical slots are reported as non conflicting 
$details");
        }
 }
 
@@ -256,6 +260,29 @@ sub check_for_invalidation
        ) or die "Timed out waiting confl_active_logicalslot to be updated";
 }
 
+# Try ($vac_attempts times max) to vacuum until it is able to remove dead rows.
+sub vacuum_until_can_remove
+{
+       my ($vac_option, $sql, $to_vac) = @_;
+
+       ($vacret, $vacstdout, $vacstderr) = ('', '', '');
+       $nb_vac = 0;
+
+       while ($nb_vac <= $vac_attempts)
+       {
+               last if ($vacstderr =~ /0 dead row versions cannot be removed 
yet/ ||
+                                $vacstderr =~ /0 are dead but not yet 
removable/);
+               # This should trigger the conflict
+               ($vacret, $vacstdout, $vacstderr) = $node_primary->psql(
+                       'testdb', qq[
+                         $sql
+                         VACUUM $vac_option verbose $to_vac;
+                         INSERT INTO flush_wal DEFAULT VALUES; -- see create 
table flush_wal
+               ]);
+               $nb_vac++;
+               usleep(100_000);
+       }
+}
 ########################
 # Initialize primary node
 ########################
@@ -266,6 +293,7 @@ $node_primary->append_conf(
 wal_level = 'logical'
 max_replication_slots = 4
 max_wal_senders = 4
+autovacuum = off
 });
 $node_primary->dump_info;
 $node_primary->start;
@@ -485,30 +513,31 @@ $node_subscriber->stop;
 reactive_slots_change_hfs_and_wait_for_xmins('behaves_ok_', 'vacuum_full_',
        0, 1);
 
-# This should trigger the conflict
-$node_primary->safe_psql(
-       'testdb', qq[
-  CREATE TABLE conflict_test(x integer, y text);
-  DROP TABLE conflict_test;
-  VACUUM full pg_class;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+vacuum_until_can_remove('full', 'CREATE TABLE conflict_test(x integer, y text);
+                                                                DROP TABLE 
conflict_test;', 'pg_class');
 
 $node_primary->wait_for_replay_catchup($node_standby);
 
-# Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on pg_class');
+SKIP:
+{
+       # Checking for invalidation only if vacuum did remove dead rows
+       skip 'vacuum full did not remove dead rows', 1
+               unless ($nb_vac < $vac_attempts);
 
-# Verify slots are reported as conflicting in pg_replication_slots
-check_slots_conflicting_status(1);
+       # Check invalidation in the logfile and in pg_stat_database_conflicts
+       check_for_invalidation('vacuum_full_', 1, 'with vacuum FULL on 
pg_class');
 
-$handle =
-  make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, \$stderr);
+       # Verify slots are reported as conflicting in pg_replication_slots
+       check_slots_conflicting_status(1, 'after vacuum FULL on pg_class');
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-       "can no longer get changes from replication slot 
\"vacuum_full_activeslot\""
-);
+       $handle =
+         make_slot_active($node_standby, 'vacuum_full_', 0, \$stdout, 
\$stderr);
+
+       # We are not able to read from the slot as it has been invalidated
+       check_pg_recvlogical_stderr($handle,
+               "can no longer get changes from replication slot 
\"vacuum_full_activeslot\""
+       );
+}
 
 # Turn hot_standby_feedback back on
 change_hot_standby_feedback_and_wait_for_xmins(1, 1);
@@ -518,45 +547,49 @@ change_hot_standby_feedback_and_wait_for_xmins(1, 1);
 ##################################################
 $node_standby->restart;
 
-# Verify slots are reported as conflicting in pg_replication_slots
-check_slots_conflicting_status(1);
+SKIP:
+{
+       skip 'vacuum full on pg_class did not remove dead rows', 1
+               unless ($nb_vac < $vac_attempts);
+       # Verify slots are reported as conflicting in pg_replication_slots
+       check_slots_conflicting_status(1, 'after a restart');
 
 ##################################################
 # Verify that invalidated logical slots do not lead to retaining WAL.
 ##################################################
 
-# Get the restart_lsn from an invalidated slot
-my $restart_lsn = $node_standby->safe_psql('postgres',
-       "SELECT restart_lsn from pg_replication_slots WHERE slot_name = 
'vacuum_full_activeslot' and conflicting is true;"
-);
-
-chomp($restart_lsn);
+       # Get the restart_lsn from an invalidated slot
+       my $restart_lsn = $node_standby->safe_psql('postgres',
+               "SELECT restart_lsn from pg_replication_slots WHERE slot_name = 
'vacuum_full_activeslot' and conflicting is true;"
+       );
 
-# As pg_walfile_name() can not be executed on the standby,
-# get the WAL file name associated to this lsn from the primary
-my $walfile_name = $node_primary->safe_psql('postgres',
-       "SELECT pg_walfile_name('$restart_lsn')");
+       chomp($restart_lsn);
 
-chomp($walfile_name);
+       # As pg_walfile_name() can not be executed on the standby,
+       # get the WAL file name associated to this lsn from the primary
+       my $walfile_name = $node_primary->safe_psql('postgres',
+               "SELECT pg_walfile_name('$restart_lsn')");
 
-# Generate some activity and switch WAL file on the primary
-$node_primary->safe_psql(
-       'postgres', "create table retain_test(a int);
-                                                                        select 
pg_switch_wal();
-                                                                        insert 
into retain_test values(1);
-                                                                        
checkpoint;");
+       chomp($walfile_name);
 
-# Wait for the standby to catch up
-$node_primary->wait_for_replay_catchup($node_standby);
+       # Generate some activity and switch WAL file on the primary
+       $node_primary->safe_psql(
+               'postgres', "create table retain_test(a int);
+                                                                               
 select pg_switch_wal();
+                                                                               
 insert into retain_test values(1);
+                                                                               
 checkpoint;");
 
-# Request a checkpoint on the standby to trigger the WAL file(s) removal
-$node_standby->safe_psql('postgres', 'checkpoint;');
+       # Wait for the standby to catch up
+       $node_primary->wait_for_replay_catchup($node_standby);
 
-# Verify that the WAL file has not been retained on the standby
-my $standby_walfile = $node_standby->data_dir . '/pg_wal/' . $walfile_name;
-ok(!-f "$standby_walfile",
-       "invalidated logical slots do not lead to retaining WAL");
+       # Request a checkpoint on the standby to trigger the WAL file(s) removal
+       $node_standby->safe_psql('postgres', 'checkpoint;');
 
+       # Verify that the WAL file has not been retained on the standby
+       my $standby_walfile = $node_standby->data_dir . '/pg_wal/' . 
$walfile_name;
+       ok(!-f "$standby_walfile",
+               "invalidated logical slots do not lead to retaining WAL");
+}
 ##################################################
 # Recovery conflict: Invalidate conflicting slots, including in-use slots
 # Scenario 2: conflict due to row removal with hot_standby_feedback off.
@@ -571,29 +604,30 @@ 
reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_',
        0, 1);
 
 # This should trigger the conflict
-$node_primary->safe_psql(
-       'testdb', qq[
-  CREATE TABLE conflict_test(x integer, y text);
-  DROP TABLE conflict_test;
-  VACUUM pg_class;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+vacuum_until_can_remove('', 'CREATE TABLE conflict_test(x integer, y text);
+                                                        DROP TABLE 
conflict_test;', 'pg_class');
 
 $node_primary->wait_for_replay_catchup($node_standby);
 
-# Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('row_removal_', $logstart, 'with vacuum on pg_class');
+SKIP:
+{
+       skip 'vacuum on pg_class did not remove dead rows', 1
+               unless ($nb_vac < $vac_attempts);
 
-# Verify slots are reported as conflicting in pg_replication_slots
-check_slots_conflicting_status(1);
+       # Check invalidation in the logfile and in pg_stat_database_conflicts
+       check_for_invalidation('row_removal_', $logstart, 'with vacuum on 
pg_class');
 
-$handle =
-  make_slot_active($node_standby, 'row_removal_', 0, \$stdout, \$stderr);
+       # Verify slots are reported as conflicting in pg_replication_slots
+       check_slots_conflicting_status(1, 'after vacuum on pg_class');
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-       "can no longer get changes from replication slot 
\"row_removal_activeslot\""
-);
+       $handle =
+         make_slot_active($node_standby, 'row_removal_', 0, \$stdout, 
\$stderr);
+
+       # We are not able to read from the slot as it has been invalidated
+       check_pg_recvlogical_stderr($handle,
+               "can no longer get changes from replication slot 
\"row_removal_activeslot\""
+       );
+}
 
 ##################################################
 # Recovery conflict: Same as Scenario 2 but on a shared catalog table
@@ -609,30 +643,30 @@ 
reactive_slots_change_hfs_and_wait_for_xmins('row_removal_',
        'shared_row_removal_', 0, 1);
 
 # Trigger the conflict
-$node_primary->safe_psql(
-       'testdb', qq[
-  CREATE ROLE create_trash;
-  DROP ROLE create_trash;
-  VACUUM pg_authid;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+vacuum_until_can_remove('', 'CREATE ROLE create_trash;
+                                                        DROP ROLE 
create_trash;', 'pg_authid');
 
 $node_primary->wait_for_replay_catchup($node_standby);
+SKIP:
+{
+       skip 'vacuum on pg_authid did not remove dead rows', 1
+               unless ($nb_vac < $vac_attempts);
 
-# Check invalidation in the logfile and in pg_stat_database_conflicts
-check_for_invalidation('shared_row_removal_', $logstart,
-       'with vacuum on pg_authid');
+       # Check invalidation in the logfile and in pg_stat_database_conflicts
+       check_for_invalidation('shared_row_removal_', $logstart,
+               'with vacuum on pg_authid');
 
-# Verify slots are reported as conflicting in pg_replication_slots
-check_slots_conflicting_status(1);
+       # Verify slots are reported as conflicting in pg_replication_slots
+       check_slots_conflicting_status(1, 'after vacuum on pg_authid');
 
-$handle = make_slot_active($node_standby, 'shared_row_removal_', 0, \$stdout,
-       \$stderr);
+       $handle = make_slot_active($node_standby, 'shared_row_removal_', 0, 
\$stdout,
+               \$stderr);
 
-# We are not able to read from the slot as it has been invalidated
-check_pg_recvlogical_stderr($handle,
-       "can no longer get changes from replication slot 
\"shared_row_removal_activeslot\""
-);
+       # We are not able to read from the slot as it has been invalidated
+       check_pg_recvlogical_stderr($handle,
+               "can no longer get changes from replication slot 
\"shared_row_removal_activeslot\""
+       );
+}
 
 ##################################################
 # Recovery conflict: Same as Scenario 2 but on a non catalog table
@@ -646,39 +680,42 @@ 
reactive_slots_change_hfs_and_wait_for_xmins('shared_row_removal_',
        'no_conflict_', 0, 1);
 
 # This should not trigger a conflict
-$node_primary->safe_psql(
-       'testdb', qq[
-  CREATE TABLE conflict_test(x integer, y text);
-  INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
-  UPDATE conflict_test set x=1, y=1;
-  VACUUM conflict_test;
-  INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
-]);
+vacuum_until_can_remove('', 'CREATE TABLE conflict_test(x integer, y text);
+                                                        INSERT INTO 
conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
+                                                        UPDATE conflict_test 
set x=1, y=1;', 'conflict_test');
+
 $node_primary->wait_for_replay_catchup($node_standby);
 
-# message should not be issued
-ok( !find_in_log(
-               $node_standby,
-               "invalidating obsolete slot \"no_conflict_inactiveslot\"", 
$logstart),
-       'inactiveslot slot invalidation is not logged with vacuum on 
conflict_test'
-);
+SKIP:
+{
+       # Checking for invalidation only if vacuum did remove dead rows
+       skip 'vacuum did not remove dead rows on a non catalog table', 1
+               unless ($nb_vac < $vac_attempts);
 
-ok( !find_in_log(
-               $node_standby,
-               "invalidating obsolete slot \"no_conflict_activeslot\"", 
$logstart),
-       'activeslot slot invalidation is not logged with vacuum on 
conflict_test'
-);
+       # message should not be issued
+       ok( !find_in_log(
+                       $node_standby,
+                       "invalidating obsolete slot 
\"no_conflict_inactiveslot\"", $logstart),
+               'inactiveslot slot invalidation is not logged with vacuum on 
conflict_test'
+       );
 
-# Verify that pg_stat_database_conflicts.confl_active_logicalslot has not been 
updated
-ok( $node_standby->poll_query_until(
-               'postgres',
-               "select (confl_active_logicalslot = 0) from 
pg_stat_database_conflicts where datname = 'testdb'",
-               't'),
-       'confl_active_logicalslot not updated'
-) or die "Timed out waiting confl_active_logicalslot to be updated";
+       ok( !find_in_log(
+                       $node_standby,
+                       "invalidating obsolete slot 
\"no_conflict_activeslot\"", $logstart),
+               'activeslot slot invalidation is not logged with vacuum on 
conflict_test'
+       );
 
-# Verify slots are reported as non conflicting in pg_replication_slots
-check_slots_conflicting_status(0);
+       # Verify that pg_stat_database_conflicts.confl_active_logicalslot has 
not been updated
+       ok( $node_standby->poll_query_until(
+                       'postgres',
+                       "select (confl_active_logicalslot = 0) from 
pg_stat_database_conflicts where datname = 'testdb'",
+                       't'),
+               'confl_active_logicalslot not updated'
+       ) or die "Timed out waiting confl_active_logicalslot to be updated";
+
+       # Verify slots are reported as non conflicting in pg_replication_slots
+       check_slots_conflicting_status(0, 'after vacuum on a non catalog 
table');
+}
 
 # Turn hot_standby_feedback back on
 change_hot_standby_feedback_and_wait_for_xmins(1, 0);
@@ -715,7 +752,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
 check_for_invalidation('pruning_', $logstart, 'with on-access pruning');
 
 # Verify slots are reported as conflicting in pg_replication_slots
-check_slots_conflicting_status(1);
+check_slots_conflicting_status(1, 'after on-access pruning');
 
 $handle = make_slot_active($node_standby, 'pruning_', 0, \$stdout, \$stderr);
 
@@ -759,7 +796,7 @@ $node_primary->wait_for_replay_catchup($node_standby);
 check_for_invalidation('wal_level_', $logstart, 'due to wal_level');
 
 # Verify slots are reported as conflicting in pg_replication_slots
-check_slots_conflicting_status(1);
+check_slots_conflicting_status(1, 'due to wal_level');
 
 $handle =
   make_slot_active($node_standby, 'wal_level_', 0, \$stdout, \$stderr);
-- 
2.34.1

Reply via email to