On 2022-Feb-22, Imseih (AWS), Sami wrote:

> On 13.5 a wal flush PANIC is encountered after a standby is promoted.
> 
> With debugging, it was found that when a standby skips a missing
> continuation record on recovery, the missingContrecPtr is not
> invalidated after the record is skipped. Therefore, when the standby
> is promoted to a primary it writes an overwrite_contrecord with an LSN
> of the missingContrecPtr, which is now in the past. On flush time,
> this causes a PANIC. From what I can see, this failure scenario can
> only occur after a standby is promoted.

Ooh, nice find and diagnosys.  I can confirm that the test fails as you
described without the code fix, and doesn't fail with it.

I attach the same patch, with the test file put in its final place
rather than as a patch.  Due to recent xlog.c changes this need a bit of
work to apply to back branches; I'll see about getting it in all
branches soon.

-- 
Álvaro Herrera         PostgreSQL Developer  —  https://www.EnterpriseDB.com/
"I'm impressed how quickly you are fixing this obscure issue. I came from 
MS SQL and it would be hard for me to put into words how much of a better job
you all are doing on [PostgreSQL]."
 Steve Midgley, http://archives.postgresql.org/pgsql-sql/2008-08/msg00000.php
>From 7948d1acdcd25b5b425c7804fad0d46cfb4b14b0 Mon Sep 17 00:00:00 2001
From: "Sami Imseih (AWS)" <sims...@amazon.com>
Date: Tue, 22 Feb 2022 19:09:36 +0000
Subject: [PATCH v2] Fix "missing continuation record" after standby promotion

Fix a condition where a recently promoted standby attempts to write an
OVERWRITE_RECORD with an LSN of the previously read aborted record.

Author: Sami Imseih <sims...@amazon.com>
Discussion: https://postgr.es/m/44d259de-7542-49c4-8a52-2ab01534d...@amazon.com
---
 src/backend/access/transam/xlog.c             |  16 ++-
 .../t/029_overwrite_contrecord_promotion.pl   | 111 ++++++++++++++++++
 2 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 src/test/recovery/t/029_overwrite_contrecord_promotion.pl

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0d2bd7a357..62e942f41a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5423,11 +5423,25 @@ StartupXLOG(void)
 	 * made it through and start writing after the portion that persisted.
 	 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
 	 * we'll do as soon as we're open for writing new WAL.)
+	 *
+	 * If the last wal record is ahead of the missing contrecord, this is a
+	 * recently promoted primary and we should not write an overwrite
+	 * contrecord.
 	 */
 	if (!XLogRecPtrIsInvalid(missingContrecPtr))
 	{
 		Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
-		EndOfLog = missingContrecPtr;
+		if (endOfRecoveryInfo->lastRec < missingContrecPtr)
+		{
+			elog(DEBUG2, "setting end of wal to missing continuation record %X/%X",
+				 LSN_FORMAT_ARGS(missingContrecPtr));
+			EndOfLog = missingContrecPtr;
+		}
+		else
+		{
+			elog(DEBUG2, "resetting aborted record");
+			abortedRecPtr = InvalidXLogRecPtr;
+		}
 	}
 
 	/*
diff --git a/src/test/recovery/t/029_overwrite_contrecord_promotion.pl b/src/test/recovery/t/029_overwrite_contrecord_promotion.pl
new file mode 100644
index 0000000000..ea4ebb32c0
--- /dev/null
+++ b/src/test/recovery/t/029_overwrite_contrecord_promotion.pl
@@ -0,0 +1,111 @@
+# Copyright (c) 2021-2022, PostgreSQL Global Development Group
+
+# Tests for resetting the "aborted record" after a promotion.
+
+use strict;
+use warnings;
+
+use FindBin;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Test: Create a physical replica that's missing the last WAL file,
+# then restart the primary to create a divergent WAL file and observe
+# that the replica resets the "aborted record" after a promotion.
+
+my $node = PostgreSQL::Test::Cluster->new('primary');
+$node->init(allows_streaming => 1);
+# We need these settings for stability of WAL behavior.
+$node->append_conf(
+	'postgresql.conf', qq(
+autovacuum = off
+wal_keep_size = 1GB
+log_min_messages = DEBUG2
+));
+$node->start;
+
+$node->safe_psql('postgres', 'create table filler (a int, b text)');
+
+# Now consume all remaining room in the current WAL segment, leaving
+# space enough only for the start of a largish record.
+$node->safe_psql(
+	'postgres', q{
+DO $$
+DECLARE
+    wal_segsize int := setting::int FROM pg_settings WHERE name = 'wal_segment_size';
+    remain int;
+    iters  int := 0;
+BEGIN
+    LOOP
+        INSERT into filler
+        select g, repeat(md5(g::text), (random() * 60 + 1)::int)
+        from generate_series(1, 10) g;
+
+        remain := wal_segsize - (pg_current_wal_insert_lsn() - '0/0') % wal_segsize;
+        IF remain < 2 * setting::int from pg_settings where name = 'block_size' THEN
+            RAISE log 'exiting after % iterations, % bytes to end of WAL segment', iters, remain;
+            EXIT;
+        END IF;
+        iters := iters + 1;
+    END LOOP;
+END
+$$;
+});
+
+my $initfile = $node->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_insert_lsn())');
+$node->safe_psql('postgres',
+	qq{SELECT pg_logical_emit_message(true, 'test 026', repeat('xyzxz', 123456))}
+);
+#$node->safe_psql('postgres', qq{create table foo ()});
+my $endfile = $node->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_insert_lsn())');
+ok($initfile ne $endfile, "$initfile differs from $endfile");
+
+# Now stop abruptly, to avoid a stop checkpoint.  We can remove the tail file
+# afterwards, and on startup the large message should be overwritten with new
+# contents
+$node->stop('immediate');
+
+unlink $node->basedir . "/pgdata/pg_wal/$endfile"
+  or die "could not unlink " . $node->basedir . "/pgdata/pg_wal/$endfile: $!";
+
+# OK, create a standby at this spot.
+$node->backup_fs_cold('backup');
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node, 'backup', has_streaming => 1);
+
+$node_standby->start;
+$node->start;
+
+$node->safe_psql('postgres',
+	qq{create table foo (a text); insert into foo values ('hello')});
+$node->safe_psql('postgres',
+	qq{SELECT pg_logical_emit_message(true, 'test 026', 'AABBCC')});
+
+my $until_lsn = $node->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+my $caughtup_query =
+  "SELECT '$until_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
+$node_standby->poll_query_until('postgres', $caughtup_query)
+  or die "Timed out while waiting for standby to catch up";
+
+ok($node_standby->safe_psql('postgres', 'select * from foo') eq 'hello',
+	'standby replays past overwritten contrecord');
+
+$ENV{PGDATA} = $node_standby->data_dir;
+$ENV{PGPORT} = $node_standby->port;
+$ENV{PGGHOST} = $node_standby->host;
+system "psql -c 'select pg_promote()'";
+
+# Verify message appears in standby's log
+my $log = slurp_file($node_standby->logfile);
+like(
+    $log,
+    qr[resetting aborted record],
+    "found log line in standby");
+
+$node->stop;
+$node_standby->stop;
+
+done_testing();
-- 
2.30.2

Reply via email to