Hello.

Added a check for standby promotion with the long transaction to the
test (code and docs are unchanged).

Thanks,
Michail.
From c5e1053805c537b50b0922151bcf127754500adb Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikol...@gmail.com>
Date: Fri, 14 May 2021 00:32:30 +0300
Subject: [PATCH v3 3/4] test

---
 src/test/recovery/Makefile                    |   1 +
 .../recovery/t/022_standby_index_lp_dead.pl   | 265 ++++++++++++++++++
 2 files changed, 266 insertions(+)
 create mode 100644 src/test/recovery/t/022_standby_index_lp_dead.pl

diff --git a/src/test/recovery/Makefile b/src/test/recovery/Makefile
index 96442ceb4e..6399184a8c 100644
--- a/src/test/recovery/Makefile
+++ b/src/test/recovery/Makefile
@@ -10,6 +10,7 @@
 #-------------------------------------------------------------------------
 
 EXTRA_INSTALL=contrib/test_decoding
+EXTRA_INSTALL+=contrib/pageinspect
 
 subdir = src/test/recovery
 top_builddir = ../../..
diff --git a/src/test/recovery/t/022_standby_index_lp_dead.pl b/src/test/recovery/t/022_standby_index_lp_dead.pl
new file mode 100644
index 0000000000..fc91f789a1
--- /dev/null
+++ b/src/test/recovery/t/022_standby_index_lp_dead.pl
@@ -0,0 +1,265 @@
+# Checks that index hints on standby work as excepted.
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 18;
+use Config;
+
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf('postgresql.conf', qq{
+    autovacuum = off
+    enable_seqscan = off
+    enable_indexonlyscan = off
+});
+$node_primary->start;
+
+$node_primary->safe_psql('postgres', 'CREATE EXTENSION pageinspect');
+# Create test table with primary index
+$node_primary->safe_psql(
+    'postgres', 'CREATE TABLE test_table (id int, value int)');
+$node_primary->safe_psql(
+    'postgres', 'CREATE INDEX test_index ON test_table (value, id)');
+# Fill some data to it, note to not put a lot of records to avoid
+# heap_page_prune_opt call which cause conflict on recovery hiding conflict
+# caused due index hint bits
+$node_primary->safe_psql('postgres',
+    'INSERT INTO test_table VALUES (generate_series(1, 30), 0)');
+# And vacuum to allow index hint bits to be set
+$node_primary->safe_psql('postgres', 'VACUUM test_table');
+# For fail-fast in case FPW from primary
+$node_primary->safe_psql('postgres', 'CHECKPOINT');
+
+# Take backup
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Restore standby node from backup backup
+my $node_standby_1 = get_new_node('standby_1');
+$node_standby_1->init_from_backup($node_primary, $backup_name,
+    has_streaming => 1);
+
+my $standby_settings = qq{
+    max_standby_streaming_delay = 1
+    wal_receiver_status_interval = 1
+    hot_standby_feedback = off
+    enable_seqscan = off
+    enable_indexonlyscan = off
+};
+$node_standby_1->append_conf('postgresql.conf', $standby_settings);
+$node_standby_1->start;
+
+$node_standby_1->backup($backup_name);
+
+# Create second standby node linking to standby 1
+my $node_standby_2 = get_new_node('standby_2');
+$node_standby_2->init_from_backup($node_standby_1, $backup_name,
+    has_streaming => 1);
+$node_standby_2->append_conf('postgresql.conf', $standby_settings);
+$node_standby_2->start;
+
+# To avoid hanging while expecting some specific input from a psql
+# instance being driven by us, add a timeout high enough that it
+# should never trigger even on very slow machines, unless something
+# is really wrong.
+my $psql_timeout = IPC::Run::timer(10);
+
+# One psql to run command in repeatable read isolation level
+my %psql_standby_repeatable_read = ('stdin' => '', 'stdout' => '', 'stderr' => '');
+$psql_standby_repeatable_read{run} =
+    IPC::Run::start(
+        [ 'psql', '-XAb', '-f', '-', '-d', $node_standby_1->connstr('postgres') ],
+        '<', \$psql_standby_repeatable_read{stdin},
+        '>', \$psql_standby_repeatable_read{stdout},
+        '2>', \$psql_standby_repeatable_read{stderr},
+        $psql_timeout);
+
+# Another psql to run command in read committed isolation level
+my %psql_standby_read_committed = ('stdin' => '', 'stdout' => '', 'stderr' => '');
+$psql_standby_read_committed{run} =
+    IPC::Run::start(
+        [ 'psql', '-XAb', '-f', '-', '-d', $node_standby_1->connstr('postgres') ],
+        '<', \$psql_standby_read_committed{stdin},
+        '>', \$psql_standby_read_committed{stdout},
+        '2>', \$psql_standby_read_committed{stderr},
+        $psql_timeout);
+
+# Start RR transaction and read first row from index
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q[
+BEGIN TRANSACTION ISOLATION LEVEL REPEATABLE READ;
+SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;
+],
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read');
+
+# Start RC transaction and read first row from index
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q[
+BEGIN TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;
+],
+    qr/1\n\(1 row\)/m),
+    'row is visible in read committed');
+
+# Now delete first 10 rows in index
+$node_primary->safe_psql('postgres',
+    'UPDATE test_table SET value = 1 WHERE id <= 10');
+
+# Make sure hint bits are not set on primary
+is(hints_num($node_primary), qq(0), 'no index hint bits are set on primary yet');
+
+# Make sure page is not processed by heap_page_prune_opt
+is(non_normal_num($node_primary), qq(0), 'all items are normal in heap');
+
+# Wait for standbys to catch up transaction
+wait_for_catchup_all();
+
+is(hints_num($node_standby_1), qq(0), 'no index hint bits are set on standby 1 yet');
+is(hints_num($node_standby_2), qq(0), 'no index hint bits are set on standby 2 yet');
+
+# Try to set hint bits in index on standbys
+try_to_set_hint_bits();
+
+# Make sure previous queries not set the hints on standby because
+# of RR snapshot
+is(hints_num($node_standby_1), qq(0), 'no index hint bits are set on standby 1 yet');
+# At the same time hint bits are set on second standby
+is(hints_num($node_standby_2), qq(10), 'index hint bits already set on second standby 2');
+
+# Make sure read committed transaction is able to see correct data
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/11\n\(1 row\)/m),
+    'row is not visible in read committed');
+
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read');
+
+# Make checkpoint to cause FPI by LP_DEAD on primary
+$node_primary->safe_psql('postgres', "CHECKPOINT");
+
+# Set index hint bits and replicate to standby as FPI
+$node_primary->safe_psql('postgres',
+    'SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;');
+
+# Make sure page is not processed by heap_page_prune_opt
+is(non_normal_num($node_primary), qq(0), 'all items are normal in heap');
+# Make sure hint bits are set
+is(hints_num($node_primary), qq(10), 'hint bits are set on primary already');
+
+## Wait for standbys to catch up hint bits
+wait_for_catchup_all();
+
+is(hints_num($node_standby_1), qq(10),
+    'hints are set on standby1 because FPI but marked as non-safe');
+is(hints_num($node_standby_2), qq(10),
+    'hints are set on standby1 because FPI but masked as non-safe');
+
+# Make sure read committed transaction is able to see correct data
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/11\n\(1 row\)/m),
+    'row is not visible in read committed');
+
+# Make sure repeatable read transaction able to see correct data
+# because hint bits are marked as non-safe
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read');
+
+$node_primary->stop();
+
+# promote standby to new primary
+$node_standby_1->promote();
+
+# Make sure read committed transaction is able to see correct data
+ok(send_query_and_wait(\%psql_standby_read_committed,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/11\n\(1 row\)/m),
+    'row is not visible in read committed after promote');
+
+# Make sure repeatable read transaction able to see correct data
+# because hint bits are marked as non-safe and transaction was started on standby
+ok(send_query_and_wait(\%psql_standby_repeatable_read,
+    q/SELECT id FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;/,
+    qr/1\n\(1 row\)/m),
+    'row is visible in repeatable read after promote');
+
+$node_standby_1->stop();
+$node_standby_2->stop();
+
+# Send query, wait until string matches
+sub send_query_and_wait {
+    my ($psql, $query, $untl) = @_;
+
+    # send query
+    $$psql{stdin} .= $query;
+    $$psql{stdin} .= "\n";
+
+    # wait for query results
+    $$psql{run}->pump_nb();
+    while (1) {
+        # See PostgresNode.pm's psql()
+        $$psql{stdout} =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
+
+        # diag("\n" . $$psql{stdout}); # for debugging
+        # diag("\n" . $$psql{stderr}); # for debugging
+
+        last if $$psql{stdout} =~ /$untl/;
+        last if $$psql{stderr} =~ /$untl/;
+
+        if ($psql_timeout->is_expired) {
+            BAIL_OUT("aborting wait: program timed out \n" .
+                "stream contents: >>$$psql{stdout}<< \n" .
+                "pattern searched for: $untl");
+            return 0;
+        }
+        if (not $$psql{run}->pumpable()) {
+            # This is fine for some tests, keep running
+            return 0;
+        }
+        $$psql{run}->pump();
+        select(undef, undef, undef, 0.01); # sleep a little
+
+    }
+
+    $$psql{stdout} = '';
+
+    return 1;
+}
+
+sub try_to_set_hint_bits {
+    # Try to set hint bits in index on standby
+    foreach (0 .. 3) {
+        $node_standby_1->safe_psql('postgres',
+            'SELECT * FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;');
+        $node_standby_2->safe_psql('postgres',
+            'SELECT * FROM test_table WHERE value = 0 ORDER BY id LIMIT 1;');
+    }
+}
+
+sub wait_for_catchup_all {
+    $node_primary->wait_for_catchup($node_standby_1, 'replay',
+        $node_primary->lsn('insert'));
+    $node_standby_1->wait_for_catchup($node_standby_2, 'replay',
+        $node_standby_1->lsn('replay'));
+}
+
+sub hints_num {
+    my ($node) = @_;
+    return $node->safe_psql('postgres',
+        "SELECT count(*) FROM bt_page_items('test_index', 1) WHERE dead = true");
+}
+
+sub non_normal_num {
+    my ($node) = @_;
+    return $node->safe_psql('postgres',
+        "SELECT COUNT(*) FROM heap_page_items(get_raw_page('test_table', 0)) WHERE lp_flags != 1");
+}
\ No newline at end of file
-- 
2.25.1

From 2a1e6241e3fa90fa04219c7b215f4c5f2faacba0 Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikol...@gmail.com>
Date: Wed, 12 May 2021 22:55:50 +0300
Subject: [PATCH v3 1/4] code

---
 src/backend/access/gist/gistget.c        | 43 +++++++++++++++++--
 src/backend/access/gist/gistxlog.c       | 15 +++++++
 src/backend/access/hash/hash.c           |  4 +-
 src/backend/access/hash/hash_xlog.c      | 17 ++++++++
 src/backend/access/hash/hashsearch.c     | 18 ++++++--
 src/backend/access/hash/hashutil.c       | 33 ++++++++++++++-
 src/backend/access/heap/heapam.c         | 42 ++++++++++++++-----
 src/backend/access/heap/heapam_handler.c |  5 ++-
 src/backend/access/index/genam.c         | 20 ++++-----
 src/backend/access/index/indexam.c       | 26 ++++++++----
 src/backend/access/nbtree/nbtinsert.c    | 22 +++++++---
 src/backend/access/nbtree/nbtree.c       |  4 +-
 src/backend/access/nbtree/nbtsearch.c    | 14 ++++++-
 src/backend/access/nbtree/nbtutils.c     | 33 ++++++++++++++-
 src/backend/access/nbtree/nbtxlog.c      | 16 +++++++
 src/backend/access/table/tableam.c       |  4 +-
 src/backend/access/transam/rmgr.c        |  4 +-
 src/backend/access/transam/xlogutils.c   |  6 +++
 src/backend/storage/buffer/bufmgr.c      | 53 ++++++++++++++++++++++++
 src/bin/pg_rewind/parsexlog.c            |  2 +-
 src/bin/pg_waldump/rmgrdesc.c            |  2 +-
 src/include/access/gist.h                |  5 +++
 src/include/access/gistxlog.h            |  1 +
 src/include/access/hash.h                |  2 +
 src/include/access/hash_xlog.h           |  1 +
 src/include/access/heapam.h              |  2 +-
 src/include/access/nbtree.h              |  2 +
 src/include/access/nbtxlog.h             |  1 +
 src/include/access/relscan.h             | 15 +++++--
 src/include/access/rmgr.h                |  2 +-
 src/include/access/rmgrlist.h            | 46 ++++++++++----------
 src/include/access/tableam.h             | 14 ++++---
 src/include/access/xlog_internal.h       |  4 ++
 src/include/storage/bufmgr.h             | 11 +++++
 34 files changed, 399 insertions(+), 90 deletions(-)

diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index c8f7e781c6..10bb60aff4 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/genam.h"
 #include "access/gist_private.h"
 #include "access/relscan.h"
@@ -49,6 +50,7 @@ gistkillitems(IndexScanDesc scan)
 	Assert(so->curBlkno != InvalidBlockNumber);
 	Assert(!XLogRecPtrIsInvalid(so->curPageLSN));
 	Assert(so->killedItems != NULL);
+	Assert(so->numKilled > 0);
 
 	buffer = ReadBuffer(scan->indexRelation, so->curBlkno);
 	if (!BufferIsValid(buffer))
@@ -62,8 +64,13 @@ gistkillitems(IndexScanDesc scan)
 	 * If page LSN differs it means that the page was modified since the last
 	 * read. killedItems could be not valid so LP_DEAD hints applying is not
 	 * safe.
+	 *
+	 * Another case - standby was promoted after start of current transaction.
+	 * It is not required for correctness, but it is better to just skip
+	 * everything.
 	 */
-	if (BufferGetLSNAtomic(buffer) != so->curPageLSN)
+	if ((BufferGetLSNAtomic(buffer) != so->curPageLSN) ||
+			(scan->xactStartedInRecovery && !RecoveryInProgress()))
 	{
 		UnlockReleaseBuffer(buffer);
 		so->numKilled = 0;		/* reset counter */
@@ -71,6 +78,20 @@ gistkillitems(IndexScanDesc scan)
 	}
 
 	Assert(GistPageIsLeaf(page));
+	if (GistPageHasLpSafeOnStandby(page) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		GistClearPageHasLpSafeOnStandby(page);
+	}
+	else if (!GistPageHasLpSafeOnStandby(page) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_flags(page);
+		pg_memory_barrier();
+		GistMarkPageHasLpSafeOnStandby(page);
+	}
 
 	/*
 	 * Mark all killedItems as dead. We need no additional recheck, because,
@@ -338,6 +359,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	OffsetNumber maxoff;
 	OffsetNumber i;
 	MemoryContext oldcxt;
+	bool ignore_killed_tuples;
 
 	Assert(!GISTSearchItemIsHeap(*pageItem));
 
@@ -412,6 +434,15 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 	 * check all tuples on page
 	 */
 	maxoff = PageGetMaxOffsetNumber(page);
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+									GistPageHasLpSafeOnStandby(page);
 	for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
 	{
 		ItemId		iid = PageGetItemId(page, i);
@@ -424,7 +455,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
 		 * If the scan specifies not to return killed tuples, then we treat a
 		 * killed tuple as not passing the qual.
 		 */
-		if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+		if (ignore_killed_tuples && ItemIdIsDead(iid))
 			continue;
 
 		it = (IndexTuple) PageGetItem(page, iid);
@@ -651,7 +682,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 		{
 			if (so->curPageData < so->nPageData)
 			{
-				if (scan->kill_prior_tuple && so->curPageData > 0)
+				if (scan->kill_prior_tuple && so->curPageData > 0 &&
+					(XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+						scan->kill_prior_tuple_min_lsn < so->curPageLSN))
 				{
 
 					if (so->killedItems == NULL)
@@ -688,7 +721,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir)
 			 */
 			if (scan->kill_prior_tuple
 				&& so->curPageData > 0
-				&& so->curPageData == so->nPageData)
+				&& so->curPageData == so->nPageData
+				&& (XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+						scan->kill_prior_tuple_min_lsn < so->curPageLSN))
 			{
 
 				if (so->killedItems == NULL)
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 6464cb9281..210044e512 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -451,6 +451,20 @@ gist_xlog_cleanup(void)
 	MemoryContextDelete(opCtx);
 }
 
+/*
+ * Mask a Gist page that LP_DEAD bits are not safe for the standby.
+ */
+void
+gist_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+
+	if (GistPageIsLeaf(page))
+	{
+		GistClearPageHasLpSafeOnStandby(page);
+	}
+}
+
 /*
  * Mask a Gist page before running consistency checks on it.
  */
@@ -459,6 +473,7 @@ gist_mask(char *pagedata, BlockNumber blkno)
 {
 	Page		page = (Page) pagedata;
 
+	gist_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 0752fb38a9..339a6bf8b7 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -295,8 +295,10 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 	{
 		/*
 		 * Check to see if we should kill the previously-fetched tuple.
+		 * If the tuple is marked as dead but with min LSN - treat it as alive.
 		 */
-		if (scan->kill_prior_tuple)
+		if (scan->kill_prior_tuple &&
+				XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn))
 		{
 			/*
 			 * Yes, so remember it for later. (We'll deal with all such tuples
diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c
index af35a991fc..093315e8c4 100644
--- a/src/backend/access/hash/hash_xlog.c
+++ b/src/backend/access/hash/hash_xlog.c
@@ -1101,6 +1101,22 @@ hash_redo(XLogReaderState *record)
 	}
 }
 
+/*
+ * Mask a hash page that LP_DEAD bits are not safe for the standby.
+ */
+void
+hash_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+	int			pagetype = opaque->hasho_flag & LH_PAGE_TYPE;
+
+	if (pagetype == LH_BUCKET_PAGE || pagetype == LH_OVERFLOW_PAGE)
+	{
+		opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY;
+	}
+}
+
 /*
  * Mask a hash page before performing consistency checks on it.
  */
@@ -1111,6 +1127,7 @@ hash_mask(char *pagedata, BlockNumber blkno)
 	HashPageOpaque opaque;
 	int			pagetype;
 
+	hash_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 2ffa28e8f7..2ea8ecd173 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -612,9 +612,21 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 	IndexTuple	itup;
 	int			itemIndex;
 	OffsetNumber maxoff;
+	bool		ignore_killed_tuples;
+	HashPageOpaque bucket_opaque;
 
 	maxoff = PageGetMaxOffsetNumber(page);
+	bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+									H_LP_SAFE_ON_STANDBY(bucket_opaque);
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -632,8 +644,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 			 */
 			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
 				 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
-				(scan->ignore_killed_tuples &&
-				 (ItemIdIsDead(PageGetItemId(page, offnum)))))
+				(ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum)))))
 			{
 				offnum = OffsetNumberNext(offnum);	/* move forward */
 				continue;
@@ -678,8 +689,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page,
 			 */
 			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
 				 (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
-				(scan->ignore_killed_tuples &&
-				 (ItemIdIsDead(PageGetItemId(page, offnum)))))
+				(ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum)))))
 			{
 				offnum = OffsetNumberPrev(offnum);	/* move back */
 				continue;
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 519872850e..fa8453660d 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -14,6 +14,7 @@
  */
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/hash.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -547,6 +548,7 @@ _hash_kill_items(IndexScanDesc scan)
 	int			numKilled = so->numKilled;
 	int			i;
 	bool		killedsomething = false;
+	bool		dirty = false;
 	bool		havePin = false;
 
 	Assert(so->numKilled > 0);
@@ -559,6 +561,15 @@ _hash_kill_items(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
+	/*
+	 * Standby was promoted after start of current transaction. It is not
+	 * required for correctness, but it is better to just skip everything.
+	 */
+	if (scan->xactStartedInRecovery && !RecoveryInProgress())
+	{
+		return;
+	}
+
 	blkno = so->currPos.currPage;
 	if (HashScanPosIsPinned(so->currPos))
 	{
@@ -577,6 +588,23 @@ _hash_kill_items(IndexScanDesc scan)
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	maxoff = PageGetMaxOffsetNumber(page);
 
+	if (H_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+	else if (!H_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by the primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_flags(page);
+		pg_memory_barrier();
+		opaque->hasho_flag |= LH_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+
 	for (i = 0; i < numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
@@ -596,7 +624,7 @@ _hash_kill_items(IndexScanDesc scan)
 			{
 				/* found the item */
 				ItemIdMarkDead(iid);
-				killedsomething = true;
+				killedsomething = dirty = true;
 				break;			/* out of inner search loop */
 			}
 			offnum = OffsetNumberNext(offnum);
@@ -611,6 +639,9 @@ _hash_kill_items(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
+	}
+	if (dirty)
+	{
 		MarkBufferDirtyHint(buf, true);
 	}
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index ba36da2b83..8ed4064151 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1698,9 +1698,11 @@ heap_fetch(Relation relation,
  * the tuple here, in addition to updating *tid.  If no match is found, the
  * contents of this buffer on return are undefined.
  *
- * If all_dead is not NULL, we check non-visible tuples to see if they are
- * globally dead; *all_dead is set true if all members of the HOT chain
- * are vacuumable, false if not.
+ * If deadness is not NULL, we check non-visible tuples to see if they
+ * are globally dead; *all_dead is set true if all members of the HOT chain
+ * are vacuumable, false if not. Also, *latest_removed_xid is set to the
+ * latest removed xid in a HOT chain, if known. *page_lsn is set to current page
+ * LSN value.
  *
  * Unlike heap_fetch, the caller must already have pin and (at least) share
  * lock on the buffer; it is still pinned/locked at exit.  Also unlike
@@ -1709,7 +1711,7 @@ heap_fetch(Relation relation,
 bool
 heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 					   Snapshot snapshot, HeapTuple heapTuple,
-					   bool *all_dead, bool first_call)
+					   TupleDeadnessData *deadness, bool first_call)
 {
 	Page		dp = (Page) BufferGetPage(buffer);
 	TransactionId prev_xmax = InvalidTransactionId;
@@ -1721,8 +1723,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 	GlobalVisState *vistest = NULL;
 
 	/* If this is not the first call, previous call returned a (live!) tuple */
-	if (all_dead)
-		*all_dead = first_call;
+	if (deadness)
+	{
+		deadness->all_dead = first_call;
+		deadness->latest_removed_xid = InvalidTransactionId;
+		deadness->page_lsn = PageGetLSN(dp);
+	}
 
 	blkno = ItemPointerGetBlockNumber(tid);
 	offnum = ItemPointerGetOffsetNumber(tid);
@@ -1755,6 +1761,13 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				at_chain_start = false;
 				continue;
 			}
+			/*
+			 * Even if all items are dead we are not sure about latest_removed_xid
+			 * value. In theory, some newer items of the chain could be vacuumed
+			 * while older are not (pure paranoia, probably).
+			 */
+			if (deadness)
+				deadness->latest_removed_xid = InvalidTransactionId;
 			/* else must be end of chain */
 			break;
 		}
@@ -1804,8 +1817,11 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 				ItemPointerSetOffsetNumber(tid, offnum);
 				PredicateLockTID(relation, &heapTuple->t_self, snapshot,
 								 HeapTupleHeaderGetXmin(heapTuple->t_data));
-				if (all_dead)
-					*all_dead = false;
+				if (deadness)
+				{
+					deadness->all_dead = false;
+					deadness->latest_removed_xid = InvalidTransactionId;
+				}
 				return true;
 			}
 		}
@@ -1819,13 +1835,19 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 		 * Note: if you change the criterion here for what is "dead", fix the
 		 * planner's get_actual_variable_range() function to match.
 		 */
-		if (all_dead && *all_dead)
+		if (deadness && deadness->all_dead)
 		{
 			if (!vistest)
 				vistest = GlobalVisTestFor(relation);
 
 			if (!HeapTupleIsSurelyDead(heapTuple, vistest))
-				*all_dead = false;
+			{
+				deadness->all_dead = false;
+				deadness->latest_removed_xid = InvalidTransactionId;
+			}
+			else
+				HeapTupleHeaderAdvanceLatestRemovedXid(heapTuple->t_data,
+											&deadness->latest_removed_xid);
 		}
 
 		/*
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 7a9a640989..b57c40b0c9 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -115,7 +115,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 						 ItemPointer tid,
 						 Snapshot snapshot,
 						 TupleTableSlot *slot,
-						 bool *call_again, bool *all_dead)
+						 bool *call_again,
+						 TupleDeadnessData *deadness)
 {
 	IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
 	BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
@@ -147,7 +148,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
 											hscan->xs_cbuf,
 											snapshot,
 											&bslot->base.tupdata,
-											all_dead,
+											deadness,
 											!*call_again);
 	bslot->base.tupdata.t_self = *tid;
 	LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index 0aa26b448b..1818901577 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -106,18 +106,18 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
 	scan->xs_want_itup = false; /* may be set later */
 
 	/*
-	 * During recovery we ignore killed tuples and don't bother to kill them
-	 * either. We do this because the xmin on the primary node could easily be
-	 * later than the xmin on the standby node, so that what the primary
-	 * thinks is killed is supposed to be visible on standby. So for correct
-	 * MVCC for queries during recovery we must ignore these hints and check
-	 * all tuples. Do *not* set ignore_killed_tuples to true when running in a
-	 * transaction that was started during recovery. xactStartedInRecovery
-	 * should not be altered by index AMs.
-	 */
+	 * For correct MVCC for queries during recovery, we could use index LP_DEAD
+	 * bits as on the primary. But index AM should consider that it is possible
+	 * to receive such bits as part of FPI. The xmin on the primary node could
+	 * easily be later than the xmin on the standby node, so that what the
+	 * primary thinks is killed is supposed to be visible on standby.
+	 *
+	 * So for correct MVCC for queries during recovery we must mask these FPI
+	 * hints and check all tuples until standby-safe hints are set.
+	*/
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xactStartedInRecovery = TransactionStartedDuringRecovery();
-	scan->ignore_killed_tuples = !scan->xactStartedInRecovery;
 
 	scan->opaque = NULL;
 
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 5e22479b7a..c1b1802b1a 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -309,6 +309,7 @@ index_rescan(IndexScanDesc scan,
 		table_index_fetch_reset(scan->xs_heapfetch);
 
 	scan->kill_prior_tuple = false; /* for safety */
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys,
@@ -386,6 +387,7 @@ index_restrpos(IndexScanDesc scan)
 		table_index_fetch_reset(scan->xs_heapfetch);
 
 	scan->kill_prior_tuple = false; /* for safety */
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	scan->indexRelation->rd_indam->amrestrpos(scan);
@@ -534,6 +536,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 
 	/* Reset kill flag immediately for safety */
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 	scan->xs_heap_continue = false;
 
 	/* If we're out of index entries, we're done */
@@ -574,12 +577,18 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 bool
 index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 {
-	bool		all_dead = false;
-	bool		found;
+	TupleDeadnessData			deadness;
+	IndexLpDeadAllowedResult	kill_allowed;
+	bool						found;
+
+	deadness.all_dead = false;
+	deadness.latest_removed_xid = InvalidTransactionId;
+	deadness.page_lsn = InvalidXLogRecPtr;
 
 	found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid,
 									scan->xs_snapshot, slot,
-									&scan->xs_heap_continue, &all_dead);
+									&scan->xs_heap_continue,
+									&deadness);
 
 	if (found)
 		pgstat_count_heap_fetch(scan->indexRelation);
@@ -587,13 +596,11 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
 	/*
 	 * If we scanned a whole HOT chain and found only dead tuples, tell index
 	 * AM to kill its entry for that TID (this will take effect in the next
-	 * amgettuple call, in index_getnext_tid).  We do not do this when in
-	 * recovery because it may violate MVCC to do so.  See comments in
-	 * RelationGetIndexScan().
+	 * amgettuple call, in index_getnext_tid). We do this when in
+	 * recovery only in certain conditions because it may violate MVCC.
 	 */
-	if (!scan->xactStartedInRecovery)
-		scan->kill_prior_tuple = all_dead;
-
+	kill_allowed = IsIndexLpDeadAllowed(&deadness, &scan->kill_prior_tuple_min_lsn);
+	scan->kill_prior_tuple = (kill_allowed != INDEX_LP_DEAD_NOT_OK);
 	return found;
 }
 
@@ -667,6 +674,7 @@ index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap)
 
 	/* just make sure this is false... */
 	scan->kill_prior_tuple = false;
+	scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr;
 
 	/*
 	 * have the am's getbitmap proc do all the work.
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 6ac205c98e..caea430f5d 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -15,6 +15,7 @@
 
 #include "postgres.h"
 
+#include "access/bufmask.h"
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
 #include "access/transam.h"
@@ -502,7 +503,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 			if (inposting || !ItemIdIsDead(curitemid))
 			{
 				ItemPointerData htid;
-				bool		all_dead = false;
+				TupleDeadnessData deadness;
+
+				deadness.all_dead = false;
+				deadness.latest_removed_xid = InvalidTransactionId;
+				deadness.page_lsn = InvalidXLogRecPtr;
 
 				if (!inposting)
 				{
@@ -556,7 +561,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 */
 				else if (table_index_fetch_tuple_check(heapRel, &htid,
 													   &SnapshotDirty,
-													   &all_dead))
+													   &deadness))
 				{
 					TransactionId xwait;
 
@@ -670,8 +675,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 													RelationGetRelationName(rel))));
 					}
 				}
-				else if (all_dead && (!inposting ||
-									  (prevalldead &&
+				else if (deadness.all_dead && (!inposting ||
+											   (prevalldead &&
 									   curposti == BTreeTupleGetNPosting(curitup) - 1)))
 				{
 					/*
@@ -679,6 +684,13 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * all posting list TIDs) is dead to everyone, so mark the
 					 * index entry killed.
 					 */
+					Assert(!RecoveryInProgress());
+					if (P_LP_SAFE_ON_STANDBY(opaque))
+					{
+						/* Seems like server was promoted some time ago,
+						 * clear the flag just for accuracy. */
+						opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+					}
 					ItemIdMarkDead(curitemid);
 					opaque->btpo_flags |= BTP_HAS_GARBAGE;
 
@@ -696,7 +708,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 				 * Remember if posting list tuple has even a single HOT chain
 				 * whose members are not all dead
 				 */
-				if (!all_dead && inposting)
+				if (!deadness.all_dead && inposting)
 					prevalldead = false;
 			}
 		}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 1360ab80c1..927c62415c 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -244,7 +244,9 @@ btgettuple(IndexScanDesc scan, ScanDirection dir)
 			/*
 			 * Check to see if we should kill the previously-fetched tuple.
 			 */
-			if (scan->kill_prior_tuple)
+			if (scan->kill_prior_tuple &&
+				(XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) ||
+					scan->kill_prior_tuple_min_lsn < so->currPos.lsn))
 			{
 				/*
 				 * Yes, remember it for later. (We'll deal with all such
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index d1177d8772..f7e57a6610 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1516,6 +1516,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	int			itemIndex;
 	bool		continuescan;
 	int			indnatts;
+	bool		ignore_killed_tuples;
 
 	/*
 	 * We must have the buffer pinned and locked, but the usual macro can't be
@@ -1569,6 +1570,15 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	 */
 	Assert(BTScanPosIsPinned(so->currPos));
 
+	/*
+	 * Check whether is it allowed to see LP_DEAD bits - always true for primary,
+	 * on secondary we should avoid flags that were set by primary.
+	 * In case of promotion xactStartedInRecovery may still be equal
+	 * to true on primary so, old standby-safe bits are used (case of old
+	 * transaction in promoted server).
+	 */
+	ignore_killed_tuples = !scan->xactStartedInRecovery ||
+										P_LP_SAFE_ON_STANDBY(opaque);
 	if (ScanDirectionIsForward(dir))
 	{
 		/* load items[] in ascending order */
@@ -1585,7 +1595,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			 * If the scan specifies not to return killed tuples, then we
 			 * treat a killed tuple as not passing the qual
 			 */
-			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			if (ignore_killed_tuples && ItemIdIsDead(iid))
 			{
 				offnum = OffsetNumberNext(offnum);
 				continue;
@@ -1685,7 +1695,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			 * uselessly advancing to the page to the left.  This is similar
 			 * to the high key optimization used by forward scans.
 			 */
-			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			if (ignore_killed_tuples && ItemIdIsDead(iid))
 			{
 				Assert(offnum >= P_FIRSTDATAKEY(opaque));
 				if (offnum > P_FIRSTDATAKEY(opaque))
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index d524310723..df5179a9d8 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -17,6 +17,7 @@
 
 #include <time.h>
 
+#include "access/bufmask.h"
 #include "access/nbtree.h"
 #include "access/reloptions.h"
 #include "access/relscan.h"
@@ -1725,6 +1726,7 @@ _bt_killitems(IndexScanDesc scan)
 	int			i;
 	int			numKilled = so->numKilled;
 	bool		killedsomething = false;
+	bool		dirty = false;
 	bool		droppedpin PG_USED_FOR_ASSERTS_ONLY;
 
 	Assert(BTScanPosIsValid(so->currPos));
@@ -1735,6 +1737,15 @@ _bt_killitems(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
+	/*
+	 * Standby was promoted after start of current transaction. It is not
+	 * required for correctness, but it is better to just skip everything.
+	 */
+	if (scan->xactStartedInRecovery && !RecoveryInProgress())
+	{
+		return;
+	}
+
 	if (BTScanPosIsPinned(so->currPos))
 	{
 		/*
@@ -1771,6 +1782,23 @@ _bt_killitems(IndexScanDesc scan)
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
+	if (P_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery)
+	{
+		/* Seems like server was promoted some time ago,
+		 * clear the flag just for accuracy. */
+		opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+	else if (!P_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery)
+	{
+		/* LP_DEAD flags were set by primary. We need to clear them,
+		 * and allow standby to set own. */
+		mask_lp_flags(page);
+		pg_memory_barrier();
+		opaque->btpo_flags |= BTP_LP_SAFE_ON_STANDBY;
+		dirty = true;
+	}
+
 	for (i = 0; i < numKilled; i++)
 	{
 		int			itemIndex = so->killedItems[i];
@@ -1866,7 +1894,7 @@ _bt_killitems(IndexScanDesc scan)
 			{
 				/* found the item/all posting list items */
 				ItemIdMarkDead(iid);
-				killedsomething = true;
+				killedsomething = dirty = true;
 				break;			/* out of inner search loop */
 			}
 			offnum = OffsetNumberNext(offnum);
@@ -1883,6 +1911,9 @@ _bt_killitems(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->btpo_flags |= BTP_HAS_GARBAGE;
+	}
+	if (dirty)
+	{
 		MarkBufferDirtyHint(so->currPos.buf, true);
 	}
 
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 1779b6ba47..2b3601dbb1 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -1084,6 +1084,21 @@ btree_xlog_cleanup(void)
 	opCtx = NULL;
 }
 
+/*
+ * Mask a btree page that LP_DEAD bits are not safe for the standby.
+ */
+void
+btree_fpi_mask(char *pagedata, BlockNumber blkno)
+{
+	Page		page = (Page) pagedata;
+	BTPageOpaque maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
+
+	if (P_ISLEAF(maskopaq))
+	{
+		maskopaq->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY;
+	}
+}
+
 /*
  * Mask a btree page before performing consistency checks on it.
  */
@@ -1093,6 +1108,7 @@ btree_mask(char *pagedata, BlockNumber blkno)
 	Page		page = (Page) pagedata;
 	BTPageOpaque maskopaq;
 
+	btree_fpi_mask(pagedata, blkno);
 	mask_page_lsn_and_checksum(page);
 
 	mask_page_hint_bits(page);
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 5ea5bdd810..e921960a88 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -219,7 +219,7 @@ bool
 table_index_fetch_tuple_check(Relation rel,
 							  ItemPointer tid,
 							  Snapshot snapshot,
-							  bool *all_dead)
+							  TupleDeadnessData *deadness)
 {
 	IndexFetchTableData *scan;
 	TupleTableSlot *slot;
@@ -229,7 +229,7 @@ table_index_fetch_tuple_check(Relation rel,
 	slot = table_slot_create(rel, NULL);
 	scan = table_index_fetch_begin(rel);
 	found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again,
-									all_dead);
+									deadness);
 	table_index_fetch_end(scan);
 	ExecDropSingleTupleTableSlot(slot);
 
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 58091f6b52..f9e7733da4 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -30,8 +30,8 @@
 #include "utils/relmapper.h"
 
 /* must be kept in sync with RmgrData definition in xlog_internal.h */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
-	{ name, redo, desc, identify, startup, cleanup, mask },
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
+	{ name, redo, desc, identify, startup, cleanup, mask, fpi_mask },
 
 const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 #include "access/rmgrlist.h"
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index d17d660f46..4092e7d8a6 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -332,6 +332,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 							  Buffer *buf)
 {
 	XLogRecPtr	lsn = record->EndRecPtr;
+	RmgrId		rmid = XLogRecGetRmid(record);
 	RelFileNode rnode;
 	ForkNumber	forknum;
 	BlockNumber blkno;
@@ -373,6 +374,11 @@ XLogReadBufferForRedoExtended(XLogReaderState *record,
 		if (!PageIsNew(page))
 		{
 			PageSetLSN(page, lsn);
+			/* If FPI apply mask function is defined - apply it to the buffer. */
+			if (RmgrTable[rmid].rm_fpi_mask)
+			{
+				RmgrTable[rmid].rm_fpi_mask(page, blkno);
+			}
 		}
 
 		MarkBufferDirty(*buf);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 0c5b87864b..d8b00bf16a 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3982,6 +3982,59 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
 	}
 }
 
+/*
+ * IsIndexLpDeadAllowed
+ *
+ * Checks is it allowed to set LP_DEAD hint bit for the tuple in index.
+ */
+IndexLpDeadAllowedResult
+IsIndexLpDeadAllowed(TupleDeadnessData *deadness,
+					 XLogRecPtr *minLsn)
+{
+	*minLsn = InvalidXLogRecPtr;
+	if (!deadness->all_dead)
+		return INDEX_LP_DEAD_NOT_OK;
+	/* It is always allowed on primary if *all_dead. */
+	if (!RecoveryInProgress())
+		return INDEX_LP_DEAD_OK;
+
+	if (TransactionIdIsValid(deadness->latest_removed_xid)) {
+		/*
+		 * If latest_removed_xid is known - make sure its commit record
+		 * less than minRecoveryPoint to avoid MVCC failure after crash recovery.
+		 */
+		XLogRecPtr commitLSN
+				= TransactionIdGetCommitLSN(deadness->latest_removed_xid);
+
+		if (XLogNeedsFlush(commitLSN))
+		{
+			/* LSN not flushed - allow iff index LSN is greater. */
+			*minLsn = commitLSN;
+			return INDEX_LP_DEAD_OK_MIN_LSN;
+		}
+		else return INDEX_LP_DEAD_OK;
+	} else {
+		/*
+		 * Looks like it is tuple cleared by heap_page_prune_execute,
+		 * we must be sure if LSN of XLOG_HEAP2_CLEAN (or any subsequent
+		 * updates) less than minRecoveryPoint to avoid MVCC failure
+		 * after crash recovery.
+		 *
+		 * Another possible case is transaction rollback or tuple updated
+		 * by inserting transaction. Such tuple never will be seen, so it
+		 * is safe to set LP_DEAD. It is related to the logic of
+		 * HeapTupleHeaderAdvanceLatestRemovedXid.
+		 */
+		if (XLogNeedsFlush(deadness->page_lsn))
+		{
+			/* LSN not flushed - allow iff index LSN is greater. */
+			*minLsn = deadness->page_lsn;
+			return INDEX_LP_DEAD_OK_MIN_LSN;
+		}
+		else return INDEX_LP_DEAD_OK;
+	}
+}
+
 /*
  * Release buffer content locks for shared buffers.
  *
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 59ebac7d6a..2efcb887d0 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -28,7 +28,7 @@
  * RmgrNames is an array of resource manager names, to make error messages
  * a bit nicer.
  */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
   name,
 
 static const char *RmgrNames[RM_MAX_ID + 1] = {
diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c
index 852d8ca4b1..fd3bdec530 100644
--- a/src/bin/pg_waldump/rmgrdesc.c
+++ b/src/bin/pg_waldump/rmgrdesc.c
@@ -32,7 +32,7 @@
 #include "storage/standbydefs.h"
 #include "utils/relmapper.h"
 
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
 	{ name, desc, identify},
 
 const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = {
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 4b06575d98..d3f7960f7f 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -50,6 +50,7 @@
 #define F_FOLLOW_RIGHT		(1 << 3)	/* page to the right has no downlink */
 #define F_HAS_GARBAGE		(1 << 4)	/* some tuples on the page are dead,
 										 * but not deleted yet */
+#define F_LP_SAFE_ON_STANDBY	(1 << 5) /* LP bits are safe to use on standby */
 
 /*
  * NSN (node sequence number) is a special-purpose LSN which is stored on each
@@ -179,6 +180,10 @@ typedef struct GISTENTRY
 #define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE)
 #define GistClearPageHasGarbage(page)	( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE)
 
+#define GistPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags & F_LP_SAFE_ON_STANDBY)
+#define GistMarkPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags |= F_LP_SAFE_ON_STANDBY)
+#define GistClearPageHasLpSafeOnStandby(page)	( GistPageGetOpaque(page)->flags &= ~F_LP_SAFE_ON_STANDBY)
+
 #define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT)
 #define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT)
 #define GistClearFollowRight(page)	( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT)
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index fd5144f258..fbac95999b 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -110,5 +110,6 @@ extern const char *gist_identify(uint8 info);
 extern void gist_xlog_startup(void);
 extern void gist_xlog_cleanup(void);
 extern void gist_mask(char *pagedata, BlockNumber blkno);
+extern void gist_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 1cce865be2..f57401d484 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -59,6 +59,7 @@ typedef uint32 Bucket;
 #define LH_BUCKET_BEING_SPLIT	(1 << 5)
 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP	(1 << 6)
 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
+#define LH_LP_SAFE_ON_STANDBY	(1 << 8)
 
 #define LH_PAGE_TYPE \
 	(LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
@@ -89,6 +90,7 @@ typedef HashPageOpaqueData *HashPageOpaque;
 #define H_BUCKET_BEING_SPLIT(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
 #define H_BUCKET_BEING_POPULATED(opaque)	(((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
 #define H_HAS_DEAD_TUPLES(opaque)		(((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
+#define H_LP_SAFE_ON_STANDBY(opaque)	(((opaque)->hasho_flag & LH_LP_SAFE_ON_STANDBY) != 0)
 
 /*
  * The page ID is for the convenience of pg_filedump and similar utilities,
diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h
index 4353a32dbb..37bc96d391 100644
--- a/src/include/access/hash_xlog.h
+++ b/src/include/access/hash_xlog.h
@@ -263,5 +263,6 @@ extern void hash_redo(XLogReaderState *record);
 extern void hash_desc(StringInfo buf, XLogReaderState *record);
 extern const char *hash_identify(uint8 info);
 extern void hash_mask(char *pagedata, BlockNumber blkno);
+extern void hash_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* HASH_XLOG_H */
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index e63b49fc38..7e70b42ef4 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -136,7 +136,7 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot,
 					   HeapTuple tuple, Buffer *userbuf);
 extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation,
 								   Buffer buffer, Snapshot snapshot, HeapTuple heapTuple,
-								   bool *all_dead, bool first_call);
+								   TupleDeadnessData *deadness, bool first_call);
 
 extern void heap_get_latest_tid(TableScanDesc scan, ItemPointer tid);
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index a645c42e68..dc274ef2f2 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -80,6 +80,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
 #define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples (deprecated) */
 #define BTP_INCOMPLETE_SPLIT (1 << 7)	/* right sibling's downlink is missing */
 #define BTP_HAS_FULLXID	(1 << 8)	/* contains BTDeletedPageData */
+#define BTP_LP_SAFE_ON_STANDBY (1 << 9) /* LP bits are safe to use on standby */
 
 /*
  * The max allowed value of a cycle ID is a bit less than 64K.  This is
@@ -225,6 +226,7 @@ typedef struct BTMetaPageData
 #define P_HAS_GARBAGE(opaque)	(((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0)
 #define P_INCOMPLETE_SPLIT(opaque)	(((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0)
 #define P_HAS_FULLXID(opaque)	(((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0)
+#define P_LP_SAFE_ON_STANDBY(opaque) (((opaque)->btpo_flags & BTP_LP_SAFE_ON_STANDBY) != 0)
 
 /*
  * BTDeletedPageData is the page contents of a deleted page
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index 0f7731856b..366acc05b7 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -347,5 +347,6 @@ extern const char *btree_identify(uint8 info);
 extern void btree_xlog_startup(void);
 extern void btree_xlog_cleanup(void);
 extern void btree_mask(char *pagedata, BlockNumber blkno);
+extern void btree_fpi_mask(char *pagedata, BlockNumber blkno);
 
 #endif							/* NBTXLOG_H */
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 17a161c69a..823ee31ad0 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -126,9 +126,10 @@ typedef struct IndexScanDescData
 
 	/* signaling to index AM about killing index tuples */
 	bool		kill_prior_tuple;	/* last-returned tuple is dead */
-	bool		ignore_killed_tuples;	/* do not return killed entries */
-	bool		xactStartedInRecovery;	/* prevents killing/seeing killed
-										 * tuples */
+	XLogRecPtr	kill_prior_tuple_min_lsn; /* kill_prior_tuple additionally
+										   * requires index page lsn */
+	bool		xactStartedInRecovery;	/* prevents ignoring tuples
+										 * killed by primary */
 
 	/* index access method's private state */
 	void	   *opaque;			/* access-method-specific info */
@@ -188,4 +189,12 @@ typedef struct SysScanDescData
 	struct TupleTableSlot *slot;
 }			SysScanDescData;
 
+/* Struct for data about visibility of tuple */
+typedef struct TupleDeadnessData
+{
+	bool			all_dead;			/* guaranteed not visible for all backends */
+	TransactionId	latest_removed_xid;	/* latest removed xid if known */
+	XLogRecPtr		page_lsn;			/* lsn of page where dead tuple located */
+}			TupleDeadnessData;
+
 #endif							/* RELSCAN_H */
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index c9b5c56a4c..8e322b0b7f 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -19,7 +19,7 @@ typedef uint8 RmgrId;
  * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG
  * file format.
  */
-#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \
+#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,fpi_mask) \
 	symname,
 
 typedef enum RmgrIds
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index f582cf535f..13440a2883 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -24,26 +24,26 @@
  * Changes to this list possibly need an XLOG_PAGE_MAGIC bump.
  */
 
-/* symbol name, textual name, redo, desc, identify, startup, cleanup */
-PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL)
-PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL)
-PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL)
-PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL)
-PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL)
-PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL)
-PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL)
-PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL)
-PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL)
-PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask)
-PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask)
-PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask)
-PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask)
-PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask)
-PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask)
-PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask)
-PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL)
-PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL)
-PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask)
-PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL)
+/* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, fpi_mask */
+PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, NULL)
+PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, NULL)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, btree_fpi_mask)
+PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, hash_fpi_mask)
+PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL)
+PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, gist_fpi_mask)
+PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL)
+PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL)
+PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL)
+PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL)
+PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL)
+PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, NULL)
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 9f1e4a1ac9..fa856da715 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -423,7 +423,7 @@ typedef struct TableAmRoutine
 	 * needs to be set to true by index_fetch_tuple, signaling to the caller
 	 * that index_fetch_tuple should be called again for the same tid.
 	 *
-	 * *all_dead, if all_dead is not NULL, should be set to true by
+	 * *deadness, if value is not NULL, should be filled by
 	 * index_fetch_tuple iff it is guaranteed that no backend needs to see
 	 * that tuple. Index AMs can use that to avoid returning that tid in
 	 * future searches.
@@ -432,7 +432,8 @@ typedef struct TableAmRoutine
 									  ItemPointer tid,
 									  Snapshot snapshot,
 									  TupleTableSlot *slot,
-									  bool *call_again, bool *all_dead);
+									  bool *call_again,
+									  TupleDeadnessData *deadness);
 
 
 	/* ------------------------------------------------------------------------
@@ -1194,7 +1195,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan)
  * will be set to true, signaling that table_index_fetch_tuple() should be called
  * again for the same tid.
  *
- * *all_dead, if all_dead is not NULL, will be set to true by
+ * *deadness, if value is not NULL, will be filled by
  * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see
  * that tuple. Index AMs can use that to avoid returning that tid in future
  * searches.
@@ -1211,7 +1212,8 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 						ItemPointer tid,
 						Snapshot snapshot,
 						TupleTableSlot *slot,
-						bool *call_again, bool *all_dead)
+						bool *call_again,
+						TupleDeadnessData *deadness)
 {
 	/*
 	 * We don't expect direct calls to table_index_fetch_tuple with valid
@@ -1223,7 +1225,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 
 	return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
 													slot, call_again,
-													all_dead);
+													deadness);
 }
 
 /*
@@ -1235,7 +1237,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 extern bool table_index_fetch_tuple_check(Relation rel,
 										  ItemPointer tid,
 										  Snapshot snapshot,
-										  bool *all_dead);
+										  TupleDeadnessData *deadness);
 
 
 /* ------------------------------------------------------------------------
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 26a743b6b6..96cbc23a50 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -292,6 +292,9 @@ typedef enum
  * rm_mask takes as input a page modified by the resource manager and masks
  * out bits that shouldn't be flagged by wal_consistency_checking.
  *
+ * rm_fpi_mask takes FPI buffer and applies access specific non-logged changes,
+ * for example - marks LP_DEAD bits on index page as non-safe for standby.
+ *
  * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h).
  */
 typedef struct RmgrData
@@ -303,6 +306,7 @@ typedef struct RmgrData
 	void		(*rm_startup) (void);
 	void		(*rm_cleanup) (void);
 	void		(*rm_mask) (char *pagedata, BlockNumber blkno);
+	void		(*rm_fpi_mask) (char *pagedata, BlockNumber blkno);
 } RmgrData;
 
 extern const RmgrData RmgrTable[];
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index aa64fb42ec..d1118d118e 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -46,6 +46,13 @@ typedef enum
 								 * replay; otherwise same as RBM_NORMAL */
 } ReadBufferMode;
 
+typedef enum
+{
+	INDEX_LP_DEAD_OK,			/* Index tuple could be marked as LP_DEAD */
+	INDEX_LP_DEAD_NOT_OK,		/* Not allowed to mark index tuple as dead */
+	INDEX_LP_DEAD_OK_MIN_LSN	/* Allowed if index page LSN is greater */
+} IndexLpDeadAllowedResult;
+
 /*
  * Type returned by PrefetchBuffer().
  */
@@ -61,6 +68,8 @@ struct WritebackContext;
 /* forward declared, to avoid including smgr.h here */
 struct SMgrRelationData;
 
+struct TupleDeadnessData;
+
 /* in globals.c ... this duplicates miscadmin.h */
 extern PGDLLIMPORT int NBuffers;
 
@@ -224,6 +233,8 @@ extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
 						 ForkNumber *forknum, BlockNumber *blknum);
 
 extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
+extern IndexLpDeadAllowedResult IsIndexLpDeadAllowed(struct TupleDeadnessData *deadness,
+													 XLogRecPtr *minLsn);
 
 extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
-- 
2.25.1

From fb939cae4ac063ea3fd9cbba708e488ce8b36b9a Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikol...@gmail.com>
Date: Wed, 12 May 2021 22:56:05 +0300
Subject: [PATCH v3 2/4] code optional

---
 src/backend/storage/ipc/standby.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 1465ee44a1..adf10353c3 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -1135,6 +1135,12 @@ standby_redo(XLogReaderState *record)
 		running.xids = xlrec->xids;
 
 		ProcArrayApplyRecoveryInfo(&running);
+		if (InHotStandby)
+		{
+			/* Move minRecoveryPoint forward to allow standby set
+			 * hint bits and index-LP_DEAD more aggressively. */
+			XLogFlush(record->currRecPtr);
+		}
 	}
 	else if (info == XLOG_INVALIDATIONS)
 	{
-- 
2.25.1

From ff60a045894598e8405a073555ab8b52f2895a36 Mon Sep 17 00:00:00 2001
From: Michail Nikolaev <michail.nikol...@gmail.com>
Date: Fri, 14 May 2021 00:32:40 +0300
Subject: [PATCH v3 4/4] docs

---
 src/backend/access/nbtree/README | 35 ++++++++++++++++++++++----------
 src/backend/storage/page/README  |  8 +++++---
 2 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index bfe33b6b43..969d7b6928 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -705,17 +705,30 @@ lax about how same-level locks are acquired during recovery (most kinds
 of readers could still move right to recover if we didn't couple
 same-level locks), but we prefer to be conservative here.
 
-During recovery all index scans start with ignore_killed_tuples = false
-and we never set kill_prior_tuple. We do this because the oldest xmin
-on the standby server can be older than the oldest xmin on the primary
-server, which means tuples can be marked LP_DEAD even when they are
-still visible on the standby. We don't WAL log tuple LP_DEAD bits, but
-they can still appear in the standby because of full page writes. So
-we must always ignore them in standby, and that means it's not worth
-setting them either.  (When LP_DEAD-marked tuples are eventually deleted
-on the primary, the deletion is WAL-logged.  Queries that run on a
-standby therefore get much of the benefit of any LP_DEAD setting that
-takes place on the primary.)
+There is some complexity in using LP_DEAD bits during recovery. Generally,
+bits could be set and read by scan, but there is a possibility to meet
+the bit applied on the primary. We don't WAL log tuple LP_DEAD bits, but
+they can still appear on the standby because of the full-page writes. Such
+a cause could cause MVCC failures because the oldest xmin on the standby
+server can be older than the oldest xmin on the primary server, which means
+tuples can be marked LP_DEAD even when they are still visible on the standby.
+
+To prevent such failure, we mark pages with LP_DEAD bits set by standby with a
+special hint. In the case of FPW from primary the hint is always cleared while
+applying the full page write, so, LP_DEAD received from primary is ignored on
+standby. Also, standby clears all LP_DEAD set by primary on the page before
+setting of own bits.
+
+There are restrictions on settings LP_DEAD bits by the standby related to
+minRecoveryPoint value. In case of crash recovery standby will start to process
+queries after replaying WAL to minRecoveryPoint position (some kind of rewind to
+the previous state). A the same time setting of LP_DEAD bits are not protected
+by WAL in any way. So, to mark tuple as dead we must be sure it was "killed"
+before minRecoveryPoint (comparing the LSN of commit record). Another valid
+option is to compare "killer" LSN with index page LSN because minRecoveryPoint
+is moved forward if the index page flushed. Also, in some cases xid of "killer"
+is unknown - tuples were cleared by XLOG_HEAP2_CLEAN. In that case, we compare
+the LSN of the heap page to index page LSN.
 
 Note that we talk about scans that are started during recovery. We go to
 a little trouble to allow a scan to start during recovery and end during
diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README
index e30d7ac59a..1fd0cb29cb 100644
--- a/src/backend/storage/page/README
+++ b/src/backend/storage/page/README
@@ -59,6 +59,8 @@ even if it is a very bad thing for the user.
 New WAL records cannot be written during recovery, so hint bits set during
 recovery must not dirty the page if the buffer is not already dirty, when
 checksums are enabled.  Systems in Hot-Standby mode may benefit from hint bits
-being set, but with checksums enabled, a page cannot be dirtied after setting a
-hint bit (due to the torn page risk). So, it must wait for full-page images
-containing the hint bit updates to arrive from the primary.
+being set, but with checksums enabled, a page cannot be dirtied because setting
+a hint bit (due to the torn page risk). So, it must wait for full-page images
+containing the hint bit updates to arrive from the primary. But if the page is
+already dirty or dirtied later by WAL replay - hint bits may be flushed on
+standby. Also, as result, checksums on primary and standby could differ.
-- 
2.25.1

Reply via email to