From f34aa92fa93cf492828d5931958e6f8075acfcbb Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Sat, 19 Oct 2024 12:46:29 +0500
Subject: [PATCH v8 1/3] Prototype B-tree vacuum streamlineing

Signed-off-by: Andrey Borodin <x4mmm@yandex-team.ru>
Signed-off-by: Junwang Zhao <zhjwpku@gmail.com>
Reviewed-by: Kirill Reshke <reshkekirill@gmail.com>
---
 src/backend/access/nbtree/nbtree.c            | 70 +++++++++++-----
 src/test/modules/test_misc/meson.build        |  1 +
 .../modules/test_misc/t/007_vacuum_btree.pl   | 79 +++++++++++++++++++
 3 files changed, 129 insertions(+), 21 deletions(-)
 create mode 100644 src/test/modules/test_misc/t/007_vacuum_btree.pl

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index dd76fe1da90..0849ad6d819 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -31,6 +31,7 @@
 #include "storage/lmgr.h"
 #include "utils/fmgrprotos.h"
 #include "utils/index_selfuncs.h"
+#include "utils/injection_point.h"
 #include "utils/memutils.h"
 
 
@@ -85,7 +86,7 @@ typedef struct BTParallelScanDescData *BTParallelScanDesc;
 static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 IndexBulkDeleteCallback callback, void *callback_state,
 						 BTCycleId cycleid);
-static void btvacuumpage(BTVacState *vstate, BlockNumber scanblkno);
+static BlockNumber btvacuumpage(BTVacState *vstate, Buffer buf);
 static BTVacuumPosting btreevacuumposting(BTVacState *vstate,
 										  IndexTuple posting,
 										  OffsetNumber updatedoffset,
@@ -984,8 +985,9 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	Relation	rel = info->index;
 	BTVacState	vstate;
 	BlockNumber num_pages;
-	BlockNumber scanblkno;
 	bool		needLock;
+	BlockRangeReadStreamPrivate p;
+	ReadStream *stream = NULL;
 
 	/*
 	 * Reset fields that track information about the entire index now.  This
@@ -1054,9 +1056,17 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 */
 	needLock = !RELATION_IS_LOCAL(rel);
 
-	scanblkno = BTREE_METAPAGE + 1;
+	p.current_blocknum = BTREE_METAPAGE + 1;
+	stream = read_stream_begin_relation(READ_STREAM_FULL,
+										info->strategy,
+										rel,
+										MAIN_FORKNUM,
+										block_range_read_stream_cb,
+										&p,
+										0);
 	for (;;)
 	{
+		Buffer buf;
 		/* Get the current relation length */
 		if (needLock)
 			LockRelationForExtension(rel, ExclusiveLock);
@@ -1069,17 +1079,31 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 										 num_pages);
 
 		/* Quit if we've scanned the whole relation */
-		if (scanblkno >= num_pages)
+		if (p.current_blocknum >= num_pages)
 			break;
-		/* Iterate over pages, then loop back to recheck length */
-		for (; scanblkno < num_pages; scanblkno++)
+
+		/* In 007_vacuum_btree test we need to coordinate two distinguishable points here */
+		INJECTION_POINT("nbtree-vacuum-1");
+		INJECTION_POINT("nbtree-vacuum-2");
+
+		p.last_exclusive = num_pages;
+
+		/* Iterate over pages, then loop back to recheck relation length */
+		while(BufferIsValid(buf = read_stream_next_buffer(stream, NULL)))
 		{
-			btvacuumpage(&vstate, scanblkno);
+			BlockNumber current_block = btvacuumpage(&vstate, buf);
 			if (info->report_progress)
 				pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
-											 scanblkno);
+											 current_block);
 		}
+		Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer);
+		/*
+		 * After reaching the end we have to reset stream to use it again.
+		 * Extra restart in case of just one iteration does not cost us much.
+		 */
+		read_stream_reset(stream);
 	}
+	read_stream_end(stream);
 
 	/* Set statistics num_pages field to final size of index */
 	stats->num_pages = num_pages;
@@ -1109,9 +1133,11 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
  * after our cycleid was acquired) whose right half page happened to reuse
  * a block that we might have processed at some point before it was
  * recycled (i.e. before the page split).
+ *
+ * Returns BlockNumber of a scanned page (not backtracked).
  */
-static void
-btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
+static BlockNumber
+btvacuumpage(BTVacState *vstate, Buffer buf)
 {
 	IndexVacuumInfo *info = vstate->info;
 	IndexBulkDeleteResult *stats = vstate->stats;
@@ -1122,7 +1148,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
 	bool		attempt_pagedel;
 	BlockNumber blkno,
 				backtrack_to;
-	Buffer		buf;
+	BlockNumber scanblkno = BufferGetBlockNumber(buf);
 	Page		page;
 	BTPageOpaque opaque;
 
@@ -1136,14 +1162,6 @@ backtrack:
 	/* call vacuum_delay_point while not holding any buffer lock */
 	vacuum_delay_point();
 
-	/*
-	 * We can't use _bt_getbuf() here because it always applies
-	 * _bt_checkpage(), which will barf on an all-zero page. We want to
-	 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
-	 * buffer access strategy.
-	 */
-	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
-							 info->strategy);
 	_bt_lockbuf(rel, buf, BT_READ);
 	page = BufferGetPage(buf);
 	opaque = NULL;
@@ -1179,7 +1197,7 @@ backtrack:
 					 errmsg_internal("right sibling %u of scanblkno %u unexpectedly in an inconsistent state in index \"%s\"",
 									 blkno, scanblkno, RelationGetRelationName(rel))));
 			_bt_relbuf(rel, buf);
-			return;
+			return scanblkno;
 		}
 
 		/*
@@ -1199,7 +1217,7 @@ backtrack:
 		{
 			/* Done with current scanblkno (and all lower split pages) */
 			_bt_relbuf(rel, buf);
-			return;
+			return scanblkno;
 		}
 	}
 
@@ -1430,8 +1448,18 @@ backtrack:
 	if (backtrack_to != P_NONE)
 	{
 		blkno = backtrack_to;
+
+		/*
+		 * We can't use _bt_getbuf() here because it always applies
+		 * _bt_checkpage(), which will barf on an all-zero page. We want to
+		 * recycle all-zero pages, not fail.  Also, we want to use a nondefault
+		 * buffer access strategy.
+		 */
+		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+								info->strategy);
 		goto backtrack;
 	}
+	return scanblkno;
 }
 
 /*
diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build
index 283ffa751aa..a456a5e77f8 100644
--- a/src/test/modules/test_misc/meson.build
+++ b/src/test/modules/test_misc/meson.build
@@ -15,6 +15,7 @@ tests += {
       't/004_io_direct.pl',
       't/005_timeouts.pl',
       't/006_signal_autovacuum.pl',
+      't/007_vacuum_btree.pl',
     ],
   },
 }
diff --git a/src/test/modules/test_misc/t/007_vacuum_btree.pl b/src/test/modules/test_misc/t/007_vacuum_btree.pl
new file mode 100644
index 00000000000..2c69d2b477d
--- /dev/null
+++ b/src/test/modules/test_misc/t/007_vacuum_btree.pl
@@ -0,0 +1,79 @@
+# Copyright (c) 2024, PostgreSQL Global Development Group
+
+# This test verifies that B-tree vacuum can restart read stream.
+# To do so we need to insert some data during vacuum. So we wait in injection point
+# after first vacuum scan. During this wait we insert some data forcing page split.
+# this split will trigger relation extension and subsequent read_stream_reset().
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use Test::More;
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+	plan skip_all => 'Injection points not supported by this build';
+}
+
+# Initialize postgres
+my $psql_err = '';
+my $psql_out = '';
+my $node = PostgreSQL::Test::Cluster->new('node');
+$node->init;
+
+# This ensures autovacuum do not run
+$node->append_conf('postgresql.conf', 'autovacuum = off');
+$node->start;
+
+# Check if the extension injection_points is available, as it may be
+# possible that this script is run with installcheck, where the module
+# would not be installed by default.
+if (!$node->check_extension('injection_points'))
+{
+	plan skip_all => 'Extension injection_points not installed';
+}
+
+$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
+
+# From this point, vacuum worker will wait at startup.
+$node->safe_psql('postgres',
+	"SELECT injection_points_attach('nbtree-vacuum-2', 'wait');");
+
+my $psql_session = $node->background_psql('postgres');
+
+$psql_session->query_until(
+	qr/starting_bg_psql/,
+		q(\echo starting_bg_psql
+		create table a as select random() r from generate_series(1,100) x;
+		create index on a(r);
+		delete from a;
+		vacuum (index_cleanup on) a;
+	));
+
+# Wait until an vacuum worker starts.
+$node->wait_for_event('client backend', 'nbtree-vacuum-2');
+
+$node->safe_psql('postgres',
+	"SELECT injection_points_attach('nbtree-vacuum-1', 'wait');");
+
+# Here's the key point of a test: during vacuum we add some page splits.
+# This will force vacuum into doing another scan thus reseting read stream.
+$node->safe_psql('postgres',
+	"insert into a select x from generate_series(1,3000) x;");
+
+$node->safe_psql('postgres',
+	"SELECT injection_points_detach('nbtree-vacuum-2');");
+$node->safe_psql('postgres',
+	"SELECT injection_points_wakeup('nbtree-vacuum-2');");
+
+# Observe that second scan is reached.
+$node->wait_for_event('client backend', 'nbtree-vacuum-1');
+
+$node->safe_psql('postgres',
+	"SELECT injection_points_detach('nbtree-vacuum-1');");
+$node->safe_psql('postgres',
+	"SELECT injection_points_wakeup('nbtree-vacuum-1');");
+
+ok($psql_session->quit);
+
+done_testing();
-- 
2.42.0

