From 195c9bbf595512038240c51fc2142decfef4699e Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Thu, 6 Jul 2023 21:57:53 +0200
Subject: [PATCH v1] Implement a reloption that forces updated tuples to go to
 other pages

Without this, there is no way to efficiently make a table more compact when a
hole was created. With max_local_update configured, local updates are only
available for the first MBs of the table, all other updates will go through
the visibility map to find a destination page for the new tuple version.

This is intended as a debug/maintenance option so that tables will tend to
less fragmentation if a large part of the table was updated at once and left
many pages nearly empty; e.g. when someone updated a table with fillfactor=100
using an unqualified UPDATE statement.
---
 src/backend/access/common/reloptions.c    | 13 +++++++-
 src/backend/access/heap/heapam.c          |  7 ++--
 src/include/utils/rel.h                   | 19 +++++++++++
 src/test/regress/expected/alter_table.out | 18 ++++++++++
 src/test/regress/expected/update.out      | 40 +++++++++++++++++++++++
 src/test/regress/sql/alter_table.sql      |  8 +++++
 src/test/regress/sql/update.sql           | 23 +++++++++++++
 7 files changed, 124 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 11cc431677..2cb5d90d58 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -382,6 +382,15 @@ static relopt_int intRelOpts[] =
 		},
 		-1, 0, 1024
 	},
+	{
+		{
+			"max_local_update",
+			"Updates in the table that are not located the first max_local_update MB of the table will always try to insert the new tuple on a different page.",
+			RELOPT_KIND_HEAP,
+			ShareUpdateExclusiveLock
+		},
+		-1, -1, (MaxBlockNumber / (1024 * 1024 / BLCKSZ))
+	},
 
 	/* list terminator */
 	{{NULL}}
@@ -1882,7 +1891,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
 		{"vacuum_index_cleanup", RELOPT_TYPE_ENUM,
 		offsetof(StdRdOptions, vacuum_index_cleanup)},
 		{"vacuum_truncate", RELOPT_TYPE_BOOL,
-		offsetof(StdRdOptions, vacuum_truncate)}
+		offsetof(StdRdOptions, vacuum_truncate)},
+		{"max_local_update", RELOPT_TYPE_INT,
+		offsetof(StdRdOptions, max_local_update)}
 	};
 
 	return (bytea *) build_reloptions(reloptions, validate, kind,
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 7ed72abe59..9e7e852375 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3495,7 +3495,8 @@ l2:
 
 	newtupsize = MAXALIGN(newtup->t_len);
 
-	if (need_toast || newtupsize > pagefree)
+	if (need_toast || newtupsize > pagefree ||
+		!RelationUpdateTupleOnPageLocally(relation, -1, block))
 	{
 		TransactionId xmax_lock_old_tuple;
 		uint16		infomask_lock_old_tuple,
@@ -3507,7 +3508,7 @@ l2:
 		 * temporarily mark it locked, while we release the page-level lock.
 		 *
 		 * To satisfy the rule that any xid potentially appearing in a buffer
-		 * written out to disk, we unfortunately have to WAL log this
+		 * must be written out to disk, we unfortunately have to WAL log this
 		 * temporary modification.  We can reuse xl_heap_lock for this
 		 * purpose.  If we crash/error before following through with the
 		 * actual update, xmax will be of an aborted transaction, allowing
@@ -3622,7 +3623,7 @@ l2:
 		 */
 		for (;;)
 		{
-			if (newtupsize > pagefree)
+			if (newtupsize > pagefree || !RelationUpdateTupleOnPageLocally(relation, -1, block))
 			{
 				/* It doesn't fit, must use RelationGetBufferForTuple. */
 				newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 1426a353cd..65d183ff8c 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -342,6 +342,7 @@ typedef struct StdRdOptions
 	int			parallel_workers;	/* max number of parallel workers */
 	StdRdOptIndexCleanup vacuum_index_cleanup;	/* controls index vacuuming */
 	bool		vacuum_truncate;	/* enables vacuum to truncate a relation */
+	int			max_local_update;	/* Updates to pages after this block must go through the VM */
 } StdRdOptions;
 
 #define HEAP_MIN_FILLFACTOR			10
@@ -377,6 +378,24 @@ typedef struct StdRdOptions
 #define RelationGetTargetPageFreeSpace(relation, defaultff) \
 	(BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100)
 
+/*
+ * RelationGetMaxLocalUpdateBlock
+ *		Returns the size of the relation's local update section (MB).
+ */
+#define RelationGetMaxLocalUpdate(relation, defaultmlu) \
+	((relation)->rd_options ? \
+	 ((StdRdOptions *) (relation)->rd_options)->max_local_update : (defaultmlu))
+
+/*
+ * RelationUpdateTupleLocally
+ *		Is an update on blockno allowed to put the new tuple on the current
+ *		page, or should we instead try to find a different page?
+ */
+#define RelationUpdateTupleOnPageLocally(relation, defaultmlu, blockno) \
+	((RelationGetMaxLocalUpdate((relation), (defaultmlu)) == -1) || \
+	  ((blockno) < (BlockNumber) (RelationGetMaxLocalUpdate((relation), (defaultmlu)) * \
+								  (1024 * 1024) / BLCKSZ)))
+
 /*
  * RelationIsUsedAsCatalogTable
  *		Returns whether the relation should be treated as a catalog table
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 3b708c7976..859a87dde5 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -2758,6 +2758,24 @@ select * from my_locks order by 1;
  pg_toast  | ShareUpdateExclusiveLock
 (2 rows)
 
+commit;
+begin; alter table alterlock set (max_local_update = 8);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock reset (max_local_update);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
 commit;
 begin; alter table alterlock set (toast.autovacuum_enabled = off);
 select * from my_locks order by 1;
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index c809f88f54..c164f86e43 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -1026,3 +1026,43 @@ update hash_parted set b = b + 8 where b = 1;
 drop table hash_parted;
 drop operator class custom_opclass using hash;
 drop function dummy_hashint4(a int4, seed int8);
+create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false);
+insert into block_local_updates select generate_series(1, 88), 0;
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+ ctid | count 
+------+-------
+    0 |    22
+    1 |    22
+    2 |    22
+    3 |    22
+(4 rows)
+
+-- FF 10=>100 -> all blocks have ~ 90% space left
+alter table block_local_updates set (fillfactor = 100);
+-- vacuum to clear FULL bits on all pages
+vacuum (disable_page_skipping true) block_local_updates;
+-- 10% space of each page is updated => 20% full, ~80% space left.
+update block_local_updates set b = 1;
+-- all tuples still on same page, 22 each
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+ ctid | count 
+------+-------
+    0 |    22
+    1 |    22
+    2 |    22
+    3 |    22
+(4 rows)
+
+-- max_local_update=0 -> all updates in the table must not take the easy block-local path
+alter table block_local_updates set (max_local_update=0);
+-- 80% space left, all updates would be page-local if not for max_local_update
+update block_local_updates set b = 2;
+-- all tuples moved to first page, 77 total
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+ ctid | count 
+------+-------
+    0 |    88
+(1 row)
+
+-- cleanup
+drop table block_local_updates;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index 58ea20ac3d..7e067f4d2a 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1770,6 +1770,14 @@ begin; alter table alterlock reset (fillfactor);
 select * from my_locks order by 1;
 commit;
 
+begin; alter table alterlock set (max_local_update = 8);
+select * from my_locks order by 1;
+commit;
+
+begin; alter table alterlock reset (max_local_update);
+select * from my_locks order by 1;
+commit;
+
 begin; alter table alterlock set (toast.autovacuum_enabled = off);
 select * from my_locks order by 1;
 commit;
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index 7a7bee77b9..dcf09e9671 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -667,3 +667,26 @@ update hash_parted set b = b + 8 where b = 1;
 drop table hash_parted;
 drop operator class custom_opclass using hash;
 drop function dummy_hashint4(a int4, seed int8);
+
+create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false);
+insert into block_local_updates select generate_series(1, 88), 0;
+
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+-- FF 10=>100 -> all blocks have ~ 90% space left
+alter table block_local_updates set (fillfactor = 100);
+-- vacuum to clear FULL bits on all pages
+vacuum (disable_page_skipping true) block_local_updates;
+-- 10% space of each page is updated => 20% full, ~80% space left.
+update block_local_updates set b = 1;
+-- all tuples still on same page, 22 each
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+
+-- max_local_update=0 -> all updates in the table must not take the easy block-local path
+alter table block_local_updates set (max_local_update=0);
+-- 80% space left, all updates would be page-local if not for max_local_update
+update block_local_updates set b = 2;
+-- all tuples moved to first page, 77 total
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+
+-- cleanup
+drop table block_local_updates;
-- 
2.40.1