From 6da93c5db43d5f8c340cc45e47bc73752f16c72c Mon Sep 17 00:00:00 2001
From: Maxim Orlov <m.orlov@postgrespro.ru>
Date: Tue, 13 Aug 2024 14:44:50 +0300
Subject: [PATCH v5 3/4] Make pg_upgrade convert multixact offsets.

Author: Maxim Orlov <orlovmg@gmail.com>
---
 src/bin/pg_upgrade/Makefile     |   1 +
 src/bin/pg_upgrade/meson.build  |   1 +
 src/bin/pg_upgrade/pg_upgrade.c |  29 ++-
 src/bin/pg_upgrade/pg_upgrade.h |  13 +-
 src/bin/pg_upgrade/segresize.c  | 350 ++++++++++++++++++++++++++++++++
 5 files changed, 390 insertions(+), 4 deletions(-)
 create mode 100644 src/bin/pg_upgrade/segresize.c

diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile
index f83d2b5d30..70908d63a3 100644
--- a/src/bin/pg_upgrade/Makefile
+++ b/src/bin/pg_upgrade/Makefile
@@ -21,6 +21,7 @@ OBJS = \
 	info.o \
 	option.o \
 	parallel.o \
+	segresize.o \
 	pg_upgrade.o \
 	relfilenumber.o \
 	server.o \
diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index 3d88419674..16f898ba14 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -10,6 +10,7 @@ pg_upgrade_sources = files(
   'info.c',
   'option.c',
   'parallel.c',
+  'segresize.c',
   'pg_upgrade.c',
   'relfilenumber.c',
   'server.c',
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 663235816f..d9d8d0ea78 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -750,7 +750,30 @@ copy_xact_xlog_xid(void)
 	if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
 		new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
 	{
-		copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
+		/*
+		 * If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER
+		 * it must have 32-bit multixid offsets, thus it should be converted.
+		 */
+		if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER &&
+			new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER)
+		{
+			uint64	oldest_offset = convert_multixact_offsets();
+
+			if (oldest_offset)
+			{
+				uint64	next_offset = old_cluster.controldata.chkpnt_nxtmxoff;
+
+				/* Handle possible wraparound. */
+				if (next_offset < oldest_offset)
+					next_offset += ((uint64) 1 << 32) - 1;
+
+				next_offset -= oldest_offset - 1;
+				old_cluster.controldata.chkpnt_nxtmxoff = next_offset;
+			}
+		}
+		else
+			copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
+
 		copy_subdir_files("pg_multixact/members", "pg_multixact/members");
 
 		prep_status("Setting next multixact ID and offset for new cluster");
@@ -760,9 +783,9 @@ copy_xact_xlog_xid(void)
 		 * counters here and the oldest multi present on system.
 		 */
 		exec_prog(UTILITY_LOG_FILE, NULL, true, true,
-				  "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"",
+				  "\"%s/pg_resetwal\" -O %llu -m %u,%u \"%s\"",
 				  new_cluster.bindir,
-				  old_cluster.controldata.chkpnt_nxtmxoff,
+				  (unsigned long long) old_cluster.controldata.chkpnt_nxtmxoff,
 				  old_cluster.controldata.chkpnt_nxtmulti,
 				  old_cluster.controldata.chkpnt_oldstMulti,
 				  new_cluster.pgdata);
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 53f693c2d4..4d65e4125e 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -114,6 +114,13 @@ extern char *output_files[];
  */
 #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
 
+/*
+ * Swicth from 32-bit to 64-bit for multixid offsets.
+ *
+ * XXX: should be changed to the actual CATALOG_VERSION_NO on commit.
+ */
+#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 202409041
+
 /*
  * large object chunk size added to pg_controldata,
  * commit 5f93c37805e7485488480916b4585e098d3cc883
@@ -230,7 +237,7 @@ typedef struct
 	uint32		chkpnt_nxtepoch;
 	uint32		chkpnt_nxtoid;
 	uint32		chkpnt_nxtmulti;
-	uint32		chkpnt_nxtmxoff;
+	uint64		chkpnt_nxtmxoff;
 	uint32		chkpnt_oldstMulti;
 	uint32		chkpnt_oldstxid;
 	uint32		align;
@@ -515,3 +522,7 @@ typedef struct
 	FILE	   *file;
 	char		path[MAXPGPATH];
 } UpgradeTaskReport;
+
+/* segresize.c */
+
+uint64		convert_multixact_offsets(void);
diff --git a/src/bin/pg_upgrade/segresize.c b/src/bin/pg_upgrade/segresize.c
new file mode 100644
index 0000000000..e47c0a2407
--- /dev/null
+++ b/src/bin/pg_upgrade/segresize.c
@@ -0,0 +1,350 @@
+/*
+ *	segresize.c
+ *
+ *	SLRU segment resize utility
+ *
+ *	Copyright (c) 2024, PostgreSQL Global Development Group
+ *	src/bin/pg_upgrade/segresize.c
+ */
+
+#include "postgres_fe.h"
+
+#include "pg_upgrade.h"
+#include "access/multixact.h"
+
+/* See slru.h */
+#define SLRU_PAGES_PER_SEGMENT		32
+
+/*
+ * Some kind of iterator associated with a particular SLRU segment.  The idea is
+ * to specify the segment and page number and then move through the pages.
+ */
+typedef struct SlruSegState
+{
+	char	   *dir;
+	char	   *fn;
+	FILE	   *file;
+	int64		segno;
+	uint64		pageno;
+	bool		leading_gap;
+	bool		long_segment_names;
+} SlruSegState;
+
+/*
+ * Get SLRU segmen file name from state.
+ *
+ * NOTE: this function should mirror SlruFileName call.
+ */
+static inline char *
+SlruFileName(SlruSegState *state)
+{
+	if (state->long_segment_names)
+	{
+		Assert(state->segno >= 0 &&
+			   state->segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
+		return psprintf("%s/%015llX", state->dir, (long long) state->segno);
+	}
+	else
+	{
+		Assert(state->segno >= 0 &&
+			   state->segno <= INT64CONST(0xFFFFFF));
+		return psprintf("%s/%04X", state->dir, (unsigned int) state->segno);
+	}
+}
+
+/*
+ * Create SLRU segment file.
+ */
+static void
+create_segment(SlruSegState *state)
+{
+	Assert(state->fn == NULL);
+	Assert(state->file == NULL);
+
+	state->fn = SlruFileName(state);
+	state->file = fopen(state->fn, "wb");
+	if (!state->file)
+		pg_fatal("could not create file \"%s\": %m", state->fn);
+}
+
+/*
+ * Open existing SLRU segment file.
+ */
+static void
+open_segment(SlruSegState *state)
+{
+	Assert(state->fn == NULL);
+	Assert(state->file == NULL);
+
+	state->fn = SlruFileName(state);
+	state->file = fopen(state->fn, "rb");
+	if (!state->file)
+		pg_fatal("could not open file \"%s\": %m", state->fn);
+}
+
+/*
+ * Close SLRU segment file.
+ */
+static void
+close_segment(SlruSegState *state)
+{
+	if (state->file)
+	{
+		fclose(state->file);
+		state->file = NULL;
+	}
+
+	if (state->fn)
+	{
+		pfree(state->fn);
+		state->fn = NULL;
+	}
+}
+
+/*
+ * Read next page from the old 32-bit offset segment file.
+ */
+static int
+read_old_segment_page(SlruSegState *state, void *buf, bool *empty)
+{
+	int		len;
+
+	/* Open next segment file, if needed. */
+	if (!state->fn)
+	{
+		if (!state->segno)
+			state->leading_gap = true;
+
+		open_segment(state);
+
+		/* Set position to the needed page. */
+		if (state->pageno > 0 &&
+			fseek(state->file, state->pageno * BLCKSZ, SEEK_SET))
+		{
+			close_segment(state);
+		}
+	}
+
+	if (state->file)
+	{
+		/* Segment file do exists, read page from it. */
+		state->leading_gap = false;
+
+		len = fread(buf, sizeof(char), BLCKSZ, state->file);
+
+		/* Are we done or was there an error? */
+		if (len <= 0)
+		{
+			if (ferror(state->file))
+				pg_fatal("error reading file \"%s\": %m", state->fn);
+
+			if (feof(state->file))
+			{
+				*empty = true;
+				len = -1;
+
+				close_segment(state);
+			}
+		}
+		else
+			*empty = false;
+	}
+	else if (!state->leading_gap)
+	{
+		/* We reached the last segment. */
+		len = -1;
+		*empty = true;
+	}
+	else
+	{
+		/* Skip few first segments if they were frozen and removed. */
+		len = BLCKSZ;
+		*empty = true;
+	}
+
+	if (++state->pageno >= SLRU_PAGES_PER_SEGMENT)
+	{
+		/* Start a new segment. */
+		state->segno++;
+		state->pageno = 0;
+
+		close_segment(state);
+	}
+
+	return len;
+}
+
+/*
+ * Write next page to the new 64-bit offset segment file.
+ */
+static void
+write_new_segment_page(SlruSegState *state, void *buf)
+{
+	/*
+	 * Create a new segment file if we still didn't.  Creation is
+	 * postponed until the first non-empty page is found.  This helps
+	 * not to create completely empty segments.
+	 */
+	if (!state->file)
+	{
+		create_segment(state);
+
+		/* Write zeroes to the previously skipped prefix. */
+		if (state->pageno > 0)
+		{
+			char		zerobuf[BLCKSZ] = {0};
+
+			for (int64 i = 0; i < state->pageno; i++)
+			{
+				if (fwrite(zerobuf, sizeof(char), BLCKSZ, state->file) != BLCKSZ)
+					pg_fatal("could not write file \"%s\": %m", state->fn);
+			}
+		}
+	}
+
+	/* Write page to the new segment (if it was created). */
+	if (state->file)
+	{
+		if (fwrite(buf, sizeof(char), BLCKSZ, state->file) != BLCKSZ)
+			pg_fatal("could not write file \"%s\": %m", state->fn);
+	}
+
+	state->pageno++;
+
+	/*
+	 * Did we reach the maximum page number?  Then close segment file
+	 * and create a new one on the next iteration.
+	 */
+	if (state->pageno >= SLRU_PAGES_PER_SEGMENT)
+	{
+		state->segno++;
+		state->pageno = 0;
+		close_segment(state);
+	}
+}
+
+/*
+ * Convert pg_multixact/offsets segments and return oldest multi offset.
+ */
+uint64
+convert_multixact_offsets(void)
+{
+	/* See multixact.c */
+#define MULTIXACT_OFFSETS_PER_PAGE_OLD	(BLCKSZ / sizeof(uint32))
+#define MULTIXACT_OFFSETS_PER_PAGE		(BLCKSZ / sizeof(MultiXactOffset))
+
+	SlruSegState	oldseg = {0},
+					newseg = {0};
+	uint32			oldbuf[MULTIXACT_OFFSETS_PER_PAGE_OLD] = {0};
+	MultiXactOffset	newbuf[MULTIXACT_OFFSETS_PER_PAGE] = {0};
+	/*
+	 * It is much easier to deal with multi wraparound in 64 bitd format.  Thus
+	 * we use 64 bits for multi-transactions, although they remain 32 bits.
+	 */
+	uint64			oldest_multi = old_cluster.controldata.chkpnt_oldstMulti,
+					next_multi = old_cluster.controldata.chkpnt_nxtmulti,
+					multi,
+					old_entry,
+					new_entry;
+	bool			found = false;
+	uint64			oldest_offset = 0;
+
+	prep_status("Converting pg_multixact/offsets to 64-bit");
+
+	oldseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE_OLD;
+	oldseg.segno = oldseg.pageno / SLRU_PAGES_PER_SEGMENT;
+	oldseg.pageno %= SLRU_PAGES_PER_SEGMENT;
+	oldseg.dir = psprintf("%s/pg_multixact/offsets", old_cluster.pgdata);
+	oldseg.long_segment_names = false;		/* old format XXXX */
+
+	newseg.pageno = oldest_multi / MULTIXACT_OFFSETS_PER_PAGE;
+	newseg.segno = newseg.pageno / SLRU_PAGES_PER_SEGMENT;
+	newseg.pageno %= SLRU_PAGES_PER_SEGMENT;
+	newseg.dir = psprintf("%s/pg_multixact/offsets", new_cluster.pgdata);
+	newseg.long_segment_names = true;
+
+	old_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE_OLD;
+	new_entry = oldest_multi % MULTIXACT_OFFSETS_PER_PAGE;
+
+	if (next_multi < oldest_multi)
+		next_multi += (uint64) 1 << 32;		/* wraparound */
+
+	for (multi = oldest_multi; multi < next_multi; old_entry = 0)
+	{
+		int			oldlen;
+		bool		empty;
+
+		/* Handle possible segment wraparound. */
+		if (oldseg.segno > MaxMultiXactId /
+								MULTIXACT_OFFSETS_PER_PAGE_OLD /
+								SLRU_PAGES_PER_SEGMENT)
+			oldseg.segno = 0;
+
+		/* Read old offset segment. */
+		oldlen = read_old_segment_page(&oldseg, oldbuf, &empty);
+		if (oldlen <= 0 || empty)
+			pg_fatal("cannot read page %llu from file \"%s\": %m",
+					 (unsigned long long) oldseg.pageno, oldseg.fn);
+
+		/* Fill possible gap. */
+		if (oldlen < BLCKSZ)
+			memset((char *) oldbuf + oldlen, 0, BLCKSZ - oldlen);
+
+		/* Save oldest multi offset */
+		if (!found)
+		{
+			oldest_offset = oldbuf[old_entry];
+			found = true;
+		}
+
+		/* ... skip wrapped-around invalid multi */
+		if (multi == (uint64) 1 << 32)
+		{
+			Assert(oldseg.segno == 0);
+			Assert(oldseg.pageno == 1);
+			Assert(old_entry == 0);
+
+			multi += FirstMultiXactId;
+			old_entry = FirstMultiXactId;
+		}
+
+		/* Copy entries to the new page. */
+		for (; multi < next_multi && old_entry < MULTIXACT_OFFSETS_PER_PAGE_OLD;
+			 multi++, old_entry++)
+		{
+			MultiXactOffset offset = oldbuf[old_entry];
+
+			/* Handle possible offset wraparound. */
+			if (offset < oldest_offset)
+				offset += ((uint64) 1 << 32) - 1;
+
+			/* Subtract oldest_offset, so new offsets will start from 1. */
+			newbuf[new_entry++] = offset - oldest_offset + 1;
+			if (new_entry >= MULTIXACT_OFFSETS_PER_PAGE)
+			{
+				/* Write a new page. */
+				write_new_segment_page(&newseg, newbuf);
+				new_entry = 0;
+			}
+		}
+	}
+
+	/* Write the last incomplete page. */
+	if (new_entry > 0 || oldest_multi == next_multi)
+	{
+		memset(&newbuf[new_entry], 0,
+			   sizeof(newbuf[0]) * (MULTIXACT_OFFSETS_PER_PAGE - new_entry));
+		write_new_segment_page(&newseg, newbuf);
+	}
+
+	/* Release resources. */
+	close_segment(&oldseg);
+	close_segment(&newseg);
+
+	pfree(oldseg.dir);
+	pfree(newseg.dir);
+
+	check_ok();
+
+	return oldest_offset;
+}
-- 
2.43.0

