From 4076702a39d5ab0c0cd5af860fb47d2b0742c7ee Mon Sep 17 00:00:00 2001
From: Maxim Orlov <orlovmg@gmail.com>
Date: Fri, 24 Oct 2025 10:58:37 +0300
Subject: [PATCH v20 2/5] Add pg_upgarde for 64 bit multixact offsets

Author: Maxim Orlov <orlovmg@gmail.com>
Author: Heikki Linnakangas <hlinnaka@iki.fi>
---
 src/backend/access/transam/multixact.c |  35 +--
 src/bin/pg_upgrade/Makefile            |   3 +
 src/bin/pg_upgrade/meson.build         |   3 +
 src/bin/pg_upgrade/multixact_new.c     | 227 +++++++++++++++++++
 src/bin/pg_upgrade/multixact_new.h     |  31 +++
 src/bin/pg_upgrade/multixact_old.c     | 296 +++++++++++++++++++++++++
 src/bin/pg_upgrade/multixact_old.h     |  31 +++
 src/bin/pg_upgrade/pg_upgrade.c        | 108 ++++++++-
 src/bin/pg_upgrade/pg_upgrade.h        |   5 +
 src/bin/pg_upgrade/slru_io.c           | 240 ++++++++++++++++++++
 src/bin/pg_upgrade/slru_io.h           |  30 +++
 11 files changed, 977 insertions(+), 32 deletions(-)
 create mode 100644 src/bin/pg_upgrade/multixact_new.c
 create mode 100644 src/bin/pg_upgrade/multixact_new.h
 create mode 100644 src/bin/pg_upgrade/multixact_old.c
 create mode 100644 src/bin/pg_upgrade/multixact_old.h
 create mode 100644 src/bin/pg_upgrade/slru_io.c
 create mode 100644 src/bin/pg_upgrade/slru_io.h

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 93a1e4cfd2a..5a13596cd86 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -1231,7 +1231,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	int			slotno;
 	MultiXactOffset offset;
 	int			length;
-	int			truelength;
 	MultiXactId oldestMXact;
 	MultiXactId nextMXact;
 	MultiXactId tmpMXact;
@@ -1330,15 +1329,6 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members,
 	 * we have just for this; the process in charge will signal the CV as soon
 	 * as it has finished writing the multixact offset.
 	 *
-	 * 3. Because GetNewMultiXactId increments offset zero to offset one to
-	 * handle case #2, there is an ambiguity near the point of offset
-	 * wraparound.  If we see next multixact's offset is one, is that our
-	 * multixact's actual endpoint, or did it end at zero with a subsequent
-	 * increment?  We handle this using the knowledge that if the zero'th
-	 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
-	 * transaction ID so it can't be a multixact member.  Therefore, if we
-	 * read a zero from the members array, just ignore it.
-	 *
 	 * This is all pretty messy, but the mess occurs only in infrequent corner
 	 * cases, so it seems better than holding the MultiXactGenLock for a long
 	 * time on every multixact creation.
@@ -1422,6 +1412,9 @@ retry:
 	LWLockRelease(lock);
 	lock = NULL;
 
+	/* A multixid with zero members should not happen */
+	Assert(length > 0);
+
 	/*
 	 * If we slept above, clean up state; it's no longer needed.
 	 */
@@ -1430,7 +1423,6 @@ retry:
 
 	ptr = (MultiXactMember *) palloc(length * sizeof(MultiXactMember));
 
-	truelength = 0;
 	prev_pageno = -1;
 	for (int i = 0; i < length; i++, offset++)
 	{
@@ -1468,36 +1460,27 @@ retry:
 		xactptr = (TransactionId *)
 			(MultiXactMemberCtl->shared->page_buffer[slotno] + memberoff);
 
-		if (!TransactionIdIsValid(*xactptr))
-		{
-			/* Corner case 3: we must be looking at unused slot zero */
-			Assert(offset == 0);
-			continue;
-		}
+		Assert(TransactionIdIsValid(*xactptr));
 
 		flagsoff = MXOffsetToFlagsOffset(offset);
 		bshift = MXOffsetToFlagsBitShift(offset);
 		flagsptr = (uint32 *) (MultiXactMemberCtl->shared->page_buffer[slotno] + flagsoff);
 
-		ptr[truelength].xid = *xactptr;
-		ptr[truelength].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
-		truelength++;
+		ptr[i].xid = *xactptr;
+		ptr[i].status = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
 	}
 
 	LWLockRelease(lock);
 
-	/* A multixid with zero members should not happen */
-	Assert(truelength > 0);
-
 	/*
 	 * Copy the result into the local cache.
 	 */
-	mXactCachePut(multi, truelength, ptr);
+	mXactCachePut(multi, length, ptr);
 
 	debug_elog3(DEBUG2, "GetMembers: no cache for %s",
-				mxid_to_string(multi, truelength, ptr));
+				mxid_to_string(multi, length, ptr));
 	*members = ptr;
-	return truelength;
+	return length;
 }
 
 /*
diff --git a/src/bin/pg_upgrade/Makefile b/src/bin/pg_upgrade/Makefile
index 69fcf593cae..42995d53b0b 100644
--- a/src/bin/pg_upgrade/Makefile
+++ b/src/bin/pg_upgrade/Makefile
@@ -18,11 +18,14 @@ OBJS = \
 	file.o \
 	function.o \
 	info.o \
+	multixact_new.o \
+	multixact_old.o \
 	option.o \
 	parallel.o \
 	pg_upgrade.o \
 	relfilenumber.o \
 	server.o \
+	slru_io.o \
 	tablespace.o \
 	task.o \
 	util.o \
diff --git a/src/bin/pg_upgrade/meson.build b/src/bin/pg_upgrade/meson.build
index ac992f0d14b..3e46c4512cf 100644
--- a/src/bin/pg_upgrade/meson.build
+++ b/src/bin/pg_upgrade/meson.build
@@ -8,11 +8,14 @@ pg_upgrade_sources = files(
   'file.c',
   'function.c',
   'info.c',
+  'multixact_new.c',
+  'multixact_old.c',
   'option.c',
   'parallel.c',
   'pg_upgrade.c',
   'relfilenumber.c',
   'server.c',
+  'slru_io.c',
   'tablespace.c',
   'task.c',
   'util.c',
diff --git a/src/bin/pg_upgrade/multixact_new.c b/src/bin/pg_upgrade/multixact_new.c
new file mode 100644
index 00000000000..d43442fb9a7
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_new.c
@@ -0,0 +1,227 @@
+/*
+ * multixact_new.c
+ *
+ * Rewrite pre-v19 multixacts to new format with 64-bit MultiXactOffsets
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_new.c
+ */
+
+#include "multixact_new.h"
+
+/*
+ * NOTE: Below are a bunch of definitions and simple inline functions that are
+ * copy-pasted from multixact.c
+ */
+
+/* We need four bytes per offset, 8 bytes for the base */
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+
+static inline int64
+MultiXactIdToOffsetPage(MultiXactId multi)
+{
+	return multi / MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+static inline int
+MultiXactIdToOffsetEntry(MultiXactId multi)
+{
+	return multi % MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT			8
+#define MXACT_MEMBER_FLAGS_PER_BYTE			1
+#define MXACT_MEMBER_XACT_BITMASK	((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP		4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP	\
+	(MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+	(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE	\
+	(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/*
+ * Because the number of items per page is not a divisor of the last item
+ * number (member 0xFFFFFFFF), the last segment does not use the maximum number
+ * of pages, and moreover the last used page therein does not use the same
+ * number of items as previous pages.  (Another way to say it is that the
+ * 0xFFFFFFFF member is somewhere in the middle of the last page, so the page
+ * has some empty space after that item.)
+ *
+ * This constant is the number of members in the last page of the last segment.
+ */
+#define MAX_MEMBERS_IN_LAST_MEMBERS_PAGE \
+		((uint32) ((0xFFFFFFFF % MULTIXACT_MEMBERS_PER_PAGE) + 1))
+
+/* page in which a member is to be found */
+static inline int64
+MXOffsetToMemberPage(MultiXactOffset offset)
+{
+	return offset / MULTIXACT_MEMBERS_PER_PAGE;
+}
+
+/* Location (byte offset within page) of flag word for a given member */
+static inline int
+MXOffsetToFlagsOffset(MultiXactOffset offset)
+{
+	MultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+	int			grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
+	int			byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
+
+	return byteoff;
+}
+
+static inline int
+MXOffsetToFlagsBitShift(MultiXactOffset offset)
+{
+	int			member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+	int			bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
+
+	return bshift;
+}
+
+/* Location (byte offset within page) of TransactionId of given member */
+static inline int
+MXOffsetToMemberOffset(MultiXactOffset offset)
+{
+	int			member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+
+	return MXOffsetToFlagsOffset(offset) +
+		MULTIXACT_FLAGBYTES_PER_GROUP +
+		member_in_group * sizeof(TransactionId);
+}
+
+static inline void
+MXOffsetWrite(char *buf, int entryno, MultiXactOffset offset)
+{
+	MultiXactOffset *offptr = (MultiXactOffset *) buf;
+
+	offptr[entryno] = offset;
+}
+
+MultiXactWriter *
+AllocMultiXactWrite(char *pgdata, MultiXactId firstMulti,
+					MultiXactOffset firstOffset)
+{
+	MultiXactWriter    *state = pg_malloc(sizeof(*state));
+	char				dir[MAXPGPATH] = {0};
+
+	state->nextMXact = firstMulti;
+	state->nextOffset = firstOffset;
+
+	pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata);
+	state->offset = AllocSlruWrite(dir, false);
+
+	pg_sprintf(dir, "%s/pg_multixact/members", pgdata);
+	state->members = AllocSlruWrite(dir, true /* use long segment names */);
+
+	return state;
+}
+
+/*
+ * Simplified copy of the corresponding server function
+ */
+MultiXactId
+GetNewMultiXactId(MultiXactWriter *state, int nmembers, MultiXactOffset *offset)
+{
+	MultiXactId		result;
+
+	/* Handle wraparound of the nextMXact counter */
+	if (state->nextMXact < FirstMultiXactId)
+		state->nextMXact = FirstMultiXactId;
+
+	/* Assign the MXID */
+	result = state->nextMXact;
+
+	/* Reserve the members space, similarly to above. */
+	*offset = state->nextOffset;
+
+	/*
+	 * Advance counters.  As in GetNewTransactionId(), this must not happen
+	 * until after file extension has succeeded!
+	 *
+	 * We don't care about MultiXactId wraparound here; it will be handled by
+	 * the next iteration.  But note that nextMXact may be InvalidMultiXactId
+	 * or the first value on a segment-beginning page after this routine
+	 * exits, so anyone else looking at the variable must be prepared to deal
+	 * with either case.  Similarly, nextOffset may be zero, but we won't use
+	 * that as the actual start offset of the next multixact.
+	 */
+	(state->nextMXact)++;
+
+	state->nextOffset += nmembers;
+
+	return result;
+}
+
+/*
+ * Write a new multixact with members.
+ *
+ * Simplified version of the correspoding server function, hence the name.
+ */
+void
+RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset,
+				   MultiXactId multi, int nmembers, MultiXactMember *members)
+{
+	int64		pageno;
+	int64		prev_pageno;
+	int			entryno,
+				i;
+	char	   *buf;
+
+	pageno = MultiXactIdToOffsetPage(multi);
+	entryno = MultiXactIdToOffsetEntry(multi);
+
+	buf = SlruWriteSwitchPage(state->offset, pageno);
+	MXOffsetWrite(buf, entryno, offset);
+
+	prev_pageno = -1;
+
+	for (i = 0; i < nmembers; i++, offset++)
+	{
+		TransactionId *memberptr;
+		uint32	   *flagsptr;
+		uint32		flagsval;
+		int			bshift;
+		int			flagsoff;
+		int			memberoff;
+
+		Assert(members[i].status <= MultiXactStatusUpdate);
+
+		pageno = MXOffsetToMemberPage(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
+
+		if (pageno != prev_pageno)
+		{
+			buf = SlruWriteSwitchPage(state->members, pageno);
+			prev_pageno = pageno;
+		}
+
+		memberptr = (TransactionId *) (buf + memberoff);
+
+		*memberptr = members[i].xid;
+
+		flagsptr = (uint32 *) (buf + flagsoff);
+
+		flagsval = *flagsptr;
+		flagsval &= ~(((1 << MXACT_MEMBER_BITS_PER_XACT) - 1) << bshift);
+		flagsval |= (members[i].status << bshift);
+		*flagsptr = flagsval;
+	}
+}
+
+void
+FreeMultiXactWrite(MultiXactWriter *state)
+{
+	FreeSlruWrite(state->offset);
+	FreeSlruWrite(state->members);
+
+	pfree(state);
+}
diff --git a/src/bin/pg_upgrade/multixact_new.h b/src/bin/pg_upgrade/multixact_new.h
new file mode 100644
index 00000000000..33d5d1b8222
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_new.h
@@ -0,0 +1,31 @@
+/*
+ * multixact_new.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_new.h
+ */
+
+#include "postgres_fe.h"
+
+#include "access/multixact.h"
+
+#include "slru_io.h"
+
+typedef struct MultiXactWriter
+{
+	MultiXactId			nextMXact;
+	MultiXactOffset		nextOffset;
+
+	SlruSegState	   *offset;
+	SlruSegState	   *members;
+} MultiXactWriter;
+
+extern MultiXactWriter *AllocMultiXactWrite(char *pgdata,
+											MultiXactId firstMulti,
+											MultiXactOffset firstOffset);
+extern MultiXactId GetNewMultiXactId(MultiXactWriter *state, int nmembers,
+									 MultiXactOffset *offset);
+extern void RecordNewMultiXact(MultiXactWriter *state, MultiXactOffset offset,
+							   MultiXactId multi, int nmembers,
+							   MultiXactMember *members);
+extern void FreeMultiXactWrite(MultiXactWriter *writer);
diff --git a/src/bin/pg_upgrade/multixact_old.c b/src/bin/pg_upgrade/multixact_old.c
new file mode 100644
index 00000000000..6cc384d2cf2
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_old.c
@@ -0,0 +1,296 @@
+/*
+ * multixact_old.c
+ *
+ * Rewrite pre-v19 multixacts to new format with 64-bit MultiXactOffsets
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_old.c
+ */
+
+#include "multixact_old.h"
+
+#include "pg_upgrade.h"
+
+/*
+ * NOTE: below are a bunch of definitions and simple sttaic inline functions
+ * that are copy-pasted from multixact.c from version 18.  The only difference
+ * is that we use the OldMultiXactOffset type equal to uint32 instead of
+ * MultiXactOffset which became uint64.
+ */
+
+/* We need four bytes per offset and 8 bytes per base for each page. */
+#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(OldMultiXactOffset))
+
+static inline int64
+MultiXactIdToOffsetPage(MultiXactId multi)
+{
+	return multi / MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+static inline int
+MultiXactIdToOffsetEntry(MultiXactId multi)
+{
+	return multi % MULTIXACT_OFFSETS_PER_PAGE;
+}
+
+/*
+ * The situation for members is a bit more complex: we store one byte of
+ * additional flag bits for each TransactionId.  To do this without getting
+ * into alignment issues, we store four bytes of flags, and then the
+ * corresponding 4 Xids.  Each such 5-word (20-byte) set we call a "group", and
+ * are stored as a whole in pages.  Thus, with 8kB BLCKSZ, we keep 409 groups
+ * per page.  This wastes 12 bytes per page, but that's OK -- simplicity (and
+ * performance) trumps space efficiency here.
+ *
+ * Note that the "offset" macros work with byte offset, not array indexes, so
+ * arithmetic must be done using "char *" pointers.
+ */
+/* We need eight bits per xact, so one xact fits in a byte */
+#define MXACT_MEMBER_BITS_PER_XACT			8
+#define MXACT_MEMBER_FLAGS_PER_BYTE			1
+#define MXACT_MEMBER_XACT_BITMASK	((1 << MXACT_MEMBER_BITS_PER_XACT) - 1)
+
+/* how many full bytes of flags are there in a group? */
+#define MULTIXACT_FLAGBYTES_PER_GROUP		4
+#define MULTIXACT_MEMBERS_PER_MEMBERGROUP	\
+	(MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE)
+/* size in bytes of a complete group */
+#define MULTIXACT_MEMBERGROUP_SIZE \
+	(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERS_PER_PAGE	\
+	(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
+
+/* page in which a member is to be found */
+static inline int64
+MXOffsetToMemberPage(OldMultiXactOffset offset)
+{
+	return offset / MULTIXACT_MEMBERS_PER_PAGE;
+}
+
+/* Location (byte offset within page) of flag word for a given member */
+static inline int
+MXOffsetToFlagsOffset(MultiXactOffset offset)
+{
+	OldMultiXactOffset group = offset / MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+	int			grouponpg = group % MULTIXACT_MEMBERGROUPS_PER_PAGE;
+	int			byteoff = grouponpg * MULTIXACT_MEMBERGROUP_SIZE;
+
+	return byteoff;
+}
+
+/* Location (byte offset within page) of TransactionId of given member */
+static inline int
+MXOffsetToMemberOffset(OldMultiXactOffset offset)
+{
+	int			member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+
+	return MXOffsetToFlagsOffset(offset) +
+		MULTIXACT_FLAGBYTES_PER_GROUP +
+		member_in_group * sizeof(TransactionId);
+}
+
+static inline int
+MXOffsetToFlagsBitShift(OldMultiXactOffset offset)
+{
+	int			member_in_group = offset % MULTIXACT_MEMBERS_PER_MEMBERGROUP;
+	int			bshift = member_in_group * MXACT_MEMBER_BITS_PER_XACT;
+
+	return bshift;
+}
+
+/*
+ * Construct reader of old multixacts.
+ *
+ * Returns the malloced memory used by the all other calls in this module.
+ */
+OldMultiXactReader *
+AllocOldMultiXactRead(char *pgdata, MultiXactId nextMulti,
+					  OldMultiXactOffset nextOffset)
+{
+	OldMultiXactReader *state = state = pg_malloc(sizeof(*state));
+	char				dir[MAXPGPATH] = {0};
+
+	state->nextMXact = nextMulti;
+	state->nextOffset = nextOffset;
+
+	pg_sprintf(dir, "%s/pg_multixact/offsets", pgdata);
+	state->offset = AllocSlruRead(dir);
+
+	pg_sprintf(dir, "%s/pg_multixact/members", pgdata);
+	state->members = AllocSlruRead(dir);
+
+	return state;
+}
+
+/*
+ * This is a simplified version of the GetMultiXactIdMembers() server function.
+ *
+ * - Only return the updating member, if any. Upgrade only cares about the
+ *   updaters. If there is no updating member, return the first locking-only
+ *   member. We don't have any way to represent "no members", but we also don't
+ *   need to preserve all the locking members.
+ *
+ * - We don't need to worry about locking and some corner cases because there's
+ *   no concurrent activity.
+ */
+void
+GetOldMultiXactIdSingleMember(OldMultiXactReader *state, MultiXactId multi,
+							  TransactionId *result, MultiXactStatus *status)
+{
+	MultiXactId		nextMXact,
+					nextOffset,
+					tmpMXact;
+	int64			pageno,
+					prev_pageno;
+	int				entryno,
+					length;
+	char		   *buf;
+	OldMultiXactOffset *offptr,
+						offset;
+	TransactionId	result_xid = InvalidTransactionId;
+	bool			result_isupdate = false;
+
+	nextMXact = state->nextMXact;
+	nextOffset = state->nextOffset;
+
+	/*
+	 * See GetMultiXactIdMembers in multixact.c
+	 *
+	 * Find out the offset at which we need to start reading MultiXactMembers
+	 * and the number of members in the multixact.  We determine the latter as
+	 * the difference between this multixact's starting offset and the next
+	 * one's.  However, there are some corner cases to worry about:
+	 *
+	 * 1. This multixact may be the latest one created, in which case there is
+	 * no next one to look at.  In this case the nextOffset value we just
+	 * saved is the correct endpoint.
+	 *
+	 * 2. The next multixact may still be in process of being filled in...
+	 * This cannot happen during upgrade.
+	 *
+	 * 3. Because GetNewMultiXactId increments offset zero to offset one to
+	 * handle case #2, there is an ambiguity near the point of offset
+	 * wraparound.  If we see next multixact's offset is one, is that our
+	 * multixact's actual endpoint, or did it end at zero with a subsequent
+	 * increment?  We handle this using the knowledge that if the zero'th
+	 * member slot wasn't filled, it'll contain zero, and zero isn't a valid
+	 * transaction ID so it can't be a multixact member.  Therefore, if we
+	 * read a zero from the members array, just ignore it.
+	 */
+
+	pageno = MultiXactIdToOffsetPage(multi);
+	entryno = MultiXactIdToOffsetEntry(multi);
+
+	buf = SlruReadSwitchPage(state->offset, pageno);
+	offptr = (OldMultiXactOffset *) buf;
+	offptr += entryno;
+	offset = *offptr;
+
+	Assert(offset != 0);
+
+	/*
+	 * Use the same increment rule as GetNewMultiXactId(), that is, don't
+	 * handle wraparound explicitly until needed.
+	 */
+	tmpMXact = multi + 1;
+
+	if (nextMXact == tmpMXact)
+	{
+		/* Corner case 1: there is no next multixact */
+		length = nextOffset - offset;
+	}
+	else
+	{
+		OldMultiXactOffset nextMXOffset;
+
+		/* handle wraparound if needed */
+		if (tmpMXact < FirstMultiXactId)
+			tmpMXact = FirstMultiXactId;
+
+		prev_pageno = pageno;
+
+		pageno = MultiXactIdToOffsetPage(tmpMXact);
+		entryno = MultiXactIdToOffsetEntry(tmpMXact);
+
+		if (pageno != prev_pageno)
+			buf = SlruReadSwitchPage(state->offset, pageno);
+
+		offptr = (OldMultiXactOffset *) buf;
+		offptr += entryno;
+		nextMXOffset = *offptr;
+
+		/*
+		 * Corner case 2: next multixact is still being filled in, this must
+		 * not happen during upgrade.
+		 */
+		Assert(nextMXOffset != 0);
+
+		length = nextMXOffset - offset;
+	}
+
+	prev_pageno = -1;
+	for (int i = 0; i < length; i++, offset++)
+	{
+		TransactionId *xactptr;
+		uint32	   *flagsptr;
+		int			flagsoff;
+		int			bshift;
+		int			memberoff;
+		MultiXactStatus st;
+
+		pageno = MXOffsetToMemberPage(offset);
+		memberoff = MXOffsetToMemberOffset(offset);
+
+		if (pageno != prev_pageno)
+		{
+			buf = SlruReadSwitchPage(state->members, pageno);
+			prev_pageno = pageno;
+		}
+
+		xactptr = (TransactionId *) (buf + memberoff);
+		if (!TransactionIdIsValid(*xactptr))
+		{
+			/* Corner case 3: we must be looking at unused slot zero */
+			Assert(offset == 0);
+			continue;
+		}
+
+		flagsoff = MXOffsetToFlagsOffset(offset);
+		bshift = MXOffsetToFlagsBitShift(offset);
+		flagsptr = (uint32 *) (buf + flagsoff);
+
+		st = (*flagsptr >> bshift) & MXACT_MEMBER_XACT_BITMASK;
+
+		/* Verify that there is a single update Xid among the given members. */
+		if (ISUPDATE_from_mxstatus(st))
+		{
+			if (result_isupdate)
+				pg_fatal("multixact %u has more than one updating member",
+						 multi);
+			result_xid = *xactptr;
+			result_isupdate = true;
+		}
+		else if (!TransactionIdIsValid(result_xid))
+			result_xid = *xactptr;
+	}
+
+	/* A multixid with zero members should not happen */
+	Assert(TransactionIdIsValid(result_xid));
+
+	*result = result_xid;
+	*status = result_isupdate ? MultiXactStatusUpdate :
+								MultiXactStatusForKeyShare;
+}
+
+/*
+ * Frees the malloced reader.
+ */
+void
+FreeOldMultiXactReader(OldMultiXactReader *state)
+{
+	FreeSlruRead(state->offset);
+	FreeSlruRead(state->members);
+
+	pfree(state);
+}
diff --git a/src/bin/pg_upgrade/multixact_old.h b/src/bin/pg_upgrade/multixact_old.h
new file mode 100644
index 00000000000..8d4659ba6a0
--- /dev/null
+++ b/src/bin/pg_upgrade/multixact_old.h
@@ -0,0 +1,31 @@
+/*
+ * multixact_old.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/multixact_old.h
+ */
+
+#include "postgres_fe.h"
+
+#include "access/multixact.h"
+#include "slru_io.h"
+
+typedef uint32 OldMultiXactOffset;
+
+typedef struct OldMultiXactReader
+{
+	MultiXactId			nextMXact;
+	OldMultiXactOffset	nextOffset;
+
+	SlruSegState	   *offset;
+	SlruSegState	   *members;
+} OldMultiXactReader;
+
+extern OldMultiXactReader *AllocOldMultiXactRead(char *pgdata,
+												 MultiXactId nextMulti,
+												 OldMultiXactOffset nextOffset);
+extern void GetOldMultiXactIdSingleMember(OldMultiXactReader *state,
+										  MultiXactId multi,
+										  TransactionId *result,
+										  MultiXactStatus *status);
+extern void FreeOldMultiXactReader(OldMultiXactReader *reader);
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index 490e98fa26f..5432c03a2b0 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -49,6 +49,8 @@
 #include "common/restricted_token.h"
 #include "fe_utils/string_utils.h"
 #include "pg_upgrade.h"
+#include "multixact_old.h"
+#include "multixact_new.h"
 
 /*
  * Maximum number of pg_restore actions (TOC entries) to process within one
@@ -769,6 +771,82 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
 	check_ok();
 }
 
+/*
+ * Convert pg_multixact/offset and /members to new format with 64-bit offsets.
+ */
+static void
+convert_multixacts(MultiXactId *new_nxtmulti, MultiXactOffset *new_nxtmxoff)
+{
+	MultiXactId			oldest_multi,
+						next_multi;
+	OldMultiXactReader *old_reader;
+	MultiXactWriter	   *new_writer;
+
+	old_reader = AllocOldMultiXactRead(old_cluster.pgdata,
+									   old_cluster.controldata.chkpnt_nxtmulti,
+									   old_cluster.controldata.chkpnt_nxtmxoff);
+	new_writer = AllocMultiXactWrite(new_cluster.pgdata,
+									 old_cluster.controldata.chkpnt_oldstMulti,
+									 1 /* see below */);
+
+	oldest_multi = old_cluster.controldata.chkpnt_oldstMulti;
+	next_multi = old_cluster.controldata.chkpnt_nxtmulti;
+
+	/* handle wraparound */
+	if (next_multi < FirstMultiXactId)
+		next_multi = FirstMultiXactId;
+
+	/*
+	 * Read multixids from old files one by one, and write them back in the new
+	 * format.
+	 *
+	 * The locking-only XIDs that may be part of multi-xids don't matter after
+	 * upgrade, as there can be no transactions running across upgrade.  So as
+	 * a little optimization, we only read one member from each multixid: the
+	 * one updating one, or if there was no update, arbitrarily the first
+	 * locking xid.
+	 */
+	for (MultiXactId multi = oldest_multi; multi != next_multi;)
+	{
+		TransactionId		xid;
+		MultiXactStatus		status;
+		MultiXactMember		member;
+		MultiXactId			new_multi PG_USED_FOR_ASSERTS_ONLY;
+		MultiXactOffset		offset;
+
+		/* Read the old multixid */
+		GetOldMultiXactIdSingleMember(old_reader, multi, &xid, &status);
+
+		/* Write it out in new format */
+		member.xid = xid;
+		member.status = status;
+		new_multi = GetNewMultiXactId(new_writer, 1, &offset);
+
+		Assert(new_multi == multi);
+
+		RecordNewMultiXact(new_writer, offset, multi, 1, &member);
+
+		multi++;
+		/* handle wraparound */
+		if (multi < FirstMultiXactId)
+			multi = FirstMultiXactId;
+	}
+
+	/*
+	 * Update the nextMXact/Offset values in the control file to match what we
+	 * wrote.  The nextMXact should be unchanged, but because we ignored the
+	 * locking XIDs members, the nextOffset will be different.
+	 */
+	Assert(new_writer->nextMXact == next_multi);
+
+	*new_nxtmulti = next_multi;
+	*new_nxtmxoff = new_writer->nextOffset;
+
+	/* Release resources */
+	FreeMultiXactWrite(new_writer);
+	FreeOldMultiXactReader(old_reader);
+}
+
 static void
 copy_xact_xlog_xid(void)
 {
@@ -816,8 +894,28 @@ copy_xact_xlog_xid(void)
 	if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
 		new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
 	{
-		copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
-		copy_subdir_files("pg_multixact/members", "pg_multixact/members");
+		MultiXactId		new_nxtmulti = old_cluster.controldata.chkpnt_nxtmulti;
+		MultiXactOffset new_nxtmxoff = old_cluster.controldata.chkpnt_nxtmxoff;
+
+		/*
+		 * If the old server is before the MULTIXACTOFFSET_FORMATCHANGE_CAT_VER
+		 * it must have 32-bit multixid offsets, thus it should be converted.
+		 */
+		if (old_cluster.controldata.cat_ver < MULTIXACTOFFSET_FORMATCHANGE_CAT_VER &&
+			new_cluster.controldata.cat_ver >= MULTIXACTOFFSET_FORMATCHANGE_CAT_VER)
+		{
+			remove_new_subdir("pg_multixact/members", false);
+			remove_new_subdir("pg_multixact/offsets", false);
+
+			prep_status("Converting pg_multixact/offsets to 64-bit");
+			convert_multixacts(&new_nxtmulti, &new_nxtmxoff);
+			check_ok();
+		}
+		else
+		{
+			copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
+			copy_subdir_files("pg_multixact/members", "pg_multixact/members");
+		}
 
 		prep_status("Setting next multixact ID and offset for new cluster");
 
@@ -826,10 +924,8 @@ copy_xact_xlog_xid(void)
 		 * counters here and the oldest multi present on system.
 		 */
 		exec_prog(UTILITY_LOG_FILE, NULL, true, true,
-				  "\"%s/pg_resetwal\" -O %u -m %u,%u \"%s\"",
-				  new_cluster.bindir,
-				  old_cluster.controldata.chkpnt_nxtmxoff,
-				  old_cluster.controldata.chkpnt_nxtmulti,
+				  "\"%s/pg_resetwal\" -O %" PRIu64 " -m %u,%u \"%s\"",
+				  new_cluster.bindir, new_nxtmxoff, new_nxtmulti,
 				  old_cluster.controldata.chkpnt_oldstMulti,
 				  new_cluster.pgdata);
 		check_ok();
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index e86336f4be9..127b2cb00fa 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -114,6 +114,11 @@ extern char *output_files[];
  */
 #define MULTIXACT_FORMATCHANGE_CAT_VER 201301231
 
+/*
+ * Swicth from 32-bit to 64-bit for multixid offsets.
+ */
+#define MULTIXACTOFFSET_FORMATCHANGE_CAT_VER 999999999
+
 /*
  * large object chunk size added to pg_controldata,
  * commit 5f93c37805e7485488480916b4585e098d3cc883
diff --git a/src/bin/pg_upgrade/slru_io.c b/src/bin/pg_upgrade/slru_io.c
new file mode 100644
index 00000000000..4e823199303
--- /dev/null
+++ b/src/bin/pg_upgrade/slru_io.c
@@ -0,0 +1,240 @@
+/*
+ * slru_io.c
+ *
+ * Routines for reading and writing SLRU files during upgrade.
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.c
+ */
+
+#include "postgres_fe.h"
+
+#include <fcntl.h>
+
+#include "pg_upgrade.h"
+#include "slru_io.h"
+
+#include "common/fe_memutils.h"
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "port/pg_iovec.h"
+
+/*
+ * State for reading or writing an SLRU, with a one page buffer.
+ */
+typedef struct SlruSegState
+{
+	bool		writing;
+	bool		long_segment_names;
+
+	char	   *dir;
+	char	   *fn;
+	int			fd;
+	int64		segno;
+	uint64		pageno;
+
+	PGAlignedBlock buf;
+} SlruSegState;
+
+static inline SlruSegState *
+AllocSlruSegState(char *dir)
+{
+	SlruSegState *state = pg_malloc(sizeof(*state));
+
+	state->segno = -1;
+	state->pageno = 0;
+	state->dir = pstrdup(dir);
+	state->fd = -1;
+	state->fn = NULL;
+
+	return state;
+}
+
+static inline void
+SlruFlush(SlruSegState *state)
+{
+	struct iovec	iovec = {
+		.iov_base = &state->buf,
+		.iov_len = BLCKSZ,
+	};
+	off_t			offset;
+
+	if (state->segno == -1)
+		return;
+
+	offset = (state->pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+	if (pg_pwritev_with_retry(state->fd, &iovec, 1, offset) < 0)
+		pg_fatal("could not write file \"%s\": %m", state->fn);
+}
+
+/*
+ * Create slru reader for dir.
+ *
+ * Returns the malloced memory used by the all other read calls in this module.
+ */
+SlruSegState *
+AllocSlruRead(char *dir)
+{
+	SlruSegState *state = AllocSlruSegState(dir);
+
+	state->writing = false;
+
+	return state;
+}
+
+/*
+ * Open given page for reading.
+ *
+ * Reading can be done in random order.
+ */
+char *
+SlruReadSwitchPage(SlruSegState *state, uint64 pageno)
+{
+	int64 segno;
+
+	Assert(!state->writing);	/* read only mode */
+
+	if (state->segno != -1 && pageno == state->pageno)
+		return state->buf.data;
+
+	segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	if (segno != state->segno)
+	{
+		if (state->segno != -1)
+		{
+			close(state->fd);
+			state->fd = -1;
+
+			pg_free(state->fn);
+			state->fn = NULL;
+		}
+
+		/* Open new segment */
+		state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno);
+		if ((state->fd = open(state->fn, O_RDONLY | PG_BINARY, 0)) < 0)
+			pg_fatal("could not open file \"%s\": %m", state->fn);
+	}
+
+	state->segno = segno;
+
+	{
+		struct iovec	iovec = {
+			.iov_base = &state->buf,
+			.iov_len = BLCKSZ,
+		};
+		off_t			offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+		if (pg_preadv(state->fd, &iovec, 1, offset) < 0)
+			pg_fatal("could not read file \"%s\": %m", state->fn);
+
+		state->pageno = pageno;
+	}
+
+	return state->buf.data;
+}
+
+/*
+ * Frees the malloced reader.
+ */
+void
+FreeSlruRead(SlruSegState *state)
+{
+	Assert(!state->writing);	/* read only mode */
+
+	close(state->fd);
+	pg_free(state);
+}
+
+/*
+ * Open the given page for writing.
+ *
+ * NOTE: This uses O_EXCL when stepping to a new segment, so this assumes that
+ * each segment is written in full before moving on to next one.  This
+ * limitation would be easy to lift if needed, but it fits the usage pattern of
+ * current callers.
+ */
+char *
+SlruWriteSwitchPage(SlruSegState *state, uint64 pageno)
+{
+	int64	segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	off_t	offset;
+
+	if (state->segno != -1 && pageno == state->pageno)
+		return state->buf.data;
+
+	segno = pageno / SLRU_PAGES_PER_SEGMENT;
+	offset = (pageno % SLRU_PAGES_PER_SEGMENT) * BLCKSZ;
+
+	SlruFlush(state);
+	memset(state->buf.data, 0, BLCKSZ);
+
+	if (segno != state->segno)
+	{
+		if (state->segno != -1)
+		{
+			close(state->fd);
+			state->fd = -1;
+
+			pg_free(state->fn);
+			state->fn = NULL;
+		}
+
+		/* Create the segment */
+		if (state->long_segment_names)
+		{
+			Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFFFFFFFFFFF));
+			state->fn = psprintf("%s/%015" PRIX64, state->dir, segno);
+		}
+		else
+		{
+			Assert(segno >= 0 && segno <= INT64CONST(0xFFFFFF));
+			state->fn = psprintf("%s/%04X", state->dir, (unsigned int) segno);
+		}
+
+		if ((state->fd = open(state->fn, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+							  pg_file_create_mode)) < 0)
+		{
+			pg_fatal("could not create file \"%s\": %m", state->fn);
+		}
+
+		state->segno = segno;
+
+		if (offset > 0 && pg_pwrite_zeros(state->fd, offset, 0) < 0)
+			pg_fatal("could not write file \"%s\": %m", state->fn);
+	}
+
+	state->pageno = pageno;
+
+	return state->buf.data;
+}
+
+/*
+ * Create slru writer for dir.
+ *
+ * Returns the malloced memory used by the all other write calls in this module.
+ */
+SlruSegState *
+AllocSlruWrite(char *dir, bool long_segment_names)
+{
+	SlruSegState *state = AllocSlruSegState(dir);
+
+	state->writing = true;
+	state->long_segment_names = long_segment_names;
+
+	return state;
+}
+
+/*
+ * Frees the malloced writer.
+ */
+void
+FreeSlruWrite(SlruSegState *state)
+{
+	Assert(state->writing);
+
+	SlruFlush(state);
+
+	close(state->fd);
+	pg_free(state);
+}
diff --git a/src/bin/pg_upgrade/slru_io.h b/src/bin/pg_upgrade/slru_io.h
new file mode 100644
index 00000000000..920b8ae82e2
--- /dev/null
+++ b/src/bin/pg_upgrade/slru_io.h
@@ -0,0 +1,30 @@
+/*
+ * slru_io.h
+ *
+ * Copyright (c) 2025, PostgreSQL Global Development Group
+ * src/bin/pg_upgrade/slru_io.h
+ */
+
+/*
+ * Some kind of iterator associated with a particular SLRU segment.  The idea is
+ * to specify the segment and page number and then move through the pages.
+ */
+
+#include "postgres_fe.h"
+
+/*
+ * See access/slru.h
+ *
+ * Copy here, since slru.h could not be included in fe code.
+ */
+#define SLRU_PAGES_PER_SEGMENT 32
+
+typedef struct SlruSegState SlruSegState;
+
+extern SlruSegState *AllocSlruRead(char *dir);
+extern char *SlruReadSwitchPage(SlruSegState *state, uint64 pageno);
+extern void FreeSlruRead(SlruSegState *state);
+
+extern SlruSegState *AllocSlruWrite(char *dir, bool long_segment_names);
+extern char *SlruWriteSwitchPage(SlruSegState *state, uint64 pageno);
+extern void FreeSlruWrite(SlruSegState *state);
-- 
2.43.0

