Alvaro Herrera wrote:

> 1. slru.c doesn't consider file names longer than 4 hexadecimal chars.

> For 9.3, I propose we skip this and tweak the code to consider files
> whose names are 4 or 5 chars in length, to remain compatible with
> existing installations that have pg_multixact/member having a mixture of
> 4-char and 5-char file names.

Attached is a patch for this.

> 2. pg_multixact/members truncation requires more intelligence to avoid
> removing files that are still needed.  Right now we use modulo-2^32
> arithmetic, but this doesn't work because the useful range can span
> longer than what we can keep within that range.

> #2c At start of truncation, save end-of-range in MultiXactState.  This
> state is updated by GetNewMultiXactId as new files are created.  That
> way, before each new file is created, the truncation routine knows to
> skip it.

Attached is a patch implementing this.

I also attach a patch implementing a "burn multixact" utility, initially
coded by Andres Freund, tweaked by me.  I used it to run a bunch of
wraparound cycles and everything seems to behave as expected.  (I don't
recommend applying this patch; I'm posting merely because it's a very
useful debugging tool.)

One problem I see is length of time before freezing multis: they live
for far too long, causing the SLRU files to eat way too much disk space.
I ran burnmulti in a loop, creating multis of 3 members each, with a min
freeze age of 50 million, and this leads to ~770 files in
pg_multixact/offsets and ~2900 files in pg_multixact/members.  Each file
is 32 pages long. 256kB apiece.  Probably enough to be bothersome.

I think for computing the freezing point for multis, we should slash
min_freeze_age by 10 or something like that.  Or just set a hardcoded
one million.


> 3. New pg_multixact/members generation requires more intelligence to
> avoid stomping on files from the previous wraparound cycle.  Right now
> there is no defense against this at all.

I still have no idea how to attack this.

-- 
Álvaro Herrera                http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
>From b264bfe61b315a5f65d72b9550592cc9f73bf0a8 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Tue, 31 Dec 2013 00:42:11 -0300
Subject: [PATCH 1/3] pg_burn_multixact utility

Andres Freund, minor tweaks by me
---
 contrib/pageinspect/heapfuncs.c          |   42 ++++++++++++++++++++++++++++++
 contrib/pageinspect/pageinspect--1.1.sql |    5 ++++
 src/backend/access/heap/heapam.c         |    2 +-
 src/backend/access/transam/multixact.c   |   12 +++++----
 src/include/access/multixact.h           |    3 ++-
 5 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index 6d8f6f1..93a3317 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -29,6 +29,8 @@
 #include "funcapi.h"
 #include "utils/builtins.h"
 #include "miscadmin.h"
+#include "access/multixact.h"
+#include "access/transam.h"
 
 Datum		heap_page_items(PG_FUNCTION_ARGS);
 
@@ -224,3 +226,43 @@ heap_page_items(PG_FUNCTION_ARGS)
 	else
 		SRF_RETURN_DONE(fctx);
 }
+
+extern Datum
+pg_burn_multixact(PG_FUNCTION_ARGS);
+PG_FUNCTION_INFO_V1(pg_burn_multixact);
+
+Datum
+pg_burn_multixact(PG_FUNCTION_ARGS)
+{
+	int		rep = PG_GETARG_INT32(0);
+	int		size = PG_GETARG_INT32(1);
+	MultiXactMember *members;
+	MultiXactId ret;
+	TransactionId id = ReadNewTransactionId() - size;
+	int		i;
+
+	if (rep < 1)
+		elog(ERROR, "need to burn, burn, burn");
+
+	members = palloc(size * sizeof(MultiXactMember));
+	for (i = 0; i < size; i++)
+	{
+		members[i].xid = id++;
+		members[i].status = MultiXactStatusForShare;
+
+		if (!TransactionIdIsNormal(members[i].xid))
+		{
+			id = FirstNormalTransactionId;
+			members[i].xid = id++;
+		}
+	}
+
+	MultiXactIdSetOldestMember();
+
+	for (i = 0; i < rep; i++)
+	{
+		ret = MultiXactIdCreateFromMembers(size, members, true);
+	}
+
+	PG_RETURN_INT64((int64) ret);
+}
diff --git a/contrib/pageinspect/pageinspect--1.1.sql b/contrib/pageinspect/pageinspect--1.1.sql
index 22a47d5..b895246 100644
--- a/contrib/pageinspect/pageinspect--1.1.sql
+++ b/contrib/pageinspect/pageinspect--1.1.sql
@@ -105,3 +105,8 @@ CREATE FUNCTION fsm_page_contents(IN page bytea)
 RETURNS text
 AS 'MODULE_PATHNAME', 'fsm_page_contents'
 LANGUAGE C STRICT;
+
+CREATE FUNCTION pg_burn_multixact(num int4, size int4)
+RETURNS int4
+AS 'MODULE_PATHNAME', 'pg_burn_multixact'
+LANGUAGE C STRICT;
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 90e9e6f..1b8469f 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -5544,7 +5544,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 		 * Create a new multixact with the surviving members of the previous
 		 * one, to set as new Xmax in the tuple.
 		 */
-		xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers);
+		xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers, false);
 		*flags |= FRM_RETURN_IS_MULTI;
 	}
 
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 6ce0eb2..2f87a1e 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -349,7 +349,7 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1,
 	members[1].xid = xid2;
 	members[1].status = status2;
 
-	newMulti = MultiXactIdCreateFromMembers(2, members);
+	newMulti = MultiXactIdCreateFromMembers(2, members, false);
 
 	debug_elog3(DEBUG2, "Create: %s",
 				mxid_to_string(newMulti, 2, members));
@@ -415,7 +415,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 		 */
 		member.xid = xid;
 		member.status = status;
-		newMulti = MultiXactIdCreateFromMembers(1, &member);
+		newMulti = MultiXactIdCreateFromMembers(1, &member, false);
 
 		debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u",
 					multi, newMulti);
@@ -467,7 +467,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
 
 	newMembers[j].xid = xid;
 	newMembers[j++].status = status;
-	newMulti = MultiXactIdCreateFromMembers(j, newMembers);
+	newMulti = MultiXactIdCreateFromMembers(j, newMembers, false);
 
 	pfree(members);
 	pfree(newMembers);
@@ -681,7 +681,7 @@ ReadNextMultiXactId(void)
  * NB: the passed members[] array will be sorted in-place.
  */
 MultiXactId
-MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
+MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members, bool nocache)
 {
 	MultiXactId multi;
 	MultiXactOffset offset;
@@ -701,7 +701,9 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
 	 * corner cases where someone else added us to a MultiXact without our
 	 * knowledge, but it's not worth checking for.)
 	 */
-	multi = mXactCacheGetBySet(nmembers, members);
+	multi = nocache ? InvalidMultiXactId :
+		mXactCacheGetBySet(nmembers, members);
+
 	if (MultiXactIdIsValid(multi))
 	{
 		debug_elog2(DEBUG2, "Create: in cache!");
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index 6aceaeb..1180e1f 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -79,10 +79,11 @@ typedef struct xl_multixact_create
 extern MultiXactId MultiXactIdCreate(TransactionId xid1,
 				  MultiXactStatus status1, TransactionId xid2,
 				  MultiXactStatus status2);
+extern MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members, bool nocache);
 extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid,
 				  MultiXactStatus status);
 extern MultiXactId MultiXactIdCreateFromMembers(int nmembers,
-							 MultiXactMember *members);
+							 MultiXactMember *members, bool nocache);
 
 extern MultiXactId ReadNextMultiXactId(void);
 extern bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly);
-- 
1.7.10.4

>From 44ac15e12ac2af4df613087098f9be573517e257 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Tue, 31 Dec 2013 00:43:38 -0300
Subject: [PATCH 2/3] handle wraparound during trunc for multixact/members

---
 src/backend/access/transam/multixact.c |  124 +++++++++++++++++++++++++++++---
 src/backend/access/transam/slru.c      |    5 ++
 2 files changed, 119 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 2f87a1e..ba730d6 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -63,6 +63,8 @@
  */
 #include "postgres.h"
 
+#include <unistd.h>
+
 #include "access/multixact.h"
 #include "access/slru.h"
 #include "access/transam.h"
@@ -577,8 +579,13 @@ MultiXactIdSetOldestMember(void)
 		 * another someone else could compute an OldestVisibleMXactId that
 		 * would be after the value we are going to store when we get control
 		 * back.  Which would be wrong.
+		 *
+		 * Note that a shared lock is sufficient, because it's enough to stop
+		 * someone from advancing nextMXact; and nobody else could be trying to
+		 * write to our OldestMember entry, only reading (and we assume storing
+		 * it is atomic.)
 		 */
-		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+		LWLockAcquire(MultiXactGenLock, LW_SHARED);
 
 		/*
 		 * We have to beware of the possibility that nextMXact is in the
@@ -1546,7 +1553,7 @@ AtEOXact_MultiXact(void)
 
 /*
  * AtPrepare_MultiXact
- *		Save multixact state at 2PC tranasction prepare
+ *		Save multixact state at 2PC transaction prepare
  *
  * In this phase, we only store our OldestMemberMXactId value in the two-phase
  * state file.
@@ -2241,7 +2248,6 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
 	{
 		int			flagsoff;
 		int			flagsbit;
-		int			difference;
 
 		/*
 		 * Only zero when at first entry of a page.
@@ -2262,10 +2268,25 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers)
 			LWLockRelease(MultiXactMemberControlLock);
 		}
 
-		/* Advance to next page (OK if nmembers goes negative) */
-		difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
-		offset += difference;
-		nmembers -= difference;
+		/*
+		 * Advance to next page, taking care to properly handle the wraparound
+		 * case.
+		 */
+		if ((unsigned int) (offset + nmembers) < offset)
+		{
+			uint32		difference = offset + MULTIXACT_MEMBERS_PER_PAGE;
+
+			nmembers -= (unsigned int) (MULTIXACT_MEMBERS_PER_PAGE - difference);
+			offset = 0;
+		}
+		else
+		{
+			int			difference;
+
+			difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE;
+			nmembers -= difference;
+			offset += difference;
+		}
 	}
 }
 
@@ -2322,6 +2343,72 @@ GetOldestMultiXactId(void)
 	return oldestMXact;
 }
 
+/*
+ * SlruScanDirectory callback.
+ * 		This callback deletes segments that are outside the range determined by
+ * 		the given page numbers.
+ *
+ * Both range endpoints are exclusive (that is, segments containing any of
+ * those pages are kept.)
+ */
+typedef struct SlruScanDirPageRange
+{
+	int		rangeStart;
+	int		rangeEnd;
+} SlruScanDirPageRange;
+
+static bool
+SlruScanDirCbRemoveMembers(SlruCtl ctl, char *filename, int segpage,
+						   void *data)
+{
+	SlruScanDirPageRange *range = (SlruScanDirPageRange *) data;
+	MultiXactOffset	nextOffset;
+
+	if (range->rangeStart == range->rangeEnd)
+		return false;		/* easy case out */
+
+	/*
+	 * To ensure that no segment is spuriously removed, we must keep track
+	 * of new segments added since the start of the directory scan; to do this,
+	 * we update our end-of-range point as we run.
+	 *
+	 * As an optimization, we can skip looking at shared memory if we know for
+	 * certain that the current segment must be kept.  This is so because
+	 * nextOffset never decreases, and we never increase rangeStart during any
+	 * one run.
+	 */
+	if (!((range->rangeStart > range->rangeEnd &&
+		   segpage > range->rangeEnd && segpage < range->rangeStart) ||
+		  (range->rangeStart < range->rangeEnd &&
+		   (segpage < range->rangeStart || segpage > range->rangeEnd))))
+		return false;
+
+	/*
+	 * Update our idea of the end of the live range.
+	 */
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	nextOffset = MultiXactState->nextOffset;
+	LWLockRelease(MultiXactGenLock);
+	range->rangeEnd = MXOffsetToMemberPage(nextOffset);
+
+	/* Recheck the deletion condition.  If it still holds, perform it. */
+	if ((range->rangeStart > range->rangeEnd &&
+		 segpage > range->rangeEnd && segpage < range->rangeStart) ||
+		(range->rangeStart < range->rangeEnd &&
+		 (segpage < range->rangeStart || segpage > range->rangeEnd)))
+	{
+		char		path[MAXPGPATH];
+
+		snprintf(path, MAXPGPATH, "%s/%s", ctl->Dir, filename);
+		ereport(DEBUG2,
+				(errmsg("removing file \"%s\"", path)));
+		unlink(path);
+	}
+
+	return false;				/* keep going */
+}
+
+
 typedef struct mxtruncinfo
 {
 	int			earliestExistingPage;
@@ -2363,8 +2450,10 @@ void
 TruncateMultiXact(MultiXactId oldestMXact)
 {
 	MultiXactOffset oldestOffset;
+	MultiXactOffset	nextOffset;
 	mxtruncinfo trunc;
 	MultiXactId earliest;
+	SlruScanDirPageRange	pageRange;
 
 	/*
 	 * Note we can't just plow ahead with the truncation; it's possible that
@@ -2411,9 +2500,24 @@ TruncateMultiXact(MultiXactId oldestMXact)
 	SimpleLruTruncate(MultiXactOffsetCtl,
 					  MultiXactIdToOffsetPage(oldestMXact));
 
-	/* truncate MultiXactMembers and we're done */
-	SimpleLruTruncate(MultiXactMemberCtl,
-					  MXOffsetToMemberPage(oldestOffset));
+	/*
+	 * To truncate MultiXactMembers, we need to figure out the active page
+	 * range and delete all files outside that range.  The start point is the
+	 * start of the segment containing the oldest offset; an end point of the
+	 * segment containing the next offset to use is enough.  The end point is
+	 * updated as MultiXactMember gets extended concurrently, elsewhere.
+	 */
+	pageRange.rangeStart = MXOffsetToMemberPage(oldestOffset);
+	pageRange.rangeStart -= pageRange.rangeStart % SLRU_PAGES_PER_SEGMENT;
+
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	nextOffset = MultiXactState->nextOffset;
+	LWLockRelease(MultiXactGenLock);
+
+	pageRange.rangeEnd = MXOffsetToMemberPage(nextOffset);
+
+	SlruScanDirectory(MultiXactMemberCtl, SlruScanDirCbRemoveMembers,
+					  &pageRange);
 }
 
 /*
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 5e53593..a8480b0 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1272,6 +1272,11 @@ SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data)
  * If the callback returns true, the scan is stopped.  The last return value
  * from the callback is returned.
  *
+ * The callback receives the following arguments: 1. the SlruCtl struct for the
+ * slru being truncated; 2. the filename being considered; 3. the page number
+ * for the first page of that file; 4. a pointer to the opaque data given to us
+ * by the caller.
+ *
  * Note that the ordering in which the directory is scanned is not guaranteed.
  *
  * Note that no locking is applied.
-- 
1.7.10.4

>From 2df77b55a900f576c4b2e97c9d1d2ed819b03f1c Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvhe...@alvh.no-ip.org>
Date: Tue, 31 Dec 2013 00:44:54 -0300
Subject: [PATCH 3/3] Have 5-char filenames in SlruScanDirectory

---
 src/backend/access/transam/slru.c |    8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index a8480b0..317b6a5 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1293,8 +1293,12 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data)
 	cldir = AllocateDir(ctl->Dir);
 	while ((clde = ReadDir(cldir, ctl->Dir)) != NULL)
 	{
-		if (strlen(clde->d_name) == 4 &&
-			strspn(clde->d_name, "0123456789ABCDEF") == 4)
+		size_t	len;
+
+		len = strlen(clde->d_name);
+
+		if ((len == 4 || len == 5) &&
+			strspn(clde->d_name, "0123456789ABCDEF") == len)
 		{
 			segno = (int) strtol(clde->d_name, NULL, 16);
 			segpage = segno * SLRU_PAGES_PER_SEGMENT;
-- 
1.7.10.4

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to