Alvaro Herrera wrote: > 1. slru.c doesn't consider file names longer than 4 hexadecimal chars.
> For 9.3, I propose we skip this and tweak the code to consider files > whose names are 4 or 5 chars in length, to remain compatible with > existing installations that have pg_multixact/member having a mixture of > 4-char and 5-char file names. Attached is a patch for this. > 2. pg_multixact/members truncation requires more intelligence to avoid > removing files that are still needed. Right now we use modulo-2^32 > arithmetic, but this doesn't work because the useful range can span > longer than what we can keep within that range. > #2c At start of truncation, save end-of-range in MultiXactState. This > state is updated by GetNewMultiXactId as new files are created. That > way, before each new file is created, the truncation routine knows to > skip it. Attached is a patch implementing this. I also attach a patch implementing a "burn multixact" utility, initially coded by Andres Freund, tweaked by me. I used it to run a bunch of wraparound cycles and everything seems to behave as expected. (I don't recommend applying this patch; I'm posting merely because it's a very useful debugging tool.) One problem I see is length of time before freezing multis: they live for far too long, causing the SLRU files to eat way too much disk space. I ran burnmulti in a loop, creating multis of 3 members each, with a min freeze age of 50 million, and this leads to ~770 files in pg_multixact/offsets and ~2900 files in pg_multixact/members. Each file is 32 pages long. 256kB apiece. Probably enough to be bothersome. I think for computing the freezing point for multis, we should slash min_freeze_age by 10 or something like that. Or just set a hardcoded one million. > 3. New pg_multixact/members generation requires more intelligence to > avoid stomping on files from the previous wraparound cycle. Right now > there is no defense against this at all. I still have no idea how to attack this. -- Álvaro Herrera http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
>From b264bfe61b315a5f65d72b9550592cc9f73bf0a8 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera <alvhe...@alvh.no-ip.org> Date: Tue, 31 Dec 2013 00:42:11 -0300 Subject: [PATCH 1/3] pg_burn_multixact utility Andres Freund, minor tweaks by me --- contrib/pageinspect/heapfuncs.c | 42 ++++++++++++++++++++++++++++++ contrib/pageinspect/pageinspect--1.1.sql | 5 ++++ src/backend/access/heap/heapam.c | 2 +- src/backend/access/transam/multixact.c | 12 +++++---- src/include/access/multixact.h | 3 ++- 5 files changed, 57 insertions(+), 7 deletions(-) diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c index 6d8f6f1..93a3317 100644 --- a/contrib/pageinspect/heapfuncs.c +++ b/contrib/pageinspect/heapfuncs.c @@ -29,6 +29,8 @@ #include "funcapi.h" #include "utils/builtins.h" #include "miscadmin.h" +#include "access/multixact.h" +#include "access/transam.h" Datum heap_page_items(PG_FUNCTION_ARGS); @@ -224,3 +226,43 @@ heap_page_items(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(fctx); } + +extern Datum +pg_burn_multixact(PG_FUNCTION_ARGS); +PG_FUNCTION_INFO_V1(pg_burn_multixact); + +Datum +pg_burn_multixact(PG_FUNCTION_ARGS) +{ + int rep = PG_GETARG_INT32(0); + int size = PG_GETARG_INT32(1); + MultiXactMember *members; + MultiXactId ret; + TransactionId id = ReadNewTransactionId() - size; + int i; + + if (rep < 1) + elog(ERROR, "need to burn, burn, burn"); + + members = palloc(size * sizeof(MultiXactMember)); + for (i = 0; i < size; i++) + { + members[i].xid = id++; + members[i].status = MultiXactStatusForShare; + + if (!TransactionIdIsNormal(members[i].xid)) + { + id = FirstNormalTransactionId; + members[i].xid = id++; + } + } + + MultiXactIdSetOldestMember(); + + for (i = 0; i < rep; i++) + { + ret = MultiXactIdCreateFromMembers(size, members, true); + } + + PG_RETURN_INT64((int64) ret); +} diff --git a/contrib/pageinspect/pageinspect--1.1.sql b/contrib/pageinspect/pageinspect--1.1.sql index 22a47d5..b895246 100644 --- a/contrib/pageinspect/pageinspect--1.1.sql +++ b/contrib/pageinspect/pageinspect--1.1.sql @@ -105,3 +105,8 @@ CREATE FUNCTION fsm_page_contents(IN page bytea) RETURNS text AS 'MODULE_PATHNAME', 'fsm_page_contents' LANGUAGE C STRICT; + +CREATE FUNCTION pg_burn_multixact(num int4, size int4) +RETURNS int4 +AS 'MODULE_PATHNAME', 'pg_burn_multixact' +LANGUAGE C STRICT; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 90e9e6f..1b8469f 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -5544,7 +5544,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * Create a new multixact with the surviving members of the previous * one, to set as new Xmax in the tuple. */ - xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers); + xid = MultiXactIdCreateFromMembers(nnewmembers, newmembers, false); *flags |= FRM_RETURN_IS_MULTI; } diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 6ce0eb2..2f87a1e 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -349,7 +349,7 @@ MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, members[1].xid = xid2; members[1].status = status2; - newMulti = MultiXactIdCreateFromMembers(2, members); + newMulti = MultiXactIdCreateFromMembers(2, members, false); debug_elog3(DEBUG2, "Create: %s", mxid_to_string(newMulti, 2, members)); @@ -415,7 +415,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) */ member.xid = xid; member.status = status; - newMulti = MultiXactIdCreateFromMembers(1, &member); + newMulti = MultiXactIdCreateFromMembers(1, &member, false); debug_elog4(DEBUG2, "Expand: %u has no members, create singleton %u", multi, newMulti); @@ -467,7 +467,7 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) newMembers[j].xid = xid; newMembers[j++].status = status; - newMulti = MultiXactIdCreateFromMembers(j, newMembers); + newMulti = MultiXactIdCreateFromMembers(j, newMembers, false); pfree(members); pfree(newMembers); @@ -681,7 +681,7 @@ ReadNextMultiXactId(void) * NB: the passed members[] array will be sorted in-place. */ MultiXactId -MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) +MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members, bool nocache) { MultiXactId multi; MultiXactOffset offset; @@ -701,7 +701,9 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) * corner cases where someone else added us to a MultiXact without our * knowledge, but it's not worth checking for.) */ - multi = mXactCacheGetBySet(nmembers, members); + multi = nocache ? InvalidMultiXactId : + mXactCacheGetBySet(nmembers, members); + if (MultiXactIdIsValid(multi)) { debug_elog2(DEBUG2, "Create: in cache!"); diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 6aceaeb..1180e1f 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -79,10 +79,11 @@ typedef struct xl_multixact_create extern MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2); +extern MultiXactId CreateMultiXactId(int nmembers, MultiXactMember *members, bool nocache); extern MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status); extern MultiXactId MultiXactIdCreateFromMembers(int nmembers, - MultiXactMember *members); + MultiXactMember *members, bool nocache); extern MultiXactId ReadNextMultiXactId(void); extern bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly); -- 1.7.10.4
>From 44ac15e12ac2af4df613087098f9be573517e257 Mon Sep 17 00:00:00 2001 From: Alvaro Herrera <alvhe...@alvh.no-ip.org> Date: Tue, 31 Dec 2013 00:43:38 -0300 Subject: [PATCH 2/3] handle wraparound during trunc for multixact/members --- src/backend/access/transam/multixact.c | 124 +++++++++++++++++++++++++++++--- src/backend/access/transam/slru.c | 5 ++ 2 files changed, 119 insertions(+), 10 deletions(-) diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 2f87a1e..ba730d6 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -63,6 +63,8 @@ */ #include "postgres.h" +#include <unistd.h> + #include "access/multixact.h" #include "access/slru.h" #include "access/transam.h" @@ -577,8 +579,13 @@ MultiXactIdSetOldestMember(void) * another someone else could compute an OldestVisibleMXactId that * would be after the value we are going to store when we get control * back. Which would be wrong. + * + * Note that a shared lock is sufficient, because it's enough to stop + * someone from advancing nextMXact; and nobody else could be trying to + * write to our OldestMember entry, only reading (and we assume storing + * it is atomic.) */ - LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + LWLockAcquire(MultiXactGenLock, LW_SHARED); /* * We have to beware of the possibility that nextMXact is in the @@ -1546,7 +1553,7 @@ AtEOXact_MultiXact(void) /* * AtPrepare_MultiXact - * Save multixact state at 2PC tranasction prepare + * Save multixact state at 2PC transaction prepare * * In this phase, we only store our OldestMemberMXactId value in the two-phase * state file. @@ -2241,7 +2248,6 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) { int flagsoff; int flagsbit; - int difference; /* * Only zero when at first entry of a page. @@ -2262,10 +2268,25 @@ ExtendMultiXactMember(MultiXactOffset offset, int nmembers) LWLockRelease(MultiXactMemberControlLock); } - /* Advance to next page (OK if nmembers goes negative) */ - difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; - offset += difference; - nmembers -= difference; + /* + * Advance to next page, taking care to properly handle the wraparound + * case. + */ + if ((unsigned int) (offset + nmembers) < offset) + { + uint32 difference = offset + MULTIXACT_MEMBERS_PER_PAGE; + + nmembers -= (unsigned int) (MULTIXACT_MEMBERS_PER_PAGE - difference); + offset = 0; + } + else + { + int difference; + + difference = MULTIXACT_MEMBERS_PER_PAGE - offset % MULTIXACT_MEMBERS_PER_PAGE; + nmembers -= difference; + offset += difference; + } } } @@ -2322,6 +2343,72 @@ GetOldestMultiXactId(void) return oldestMXact; } +/* + * SlruScanDirectory callback. + * This callback deletes segments that are outside the range determined by + * the given page numbers. + * + * Both range endpoints are exclusive (that is, segments containing any of + * those pages are kept.) + */ +typedef struct SlruScanDirPageRange +{ + int rangeStart; + int rangeEnd; +} SlruScanDirPageRange; + +static bool +SlruScanDirCbRemoveMembers(SlruCtl ctl, char *filename, int segpage, + void *data) +{ + SlruScanDirPageRange *range = (SlruScanDirPageRange *) data; + MultiXactOffset nextOffset; + + if (range->rangeStart == range->rangeEnd) + return false; /* easy case out */ + + /* + * To ensure that no segment is spuriously removed, we must keep track + * of new segments added since the start of the directory scan; to do this, + * we update our end-of-range point as we run. + * + * As an optimization, we can skip looking at shared memory if we know for + * certain that the current segment must be kept. This is so because + * nextOffset never decreases, and we never increase rangeStart during any + * one run. + */ + if (!((range->rangeStart > range->rangeEnd && + segpage > range->rangeEnd && segpage < range->rangeStart) || + (range->rangeStart < range->rangeEnd && + (segpage < range->rangeStart || segpage > range->rangeEnd)))) + return false; + + /* + * Update our idea of the end of the live range. + */ + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextOffset = MultiXactState->nextOffset; + LWLockRelease(MultiXactGenLock); + range->rangeEnd = MXOffsetToMemberPage(nextOffset); + + /* Recheck the deletion condition. If it still holds, perform it. */ + if ((range->rangeStart > range->rangeEnd && + segpage > range->rangeEnd && segpage < range->rangeStart) || + (range->rangeStart < range->rangeEnd && + (segpage < range->rangeStart || segpage > range->rangeEnd))) + { + char path[MAXPGPATH]; + + snprintf(path, MAXPGPATH, "%s/%s", ctl->Dir, filename); + ereport(DEBUG2, + (errmsg("removing file \"%s\"", path))); + unlink(path); + } + + return false; /* keep going */ +} + + typedef struct mxtruncinfo { int earliestExistingPage; @@ -2363,8 +2450,10 @@ void TruncateMultiXact(MultiXactId oldestMXact) { MultiXactOffset oldestOffset; + MultiXactOffset nextOffset; mxtruncinfo trunc; MultiXactId earliest; + SlruScanDirPageRange pageRange; /* * Note we can't just plow ahead with the truncation; it's possible that @@ -2411,9 +2500,24 @@ TruncateMultiXact(MultiXactId oldestMXact) SimpleLruTruncate(MultiXactOffsetCtl, MultiXactIdToOffsetPage(oldestMXact)); - /* truncate MultiXactMembers and we're done */ - SimpleLruTruncate(MultiXactMemberCtl, - MXOffsetToMemberPage(oldestOffset)); + /* + * To truncate MultiXactMembers, we need to figure out the active page + * range and delete all files outside that range. The start point is the + * start of the segment containing the oldest offset; an end point of the + * segment containing the next offset to use is enough. The end point is + * updated as MultiXactMember gets extended concurrently, elsewhere. + */ + pageRange.rangeStart = MXOffsetToMemberPage(oldestOffset); + pageRange.rangeStart -= pageRange.rangeStart % SLRU_PAGES_PER_SEGMENT; + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextOffset = MultiXactState->nextOffset; + LWLockRelease(MultiXactGenLock); + + pageRange.rangeEnd = MXOffsetToMemberPage(nextOffset); + + SlruScanDirectory(MultiXactMemberCtl, SlruScanDirCbRemoveMembers, + &pageRange); } /* diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 5e53593..a8480b0 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1272,6 +1272,11 @@ SlruScanDirCbDeleteAll(SlruCtl ctl, char *filename, int segpage, void *data) * If the callback returns true, the scan is stopped. The last return value * from the callback is returned. * + * The callback receives the following arguments: 1. the SlruCtl struct for the + * slru being truncated; 2. the filename being considered; 3. the page number + * for the first page of that file; 4. a pointer to the opaque data given to us + * by the caller. + * * Note that the ordering in which the directory is scanned is not guaranteed. * * Note that no locking is applied. -- 1.7.10.4
>From 2df77b55a900f576c4b2e97c9d1d2ed819b03f1c Mon Sep 17 00:00:00 2001 From: Alvaro Herrera <alvhe...@alvh.no-ip.org> Date: Tue, 31 Dec 2013 00:44:54 -0300 Subject: [PATCH 3/3] Have 5-char filenames in SlruScanDirectory --- src/backend/access/transam/slru.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index a8480b0..317b6a5 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1293,8 +1293,12 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) cldir = AllocateDir(ctl->Dir); while ((clde = ReadDir(cldir, ctl->Dir)) != NULL) { - if (strlen(clde->d_name) == 4 && - strspn(clde->d_name, "0123456789ABCDEF") == 4) + size_t len; + + len = strlen(clde->d_name); + + if ((len == 4 || len == 5) && + strspn(clde->d_name, "0123456789ABCDEF") == len) { segno = (int) strtol(clde->d_name, NULL, 16); segpage = segno * SLRU_PAGES_PER_SEGMENT; -- 1.7.10.4
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers