On Thu, May 19, 2016 at 6:57 AM, Michael Paquier <michael.paqu...@gmail.com> wrote: > I am adding that to the commit fest of September.
And a lot of activity has happened here since. Attached are refreshed patches based on da6c4f6. v11 still applies correctly but it's always better to avoid hunks when applying them. -- Michael
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 289d240..0fd2e2b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8448,8 +8448,12 @@ CreateCheckPoint(int flags) if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE)) == 0) { + elog(LOG, "Not a forced or shutdown checkpoint: progress_lsn %X/%X, ckpt %X/%X", + (uint32) (progress_lsn >> 32), (uint32) progress_lsn, + (uint32) (ControlFile->checkPoint >> 32), (uint32) ControlFile->checkPoint); if (progress_lsn == ControlFile->checkPoint) { + elog(LOG, "Checkpoint is skipped"); WALInsertLockRelease(); LWLockRelease(CheckpointLock); END_CRIT_SECTION(); @@ -8616,7 +8620,11 @@ CreateCheckPoint(int flags) * recovery we don't need to write running xact data. */ if (!shutdown && XLogStandbyInfoActive()) - LogStandbySnapshot(); + { + XLogRecPtr lsn = LogStandbySnapshot(); + elog(LOG, "snapshot taken by checkpoint %X/%X", + (uint32) (lsn >> 32), (uint32) lsn); + } START_CRIT_SECTION(); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 3a791eb..7637a1d 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -331,7 +331,9 @@ BackgroundWriterMain(void) GetLastCheckpointRecPtr() < current_progress_lsn && last_progress_lsn < current_progress_lsn) { - (void) LogStandbySnapshot(); + XLogRecPtr lsn = LogStandbySnapshot(); + elog(LOG, "snapshot taken by bgwriter %X/%X", + (uint32) (lsn >> 32), (uint32) lsn); last_snapshot_ts = now; last_progress_lsn = current_progress_lsn; }
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b019bc1..ac40731 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2507,7 +2507,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, heaptup->t_len - SizeofHeapTupleHeader); /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP_ID, info); @@ -2846,7 +2846,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, XLogRegisterBufData(0, tupledata, totaldatalen); /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP2_ID, info); @@ -3308,7 +3308,7 @@ l1: } /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE); @@ -6035,7 +6035,7 @@ heap_finish_speculative(Relation relation, HeapTuple tuple) XLogBeginInsert(); /* We want the same filtering on this as on a plain insert */ - XLogIncludeOrigin(); + XLogSetFlags(XLOG_INCLUDE_ORIGIN); XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm); XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); @@ -7703,7 +7703,7 @@ log_heap_update(Relation reln, Buffer oldbuf, } /* filtering by origin on a row level is much more efficient */ - XLogIncludeOrigin(); + XLogSetFlags(XLOG_INCLUDE_ORIGIN); recptr = XLogInsert(RM_HEAP_ID, info); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index e11b229..9130816 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -5232,7 +5232,7 @@ XactLogCommitRecord(TimestampTz commit_time, XLogRegisterData((char *) (&xl_origin), sizeof(xl_xact_origin)); /* we allow filtering by xacts */ - XLogIncludeOrigin(); + XLogSetFlags(XLOG_INCLUDE_ORIGIN); return XLogInsert(RM_XACT_ID, info); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index c1b9a97..289d240 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -442,11 +442,30 @@ typedef struct XLogwrtResult * the WAL record is just copied to the page and the lock is released. But * to avoid the deadlock-scenario explained above, the indicator is always * updated before sleeping while holding an insertion lock. + * + * The progressAt values indicate the insertion progress used to determine + * WAL insertion activity since a previous checkpoint, which is aimed at + * finding out if a checkpoint should be skipped or not or if standby + * activity should be logged. Progress position is basically updated + * for all types of records, for the time being only snapshot logging + * is out of this scope to properly skip their logging on idle systems. + * Tracking the WAL activity directly in WALInsertLock has the advantage + * to not rely on taking an exclusive lock on all the WAL insertion locks, + * hence reducing the impact of the activity lookup. This takes also + * advantage to avoid 8-byte torn reads on some platforms by using the + * fact that each insert lock is located on the same cache line. + * XXX: There is still room for more improvements here, particularly + * WAL operations related to unlogged relations (INIT_FORKNUM) should not + * update the progress LSN as those relations are reset during crash + * recovery so enforcing buffers of such relations to be flushed for + * example in the case of a load only on unlogged relations is a waste + * of disk write. */ typedef struct { LWLock lock; XLogRecPtr insertingAt; + XLogRecPtr progressAt; } WALInsertLock; /* @@ -882,6 +901,9 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); * which pages need a full-page image, and retry. If fpw_lsn is invalid, the * record is always inserted. * + * 'flags' gives more in-depth control on the record being inserted. As of + * now, this controls if the progress LSN positions are updated. + * * The first XLogRecData in the chain must be for the record header, and its * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and * xl_crc fields in the header, the rest of the header must already be filled @@ -894,7 +916,9 @@ static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt); * WAL rule "write the log before the data".) */ XLogRecPtr -XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) +XLogInsertRecord(XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags) { XLogCtlInsert *Insert = &XLogCtl->Insert; pg_crc32c rdata_crc; @@ -993,6 +1017,25 @@ XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn) inserted = true; } + /* + * Update the progress LSN positions. At least one WAL insertion lock + * is already taken appropriately before doing that, and it is just more + * simple to do that here where WAL record data and type is at hand. + * The progress is set at the start position of the record tracked that + * is being added, making easier checkpoint progress tracking as the + * control file already saves the start LSN position of the last + * checkpoint run. If an exclusive lock is taken for WAL insertion, + * there is actually no need to update all the progression fields, so + * just do it on the first one. + */ + if ((flags & XLOG_NO_PROGRESS) == 0) + { + if (holdingAllLocks) + WALInsertLocks[0].l.progressAt = StartPos; + else + WALInsertLocks[MyLockNo].l.progressAt = StartPos; + } + if (inserted) { /* @@ -4716,6 +4759,7 @@ XLOGShmemInit(void) { LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT); WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr; + WALInsertLocks[i].l.progressAt = InvalidXLogRecPtr; } /* @@ -7992,6 +8036,55 @@ GetFlushRecPtr(void) } /* + * GetProgressRecPtr -- Returns the newest WAL activity position, aimed + * at the last significant WAL activity, or in other words any activity + * not referring to standby logging as of now. Finding the last activity + * position is done by scanning each WAL insertion lock by taking directly + * the light-weight lock associated to it. + */ +XLogRecPtr +GetProgressRecPtr(void) +{ + XLogRecPtr res = InvalidXLogRecPtr; + int i; + + /* + * Look at the latest LSN position referring to the activity done by + * WAL insertion. An exclusive lock is taken because currently the + * locking logic for WAL insertion only expects such a level of locking. + * Taking a lock is as well necessary to prevent potential torn reads + * on some platforms. + */ + for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++) + { + XLogRecPtr progress_lsn; + + LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE); + progress_lsn = WALInsertLocks[i].l.progressAt; + LWLockRelease(&WALInsertLocks[i].l.lock); + + if (res < progress_lsn) + res = progress_lsn; + } + + return res; +} + +/* + * GetLastCheckpointRecPtr -- Returns the last checkpoint insert position. + */ +XLogRecPtr +GetLastCheckpointRecPtr(void) +{ + XLogRecPtr ckpt_lsn; + + LWLockAcquire(ControlFileLock, LW_SHARED); + ckpt_lsn = ControlFile->checkPoint; + LWLockRelease(ControlFileLock); + return ckpt_lsn; +} + +/* * Get the time of the last xlog segment switch */ pg_time_t @@ -8251,7 +8344,7 @@ CreateCheckPoint(int flags) uint32 freespace; XLogRecPtr PriorRedoPtr; XLogRecPtr curInsert; - XLogRecPtr prevPtr; + XLogRecPtr progress_lsn; VirtualTransactionId *vxids; int nvxids; @@ -8332,34 +8425,30 @@ CreateCheckPoint(int flags) checkPoint.oldestActiveXid = InvalidTransactionId; /* + * Get progress before acquiring insert locks to shorten the locked + * section waiting ahead. + */ + progress_lsn = GetProgressRecPtr(); + + /* * We must block concurrent insertions while examining insert state to * determine the checkpoint REDO pointer. */ WALInsertLockAcquireExclusive(); curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos); - prevPtr = XLogBytePosToRecPtr(Insert->PrevBytePos); /* - * If this isn't a shutdown or forced checkpoint, and we have not inserted - * any XLOG records since the start of the last checkpoint, skip the - * checkpoint. The idea here is to avoid inserting duplicate checkpoints - * when the system is idle. That wastes log space, and more importantly it - * exposes us to possible loss of both current and previous checkpoint - * records if the machine crashes just as we're writing the update. - * (Perhaps it'd make even more sense to checkpoint only when the previous - * checkpoint record is in a different xlog page?) - * - * If the previous checkpoint crossed a WAL segment, however, we create - * the checkpoint anyway, to have the latest checkpoint fully contained in - * the new segment. This is for a little bit of extra robustness: it's - * better if you don't need to keep two WAL segments around to recover the - * checkpoint. + * If this isn't a shutdown or forced checkpoint, and if there has been no + * WAL activity, skip the checkpoint. The idea here is to avoid inserting + * duplicate checkpoints when the system is idle. That wastes log space, + * and more importantly it exposes us to possible loss of both current and + * previous checkpoint records if the machine crashes just as we're writing + * the update. */ if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_FORCE)) == 0) { - if (prevPtr == ControlFile->checkPointCopy.redo && - prevPtr / XLOG_SEG_SIZE == curInsert / XLOG_SEG_SIZE) + if (progress_lsn == ControlFile->checkPoint) { WALInsertLockRelease(); LWLockRelease(CheckpointLock); diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index 3cd273b..23f1e67 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -73,8 +73,8 @@ static XLogRecData *mainrdata_head; static XLogRecData *mainrdata_last = (XLogRecData *) &mainrdata_head; static uint32 mainrdata_len; /* total # of bytes in chain */ -/* Should the in-progress insertion log the origin? */ -static bool include_origin = false; +/* status flags of the in-progress insertion */ +static uint8 status_flags = 0; /* * These are used to hold the record header while constructing a record. @@ -201,7 +201,7 @@ XLogResetInsertion(void) max_registered_block_id = 0; mainrdata_len = 0; mainrdata_last = (XLogRecData *) &mainrdata_head; - include_origin = false; + status_flags = 0; begininsert_called = false; } @@ -387,10 +387,10 @@ XLogRegisterBufData(uint8 block_id, char *data, int len) * Should this record include the replication origin if one is set up? */ void -XLogIncludeOrigin(void) +XLogSetFlags(uint8 flags) { Assert(begininsert_called); - include_origin = true; + status_flags = flags; } /* @@ -450,7 +450,7 @@ XLogInsert(RmgrId rmid, uint8 info) rdt = XLogRecordAssemble(rmid, info, RedoRecPtr, doPageWrites, &fpw_lsn); - EndPos = XLogInsertRecord(rdt, fpw_lsn); + EndPos = XLogInsertRecord(rdt, fpw_lsn, status_flags); } while (EndPos == InvalidXLogRecPtr); XLogResetInsertion(); @@ -701,7 +701,8 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } /* followed by the record's origin, if any */ - if (include_origin && replorigin_session_origin != InvalidRepOriginId) + if ((status_flags & XLOG_INCLUDE_ORIGIN) != 0 && + replorigin_session_origin != InvalidRepOriginId) { *(scratch++) = XLR_BLOCK_ID_ORIGIN; memcpy(scratch, &replorigin_session_origin, sizeof(replorigin_session_origin)); diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 1002034..3a791eb 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -78,12 +78,12 @@ int BgWriterDelay = 200; #define LOG_SNAPSHOT_INTERVAL_MS 15000 /* - * LSN and timestamp at which we last issued a LogStandbySnapshot(), to avoid - * doing so too often or repeatedly if there has been no other write activity - * in the system. + * Last progress LSN and timestamp at which we last logged a standby + * snapshot, to avoid doing so too often or repeatedly if there has been + * no other write activity in the system. */ static TimestampTz last_snapshot_ts; -static XLogRecPtr last_snapshot_lsn = InvalidXLogRecPtr; +static XLogRecPtr last_progress_lsn = InvalidXLogRecPtr; /* * Flags set by interrupt handlers for later service in the main loop. @@ -308,7 +308,7 @@ BackgroundWriterMain(void) * check whether there has been any WAL inserted since the last time * we've logged a running xacts. * - * We do this logging in the bgwriter as its the only process that is + * We do this logging in the bgwriter as it is the only process that is * run regularly and returns to its mainloop all the time. E.g. * Checkpointer, when active, is barely ever in its mainloop and thus * makes it hard to log regularly. @@ -317,19 +317,23 @@ BackgroundWriterMain(void) { TimestampTz timeout = 0; TimestampTz now = GetCurrentTimestamp(); + XLogRecPtr current_progress_lsn = GetProgressRecPtr(); timeout = TimestampTzPlusMilliseconds(last_snapshot_ts, LOG_SNAPSHOT_INTERVAL_MS); /* - * only log if enough time has passed and some xlog record has - * been inserted. + * only log if enough time has passed, that some WAL activity + * has happened since last checkpoint, and that some xlog record + * has been inserted. */ if (now >= timeout && - last_snapshot_lsn != GetXLogInsertRecPtr()) + GetLastCheckpointRecPtr() < current_progress_lsn && + last_progress_lsn < current_progress_lsn) { - last_snapshot_lsn = LogStandbySnapshot(); + (void) LogStandbySnapshot(); last_snapshot_ts = now; + last_progress_lsn = current_progress_lsn; } } diff --git a/src/backend/replication/logical/message.c b/src/backend/replication/logical/message.c index 8f9dc2f..c2d2bd8 100644 --- a/src/backend/replication/logical/message.c +++ b/src/backend/replication/logical/message.c @@ -73,7 +73,7 @@ LogLogicalMessage(const char *prefix, const char *message, size_t size, XLogRegisterData((char *) message, size); /* allow origin filtering */ - XLogIncludeOrigin(); + XLogSetFlags(XLOG_INCLUDE_ORIGIN); return XLogInsert(RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE); } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 547f1a8..9774155 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -963,7 +963,8 @@ LogStandbySnapshot(void) * The definitions of RunningTransactionsData and xl_xact_running_xacts * are similar. We keep them separate because xl_xact_running_xacts * is a contiguous chunk of memory and never exists fully until it is - * assembled in WAL. + * assembled in WAL. Progress of WAL activity is not updated when + * this record is logged. */ static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) @@ -987,6 +988,8 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) XLogRegisterData((char *) CurrRunningXacts->xids, (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId)); + XLogSetFlags(XLOG_NO_PROGRESS); + recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS); if (CurrRunningXacts->subxid_overflow) @@ -1034,6 +1037,7 @@ LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks) XLogBeginInsert(); XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks)); XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock)); + XLogSetFlags(XLOG_NO_PROGRESS); (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK); } diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index c9f332c..dbd4cff 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -184,6 +184,12 @@ extern bool XLOG_DEBUG; #define CHECKPOINT_CAUSE_XLOG 0x0040 /* XLOG consumption */ #define CHECKPOINT_CAUSE_TIME 0x0080 /* Elapsed time */ +/* + * Flag bits for the record currently inserted. + */ +#define XLOG_INCLUDE_ORIGIN 0x01 /* include the origin */ +#define XLOG_NO_PROGRESS 0x02 /* do not update progress LSN */ + /* Checkpoint statistics */ typedef struct CheckpointStatsData { @@ -211,7 +217,9 @@ extern CheckpointStatsData CheckpointStats; struct XLogRecData; -extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, XLogRecPtr fpw_lsn); +extern XLogRecPtr XLogInsertRecord(struct XLogRecData *rdata, + XLogRecPtr fpw_lsn, + uint8 flags); extern void XLogFlush(XLogRecPtr RecPtr); extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); @@ -262,6 +270,8 @@ extern void GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p) extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); extern XLogRecPtr GetFlushRecPtr(void); +extern XLogRecPtr GetProgressRecPtr(void); +extern XLogRecPtr GetLastCheckpointRecPtr(void); extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch); extern void RemovePromoteSignalFiles(void); diff --git a/src/include/access/xloginsert.h b/src/include/access/xloginsert.h index cc0177e..3f10919 100644 --- a/src/include/access/xloginsert.h +++ b/src/include/access/xloginsert.h @@ -40,7 +40,7 @@ /* prototypes for public functions in xloginsert.c: */ extern void XLogBeginInsert(void); -extern void XLogIncludeOrigin(void); +extern void XLogSetFlags(uint8 flags); extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info); extern void XLogEnsureRecordSpace(int nbuffers, int ndatas); extern void XLogRegisterData(char *data, int len);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers