Rebased. Along with rebasing, I changed the interface of XLogFsyncFile() to return a boolean instead of an error message.
regards. -- Kyotaro Horiguchi NTT Open Source Software Center
>From bed74e638643d7491bbd86fe640c33db1e16f0e5 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horikyota....@gmail.com> Date: Mon, 15 Jan 2024 15:57:53 +0900 Subject: [PATCH v33 1/3] Export wal_sync_method related functions Export several functions related to wal_sync_method for use in subsequent commits. Since PG_O_DIRECT cannot be used in those commits, the new function XLogGetSyncBit() will mask PG_O_DIRECT. --- src/backend/access/transam/xlog.c | 73 +++++++++++++++++++++---------- src/include/access/xlog.h | 2 + 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 330e058c5f..492ababd9c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -8592,21 +8592,29 @@ assign_wal_sync_method(int new_wal_sync_method, void *extra) } } +/* + * Exported version of get_sync_bit() + * + * Do not expose PG_O_DIRECT for uses outside xlog.c. + */ +int +XLogGetSyncBit(void) +{ + return get_sync_bit(wal_sync_method) & ~PG_O_DIRECT; +} + /* - * Issue appropriate kind of fsync (if any) for an XLOG output file. + * Issue appropriate kind of fsync (if any) according to wal_sync_method. * - * 'fd' is a file descriptor for the XLOG file to be fsync'd. - * 'segno' is for error reporting purposes. + * 'fd' is a file descriptor for the file to be fsync'd. */ -void -issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) +const char * +XLogFsyncFile(int fd) { - char *msg = NULL; + const char *msg = NULL; instr_time start; - Assert(tli != 0); - /* * Quick exit if fsync is disabled or write() has already synced the WAL * file. @@ -8614,7 +8622,7 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) if (!enableFsync || wal_sync_method == WAL_SYNC_METHOD_OPEN || wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC) - return; + return NULL; /* Measure I/O timing to sync the WAL file */ if (track_wal_io_timing) @@ -8651,19 +8659,6 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) break; } - /* PANIC if failed to fsync */ - if (msg) - { - char xlogfname[MAXFNAMELEN]; - int save_errno = errno; - - XLogFileName(xlogfname, tli, segno, wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg(msg, xlogfname))); - } - pgstat_report_wait_end(); /* @@ -8677,7 +8672,39 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, end, start); } - PendingWalStats.wal_sync++; + if (msg != NULL) + PendingWalStats.wal_sync++; + + return msg; +} + +/* + * Issue appropriate kind of fsync (if any) for an XLOG output file. + * + * 'fd' is a file descriptor for the XLOG file to be fsync'd. + * 'segno' is for error reporting purposes. + */ +void +issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) +{ + const char *msg; + + Assert(tli != 0); + + msg = XLogFsyncFile(fd); + + /* PANIC if failed to fsync */ + if (msg) + { + char xlogfname[MAXFNAMELEN]; + int save_errno = errno; + + XLogFileName(xlogfname, tli, segno, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg(msg, xlogfname))); + } } /* diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1a1f11a943..badfe4abd6 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -217,6 +217,8 @@ extern void xlog_redo(struct XLogReaderState *record); extern void xlog_desc(StringInfo buf, struct XLogReaderState *record); extern const char *xlog_identify(uint8 info); +extern int XLogGetSyncBit(void); +extern const char *XLogFsyncFile(int fd); extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); extern bool RecoveryInProgress(void); -- 2.43.0
>From c200b85c1311f97bdae2ed20e2746c44d5c4aadb Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horikyota....@gmail.com> Date: Thu, 31 Aug 2023 11:49:10 +0900 Subject: [PATCH v33 2/3] Introduce undo log implementation This patch adds a simple implementation of UNDO log feature. --- src/backend/access/transam/Makefile | 1 + src/backend/access/transam/meson.build | 1 + src/backend/access/transam/rmgr.c | 4 +- src/backend/access/transam/simpleundolog.c | 362 +++++++++++++++++++++ src/backend/access/transam/twophase.c | 3 + src/backend/access/transam/xact.c | 24 ++ src/backend/access/transam/xlog.c | 42 ++- src/backend/catalog/storage.c | 171 ++++++++++ src/backend/storage/file/reinit.c | 78 +++++ src/backend/storage/smgr/smgr.c | 9 + src/bin/initdb/initdb.c | 17 + src/bin/pg_rewind/parsexlog.c | 2 +- src/bin/pg_waldump/rmgrdesc.c | 2 +- src/include/access/rmgr.h | 2 +- src/include/access/rmgrlist.h | 44 +-- src/include/access/simpleundolog.h | 36 ++ src/include/access/xlog.h | 2 +- src/include/catalog/storage.h | 3 + src/include/catalog/storage_ulog.h | 38 +++ src/include/catalog/storage_xlog.h | 9 + src/include/storage/reinit.h | 2 + src/include/storage/smgr.h | 1 + src/tools/pgindent/typedefs.list | 6 + 23 files changed, 818 insertions(+), 41 deletions(-) create mode 100644 src/backend/access/transam/simpleundolog.c create mode 100644 src/include/access/simpleundolog.h create mode 100644 src/include/catalog/storage_ulog.h diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 661c55a9db..531505cbbd 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -21,6 +21,7 @@ OBJS = \ rmgr.o \ slru.o \ subtrans.o \ + simpleundolog.o \ timeline.o \ transam.o \ twophase.o \ diff --git a/src/backend/access/transam/meson.build b/src/backend/access/transam/meson.build index 8a3522557c..c1225636b5 100644 --- a/src/backend/access/transam/meson.build +++ b/src/backend/access/transam/meson.build @@ -9,6 +9,7 @@ backend_sources += files( 'rmgr.c', 'slru.c', 'subtrans.c', + 'simpleundolog.c', 'timeline.c', 'transam.c', 'twophase.c', diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 1b7499726e..8fe3e71a0c 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -44,8 +44,8 @@ /* must be kept in sync with RmgrData definition in xlog_internal.h */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ - { name, redo, desc, identify, startup, cleanup, mask, decode }, +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ + { name, redo, desc, identify, startup, cleanup, mask, decode}, RmgrData RmgrTable[RM_MAX_ID + 1] = { #include "access/rmgrlist.h" diff --git a/src/backend/access/transam/simpleundolog.c b/src/backend/access/transam/simpleundolog.c new file mode 100644 index 0000000000..e22ed67bae --- /dev/null +++ b/src/backend/access/transam/simpleundolog.c @@ -0,0 +1,362 @@ +/*------------------------------------------------------------------------- + * + * simpleundolog.c + * Simple implementation of PostgreSQL transaction-undo-log manager + * + * In this module, procedures required during a transaction abort are + * logged. Persisting this information becomes crucial, particularly for + * ensuring reliable post-processing during the restart following a transaction + * crash. At present, in this module, logging of information is performed by + * simply appending data to a created file. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/clog.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/simpleundolog.h" +#include "access/twophase_rmgr.h" +#include "access/parallel.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/storage_ulog.h" +#include "storage/fd.h" + +#define ULOG_FILE_MAGIC 0x12345678 + +typedef struct UndoLogFileHeader +{ + int32 magic; + bool prepared; +} UndoLogFileHeader; + +typedef struct UndoDescData +{ + const char *name; + void (*rm_undo) (SimpleUndoLogRecord *record, bool prepared); +} UndoDescData; + +/* must be kept in sync with RmgrData definition in xlog_internal.h */ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ + { name, undo }, + +UndoDescData UndoRoutines[RM_MAX_ID + 1] = { +#include "access/rmgrlist.h" +}; +#undef PG_RMGR + +static char current_ulogfile_name[MAXPGPATH]; +static int current_ulogfile_fd = -1; +static int current_xid = InvalidTransactionId; +static UndoLogFileHeader current_fhdr; + +static void +undolog_check_file_header(void) +{ + if (read(current_ulogfile_fd, ¤t_fhdr, sizeof(current_fhdr)) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not read undolog file \"%s\": %m", + current_ulogfile_name)); + if (current_fhdr.magic != ULOG_FILE_MAGIC) + ereport(PANIC, + errcode_for_file_access(), + errmsg("invalid undolog file \"%s\": magic don't match", + current_ulogfile_name)); +} + +static void +undolog_sync_current_file(void) +{ + const char *msg; + + msg = XLogFsyncFile(current_ulogfile_fd); + + /* PANIC if failed to fsync */ + if (msg) + { + ereport(PANIC, + (errcode_for_file_access(), + errmsg(msg, current_ulogfile_name))); + } +} + +static bool +undolog_open_current_file(TransactionId xid, bool forread, bool append) +{ + int omode; + + if (current_ulogfile_fd >= 0) + { + /* use existing open file */ + if (current_xid == xid) + { + if (append) + return true; + + if (lseek(current_ulogfile_fd, + sizeof(UndoLogFileHeader), SEEK_SET) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not seek undolog file \"%s\": %m", + current_ulogfile_name)); + } + + close(current_ulogfile_fd); + current_ulogfile_fd = -1; + ReleaseExternalFD(); + } + + current_xid = xid; + if (!TransactionIdIsValid(xid)) + return false; + + omode = PG_BINARY | XLogGetSyncBit(); + + if (forread) + omode |= O_RDONLY; + else + { + omode |= O_RDWR; + + if (!append) + omode |= O_TRUNC; + } + + snprintf(current_ulogfile_name, MAXPGPATH, "%s/%08x", + SIMPLE_UNDOLOG_DIR, xid); + current_ulogfile_fd = BasicOpenFile(current_ulogfile_name, omode); + if (current_ulogfile_fd >= 0) + undolog_check_file_header(); + else + { + if (forread) + return false; + + current_fhdr.magic = ULOG_FILE_MAGIC; + current_fhdr.prepared = false; + + omode |= O_CREAT; + current_ulogfile_fd = BasicOpenFile(current_ulogfile_name, omode); + if (current_ulogfile_fd < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not create undolog file \"%s\": %m", + current_ulogfile_name)); + + if (write(current_ulogfile_fd, ¤t_fhdr, sizeof(current_fhdr)) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not write undolog file \"%s\": %m", + current_ulogfile_name)); + } + + /* + * move file pointer to the end of the file. we do this not using O_APPEND, + * to allow us to modify data at any location in the file. We already moved + * to the first record in the case of !append. + */ + if (append) + { + if (lseek(current_ulogfile_fd, 0, SEEK_END) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not seek undolog file \"%s\": %m", + current_ulogfile_name)); + } + ReserveExternalFD(); + + /* sync the file according to wal_sync_method */ + undolog_sync_current_file(); + + return true; +} + +/* + * Write an undolog record + */ +void +SimpleUndoLogWrite(RmgrId rmgr, uint8 info, + TransactionId xid, void *data, int len) +{ + int reclen = sizeof(SimpleUndoLogRecord) + len; + SimpleUndoLogRecord *rec = palloc(reclen); + pg_crc32c undodata_crc; + + Assert(!IsParallelWorker()); + Assert(xid != InvalidTransactionId); + + undolog_open_current_file(xid, false, true); + + rec->ul_tot_len = reclen; + rec->ul_rmid = rmgr; + rec->ul_info = info; + rec->ul_xid = current_xid; + + memcpy((char *)rec + sizeof(SimpleUndoLogRecord), data, len); + + /* Calculate CRC of the data */ + INIT_CRC32C(undodata_crc); + COMP_CRC32C(undodata_crc, rec, + reclen - offsetof(SimpleUndoLogRecord, ul_rmid)); + rec->ul_crc = undodata_crc; + + + if (write(current_ulogfile_fd, rec, reclen) < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not write to undolog file \"%s\": %m", + current_ulogfile_name)); + + undolog_sync_current_file(); +} + +static void +SimpleUndoLogUndo(bool cleanup) +{ + int bufsize; + char *buf; + + bufsize = 1024; + buf = palloc(bufsize); + + Assert(current_ulogfile_fd >= 0); + + while (read(current_ulogfile_fd, buf, sizeof(SimpleUndoLogRecord)) == + sizeof(SimpleUndoLogRecord)) + { + SimpleUndoLogRecord *rec = (SimpleUndoLogRecord *) buf; + int readlen = rec->ul_tot_len - sizeof(SimpleUndoLogRecord); + int ret; + + if (rec->ul_tot_len > bufsize) + { + bufsize *= 2; + buf = repalloc(buf, bufsize); + } + + ret = read(current_ulogfile_fd, + buf + sizeof(SimpleUndoLogRecord), readlen); + if (ret != readlen) + { + if (ret < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not read undo log file \"%s\": %m", + current_ulogfile_name)); + + ereport(ERROR, + errcode_for_file_access(), + errmsg("reading undo log expected %d bytes, but actually %d: %s", + readlen, ret, current_ulogfile_name)); + + } + + UndoRoutines[rec->ul_rmid].rm_undo(rec, + current_fhdr.prepared && cleanup); + } +} + +void +AtEOXact_SimpleUndoLog(bool isCommit, TransactionId xid) +{ + if (IsParallelWorker()) + return; + + if (!undolog_open_current_file(xid, true, false)) + return; + + if (!isCommit) + SimpleUndoLogUndo(false); + + if (current_ulogfile_fd > 0) + { + if (close(current_ulogfile_fd) != 0) + ereport(PANIC, errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + current_ulogfile_name)); + + current_ulogfile_fd = -1; + ReleaseExternalFD(); + durable_unlink(current_ulogfile_name, FATAL); + } + + return; +} + +void +UndoLogCleanup(void) +{ + DIR *dirdesc; + struct dirent *de; + char **loglist; + int loglistspace = 128; + int loglistlen = 0; + int i; + + loglist = palloc(sizeof(char*) * loglistspace); + + dirdesc = AllocateDir(SIMPLE_UNDOLOG_DIR); + while ((de = ReadDir(dirdesc, SIMPLE_UNDOLOG_DIR)) != NULL) + { + if (strspn(de->d_name, "01234567890abcdef") < strlen(de->d_name)) + continue; + + if (loglistlen >= loglistspace) + { + loglistspace *= 2; + loglist = repalloc(loglist, sizeof(char*) * loglistspace); + } + loglist[loglistlen++] = pstrdup(de->d_name); + } + + for (i = 0 ; i < loglistlen ; i++) + { + snprintf(current_ulogfile_name, MAXPGPATH, "%s/%s", + SIMPLE_UNDOLOG_DIR, loglist[i]); + current_ulogfile_fd = BasicOpenFile(current_ulogfile_name, + O_RDWR | PG_BINARY | + XLogGetSyncBit()); + undolog_check_file_header(); + SimpleUndoLogUndo(true); + if (close(current_ulogfile_fd) != 0) + ereport(PANIC, errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", + current_ulogfile_name)); + current_ulogfile_fd = -1; + + /* do not remove ulog files for prepared transactions */ + if (!current_fhdr.prepared) + durable_unlink(current_ulogfile_name, FATAL); + } +} + +/* + * Mark this xid as prepared + */ +void +SimpleUndoLogSetPrpared(TransactionId xid, bool prepared) +{ + Assert(xid != InvalidTransactionId); + + undolog_open_current_file(xid, false, true); + current_fhdr.prepared = prepared; + if (lseek(current_ulogfile_fd, 0, SEEK_SET) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not seek undolog file \"%s\": %m", + current_ulogfile_name)); + + if (write(current_ulogfile_fd, ¤t_fhdr, sizeof(current_fhdr)) < 0) + ereport(PANIC, + errcode_for_file_access(), + errmsg("could not write undolog file \"%s\": %m", + current_ulogfile_name)); + + undolog_sync_current_file(); +} diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index bf451d42ff..db3b227111 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -78,6 +78,7 @@ #include "access/commit_ts.h" #include "access/htup_details.h" +#include "access/simpleundolog.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/twophase.h" @@ -1587,6 +1588,8 @@ FinishPreparedTransaction(const char *gid, bool isCommit) abortstats, gid); + AtEOXact_SimpleUndoLog(isCommit, xid); + ProcArrayRemove(proc, latestXid); /* diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 4f4ce75762..d81e51746b 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -24,6 +24,7 @@ #include "access/multixact.h" #include "access/parallel.h" #include "access/subtrans.h" +#include "access/simpleundolog.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" @@ -2267,6 +2268,9 @@ CommitTransaction(void) */ smgrDoPendingSyncs(true, is_parallel_worker); + /* Likewise perform uncommitted storage file deletion. */ + smgrDoPendingCleanups(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2414,6 +2418,7 @@ CommitTransaction(void) AtEOXact_on_commit_actions(true); AtEOXact_Namespace(true, is_parallel_worker); AtEOXact_SMgr(); + AtEOXact_SimpleUndoLog(true, GetCurrentTransactionIdIfAny()); AtEOXact_Files(true); AtEOXact_ComboCid(); AtEOXact_HashTables(true); @@ -2523,6 +2528,9 @@ PrepareTransaction(void) */ smgrDoPendingSyncs(true, false); + /* Likewise perform uncommitted storage file deletion. */ + smgrDoPendingCleanups(true); + /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); @@ -2856,6 +2864,7 @@ AbortTransaction(void) AfterTriggerEndXact(false); /* 'false' means it's abort */ AtAbort_Portals(); smgrDoPendingSyncs(false, is_parallel_worker); + smgrDoPendingCleanups(false); AtEOXact_LargeObject(false); AtAbort_Notify(); AtEOXact_RelationMap(false, is_parallel_worker); @@ -2923,6 +2932,7 @@ AbortTransaction(void) AtEOXact_on_commit_actions(false); AtEOXact_Namespace(false, is_parallel_worker); AtEOXact_SMgr(); + AtEOXact_SimpleUndoLog(false, GetCurrentTransactionIdIfAny()); AtEOXact_Files(false); AtEOXact_ComboCid(); AtEOXact_HashTables(false); @@ -5107,6 +5117,8 @@ CommitSubTransaction(void) AtEOSubXact_Inval(true); AtSubCommit_smgr(); + AtEOXact_SimpleUndoLog(true, GetCurrentTransactionIdIfAny()); + /* * The only lock we actually release here is the subtransaction XID lock. */ @@ -5300,6 +5312,7 @@ AbortSubTransaction(void) RESOURCE_RELEASE_AFTER_LOCKS, false, false); AtSubAbort_smgr(); + AtEOXact_SimpleUndoLog(false, GetCurrentTransactionIdIfAny()); AtEOXact_GUC(false, s->gucNestLevel); AtEOSubXact_SPI(false, s->subTransactionId); @@ -5790,7 +5803,10 @@ XactLogCommitRecord(TimestampTz commit_time, if (!TransactionIdIsValid(twophase_xid)) info = XLOG_XACT_COMMIT; else + { + elog(LOG, "COMMIT PREPARED: %d", twophase_xid); info = XLOG_XACT_COMMIT_PREPARED; + } /* First figure out and collect all the information needed */ @@ -6190,6 +6206,8 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, DropRelationFiles(parsed->xlocators, parsed->nrels, true); } + AtEOXact_SimpleUndoLog(true, xid); + if (parsed->nstats > 0) { /* see equivalent call for relations above */ @@ -6301,6 +6319,8 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid, DropRelationFiles(parsed->xlocators, parsed->nrels, true); } + AtEOXact_SimpleUndoLog(false, xid); + if (parsed->nstats > 0) { /* see equivalent call for relations above */ @@ -6366,6 +6386,10 @@ xact_redo(XLogReaderState *record) } else if (info == XLOG_XACT_PREPARE) { + xl_xact_prepare *xlrec = (xl_xact_prepare *) XLogRecGetData(record); + + AtEOXact_SimpleUndoLog(true, xlrec->xid); + /* * Store xid and start/end pointers of the WAL record in TwoPhaseState * gxact entry. diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 492ababd9c..ce5e299b36 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -51,6 +51,7 @@ #include "access/heaptoast.h" #include "access/multixact.h" #include "access/rewriteheap.h" +#include "access/simpleundolog.h" #include "access/subtrans.h" #include "access/timeline.h" #include "access/transam.h" @@ -5720,6 +5721,12 @@ StartupXLOG(void) /* Check that the GUCs used to generate the WAL allow recovery */ CheckRequiredParameterValues(); + /* + * Perform undo processing. This must be done before resetting unlogged + * relations. + */ + UndoLogCleanup(); + /* * We're in recovery, so unlogged relations may be trashed and must be * reset. This should be done BEFORE allowing Hot Standby @@ -5867,14 +5874,17 @@ StartupXLOG(void) } /* - * Reset unlogged relations to the contents of their INIT fork. This is - * done AFTER recovery is complete so as to include any unlogged relations - * created during recovery, but BEFORE recovery is marked as having - * completed successfully. Otherwise we'd not retry if any of the post - * end-of-recovery steps fail. + * Process undo logs left ater recovery, then reset unlogged relations to + * the contents of their INIT fork. This is done AFTER recovery is complete + * so as to include any file creations during recovery, but BEFORE recovery + * is marked as having completed successfully. Otherwise we'd not retry if + * any of the post end-of-recovery steps fail. */ if (InRecovery) + { + UndoLogCleanup(); ResetUnloggedRelations(UNLOGGED_RELATION_INIT); + } /* * Pre-scan prepared transactions to find out the range of XIDs present. @@ -8607,10 +8617,12 @@ XLogGetSyncBit(void) /* * Issue appropriate kind of fsync (if any) according to wal_sync_method. * + * Returns true if sucessfully fsync'ed, otherwise returns false and sets + * errmsg if it is not NULL. * 'fd' is a file descriptor for the file to be fsync'd. */ -const char * -XLogFsyncFile(int fd) +bool +XLogFsyncFile(int fd, const char **errmsg) { const char *msg = NULL; instr_time start; @@ -8622,7 +8634,7 @@ XLogFsyncFile(int fd) if (!enableFsync || wal_sync_method == WAL_SYNC_METHOD_OPEN || wal_sync_method == WAL_SYNC_METHOD_OPEN_DSYNC) - return NULL; + return true; /* Measure I/O timing to sync the WAL file */ if (track_wal_io_timing) @@ -8672,10 +8684,16 @@ XLogFsyncFile(int fd) INSTR_TIME_ACCUM_DIFF(PendingWalStats.wal_sync_time, end, start); } - if (msg != NULL) + if (msg == NULL) + { PendingWalStats.wal_sync++; + if (errmsg) + *errmsg = msg; - return msg; + return false; + } + + return true; } /* @@ -8691,10 +8709,8 @@ issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli) Assert(tli != 0); - msg = XLogFsyncFile(fd); - /* PANIC if failed to fsync */ - if (msg) + if (!XLogFsyncFile(fd, &msg)) { char xlogfname[MAXFNAMELEN]; int save_errno = errno; diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index f56b3cc0f2..ae1bf597bd 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -19,17 +19,21 @@ #include "postgres.h" +#include "access/amapi.h" #include "access/visibilitymap.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" #include "access/xlogutils.h" +#include "access/simpleundolog.h" #include "catalog/storage.h" #include "catalog/storage_xlog.h" +#include "catalog/storage_ulog.h" #include "miscadmin.h" #include "storage/bulk_write.h" #include "storage/freespace.h" #include "storage/proc.h" +#include "storage/reinit.h" #include "storage/smgr.h" #include "utils/hsearch.h" #include "utils/memutils.h" @@ -67,6 +71,19 @@ typedef struct PendingRelDelete struct PendingRelDelete *next; /* linked-list link */ } PendingRelDelete; +#define PCOP_UNLINK_FORK (1 << 0) + +typedef struct PendingCleanup +{ + RelFileLocator rlocator; /* relation that need a cleanup */ + int op; /* operation mask */ + ForkNumber unlink_forknum; /* forknum to unlink */ + ProcNumber procNumber; /* INVALID_PROC_NUMBER if not a temp rel */ + bool atCommit; /* T=delete at commit; F=delete at abort */ + int nestLevel; /* xact nesting level of request */ + struct PendingCleanup *next; /* linked-list link */ +} PendingCleanup; + typedef struct PendingRelSync { RelFileLocator rlocator; @@ -74,6 +91,7 @@ typedef struct PendingRelSync } PendingRelSync; static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ +static PendingCleanup * pendingCleanups = NULL; /* head of linked list */ static HTAB *pendingSyncHash = NULL; @@ -149,6 +167,19 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, srel = smgropen(rlocator, procNumber); smgrcreate(srel, MAIN_FORKNUM, false); + /* Write undo log, this is required irrelevantly to needs_wal */ + if (register_delete) + { + ul_uncommitted_storage ul_storage; + + ul_storage.rlocator = rlocator; + ul_storage.forknum = MAIN_FORKNUM; + ul_storage.remove = true; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + } + if (needs_wal) log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM); @@ -192,12 +223,32 @@ log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum) */ xlrec.rlocator = *rlocator; xlrec.forkNum = forkNum; + xlrec.xid = GetTopTransactionId(); XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xlrec)); XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLR_SPECIAL_REL_UPDATE); } +/* + * Perform XLogInsert of an XLOG_SMGR_UNLINK record to WAL. + */ +void +log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum) +{ + xl_smgr_unlink xlrec; + + /* + * Make an XLOG entry reporting the file unlink. + */ + xlrec.rlocator = *rlocator; + xlrec.forkNum = forkNum; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE); +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -693,6 +744,75 @@ smgrDoPendingDeletes(bool isCommit) } } +/* + * smgrDoPendingUnmark() -- Clean up work that emits WAL records + * + * The operations handled in the function emits WAL records, which must be + * part of the current transaction. + */ +void +smgrDoPendingCleanups(bool isCommit) +{ + int nestLevel = GetCurrentTransactionNestLevel(); + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + if (pending->nestLevel < nestLevel) + { + /* outer-level entries should not be processed yet */ + prev = pending; + } + else + { + /* unlink list entry first, so we don't retry on failure */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + /* do cleanup if called for */ + if (pending->atCommit == isCommit) + { + SMgrRelation srel; + + srel = smgropen(pending->rlocator, pending->procNumber); + + Assert((pending->op & ~(PCOP_UNLINK_FORK)) == 0); + + if (pending->op & PCOP_UNLINK_FORK) + { + BlockNumber firstblock = 0; + + /* + * Unlink the fork file. Currently this operation is + * applied only to init-forks. As it is not ceratin that + * the init-fork is not loaded on shared buffers, drop all + * buffers for it. + */ + Assert(pending->unlink_forknum == INIT_FORKNUM); + DropRelationBuffers(srel, &pending->unlink_forknum, 1, + &firstblock); + + /* Don't emit wal while recovery. */ + if (!InRecovery) + log_smgrunlink(&pending->rlocator, + pending->unlink_forknum); + smgrunlink(srel, pending->unlink_forknum, false); + } + } + + /* must explicitly free the list entry */ + pfree(pending); + /* prev does not change */ + } + } +} + /* * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact. */ @@ -902,6 +1022,9 @@ PostPrepare_smgr(void) /* must explicitly free the list entry */ pfree(pending); } + + /* Mark undolog as prepared */ + SimpleUndoLogSetPrpared(GetCurrentTransactionId(), true); } @@ -949,10 +1072,28 @@ smgr_redo(XLogReaderState *record) { xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record); SMgrRelation reln; + ul_uncommitted_storage ul_storage; + + /* write undo log */ + ul_storage.rlocator = xlrec->rlocator; + ul_storage.forknum = xlrec->forkNum; + ul_storage.remove = true; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + xlrec->xid, + &ul_storage, sizeof(ul_storage)); reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); smgrcreate(reln, xlrec->forkNum, true); } + else if (info == XLOG_SMGR_UNLINK) + { + xl_smgr_unlink *xlrec = (xl_smgr_unlink *) XLogRecGetData(record); + SMgrRelation reln; + + reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); + smgrunlink(reln, xlrec->forkNum, true); + smgrclose(reln); + } else if (info == XLOG_SMGR_TRUNCATE) { xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record); @@ -1044,3 +1185,33 @@ smgr_redo(XLogReaderState *record) else elog(PANIC, "smgr_redo: unknown op code %u", info); } + +void +smgr_undo(SimpleUndoLogRecord *record, bool crash_prepared) +{ + uint8 info = record->ul_info; + + + if (info == ULOG_SMGR_UNCOMMITED_STORAGE) + { + ul_uncommitted_storage *ul_storage = + (ul_uncommitted_storage *) ULogRecGetData(record); + + if (!crash_prepared) + { + SMgrRelation reln; + + reln = smgropen(ul_storage->rlocator, INVALID_PROC_NUMBER); + smgrunlink(reln, ul_storage->forknum, true); + smgrclose(reln); + } + else + { + /* Inform reinit to ignore this file during cleanup */ + ResetUnloggedRelationIgnore(ul_storage->rlocator); + } + + } + else + elog(PANIC, "smgr_undo: unknown op code %u", info); +} diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index f1cd1a38d9..58ad350ec2 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -34,6 +34,39 @@ typedef struct RelFileNumber relnumber; /* hash key */ } unlogged_relation_entry; +static char **ignore_files = NULL; +static int nignore_elems = 0; +static int nignore_files = 0; + +/* + * identify the file should be ignored during resetting unlogged relations. + */ +static bool +reinit_ignore_file(const char *dirname, const char *name) +{ + char fnamebuf[MAXPGPATH]; + int len; + + if (nignore_files == 0) + return false; + + strncpy(fnamebuf, dirname, MAXPGPATH - 1); + strncat(fnamebuf, "/", MAXPGPATH - 1); + strncat(fnamebuf, name, MAXPGPATH - 1); + fnamebuf[MAXPGPATH - 1] = 0; + + for (int i = 0 ; i < nignore_files ; i++) + { + /* match ignoring fork part */ + len = strlen(ignore_files[i]); + if (strncmp(fnamebuf, ignore_files[i], len) == 0 && + (fnamebuf[len] == 0 || fnamebuf[len] == '_')) + return true; + } + + return false; +} + /* * Reset unlogged relations from before the last restart. * @@ -204,6 +237,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum, &segno)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -243,6 +280,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum, &segno)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* We never remove the init fork. */ if (forkNum == INIT_FORKNUM) continue; @@ -294,6 +335,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum, &segno)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -337,6 +382,10 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) &forkNum, &segno)) continue; + /* Skip anything that undo log suggested to ignore */ + if (reinit_ignore_file(dbspacedirname, de->d_name)) + continue; + /* Also skip it unless this is the init fork. */ if (forkNum != INIT_FORKNUM) continue; @@ -366,6 +415,35 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) } } +/* + * Record relfilenodes that should be left alone during reinitializing unlogged + * relations. + */ +void +ResetUnloggedRelationIgnore(RelFileLocator rloc) +{ + RelFileLocatorBackend rbloc; + + if (nignore_files >= nignore_elems) + { + if (ignore_files == NULL) + { + nignore_elems = 16; + ignore_files = palloc(sizeof(char *) * nignore_elems); + } + else + { + nignore_elems *= 2; + ignore_files = repalloc(ignore_files, + sizeof(char *) * nignore_elems); + } + } + + rbloc.backend = INVALID_PROC_NUMBER; + rbloc.locator = rloc; + ignore_files[nignore_files++] = relpath(rbloc, MAIN_FORKNUM); +} + /* * Basic parsing of putative relation filenames. * diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index a691aed1f4..d3773a2b8a 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -791,6 +791,15 @@ smgrimmedsync(SMgrRelation reln, ForkNumber forknum) smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); } +/* + * smgrunlink() -- unlink the storage file + */ +void +smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + smgrsw[reln->smgr_which].smgr_unlink(reln->smgr_rlocator, forknum, isRedo); +} + /* * AtEOXact_SMgr * diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index 12ae194067..9f9d6511e6 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -307,6 +307,7 @@ void setup_signals(void); void setup_text_search(void); void create_data_directory(void); void create_xlog_or_symlink(void); +void create_ulog(void); void warn_on_mount_point(int error); void initialize_data_directory(void); @@ -2958,6 +2959,21 @@ create_xlog_or_symlink(void) free(subdirloc); } +/* Create undo log directory */ +void +create_ulog(void) +{ + char *subdirloc; + + /* form name of the place for the subdirectory */ + subdirloc = psprintf("%s/pg_ulog", pg_data); + + if (mkdir(subdirloc, pg_dir_create_mode) < 0) + pg_fatal("could not create directory \"%s\": %m", + subdirloc); + + free(subdirloc); +} void warn_on_mount_point(int error) @@ -2992,6 +3008,7 @@ initialize_data_directory(void) create_data_directory(); create_xlog_or_symlink(); + create_ulog(); /* Create required subdirectories (other than pg_wal) */ printf(_("creating subdirectories ... ")); diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 22f7351fdc..525b98899f 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -28,7 +28,7 @@ * RmgrNames is an array of the built-in resource manager names, to make error * messages a bit nicer. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ name, static const char *const RmgrNames[RM_MAX_ID + 1] = { diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 6b8c17bb4c..a21009c5b8 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -32,7 +32,7 @@ #include "storage/standbydefs.h" #include "utils/relmapper.h" -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ { name, desc, identify}, static const RmgrDescData RmgrDescTable[RM_N_BUILTIN_IDS] = { diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index 3b6a497e1b..d705de9256 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -19,7 +19,7 @@ typedef uint8 RmgrId; * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG * file format. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,undo) \ symname, typedef enum RmgrIds diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 78e6b908c6..7f0abded93 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -25,25 +25,25 @@ */ /* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, decode */ -PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode) -PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode) -PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, standby_decode) -PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, heap2_decode) -PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, heap_decode) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, NULL) -PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, NULL) -PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL) -PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, NULL) -PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL) -PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL) -PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL) -PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL) -PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode) +PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode, NULL) +PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode, NULL) +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL, smgr_undo) +PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, standby_decode, NULL) +PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, heap2_decode, NULL) +PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, heap_decode, NULL) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, NULL, NULL) +PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, NULL, NULL) +PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL, NULL) +PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, NULL, NULL) +PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL, NULL) +PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL, NULL) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL, NULL) +PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL, NULL) +PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode, NULL) diff --git a/src/include/access/simpleundolog.h b/src/include/access/simpleundolog.h new file mode 100644 index 0000000000..3d3bd2f7e2 --- /dev/null +++ b/src/include/access/simpleundolog.h @@ -0,0 +1,36 @@ +#ifndef SIMPLE_UNDOLOG_H +#define SIMPLE_UNDOLOG_H + +#include "access/rmgr.h" +#include "port/pg_crc32c.h" + +#define SIMPLE_UNDOLOG_DIR "pg_ulog" + +typedef struct SimpleUndoLogRecord +{ + uint32 ul_tot_len; /* total length of entire record */ + pg_crc32c ul_crc; /* CRC for this record */ + RmgrId ul_rmid; /* resource manager for this record */ + uint8 ul_info; /* record info */ + TransactionId ul_xid; /* transaction id */ + /* rmgr-specific data follow, no padding */ +} SimpleUndoLogRecord; + +extern void SimpleUndoLogWrite(RmgrId rmgr, uint8 info, + TransactionId xid, void *data, int len); +extern void SimpleUndoLogSetPrpared(TransactionId xid, bool prepared); +extern void AtEOXact_SimpleUndoLog(bool isCommit, TransactionId xid); +extern void UndoLogCleanup(void); + +extern void AtPrepare_UndoLog(TransactionId xid); +extern void PostPrepare_UndoLog(void); +extern void undolog_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void undolog_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void undolog_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len); +extern void undolog_twophase_standby_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len); + +#endif /* SIMPLE_UNDOLOG_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index badfe4abd6..00ee01af68 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -218,7 +218,7 @@ extern void xlog_desc(StringInfo buf, struct XLogReaderState *record); extern const char *xlog_identify(uint8 info); extern int XLogGetSyncBit(void); -extern const char *XLogFsyncFile(int fd); +extern bool XLogFsyncFile(int fd, const char **errmsg); extern void issue_xlog_fsync(int fd, XLogSegNo segno, TimeLineID tli); extern bool RecoveryInProgress(void); diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index 72ef3ee92c..2a63eabcbd 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -25,6 +25,8 @@ extern PGDLLIMPORT int wal_skip_threshold; extern SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete); +extern void RelationCreateInitFork(Relation rel); +extern void RelationDropInitFork(Relation rel); extern void RelationDropStorage(Relation rel); extern void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit); extern void RelationPreTruncate(Relation rel); @@ -43,6 +45,7 @@ extern void RestorePendingSyncs(char *startAddress); extern void smgrDoPendingDeletes(bool isCommit); extern void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker); extern int smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr); +extern void smgrDoPendingCleanups(bool isCommit); extern void AtSubCommit_smgr(void); extern void AtSubAbort_smgr(void); extern void PostPrepare_smgr(void); diff --git a/src/include/catalog/storage_ulog.h b/src/include/catalog/storage_ulog.h new file mode 100644 index 0000000000..847f0403e2 --- /dev/null +++ b/src/include/catalog/storage_ulog.h @@ -0,0 +1,38 @@ +/*------------------------------------------------------------------------- + * + * storage_ulog.h + * prototypes for Undo Log support for backend/catalog/storage.c + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/catalog/storage_ulog.h + * + *------------------------------------------------------------------------- + */ +#ifndef STORAGE_ULOG_H +#define STORAGE_ULOG_H + +#include "access/simpleundolog.h" +#include "storage/relfilelocator.h" + +/* ULOG gives us high 4 bits (just following xlog) */ +#define ULOG_SMGR_UNCOMMITED_STORAGE 0x10 + +/* undo log entry for uncommitted storage files */ +typedef struct ul_uncommitted_storage +{ + RelFileLocator rlocator; + ForkNumber forknum; + bool remove; +} ul_uncommitted_storage; + +/* flags for xl_smgr_truncate */ +#define SMGR_TRUNCATE_HEAP 0x0001 + +void smgr_undo(SimpleUndoLogRecord *record, bool crash_prepared); + +#define ULogRecGetData(record) ((char *)record + sizeof(SimpleUndoLogRecord)) + +#endif /* STORAGE_XLOG_H */ diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index a490e05f88..807c0f8235 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -29,13 +29,21 @@ /* XLOG gives us high 4 bits */ #define XLOG_SMGR_CREATE 0x10 #define XLOG_SMGR_TRUNCATE 0x20 +#define XLOG_SMGR_UNLINK 0x30 typedef struct xl_smgr_create { RelFileLocator rlocator; ForkNumber forkNum; + TransactionId xid; } xl_smgr_create; +typedef struct xl_smgr_unlink +{ + RelFileLocator rlocator; + ForkNumber forkNum; +} xl_smgr_unlink; + /* flags for xl_smgr_truncate */ #define SMGR_TRUNCATE_HEAP 0x0001 #define SMGR_TRUNCATE_VM 0x0002 @@ -51,6 +59,7 @@ typedef struct xl_smgr_truncate } xl_smgr_truncate; extern void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum); +extern void log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum); extern void smgr_redo(XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h index 1373d509df..c57ae26b4c 100644 --- a/src/include/storage/reinit.h +++ b/src/include/storage/reinit.h @@ -16,9 +16,11 @@ #define REINIT_H #include "common/relpath.h" +#include "storage/relfilelocator.h" extern void ResetUnloggedRelations(int op); +extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); extern bool parse_filename_for_nontemp_relation(const char *name, RelFileNumber *relnumber, ForkNumber *fork, diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index fc5f883ce1..3428e5233b 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -84,6 +84,7 @@ extern void smgrrelease(SMgrRelation reln); extern void smgrreleaseall(void); extern void smgrreleaserellocator(RelFileLocatorBackend rlocator); extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); +extern void smgrunlink(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 46a84c5714..29a3e52dbf 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2073,6 +2073,7 @@ PatternInfo PatternInfoArray Pattern_Prefix_Status Pattern_Type +PendingCleanup PendingFsyncEntry PendingRelDelete PendingRelSync @@ -2645,6 +2646,7 @@ SimplePtrListCell SimpleStats SimpleStringList SimpleStringListCell +SimpleUndoLogRecord SingleBoundSortItem SinglePartitionSpec Size @@ -3017,6 +3019,8 @@ ULONG ULONG_PTR UV UVersionInfo +UndoDescData +UndoLogFileHeader UnicodeNormalizationForm UnicodeNormalizationQC Unique @@ -3983,6 +3987,7 @@ uint8 uint8_t uint8x16_t uintptr_t +ul_uncommitted_storage unicodeStyleBorderFormat unicodeStyleColumnFormat unicodeStyleFormat @@ -4095,6 +4100,7 @@ xl_running_xacts xl_seq_rec xl_smgr_create xl_smgr_truncate +xl_smgr_unlink xl_standby_lock xl_standby_locks xl_tblspc_create_rec -- 2.43.0
>From cef243b95fe49cbe753731eab55508e32800ac8f Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horikyota....@gmail.com> Date: Mon, 4 Sep 2023 17:23:05 +0900 Subject: [PATCH v33 3/3] In-place table persistence change Previously, the command caused a large amount of file I/O due to heap rewrites, even though ALTER TABLE SET UNLOGGED does not require any data rewrites. This patch eliminates the need for rewrites. Additionally, ALTER TABLE SET LOGGED is updated to emit XLOG_FPI records instead of numerous HEAP_INSERTs when wal_level > minimal, reducing resource consumption. --- src/backend/access/rmgrdesc/smgrdesc.c | 12 + src/backend/access/transam/simpleundolog.c | 4 +- src/backend/catalog/storage.c | 338 ++++++++++++++++++++- src/backend/commands/tablecmds.c | 268 +++++++++++++--- src/backend/storage/buffer/bufmgr.c | 84 +++++ src/bin/pg_rewind/parsexlog.c | 6 + src/include/catalog/storage_xlog.h | 10 + src/include/storage/bufmgr.h | 3 + src/include/storage/reinit.h | 2 +- src/tools/pgindent/typedefs.list | 1 + 10 files changed, 684 insertions(+), 44 deletions(-) diff --git a/src/backend/access/rmgrdesc/smgrdesc.c b/src/backend/access/rmgrdesc/smgrdesc.c index 71410e0a2d..77a8fdb045 100644 --- a/src/backend/access/rmgrdesc/smgrdesc.c +++ b/src/backend/access/rmgrdesc/smgrdesc.c @@ -40,6 +40,15 @@ smgr_desc(StringInfo buf, XLogReaderState *record) xlrec->blkno, xlrec->flags); pfree(path); } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = (xl_smgr_bufpersistence *) rec; + char *path = relpathperm(xlrec->rlocator, MAIN_FORKNUM); + + appendStringInfoString(buf, path); + appendStringInfo(buf, " persistence %d", xlrec->persistence); + pfree(path); + } } const char * @@ -55,6 +64,9 @@ smgr_identify(uint8 info) case XLOG_SMGR_TRUNCATE: id = "TRUNCATE"; break; + case XLOG_SMGR_BUFPERSISTENCE: + id = "BUFPERSISTENCE"; + break; } return id; diff --git a/src/backend/access/transam/simpleundolog.c b/src/backend/access/transam/simpleundolog.c index e22ed67bae..ec26c95b32 100644 --- a/src/backend/access/transam/simpleundolog.c +++ b/src/backend/access/transam/simpleundolog.c @@ -75,10 +75,8 @@ undolog_sync_current_file(void) { const char *msg; - msg = XLogFsyncFile(current_ulogfile_fd); - /* PANIC if failed to fsync */ - if (msg) + if (!XLogFsyncFile(current_ulogfile_fd, &msg)) { ereport(PANIC, (errcode_for_file_access(), diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index ae1bf597bd..3de229b53d 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -72,11 +72,13 @@ typedef struct PendingRelDelete } PendingRelDelete; #define PCOP_UNLINK_FORK (1 << 0) +#define PCOP_SET_PERSISTENCE (1 << 1) typedef struct PendingCleanup { RelFileLocator rlocator; /* relation that need a cleanup */ int op; /* operation mask */ + bool bufpersistence; /* buffer persistence to set */ ForkNumber unlink_forknum; /* forknum to unlink */ ProcNumber procNumber; /* INVALID_PROC_NUMBER if not a temp rel */ bool atCommit; /* T=delete at commit; F=delete at abort */ @@ -210,6 +212,208 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence, return srel; } +/* + * RelationCreateInitFork + * Create physical storage for the init fork of a relation. + * + * Create the init fork for the relation. + * + * This function is transactional. The creation is WAL-logged, and if the + * transaction aborts later on, the init fork will be removed. + */ +void +RelationCreateInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + SMgrRelation srel; + ul_uncommitted_storage ul_storage; + bool create = true; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), false, false); + + /* + * If a pending-unlink exists for this relation's init-fork, it indicates + * the init-fork's existed before the current transaction; this function + * reverts the pending-unlink by removing the entry. See + * RelationDropInitFork. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum == INIT_FORKNUM) + { + /* write cancel log for preceding undo log entry */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + /* prev does not change */ + + create = false; + } + else + prev = pending; + } + + if (!create) + return; + + /* create undo log entry, then the init fork */ + srel = smgropen(rlocator, INVALID_PROC_NUMBER); + + /* write undo log */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = true; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + /* We don't have existing init fork, create it. */ + smgrcreate(srel, INIT_FORKNUM, false); + + /* + * For index relations, WAL-logging and file sync are handled by + * ambuildempty. In contrast, for heap relations, these tasks are performed + * directly. + */ + if (rel->rd_rel->relkind == RELKIND_INDEX) + rel->rd_indam->ambuildempty(rel); + else + { + log_smgrcreate(&rlocator, INIT_FORKNUM); + smgrimmedsync(srel, INIT_FORKNUM); + } + + /* drop the init fork, mark file then revert persistence at abort */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK | PCOP_SET_PERSISTENCE; + pending->unlink_forknum = INIT_FORKNUM; + pending->bufpersistence = true; + pending->procNumber = INVALID_PROC_NUMBER; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + +/* + * RelationDropInitFork + * Delete physical storage for the init fork of a relation. + */ +void +RelationDropInitFork(Relation rel) +{ + RelFileLocator rlocator = rel->rd_locator; + PendingCleanup *pending; + PendingCleanup *prev; + PendingCleanup *next; + bool inxact_created = false; + + /* switch buffer persistence */ + SetRelationBuffersPersistence(RelationGetSmgr(rel), true, false); + + /* + * Search for a pending-unlink associated with the init-fork of the + * relation. Its presence indicates that the init-fork was created within + * the current transaction. + */ + prev = NULL; + for (pending = pendingCleanups; pending != NULL; pending = next) + { + next = pending->next; + + if (RelFileLocatorEquals(rlocator, pending->rlocator) && + pending->unlink_forknum == INIT_FORKNUM) + { + ul_uncommitted_storage ul_storage; + + /* write cancel log for preceding undo log entry */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + + /* unlink list entry */ + if (prev) + prev->next = next; + else + pendingCleanups = next; + + pfree(pending); + + /* prev does not change */ + + inxact_created = true; + } + else + prev = pending; + } + + /* + * If the init-fork was created in this transaction, remove the init-fork + * and cancel preceding undo log. Otherwise, register an at-commit + * pending-unlink for the existing init-fork. See RelationCreateInitFork. + */ + if (inxact_created) + { + SMgrRelation srel = smgropen(rlocator, INVALID_PROC_NUMBER); + ForkNumber forknum = INIT_FORKNUM; + BlockNumber firstblock = 0; + ul_uncommitted_storage ul_storage; + + /* + * Some AMs initialize init-fork via the buffer manager. To properly + * drop the init-fork, first drop all buffers for the init-fork, then + * unlink the init-fork and cancel preceding undo log. + */ + DropRelationBuffers(srel, &forknum, 1, &firstblock); + + /* cancel existing undo log */ + ul_storage.rlocator = rlocator; + ul_storage.forknum = INIT_FORKNUM; + ul_storage.remove = false; + SimpleUndoLogWrite(RM_SMGR_ID, ULOG_SMGR_UNCOMMITED_STORAGE, + GetCurrentTransactionId(), + &ul_storage, sizeof(ul_storage)); + log_smgrunlink(&rlocator, INIT_FORKNUM); + smgrunlink(srel, INIT_FORKNUM, false); + return; + } + + /* register drop of this init fork file at commit */ + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = rlocator; + pending->op = PCOP_UNLINK_FORK; + pending->unlink_forknum = INIT_FORKNUM; + pending->procNumber = INVALID_PROC_NUMBER; + pending->atCommit = true; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; +} + /* * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL. */ @@ -249,6 +453,25 @@ log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum) XLogInsert(RM_SMGR_ID, XLOG_SMGR_UNLINK | XLR_SPECIAL_REL_UPDATE); } +/* + * Perform XLogInsert of an XLOG_SMGR_BUFPERSISTENCE record to WAL. + */ +void +log_smgrbufpersistence(const RelFileLocator rlocator, bool persistence) +{ + xl_smgr_bufpersistence xlrec; + + /* + * Make an XLOG entry reporting the change of buffer persistence. + */ + xlrec.rlocator = rlocator; + xlrec.persistence = persistence; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + XLogInsert(RM_SMGR_ID, XLOG_SMGR_BUFPERSISTENCE | XLR_SPECIAL_REL_UPDATE); +} + /* * RelationDropStorage * Schedule unlinking of physical storage at transaction commit. @@ -782,7 +1005,14 @@ smgrDoPendingCleanups(bool isCommit) srel = smgropen(pending->rlocator, pending->procNumber); - Assert((pending->op & ~(PCOP_UNLINK_FORK)) == 0); + Assert((pending->op & + ~(PCOP_UNLINK_FORK | PCOP_SET_PERSISTENCE)) == 0); + + if (pending->op & PCOP_SET_PERSISTENCE) + { + SetRelationBuffersPersistence(srel, pending->bufpersistence, + InRecovery); + } if (pending->op & PCOP_UNLINK_FORK) { @@ -1182,6 +1412,112 @@ smgr_redo(XLogReaderState *record) FreeFakeRelcacheEntry(rel); } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete any pending action for persistence change, if present. There + * should be at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert(pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * During abort, revert any changes to buffer persistence made made in + * this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = xlrec->rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->procNumber = INVALID_PROC_NUMBER; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } + else if (info == XLOG_SMGR_BUFPERSISTENCE) + { + xl_smgr_bufpersistence *xlrec = + (xl_smgr_bufpersistence *) XLogRecGetData(record); + SMgrRelation reln; + PendingCleanup *pending; + PendingCleanup *prev = NULL; + + reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER); + SetRelationBuffersPersistence(reln, xlrec->persistence, true); + + /* + * Delete any pending action for persistence change, if present. There + * should be at most one entry for this action. + */ + for (pending = pendingCleanups; pending != NULL; + pending = pending->next) + { + if (RelFileLocatorEquals(xlrec->rlocator, pending->rlocator) && + (pending->op & PCOP_SET_PERSISTENCE) != 0) + { + Assert(pending->bufpersistence == xlrec->persistence); + + if (prev) + prev->next = pending->next; + else + pendingCleanups = pending->next; + + pfree(pending); + break; + } + + prev = pending; + } + + /* + * During abort, revert any changes to buffer persistence made made in + * this transaction. + */ + if (!pending) + { + pending = (PendingCleanup *) + MemoryContextAlloc(TopMemoryContext, sizeof(PendingCleanup)); + pending->rlocator = xlrec->rlocator; + pending->op = PCOP_SET_PERSISTENCE; + pending->bufpersistence = !xlrec->persistence; + pending->procNumber = INVALID_PROC_NUMBER; + pending->atCommit = false; + pending->nestLevel = GetCurrentTransactionNestLevel(); + pending->next = pendingCleanups; + pendingCleanups = pending; + } + } else elog(PANIC, "smgr_redo: unknown op code %u", info); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 313c782cae..47c46b13cd 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -5667,6 +5667,189 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel, return newcmd; } +/* + * RelationChangePersistence: perform in-place persistence change of a relation + */ +static void +RelationChangePersistence(AlteredTableInfo *tab, char persistence, + LOCKMODE lockmode) +{ + Relation rel; + Relation classRel; + HeapTuple tuple, + newtuple; + Datum new_val[Natts_pg_class]; + bool new_null[Natts_pg_class], + new_repl[Natts_pg_class]; + int i; + List *relids; + ListCell *lc_oid; + + Assert(tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE); + Assert(lockmode == AccessExclusiveLock); + + /* + * Use ATRewriteTable instead of this function under the following + * condition. + */ + Assert(tab->constraints == NULL && tab->partition_constraint == NULL && + tab->newvals == NULL && !tab->verify_new_notnull); + + rel = table_open(tab->relid, lockmode); + + Assert(rel->rd_rel->relpersistence != persistence); + + elog(DEBUG1, "perform in-place persistence change"); + + /* + * Initially, gather all relations that require a persistence change. + */ + + /* Collect OIDs of indexes and toast relations */ + relids = RelationGetIndexList(rel); + relids = lcons_oid(rel->rd_id, relids); + + /* Add toast relation if any */ + if (OidIsValid(rel->rd_rel->reltoastrelid)) + { + List *toastidx; + Relation toastrel = table_open(rel->rd_rel->reltoastrelid, lockmode); + + relids = lappend_oid(relids, rel->rd_rel->reltoastrelid); + toastidx = RelationGetIndexList(toastrel); + relids = list_concat(relids, toastidx); + pfree(toastidx); + table_close(toastrel, NoLock); + } + + table_close(rel, NoLock); + + /* Make changes in storage */ + classRel = table_open(RelationRelationId, RowExclusiveLock); + + foreach(lc_oid, relids) + { + Oid reloid = lfirst_oid(lc_oid); + Relation r = relation_open(reloid, lockmode); + + /* + * XXXX: Some access methods don't support in-place persistence + * changes. GiST uses page LSNs to figure out whether a block has been + * modified. However, UNLOGGED GiST indexes use fake LSNs, which are + * incompatible with the real LSNs used for LOGGED indexes. + * + * Potentially, if gistGetFakeLSN behaved similarly for both permanent + * and unlogged indexes, we could avoid index rebuilds by emitting + * extra WAL records while the index is unlogged. + * + * Compare relam against a positive list to ensure the hard way is + * taken for unknown AMs. + */ + if (r->rd_rel->relkind == RELKIND_INDEX && + /* GiST is excluded */ + r->rd_rel->relam != BTREE_AM_OID && + r->rd_rel->relam != HASH_AM_OID && + r->rd_rel->relam != GIN_AM_OID && + r->rd_rel->relam != SPGIST_AM_OID && + r->rd_rel->relam != BRIN_AM_OID) + { + int reindex_flags; + ReindexParams params = {0}; + + /* reindex doesn't allow concurrent use of the index */ + table_close(r, NoLock); + + reindex_flags = + REINDEX_REL_SUPPRESS_INDEX_USE | + REINDEX_REL_CHECK_CONSTRAINTS; + + /* Set the same persistence with the parent relation. */ + if (persistence == RELPERSISTENCE_UNLOGGED) + reindex_flags |= REINDEX_REL_FORCE_INDEXES_UNLOGGED; + else + reindex_flags |= REINDEX_REL_FORCE_INDEXES_PERMANENT; + + /* this doesn't fire REINDEX event triegger */ + reindex_index(NULL, reloid, reindex_flags, persistence, ¶ms); + + continue; + } + + /* Create or drop init fork */ + if (persistence == RELPERSISTENCE_UNLOGGED) + RelationCreateInitFork(r); + else + RelationDropInitFork(r); + + /* + * If this relation becomes WAL-logged, immediately sync all files + * except the init-fork to establish the initial state on storage. The + * buffers should have already been flushed out by + * RelationCreate(Drop)InitFork called just above. The init-fork should + * already be synchronized as required. + */ + if (persistence == RELPERSISTENCE_PERMANENT) + { + for (i = 0; i < INIT_FORKNUM; i++) + { + if (smgrexists(RelationGetSmgr(r), i)) + smgrimmedsync(RelationGetSmgr(r), i); + } + } + + /* Update catalog */ + tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(reloid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", reloid); + + memset(new_val, 0, sizeof(new_val)); + memset(new_null, false, sizeof(new_null)); + memset(new_repl, false, sizeof(new_repl)); + + new_val[Anum_pg_class_relpersistence - 1] = CharGetDatum(persistence); + new_null[Anum_pg_class_relpersistence - 1] = false; + new_repl[Anum_pg_class_relpersistence - 1] = true; + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(classRel), + new_val, new_null, new_repl); + + CatalogTupleUpdate(classRel, &newtuple->t_self, newtuple); + heap_freetuple(newtuple); + + /* + * If wal_level >= replica, switching to LOGGED necessitates WAL-logging + * the relation content for later recovery. This is not emitted when + * wal_level = minimal. + */ + if (persistence == RELPERSISTENCE_PERMANENT && XLogIsNeeded()) + { + ForkNumber fork; + xl_smgr_truncate xlrec; + + xlrec.blkno = 0; + xlrec.rlocator = r->rd_locator; + xlrec.flags = SMGR_TRUNCATE_ALL; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(xlrec)); + + XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + + for (fork = 0; fork < INIT_FORKNUM; fork++) + { + if (smgrexists(RelationGetSmgr(r), fork)) + log_newpage_range(r, fork, 0, + smgrnblocks(RelationGetSmgr(r), fork), + false); + } + } + + table_close(r, NoLock); + } + + table_close(classRel, NoLock); +} + /* * ATRewriteTables: ALTER TABLE phase 3 */ @@ -5797,48 +5980,55 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode, tab->relid, tab->rewrite); - /* - * Create transient table that will receive the modified data. - * - * Ensure it is marked correctly as logged or unlogged. We have - * to do this here so that buffers for the new relfilenumber will - * have the right persistence set, and at the same time ensure - * that the original filenumbers's buffers will get read in with - * the correct setting (i.e. the original one). Otherwise a - * rollback after the rewrite would possibly result with buffers - * for the original filenumbers having the wrong persistence - * setting. - * - * NB: This relies on swap_relation_files() also swapping the - * persistence. That wouldn't work for pg_class, but that can't be - * unlogged anyway. - */ - OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, NewAccessMethod, - persistence, lockmode); + if (tab->rewrite == AT_REWRITE_ALTER_PERSISTENCE) + RelationChangePersistence(tab, persistence, lockmode); + else + { + /* + * Create transient table that will receive the modified data. + * + * Ensure it is marked correctly as logged or unlogged. We + * have to do this here so that buffers for the new + * relfilenumber will have the right persistence set, and at + * the same time ensure that the original filenumbers's buffers + * will get read in with the correct setting (i.e. the original + * one). Otherwise a rollback after the rewrite would possibly + * result with buffers for the original filenumbers having the + * wrong persistence setting. + * + * NB: This relies on swap_relation_files() also swapping the + * persistence. That wouldn't work for pg_class, but that + * can't be unlogged anyway. + */ + OIDNewHeap = make_new_heap(tab->relid, NewTableSpace, + NewAccessMethod, + persistence, lockmode); - /* - * Copy the heap data into the new table with the desired - * modifications, and test the current data within the table - * against new constraints generated by ALTER TABLE commands. - */ - ATRewriteTable(tab, OIDNewHeap, lockmode); + /* + * Copy the heap data into the new table with the desired + * modifications, and test the current data within the table + * against new constraints generated by ALTER TABLE commands. + */ + ATRewriteTable(tab, OIDNewHeap, lockmode); - /* - * Swap the physical files of the old and new heaps, then rebuild - * indexes and discard the old heap. We can use RecentXmin for - * the table's new relfrozenxid because we rewrote all the tuples - * in ATRewriteTable, so no older Xid remains in the table. Also, - * we never try to swap toast tables by content, since we have no - * interest in letting this code work on system catalogs. - */ - finish_heap_swap(tab->relid, OIDNewHeap, - false, false, true, - !OidIsValid(tab->newTableSpace), - RecentXmin, - ReadNextMultiXactId(), - persistence); + /* + * Swap the physical files of the old and new heaps, then + * rebuild indexes and discard the old heap. We can use + * RecentXmin for the table's new relfrozenxid because we + * rewrote all the tuples in ATRewriteTable, so no older Xid + * remains in the table. Also, we never try to swap toast + * tables by content, since we have no interest in letting + * this code work on system catalogs. + */ + finish_heap_swap(tab->relid, OIDNewHeap, + false, false, true, + !OidIsValid(tab->newTableSpace), + RecentXmin, + ReadNextMultiXactId(), + persistence); - InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + InvokeObjectPostAlterHook(RelationRelationId, tab->relid, 0); + } } else if (tab->rewrite > 0 && tab->relkind == RELKIND_SEQUENCE) { diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 49637284f9..ed933e7b9e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4085,6 +4085,90 @@ DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, } } +/* --------------------------------------------------------------------- + * SetRelationBuffersPersistence + * + * This function changes the persistence of all buffer pages of a relation + * then writes all dirty pages to disk (or kernel disk buffers) when + * switching to PERMANENT, ensuring the kernel has an up-to-date view of + * the relation. + * + * The caller must be holding AccessExclusiveLock on the target relation + * to ensure no other backend is busy dirtying more blocks. + * + * XXX currently it sequentially searches the buffer pool; consider + * implementing more efficient search methods. This routine isn't used in + * performance-critical code paths, so it's not worth additional overhead + * to make it go faster; see also DropRelationBuffers. + * -------------------------------------------------------------------- + */ +void +SetRelationBuffersPersistence(SMgrRelation srel, bool permanent, bool isRedo) +{ + int i; + RelFileLocatorBackend rlocator = srel->smgr_rlocator; + + Assert(!RelFileLocatorBackendIsTemp(rlocator)); + + if (!isRedo) + log_smgrbufpersistence(srel->smgr_rlocator.locator, permanent); + + ResourceOwnerEnlarge(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + + if (!RelFileLocatorEquals(BufTagGetRelFileLocator(&bufHdr->tag), + rlocator.locator)) + { + UnlockBufHdr(bufHdr, buf_state); + continue; + } + + if (permanent) + { + /* The init fork is being dropped, drop buffers for it. */ + if (BufTagGetForkNum(&bufHdr->tag) == INIT_FORKNUM) + { + InvalidateBuffer(bufHdr); + continue; + } + + buf_state |= BM_PERMANENT; + pg_atomic_write_u32(&bufHdr->state, buf_state); + + /* flush this buffer when switching to PERMANENT */ + if ((buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_SHARED); + FlushBuffer(bufHdr, srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + else + { + /* There shouldn't be an init fork */ + Assert(BufTagGetForkNum(&bufHdr->tag) != INIT_FORKNUM); + UnlockBufHdr(bufHdr, buf_state); + } + } +} + /* --------------------------------------------------------------------- * DropRelationsAllBuffers * diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index 525b98899f..c8c9cc361f 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -418,6 +418,12 @@ extractPageInfo(XLogReaderState *record) * source system. */ } + else if (rmid == RM_SMGR_ID && rminfo == XLOG_SMGR_BUFPERSISTENCE) + { + /* + * We can safely ignore these. These don't make any on-disk changes. + */ + } else if (rmid == RM_XACT_ID && ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT || (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED || diff --git a/src/include/catalog/storage_xlog.h b/src/include/catalog/storage_xlog.h index 807c0f8235..b38909ceb3 100644 --- a/src/include/catalog/storage_xlog.h +++ b/src/include/catalog/storage_xlog.h @@ -14,6 +14,7 @@ #ifndef STORAGE_XLOG_H #define STORAGE_XLOG_H +#include "access/simpleundolog.h" #include "access/xlogreader.h" #include "lib/stringinfo.h" #include "storage/block.h" @@ -30,6 +31,7 @@ #define XLOG_SMGR_CREATE 0x10 #define XLOG_SMGR_TRUNCATE 0x20 #define XLOG_SMGR_UNLINK 0x30 +#define XLOG_SMGR_BUFPERSISTENCE 0x40 typedef struct xl_smgr_create { @@ -44,6 +46,12 @@ typedef struct xl_smgr_unlink ForkNumber forkNum; } xl_smgr_unlink; +typedef struct xl_smgr_bufpersistence +{ + RelFileLocator rlocator; + bool persistence; +} xl_smgr_bufpersistence; + /* flags for xl_smgr_truncate */ #define SMGR_TRUNCATE_HEAP 0x0001 #define SMGR_TRUNCATE_VM 0x0002 @@ -60,6 +68,8 @@ typedef struct xl_smgr_truncate extern void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum); extern void log_smgrunlink(const RelFileLocator *rlocator, ForkNumber forkNum); +extern void log_smgrbufpersistence(const RelFileLocator rlocator, + bool persistence); extern void smgr_redo(XLogReaderState *record); extern void smgr_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 08364447c7..552fa609c2 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -272,6 +272,9 @@ extern void DropRelationBuffers(struct SMgrRelationData *smgr_reln, int nforks, BlockNumber *firstDelBlock); extern void DropRelationsAllBuffers(struct SMgrRelationData **smgr_reln, int nlocators); +extern void SetRelationBuffersPersistence(struct SMgrRelationData *srel, + bool permanent, bool isRedo); + extern void DropDatabaseBuffers(Oid dbid); #define RelationGetNumberOfBlocks(reln) \ diff --git a/src/include/storage/reinit.h b/src/include/storage/reinit.h index c57ae26b4c..746d3a910a 100644 --- a/src/include/storage/reinit.h +++ b/src/include/storage/reinit.h @@ -20,11 +20,11 @@ extern void ResetUnloggedRelations(int op); -extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); extern bool parse_filename_for_nontemp_relation(const char *name, RelFileNumber *relnumber, ForkNumber *fork, unsigned *segno); +extern void ResetUnloggedRelationIgnore(RelFileLocator rloc); #define UNLOGGED_RELATION_CLEANUP 0x0001 #define UNLOGGED_RELATION_INIT 0x0002 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 29a3e52dbf..c477f5fac6 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4098,6 +4098,7 @@ xl_replorigin_set xl_restore_point xl_running_xacts xl_seq_rec +xl_smgr_bufpersistence xl_smgr_create xl_smgr_truncate xl_smgr_unlink -- 2.43.0