On Wed, Apr 6, 2016 at 3:11 PM, Michael Paquier <michael.paqu...@gmail.com> wrote: > On Wed, Mar 23, 2016 at 12:45 PM, Michael Paquier > <michael.paqu...@gmail.com> wrote: >> On Wed, Mar 23, 2016 at 11:11 AM, David Steele <da...@pgmasters.net> wrote: >>> I would prefer not to bump it to the next CF unless we decide this will >>> not get fixed for 9.6. >> >> It may make sense to add that to the list of open items for 9.6 >> instead. That's not a feature. > > So I have moved this patch to the next CF for now, and will work on > fixing it rather soonishly as an effort to stabilize 9.6 as well as > back-branches.
Well, not that soon at the end, but I am back on that... I have not completely reviewed all the code yet, and the case of index relation referring to a relation optimized with truncate is still broken, but for now here is a rebased patch if people are interested. I am going to get as well a TAP tests out of my pocket to ease testing. -- Michael
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 38bba16..bbc09cd 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -55,6 +55,7 @@ #include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/namespace.h" +#include "catalog/storage.h" #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" @@ -2331,12 +2332,6 @@ FreeBulkInsertState(BulkInsertState bistate) * The new tuple is stamped with current transaction ID and the specified * command ID. * - * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not - * logged in WAL, even for a non-temp relation. Safe usage of this behavior - * requires that we arrange that all new tuples go into new pages not - * containing any tuples from other transactions, and that the relation gets - * fsync'd before commit. (See also heap_sync() comments) - * * The HEAP_INSERT_SKIP_FSM option is passed directly to * RelationGetBufferForTuple, which see for more info. * @@ -2440,7 +2435,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /* XLOG stuff */ - if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { xl_heap_insert xlrec; xl_heap_header xlhdr; @@ -2639,12 +2634,10 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, int ndone; char *scratch = NULL; Page page; - bool needwal; Size saveFreeSpace; bool need_tuple_data = RelationIsLogicallyLogged(relation); bool need_cids = RelationIsAccessibleInLogicalDecoding(relation); - needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -2659,7 +2652,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, * palloc() within a critical section is not safe, so we allocate this * beforehand. */ - if (needwal) + if (RelationNeedsWAL(relation)) scratch = palloc(BLCKSZ); /* @@ -2727,7 +2720,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, * We don't use heap_multi_insert for catalog tuples yet, but * better be prepared... */ - if (needwal && need_cids) + if (HeapNeedsWAL(relation, buffer) && need_cids) log_heap_new_cid(relation, heaptup); } @@ -2747,7 +2740,7 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples, MarkBufferDirty(buffer); /* XLOG stuff */ - if (needwal) + if (HeapNeedsWAL(relation, buffer)) { XLogRecPtr recptr; xl_heap_multi_insert *xlrec; @@ -3261,7 +3254,7 @@ l1: * NB: heap_abort_speculative() uses the same xlog record and replay * routines. */ - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { xl_heap_delete xlrec; XLogRecPtr recptr; @@ -3982,7 +3975,7 @@ l2: MarkBufferDirty(buffer); - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { xl_heap_lock xlrec; XLogRecPtr recptr; @@ -4194,7 +4187,7 @@ l2: MarkBufferDirty(buffer); /* XLOG stuff */ - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { XLogRecPtr recptr; @@ -5148,7 +5141,7 @@ failed: * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG * entries for everything anyway.) */ - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, *buffer)) { xl_heap_lock xlrec; XLogRecPtr recptr; @@ -5825,7 +5818,7 @@ l4: MarkBufferDirty(buf); /* XLOG stuff */ - if (RelationNeedsWAL(rel)) + if (HeapNeedsWAL(rel, buf)) { xl_heap_lock_updated xlrec; XLogRecPtr recptr; @@ -5980,7 +5973,7 @@ heap_finish_speculative(Relation relation, HeapTuple tuple) htup->t_ctid = tuple->t_self; /* XLOG stuff */ - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { xl_heap_confirm xlrec; XLogRecPtr recptr; @@ -6112,7 +6105,7 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) * The WAL records generated here match heap_delete(). The same recovery * routines are used. */ - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { xl_heap_delete xlrec; XLogRecPtr recptr; @@ -6218,7 +6211,7 @@ heap_inplace_update(Relation relation, HeapTuple tuple) MarkBufferDirty(buffer); /* XLOG stuff */ - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { xl_heap_inplace xlrec; XLogRecPtr recptr; @@ -9081,3 +9074,71 @@ heap_sync(Relation rel) heap_close(toastrel, AccessShareLock); } } + +/* + * heap_register_sync - register a heap to be synced to disk at commit + * + * This can be used to skip WAL-logging changes on a relation file that has + * been created in the same transaction. After calling this, any changes to + * the heap (including TOAST heap if any) in the same transaction will not be + * WAL-logged. Instead, the heap contents are flushed to disk at commit, + * like heap_sync() does. + * + * Like with heap_sync(), indexes are not touched. + */ +void +heap_register_sync(Relation rel) +{ + /* non-WAL-logged tables never need fsync */ + if (!RelationNeedsWAL(rel)) + return; + + smgrRegisterPendingSync(rel); + if (OidIsValid(rel->rd_rel->reltoastrelid)) + { + Relation toastrel; + + toastrel = heap_open(rel->rd_rel->reltoastrelid, AccessShareLock); + smgrRegisterPendingSync(toastrel); + heap_close(toastrel, AccessShareLock); + } +} + +/* + * Do changes to given heap page need to be WAL-logged? + * + * This takes into account any previous heap_register_sync() requests. + * + * Note that it is required to use this before creating any WAL records for + * heap pages - it is not merely an optimization. WAL-logging a record, + * when we have already skipped a previous WAL record for the same page could + * lead to failure at WAL replay, as the "before" state expected by the + * record might not match what's on disk (this should only a be problem + * with full_page_writes=off, though). + */ +bool +HeapNeedsWAL(Relation rel, Buffer buf) +{ + /* Temporary relations never need WAL */ + if (!RelationNeedsWAL(rel)) + return false; + + /* + * If we are going to fsync() the relation at COMMIT, and we have not + * truncated the block away previously, and we have not emitted any WAL + * records for this block yet, we can skip WAL-logging it. + */ + if (smgrIsSyncPending(rel->rd_node, BufferGetBlockNumber(buf))) + { + /* + * If a pending fsync() will handle this page, its LSN should be + * invalid. If it's not, we've already emitted a WAL record for this + * block, and all subsequent changes to the block must be WAL-logged + * too. + */ + Assert(PageGetLSN(BufferGetPage(buf)) == InvalidXLogRecPtr); + return false; + } + + return true; +} diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 6ff9251..3207134 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -260,7 +260,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, /* * Emit a WAL HEAP_CLEAN record showing what we did */ - if (RelationNeedsWAL(relation)) + if (HeapNeedsWAL(relation, buffer)) { XLogRecPtr recptr; diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 3ad4a9f..fb07795 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -307,6 +307,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, map[mapByte] |= (flags << mapOffset); MarkBufferDirty(vmBuf); + /* XXX: Should we use HeapNeedsWAL here? */ if (RelationNeedsWAL(rel)) { if (XLogRecPtrIsInvalid(recptr)) diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 23f36ea..f66d9ab 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2007,6 +2007,9 @@ CommitTransaction(void) /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); + /* Flush updates to relations that we didn't WAL-logged */ + smgrDoPendingSyncs(true); + /* * Mark serializable transaction as complete for predicate locking * purposes. This should be done as late as we can put it and still allow @@ -2237,6 +2240,9 @@ PrepareTransaction(void) /* close large objects before lower-level cleanup */ AtEOXact_LargeObject(true); + /* Flush updates to relations that we didn't WAL-logged */ + smgrDoPendingSyncs(true); + /* * Mark serializable transaction as complete for predicate locking * purposes. This should be done as late as we can put it and still allow @@ -2541,6 +2547,7 @@ AbortTransaction(void) AtAbort_Notify(); AtEOXact_RelationMap(false); AtAbort_Twophase(); + smgrDoPendingSyncs(false); /* * Advertise the fact that we aborted in pg_clog (assuming that we got as diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index 0d8311c..54ff874 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -20,6 +20,7 @@ #include "postgres.h" #include "access/visibilitymap.h" +#include "access/transam.h" #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" @@ -29,6 +30,7 @@ #include "catalog/storage_xlog.h" #include "storage/freespace.h" #include "storage/smgr.h" +#include "utils/hsearch.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -64,6 +66,42 @@ typedef struct PendingRelDelete static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */ /* + * We also track relation files (RelFileNode values) that have been created + * in the same transaction, and that have been modified without WAL-logging + * the action. When we are about to begin a large operation on the relation, + * a PendingRelSync entry is created, and 'sync_above' is set to the current + * size of the relation. Any operations on blocks < sync_above need to be + * WAL-logged as usual, but for operations on higher blocks, WAL-logging is + * skipped. It's important that after WAL-logging has been skipped for a + * block, we don't WAL log any subsequent actions on the same block either. + * Replaying the WAl record of the subsequent action might fail otherwise, + * as the "before" state of the block might not match, as the earlier actions + * were not WAL-logged. + * + * If a relation is truncated (without creating a new relfilenode), and we + * emit a WAL record of the truncation, we cannot skip WAL-logging for that + * relation anymore, as replaying the truncation record will destroy all the + * data inserted after that. But if we have already decided to skip WAL-logging + * changes to a relation, and the relation is truncated, we don't need to + * WAL-log the truncation either. + * + * This mechanism is currently only used by heaps. Indexes are always + * WAL-logged. Also, this only applies for wal_level=minimal; with higher + * WAL levels we need the WAL for PITR/replication anyway. + */ +/* Relations that need to be fsync'd at commit */ +typedef struct PendingRelSync +{ + RelFileNode relnode; /* relation created in same xact */ + BlockNumber sync_above; /* WAL-logging skipped for blocks >= sync_above */ + bool truncated; /* truncation WAL record was written */ +} PendingRelSync; + +static HTAB *pendingSyncs = NULL; + +static void createPendingSyncsHash(void); + +/* * RelationCreateStorage * Create physical storage for a relation. * @@ -226,6 +264,8 @@ RelationPreserveStorage(RelFileNode rnode, bool atCommit) void RelationTruncate(Relation rel, BlockNumber nblocks) { + PendingRelSync *pending = NULL; + bool found; bool fsm; bool vm; @@ -249,6 +289,17 @@ RelationTruncate(Relation rel, BlockNumber nblocks) if (vm) visibilitymap_truncate(rel, nblocks); + if (!pendingSyncs) + createPendingSyncsHash(); + pending = (PendingRelSync *) hash_search(pendingSyncs, + (void *) &rel->rd_node, + HASH_ENTER, &found); + if (!found) + { + pending->sync_above = InvalidBlockNumber; + pending->truncated = false; + } + /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay @@ -258,7 +309,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * failure to truncate, that might spell trouble at WAL replay, into a * certain PANIC. */ - if (RelationNeedsWAL(rel)) + if (RelationNeedsWAL(rel) && + (pending->sync_above == InvalidBlockNumber || pending->sync_above < nblocks)) { /* * Make an XLOG entry reporting the file truncation. @@ -276,6 +328,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks) lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE); + if (rel->rd_node.relNode >= FirstNormalObjectId) + elog(LOG, "WAL-logged truncation of rel %u to %u blocks", rel->rd_node.relNode, nblocks); + /* * Flush, because otherwise the truncation of the main relation might * hit the disk before the WAL record, and the truncation of the FSM @@ -285,6 +340,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks) */ if (fsm || vm) XLogFlush(lsn); + + pending->truncated = true; } /* Do the real work */ @@ -419,6 +476,142 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr) return nrels; } +/* create the hash table to track pending at-commit fsyncs */ +static void +createPendingSyncsHash(void) +{ + /* First time through: initialize the hash table */ + HASHCTL ctl; + + MemSet(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(RelFileNode); + ctl.entrysize = sizeof(PendingRelSync); + ctl.hash = tag_hash; + pendingSyncs = hash_create("pending relation sync table", 5, + &ctl, HASH_ELEM | HASH_FUNCTION); +} + +/* + * Remember that the given relation needs to be sync'd at commit, because + * we are going to skip WAL-logging subsequent actions to it. + */ +void +smgrRegisterPendingSync(Relation rel) +{ + PendingRelSync *pending; + bool found; + BlockNumber nblocks; + + nblocks = RelationGetNumberOfBlocks(rel); + + if (!pendingSyncs) + createPendingSyncsHash(); + + /* Look up or create an entry */ + pending = (PendingRelSync *) hash_search(pendingSyncs, + (void *) &rel->rd_node, + HASH_ENTER, &found); + if (!found) + { + pending->truncated = false; + pending->sync_above = nblocks; + + if (rel->rd_node.relNode >= FirstNormalObjectId) + elog(LOG, "Registering new pending sync for rel %u at block %u", rel->rd_node.relNode, nblocks); + + } + else if (pending->sync_above == InvalidBlockNumber) + { + if (rel->rd_node.relNode >= FirstNormalObjectId) + elog(LOG, "Registering pending sync for rel %u at block %u", rel->rd_node.relNode, nblocks); + pending->sync_above = nblocks; + } + else + if (rel->rd_node.relNode >= FirstNormalObjectId) + elog(LOG, "Not updating pending sync for rel %u at block %u (was %u)", rel->rd_node.relNode, nblocks, pending->sync_above); +} + +/* + * Are we going to fsync() this relation at COMMIT, and hence don't need to + * WAL-log changes to the given block? + */ +bool +smgrIsSyncPending(RelFileNode rnode, BlockNumber blkno) +{ + PendingRelSync *pending; + bool found; + + if (!pendingSyncs) + return false; + + pending = (PendingRelSync *) hash_search(pendingSyncs, + (void *) &rnode, + HASH_FIND, &found); + if (!found) + return false; + + /* + * We have no fsync() pending for this relation, or we have (possibly) + * already emitted WAL records for this block. + */ + if (pending->sync_above == InvalidBlockNumber || + pending->sync_above > blkno) + { + if (rnode.relNode >= FirstNormalObjectId) + elog(LOG, "Not skipping WAL-logging for rel %u block %u, because sync_above is %u", rnode.relNode, blkno, pending->sync_above); + return false; + } + + /* + * We have emitted a truncation record for this block. + */ + if (pending->truncated) + { + if (rnode.relNode >= FirstNormalObjectId) + elog(LOG, "Not skipping WAL-logging for rel %u block %u, because it was truncated", rnode.relNode, blkno); + return false; + } + + if (rnode.relNode >= FirstNormalObjectId) + elog(LOG, "Skipping WAL-logging for rel %u block %u", rnode.relNode, blkno); + + return true; +} + +/* + * Sync to disk any relations that we skipped WAL-logging for earlier. + */ +void +smgrDoPendingSyncs(bool isCommit) +{ + if (!pendingSyncs) + return; + + if (isCommit) + { + HASH_SEQ_STATUS status; + PendingRelSync *pending; + + hash_seq_init(&status, pendingSyncs); + + while ((pending = hash_seq_search(&status)) != NULL) + { + if (pending->sync_above != InvalidBlockNumber) + { + FlushRelFileNodeBuffers(pending->relnode, false); + /* FlushRelationBuffers will have opened rd_smgr */ + smgrimmedsync(smgropen(pending->relnode, InvalidBackendId), MAIN_FORKNUM); + + if (pending->relnode.relNode >= FirstNormalObjectId) + elog(LOG, "Syncing rel %u", pending->relnode.relNode); + } + } + } + + hash_destroy(pendingSyncs); + pendingSyncs = NULL; +} + /* * PostPrepare_smgr -- Clean up after a successful PREPARE * diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index f45b330..01486da 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -26,6 +26,7 @@ #include "access/xact.h" #include "access/xlog.h" #include "catalog/pg_type.h" +#include "catalog/storage.h" #include "commands/copy.h" #include "commands/defrem.h" #include "commands/trigger.h" @@ -2302,7 +2303,10 @@ CopyFrom(CopyState cstate) { hi_options |= HEAP_INSERT_SKIP_FSM; if (!XLogIsNeeded()) + { + heap_register_sync(cstate->rel); hi_options |= HEAP_INSERT_SKIP_WAL; + } } /* @@ -2551,11 +2555,11 @@ CopyFrom(CopyState cstate) FreeExecutorState(estate); /* - * If we skipped writing WAL, then we need to sync the heap (but not - * indexes since those use WAL anyway) + * If we skipped writing WAL, then we will sync the heap at the end of + * the transaction (we used to do it here, but it was later found out + * that to be safe, we must avoid WAL-logging any subsequent actions on + * the pages we skipped WAL for). Indexes always use WAL. */ - if (hi_options & HEAP_INSERT_SKIP_WAL) - heap_sync(cstate->rel); return processed; } diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c index 231e92d..1b1246f 100644 --- a/src/backend/commands/vacuumlazy.c +++ b/src/backend/commands/vacuumlazy.c @@ -1462,7 +1462,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer, MarkBufferDirty(buffer); /* XLOG stuff */ - if (RelationNeedsWAL(onerel)) + if (HeapNeedsWAL(onerel, buffer)) { XLogRecPtr recptr; diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 76ade37..d1e7bc8 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -452,6 +452,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr, bool *foundPtr); static void FlushBuffer(BufferDesc *buf, SMgrRelation reln); static void AtProcExit_Buffers(int code, Datum arg); +static void FlushRelFileNodeBuffers_internal(SMgrRelation smgr, bool islocal); static void CheckForBufferLeaks(void); static int rnode_comparator(const void *p1, const void *p2); static int buffertag_comparator(const void *p1, const void *p2); @@ -3136,14 +3137,30 @@ FlushRelationBuffers(Relation rel) /* Open rel at the smgr level if not already done */ RelationOpenSmgr(rel); - if (RelationUsesLocalBuffers(rel)) + FlushRelFileNodeBuffers_internal(rel->rd_smgr, RelationUsesLocalBuffers(rel)); +} + +void +FlushRelFileNodeBuffers(RelFileNode rnode, bool islocal) +{ + FlushRelFileNodeBuffers_internal(smgropen(rnode, InvalidBackendId), islocal); +} + +static void +FlushRelFileNodeBuffers_internal(SMgrRelation smgr, bool islocal) +{ + RelFileNode rnode = smgr->smgr_rnode.node; + int i; + BufferDesc *bufHdr; + + if (islocal) { for (i = 0; i < NLocBuffer; i++) { uint32 buf_state; bufHdr = GetLocalBufferDescriptor(i); - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) && + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && ((buf_state = pg_atomic_read_u32(&bufHdr->state)) & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { @@ -3160,7 +3177,7 @@ FlushRelationBuffers(Relation rel) PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); - smgrwrite(rel->rd_smgr, + smgrwrite(smgr, bufHdr->tag.forkNum, bufHdr->tag.blockNum, localpage, @@ -3190,18 +3207,18 @@ FlushRelationBuffers(Relation rel) * As in DropRelFileNodeBuffers, an unlocked precheck should be safe * and saves some cycles. */ - if (!RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node)) + if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode)) continue; ReservePrivateRefCountEntry(); buf_state = LockBufHdr(bufHdr); - if (RelFileNodeEquals(bufHdr->tag.rnode, rel->rd_node) && + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) { PinBuffer_Locked(bufHdr); LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); - FlushBuffer(bufHdr, rel->rd_smgr); + FlushBuffer(bufHdr, smgr); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); UnpinBuffer(bufHdr, true); } @@ -3397,6 +3414,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) bool dirtied = false; bool delayChkpt = false; uint32 buf_state; + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blknum; /* * If we need to protect hint bit updates from torn writes, WAL-log a @@ -3407,8 +3427,10 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std) * We don't check full_page_writes here because that logic is included * when we call XLogInsert() since the value changes dynamically. */ + BufferGetTag(buffer, &rnode, &forknum, &blknum); if (XLogHintBitIsNeeded() && - (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) + (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) && + !smgrIsSyncPending(rnode, blknum)) { /* * If we're in recovery we cannot dirty a page because of a hint. diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index b3a595c..06082d9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -25,7 +25,7 @@ /* "options" flag bits for heap_insert */ -#define HEAP_INSERT_SKIP_WAL 0x0001 +#define HEAP_INSERT_SKIP_WAL 0x0001 /* obsolete, not used anymore */ #define HEAP_INSERT_SKIP_FSM 0x0002 #define HEAP_INSERT_FROZEN 0x0004 #define HEAP_INSERT_SPECULATIVE 0x0008 @@ -177,6 +177,7 @@ extern void simple_heap_delete(Relation relation, ItemPointer tid); extern void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup); +extern void heap_register_sync(Relation relation); extern void heap_sync(Relation relation); /* in heap/pruneheap.c */ diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 06a8242..5418d71 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -378,6 +378,8 @@ extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); extern void heap_xlog_logical_rewrite(XLogReaderState *r); +extern bool HeapNeedsWAL(Relation rel, Buffer buf); + extern XLogRecPtr log_heap_cleanup_info(RelFileNode rnode, TransactionId latestRemovedXid); extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer, diff --git a/src/include/catalog/storage.h b/src/include/catalog/storage.h index ef960da..c618c78 100644 --- a/src/include/catalog/storage.h +++ b/src/include/catalog/storage.h @@ -29,6 +29,9 @@ extern void RelationTruncate(Relation rel, BlockNumber nblocks); */ extern void smgrDoPendingDeletes(bool isCommit); extern int smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr); +extern void smgrRegisterPendingSync(Relation rel); +extern bool smgrIsSyncPending(RelFileNode rnode, BlockNumber blkno); +extern void smgrDoPendingSyncs(bool isCommit); extern void AtSubCommit_smgr(void); extern void AtSubAbort_smgr(void); extern void PostPrepare_smgr(void); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 3d5dea7..0622dee 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -202,6 +202,7 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum); extern void FlushOneBuffer(Buffer buffer); extern void FlushRelationBuffers(Relation rel); +extern void FlushRelFileNodeBuffers(RelFileNode rel, bool islocal); extern void FlushDatabaseBuffers(Oid dbid); extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum, BlockNumber firstDelBlock);
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers