Hello. At Mon, 20 May 2019 15:54:30 +0900 (Tokyo Standard Time), Kyotaro HORIGUCHI <horiguchi.kyot...@lab.ntt.co.jp> wrote in <20190520.155430.215084510.horiguchi.kyot...@lab.ntt.co.jp> > > I suspect the design in the https://postgr.es/m/559fa0ba.3080...@iki.fi last > > paragraph will be simpler, not more complex. In the implementation I'm > > envisioning, smgrDoPendingDeletes() would change name, perhaps to > > AtEOXact_Storage(). For every relfilenode it does not delete, it would > > ensure > > durability by syncing (for large nodes) or by WAL-logging each page (for > > small > > nodes). RelationNeedsWAL() would return false whenever the applicable > > relfilenode appears in pendingDeletes. Access methods would remove their > > smgrimmedsync() calls, but they would otherwise not change. Would anyone > > like > > to try implementing that? > > Following this direction, the attached PoC works *at least for* > the wal_optimization TAP tests, but doing pending flush not in > smgr but in relcache. This is extending skip-wal feature to > indexes. And makes the old 0002 patch on nbtree useless.
This is a tidier version of the patch. - Passes regression tests including 018_wal_optimize.pl - Move the substantial work to table/index AMs. Each AM can decide whether to support WAL skip or not. Currently heap and nbtree support it. - The timing of sync is moved from AtEOXact to PreCommit. This is because heap_sync() needs xact state = INPROGRESS. - matview and cluster is broken, since swapping to new relfilenode doesn't change rd_newRelfilenodeSubid. I'll address that. regards. -- Kyotaro Horiguchi NTT Open Source Software Center
>From 680462288cb82da23c19a02239787fc1ea08cdde Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp> Date: Thu, 11 Oct 2018 10:03:21 +0900 Subject: [PATCH 1/2] TAP test for copy-truncation optimization. --- src/test/recovery/t/018_wal_optimize.pl | 291 ++++++++++++++++++++++++++++++++ 1 file changed, 291 insertions(+) create mode 100644 src/test/recovery/t/018_wal_optimize.pl diff --git a/src/test/recovery/t/018_wal_optimize.pl b/src/test/recovery/t/018_wal_optimize.pl new file mode 100644 index 0000000000..4fa8be728e --- /dev/null +++ b/src/test/recovery/t/018_wal_optimize.pl @@ -0,0 +1,291 @@ +# Test WAL replay for optimized TRUNCATE and COPY records +# +# WAL truncation is optimized in some cases with TRUNCATE and COPY queries +# which sometimes interact badly with the other optimizations in line with +# several setting values of wal_level, particularly when using "minimal" or +# "replica". The optimization may be enabled or disabled depending on the +# scenarios dealt here, and should never result in any type of failures or +# data loss. +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use Test::More tests => 24; + +sub check_orphan_relfilenodes +{ + my($node, $test_name) = @_; + + my $db_oid = $node->safe_psql('postgres', + "SELECT oid FROM pg_database WHERE datname = 'postgres'"); + my $prefix = "base/$db_oid/"; + my $filepaths_referenced = $node->safe_psql('postgres', " + SELECT pg_relation_filepath(oid) FROM pg_class + WHERE reltablespace = 0 and relpersistence <> 't' and + pg_relation_filepath(oid) IS NOT NULL;"); + is_deeply([sort(map { "$prefix$_" } + grep(/^[0-9]+$/, + slurp_dir($node->data_dir . "/$prefix")))], + [sort split /\n/, $filepaths_referenced], + $test_name); + return; +} + +# Wrapper routine tunable for wal_level. +sub run_wal_optimize +{ + my $wal_level = shift; + + # Primary needs to have wal_level = minimal here + my $node = get_new_node("node_$wal_level"); + $node->init; + $node->append_conf('postgresql.conf', qq( +wal_level = $wal_level +)); + $node->start; + + # Setup + my $tablespace_dir = $node->basedir . '/tablespace_other'; + mkdir ($tablespace_dir); + $tablespace_dir = TestLib::real_dir($tablespace_dir); + $node->safe_psql('postgres', + "CREATE TABLESPACE other LOCATION '$tablespace_dir';"); + + # Test direct truncation optimization. No tuples + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test1 (id serial PRIMARY KEY); + TRUNCATE test1; + COMMIT;"); + + $node->stop('immediate'); + $node->start; + + my $result = $node->safe_psql('postgres', "SELECT count(*) FROM test1;"); + is($result, qq(0), + "wal_level = $wal_level, optimized truncation with empty table"); + + # Test truncation with inserted tuples within the same transaction. + # Tuples inserted after the truncation should be seen. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test2 (id serial PRIMARY KEY); + INSERT INTO test2 VALUES (DEFAULT); + TRUNCATE test2; + INSERT INTO test2 VALUES (DEFAULT); + COMMIT;"); + + $node->stop('immediate'); + $node->start; + + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test2;"); + is($result, qq(1), + "wal_level = $wal_level, optimized truncation with inserted table"); + + # Data file for COPY query in follow-up tests. + my $basedir = $node->basedir; + my $copy_file = "$basedir/copy_data.txt"; + TestLib::append_to_file($copy_file, qq(20000,30000 +20001,30001 +20002,30002)); + + # Test truncation with inserted tuples using COPY. Tuples copied after the + # truncation should be seen. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test3 (id serial PRIMARY KEY, id2 int); + INSERT INTO test3 (id, id2) VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE test3; + COPY test3 FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3;"); + is($result, qq(3), + "wal_level = $wal_level, optimized truncation with copied table"); + + # Like previous test, but rollback SET TABLESPACE in a subtransaction. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test3a (id serial PRIMARY KEY, id2 int); + INSERT INTO test3a (id, id2) VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE test3a; + SAVEPOINT s; ALTER TABLE test3a SET TABLESPACE other; ROLLBACK TO s; + COPY test3a FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;"); + is($result, qq(3), + "wal_level = $wal_level, SET TABLESPACE in subtransaction"); + + # in different subtransaction patterns + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test3a2 (id serial PRIMARY KEY, id2 int); + INSERT INTO test3a2 (id, id2) VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE test3a2; + SAVEPOINT s; ALTER TABLE test3a SET TABLESPACE other; RELEASE s; + COPY test3a2 FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;"); + is($result, qq(3), + "wal_level = $wal_level, SET TABLESPACE in subtransaction"); + + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test3a3 (id serial PRIMARY KEY, id2 int); + INSERT INTO test3a3 (id, id2) VALUES (DEFAULT, generate_series(1,3000)); + TRUNCATE test3a3; + SAVEPOINT s; + ALTER TABLE test3a3 SET TABLESPACE other; + SAVEPOINT s2; + ALTER TABLE test3a3 SET TABLESPACE pg_default; + ROLLBACK TO s2; + SAVEPOINT s2; + ALTER TABLE test3a3 SET TABLESPACE pg_default; + RELEASE s2; + ROLLBACK TO s; + COPY test3a3 FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;"); + is($result, qq(3), + "wal_level = $wal_level, SET TABLESPACE in subtransaction"); + + # UPDATE touches two buffers; one is BufferNeedsWAL(); the other is not. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test3b (id serial PRIMARY KEY, id2 int); + INSERT INTO test3b (id, id2) VALUES (DEFAULT, generate_series(1,10000)); + COPY test3b FROM '$copy_file' DELIMITER ','; -- set sync_above + UPDATE test3b SET id2 = id2 + 1; + DELETE FROM test3b; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test3b;"); + is($result, qq(0), + "wal_level = $wal_level, UPDATE of logged page extends relation"); + + # Test truncation with inserted tuples using both INSERT and COPY. Tuples + # inserted after the truncation should be seen. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test4 (id serial PRIMARY KEY, id2 int); + INSERT INTO test4 (id, id2) VALUES (DEFAULT, generate_series(1,10000)); + TRUNCATE test4; + INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000); + COPY test4 FROM '$copy_file' DELIMITER ','; + INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000); + COMMIT;"); + + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test4;"); + is($result, qq(5), + "wal_level = $wal_level, optimized truncation with inserted/copied table"); + + # Test consistency of COPY with INSERT for table created in the same + # transaction. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test5 (id serial PRIMARY KEY, id2 int); + INSERT INTO test5 VALUES (DEFAULT, 1); + COPY test5 FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test5;"); + is($result, qq(4), + "wal_level = $wal_level, replay of optimized copy with inserted table"); + + # Test consistency of COPY that inserts more to the same table using + # triggers. If the INSERTS from the trigger go to the same block data + # is copied to, and the INSERTs are WAL-logged, WAL replay will fail when + # it tries to replay the WAL record but the "before" image doesn't match, + # because not all changes were WAL-logged. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test6 (id serial PRIMARY KEY, id2 text); + CREATE FUNCTION test6_before_row_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + IF new.id2 NOT LIKE 'triggered%' THEN + INSERT INTO test6 VALUES (DEFAULT, 'triggered row before' || NEW.id2); + END IF; + RETURN NEW; + END; \$\$; + CREATE FUNCTION test6_after_row_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + IF new.id2 NOT LIKE 'triggered%' THEN + INSERT INTO test6 VALUES (DEFAULT, 'triggered row after' || OLD.id2); + END IF; + RETURN NEW; + END; \$\$; + CREATE TRIGGER test6_before_row_insert + BEFORE INSERT ON test6 + FOR EACH ROW EXECUTE PROCEDURE test6_before_row_trig(); + CREATE TRIGGER test6_after_row_insert + AFTER INSERT ON test6 + FOR EACH ROW EXECUTE PROCEDURE test6_after_row_trig(); + COPY test6 FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test6;"); + is($result, qq(9), + "wal_level = $wal_level, replay of optimized copy with before trigger"); + + # Test consistency of INSERT, COPY and TRUNCATE in same transaction block + # with TRUNCATE triggers. + $node->safe_psql('postgres', " + BEGIN; + CREATE TABLE test7 (id serial PRIMARY KEY, id2 text); + CREATE FUNCTION test7_before_stat_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before'); + RETURN NULL; + END; \$\$; + CREATE FUNCTION test7_after_stat_trig() RETURNS trigger + LANGUAGE plpgsql as \$\$ + BEGIN + INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before'); + RETURN NULL; + END; \$\$; + CREATE TRIGGER test7_before_stat_truncate + BEFORE TRUNCATE ON test7 + FOR EACH STATEMENT EXECUTE PROCEDURE test7_before_stat_trig(); + CREATE TRIGGER test7_after_stat_truncate + AFTER TRUNCATE ON test7 + FOR EACH STATEMENT EXECUTE PROCEDURE test7_after_stat_trig(); + INSERT INTO test7 VALUES (DEFAULT, 1); + TRUNCATE test7; + COPY test7 FROM '$copy_file' DELIMITER ','; + COMMIT;"); + $node->stop('immediate'); + $node->start; + $result = $node->safe_psql('postgres', "SELECT count(*) FROM test7;"); + is($result, qq(4), + "wal_level = $wal_level, replay of optimized copy with before trigger"); + + # Test redo of temp table creation. + $node->safe_psql('postgres', " + CREATE TEMP TABLE test8 (id serial PRIMARY KEY, id2 text);"); + $node->stop('immediate'); + $node->start; + + check_orphan_relfilenodes($node, "wal_level = $wal_level, no orphan relfilenode remains"); + + return; +} + +# Run same test suite for multiple wal_level values. +run_wal_optimize("minimal"); +run_wal_optimize("replica"); -- 2.16.3
>From 75b90a8020275af6ee5e6ee5a4433c5582bd9148 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp> Date: Mon, 20 May 2019 15:38:59 +0900 Subject: [PATCH 2/2] Fix WAL skipping feature WAL-skipping operations mixed with WAL-logged operations can lead to database corruption after a crash. This patch changes the WAL-skipping feature so that no data modifcation is WAL-logged at all then sync such relations at commit. --- src/backend/access/brin/brin.c | 2 + src/backend/access/gin/ginutil.c | 2 + src/backend/access/gist/gist.c | 2 + src/backend/access/hash/hash.c | 2 + src/backend/access/heap/heapam.c | 8 +-- src/backend/access/heap/heapam_handler.c | 15 +++--- src/backend/access/heap/rewriteheap.c | 3 -- src/backend/access/index/indexam.c | 16 ++++++ src/backend/access/nbtree/nbtree.c | 13 +++++ src/backend/access/transam/xact.c | 6 +++ src/backend/commands/copy.c | 6 --- src/backend/commands/createas.c | 5 +- src/backend/commands/matview.c | 4 -- src/backend/commands/tablecmds.c | 4 -- src/backend/utils/cache/relcache.c | 87 ++++++++++++++++++++++++++++++++ src/include/access/amapi.h | 8 +++ src/include/access/genam.h | 1 + src/include/access/heapam.h | 1 - src/include/access/nbtree.h | 1 + src/include/access/tableam.h | 36 +++++++------ src/include/utils/rel.h | 21 +++++++- src/include/utils/relcache.h | 1 + 22 files changed, 188 insertions(+), 56 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index aba234c0af..681520852f 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -125,6 +125,8 @@ brinhandler(PG_FUNCTION_ARGS) amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amatcommitsync = NULL; + PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index cf9699ad18..f4f0eebec5 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -77,6 +77,8 @@ ginhandler(PG_FUNCTION_ARGS) amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amatcommitsync = NULL; + PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index d70a138f54..3a23e7c4b2 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -99,6 +99,8 @@ gisthandler(PG_FUNCTION_ARGS) amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amatcommitsync = NULL; + PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 048e40e46f..3fa8262319 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -98,6 +98,8 @@ hashhandler(PG_FUNCTION_ARGS) amroutine->aminitparallelscan = NULL; amroutine->amparallelrescan = NULL; + amroutine->amatcommitsync = NULL; + PG_RETURN_POINTER(amroutine); } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 19d2c529d8..7f78122b81 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1950,7 +1950,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, MarkBufferDirty(buffer); /* XLOG stuff */ - if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation)) + if (RelationNeedsWAL(relation)) { xl_heap_insert xlrec; xl_heap_header xlhdr; @@ -2133,7 +2133,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, /* currently not needed (thus unsupported) for heap_multi_insert() */ AssertArg(!(options & HEAP_INSERT_NO_LOGICAL)); - needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation); + needwal = RelationNeedsWAL(relation); saveFreeSpace = RelationGetTargetPageFreeSpace(relation, HEAP_DEFAULT_FILLFACTOR); @@ -8906,10 +8906,6 @@ heap2_redo(XLogReaderState *record) void heap_sync(Relation rel) { - /* non-WAL-logged tables never need fsync */ - if (!RelationNeedsWAL(rel)) - return; - /* main heap */ FlushRelationBuffers(rel); /* FlushRelationBuffers will have opened rd_smgr */ diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 8d8161fd97..a2e1464845 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -557,15 +557,14 @@ tuple_lock_retry: return result; } +/* ------------------------------------------------------------------------ + * WAL-skipping related routine + * ------------------------------------------------------------------------ + */ static void -heapam_finish_bulk_insert(Relation relation, int options) +heapam_at_commit_sync(Relation relation) { - /* - * If we skipped writing WAL, then we need to sync the heap (but not - * indexes since those use WAL anyway / don't go through tableam) - */ - if (options & HEAP_INSERT_SKIP_WAL) - heap_sync(relation); + heap_sync(relation); } @@ -2573,7 +2572,7 @@ static const TableAmRoutine heapam_methods = { .tuple_delete = heapam_tuple_delete, .tuple_update = heapam_tuple_update, .tuple_lock = heapam_tuple_lock, - .finish_bulk_insert = heapam_finish_bulk_insert, + .at_commit_sync = heapam_at_commit_sync, .tuple_fetch_row_version = heapam_fetch_row_version, .tuple_get_latest_tid = heap_get_latest_tid, diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index bce4274362..1ac77f7c14 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -654,9 +654,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup) { int options = HEAP_INSERT_SKIP_FSM; - if (!state->rs_use_wal) - options |= HEAP_INSERT_SKIP_WAL; - /* * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data * for the TOAST table are not logically decoded. The main heap is diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 0fc9139bad..1d089603b7 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -33,6 +33,7 @@ * index_can_return - does index support index-only scans? * index_getprocid - get a support procedure OID * index_getprocinfo - get a support procedure's lookup info + * index_at_commit_sync - perform at_commit_sync * * NOTES * This file contains the index_ routines which used @@ -837,6 +838,21 @@ index_getprocinfo(Relation irel, return locinfo; } +/* ---------------- + * index_at_commit_sync + * + * This routine perfoms at-commit sync of index storage. This is called + * when permanent index created in the current transaction is committed. + * ---------------- + */ +void +index_at_commit_sync(Relation irel) +{ + Assert(irel->rd_indam != NULL && irel->rd_indam->amatcommitsync != NULL); + + irel->rd_indam->amatcommitsync(irel); +} + /* ---------------- * index_store_float8_orderby_distances * diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 02fb352b94..39377f35eb 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -147,6 +147,8 @@ bthandler(PG_FUNCTION_ARGS) amroutine->aminitparallelscan = btinitparallelscan; amroutine->amparallelrescan = btparallelrescan; + amroutine->amatcommitsync = btatcommitsync; + PG_RETURN_POINTER(amroutine); } @@ -1385,3 +1387,14 @@ btcanreturn(Relation index, int attno) { return true; } + +/* + * btatcommitsync() -- Perform at-commit sync of WAL-skipped index + */ +void +btatcommitsync(Relation index) +{ + FlushRelationBuffers(index); + smgrimmedsync(index->rd_smgr, MAIN_FORKNUM); +} + diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 20feeec327..bc38a53195 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -2120,6 +2120,9 @@ CommitTransaction(void) if (!is_parallel_worker) PreCommit_CheckForSerializationFailure(); + /* Sync WAL-skipped relations */ + PreCommit_RelationSync(); + /* * Insert notifications sent by NOTIFY commands into the queue. This * should be late in the pre-commit sequence to minimize time spent @@ -2395,6 +2398,9 @@ PrepareTransaction(void) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot PREPARE a transaction that has manipulated logical replication workers"))); + /* Sync WAL-skipped relations */ + PreCommit_RelationSync(); + /* Prevent cancel/die interrupt while cleaning up */ HOLD_INTERRUPTS(); diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 5f81aa57d4..a25c82438e 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2761,11 +2761,7 @@ CopyFrom(CopyState cstate) if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) && (cstate->rel->rd_createSubid != InvalidSubTransactionId || cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId)) - { ti_options |= TABLE_INSERT_SKIP_FSM; - if (!XLogIsNeeded()) - ti_options |= TABLE_INSERT_SKIP_WAL; - } /* * Optimize if new relfilenode was created in this subxact or one of its @@ -3364,8 +3360,6 @@ CopyFrom(CopyState cstate) FreeExecutorState(estate); - table_finish_bulk_insert(cstate->rel, ti_options); - return processed; } diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c index 43c2fa9124..859b869b0d 100644 --- a/src/backend/commands/createas.c +++ b/src/backend/commands/createas.c @@ -558,8 +558,7 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) * We can skip WAL-logging the insertions, unless PITR or streaming * replication is in use. We can skip the FSM in any case. */ - myState->ti_options = TABLE_INSERT_SKIP_FSM | - (XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL); + myState->ti_options = TABLE_INSERT_SKIP_FSM; myState->bistate = GetBulkInsertState(); /* Not using WAL requires smgr_targblock be initially invalid */ @@ -604,8 +603,6 @@ intorel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate); - table_finish_bulk_insert(myState->rel, myState->ti_options); - /* close rel, but keep lock until commit */ table_close(myState->rel, NoLock); myState->rel = NULL; diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index 99bf3c29f2..c84edd0db0 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -463,8 +463,6 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo) * replication is in use. We can skip the FSM in any case. */ myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN; - if (!XLogIsNeeded()) - myState->ti_options |= TABLE_INSERT_SKIP_WAL; myState->bistate = GetBulkInsertState(); /* Not using WAL requires smgr_targblock be initially invalid */ @@ -509,8 +507,6 @@ transientrel_shutdown(DestReceiver *self) FreeBulkInsertState(myState->bistate); - table_finish_bulk_insert(myState->transientrel, myState->ti_options); - /* close transientrel, but keep lock until commit */ table_close(myState->transientrel, NoLock); myState->transientrel = NULL; diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index bfcf9472d7..75f11a327d 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -4741,8 +4741,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) bistate = GetBulkInsertState(); ti_options = TABLE_INSERT_SKIP_FSM; - if (!XLogIsNeeded()) - ti_options |= TABLE_INSERT_SKIP_WAL; } else { @@ -5026,8 +5024,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode) { FreeBulkInsertState(bistate); - table_finish_bulk_insert(newrel, ti_options); - table_close(newrel, NoLock); } } diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index d0f6f715e6..4bffbfff5d 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -1512,6 +1512,9 @@ RelationInitIndexAccessInfo(Relation relation) relation->rd_exclprocs = NULL; relation->rd_exclstrats = NULL; relation->rd_amcache = NULL; + + if (relation->rd_indam->amatcommitsync != NULL) + relation->rd_can_skipwal = true; } /* @@ -1781,6 +1784,9 @@ RelationInitTableAccessMethod(Relation relation) * Now we can fetch the table AM's API struct */ InitTableAmRoutine(relation); + + if (relation->rd_tableam && relation->rd_tableam->at_commit_sync) + relation->rd_can_skipwal = true; } /* @@ -2913,6 +2919,73 @@ RememberToFreeTupleDescAtEOX(TupleDesc td) EOXactTupleDescArray[NextEOXactTupleDescNum++] = td; } +/* + * PreComimt_RelationSync + * + * Sync relations that were WAL-skipped in this transaction . + * + * AMs may have skipped WAL-logging for relations created in the current + * transaction. This let such relations be synced. This operation can only be + * perfomed while transaction status is INPROGRESS so it is separated from + * AtEOXact_RelationCache. + */ +void +PreCommit_RelationSync(void) +{ + HASH_SEQ_STATUS status; + RelIdCacheEnt *idhentry; + int i; + + /* See AtEOXact_RelationCache for details on eoxact_list */ + if (eoxact_list_overflowed) + { + hash_seq_init(&status, RelationIdCache); + while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL) + { + Relation rel = idhentry->reldesc; + + if (!RelationNeedsAtCommitSync(rel)) + continue; + + if (rel->rd_tableam != NULL) + table_at_commit_sync(rel); + else + { + Assert (rel->rd_indam != NULL); + table_at_commit_sync(rel); + } + } + } + else + { + for (i = 0; i < eoxact_list_len; i++) + { + Relation rel; + + idhentry = (RelIdCacheEnt *) hash_search(RelationIdCache, + (void *) &eoxact_list[i], + HASH_FIND, + NULL); + + if (idhentry == NULL) + continue; + + rel = idhentry->reldesc; + + if (!RelationNeedsAtCommitSync(rel)) + continue; + + if (rel->rd_tableam != NULL) + table_at_commit_sync(rel); + else + { + Assert (rel->rd_indam != NULL); + table_at_commit_sync(rel); + } + } + } +} + /* * AtEOXact_RelationCache * @@ -3032,7 +3105,21 @@ AtEOXact_cleanup(Relation relation, bool isCommit) if (relation->rd_createSubid != InvalidSubTransactionId) { if (isCommit) + { + /* + * While wal_level=minimal, we have skipped WAL-logging on + * persistent relations created in this transaction. Sync that + * tables out before they become publicly accessible. + */ + if (!XLogIsNeeded() && relation->rd_smgr && + relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) + { + FlushRelationBuffers(relation); + smgrimmedsync(relation->rd_smgr, MAIN_FORKNUM); + } + relation->rd_createSubid = InvalidSubTransactionId; + } else if (RelationHasReferenceCountZero(relation)) { RelationClearRelation(relation, false); diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 09a7404267..fc6981d98a 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -156,6 +156,11 @@ typedef void (*aminitparallelscan_function) (void *target); /* (re)start parallel index scan */ typedef void (*amparallelrescan_function) (IndexScanDesc scan); +/* sync relation at commit */ +typedef void (*amatcommitsync_function) (Relation indexRelation); + + /* interface function to support WAL-skipping feature */ + /* * API struct for an index AM. Note this must be stored in a single palloc'd * chunk of memory. @@ -230,6 +235,9 @@ typedef struct IndexAmRoutine amestimateparallelscan_function amestimateparallelscan; /* can be NULL */ aminitparallelscan_function aminitparallelscan; /* can be NULL */ amparallelrescan_function amparallelrescan; /* can be NULL */ + + /* interface function to support WAL-skipping feature */ + amatcommitsync_function amatcommitsync; /* can be NULL */; } IndexAmRoutine; diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 9717183ef2..b225fd622e 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -177,6 +177,7 @@ extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum, uint16 procnum); extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum); +extern void index_at_commit_sync(Relation irel); extern void index_store_float8_orderby_distances(IndexScanDesc scan, Oid *orderByTypes, double *distances, bool recheckOrderBy); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 62aaa08eff..0fb7d86bf2 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -29,7 +29,6 @@ /* "options" flag bits for heap_insert */ -#define HEAP_INSERT_SKIP_WAL TABLE_INSERT_SKIP_WAL #define HEAP_INSERT_SKIP_FSM TABLE_INSERT_SKIP_FSM #define HEAP_INSERT_FROZEN TABLE_INSERT_FROZEN #define HEAP_INSERT_NO_LOGICAL TABLE_INSERT_NO_LOGICAL diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6c1acd4855..1d042e89b5 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -717,6 +717,7 @@ extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info, extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); extern bool btcanreturn(Relation index, int attno); +extern void btatcommitsync(Relation index); /* * prototypes for internal functions in nbtree.c diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 06eae2337a..90254cb278 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -409,19 +409,15 @@ typedef struct TableAmRoutine TM_FailureData *tmfd); /* - * Perform operations necessary to complete insertions made via - * tuple_insert and multi_insert with a BulkInsertState specified. This - * may for example be used to flush the relation, when the - * TABLE_INSERT_SKIP_WAL option was used. + * Sync relation at commit-time if needed. * - * Typically callers of tuple_insert and multi_insert will just pass all - * the flags that apply to them, and each AM has to decide which of them - * make sense for it, and then only take actions in finish_bulk_insert for - * those flags, and ignore others. + * A table AM may skip WAL-logging for relations created in the current + * transaction. This routine is called commit-time and the table AM + * must flush buffer and sync the underlying storage. * * Optional callback. */ - void (*finish_bulk_insert) (Relation rel, int options); + void (*at_commit_sync) (Relation rel); /* ------------------------------------------------------------------------ @@ -1104,8 +1100,7 @@ table_compute_xid_horizon_for_tuples(Relation rel, * * * The BulkInsertState object (if any; bistate can be NULL for default - * behavior) is also just passed through to RelationGetBufferForTuple. If - * `bistate` is provided, table_finish_bulk_insert() needs to be called. + * behavior) is also just passed through to RelationGetBufferForTuple. * * On return the slot's tts_tid and tts_tableOid are updated to reflect the * insertion. But note that any toasting of fields within the slot is NOT @@ -1300,20 +1295,23 @@ table_lock_tuple(Relation rel, ItemPointer tid, Snapshot snapshot, } /* - * Perform operations necessary to complete insertions made via - * tuple_insert and multi_insert with a BulkInsertState specified. This - * e.g. may e.g. used to flush the relation when inserting with - * TABLE_INSERT_SKIP_WAL specified. + * Sync relation at commit-time if needed. + * + * A table AM that defines this interface can allow derived objects created + * in the current transaction to skip WAL-logging. This routine is called + * commit-time and the table AM must flush buffer and sync the underlying + * storage. + * + * Optional callback. */ static inline void -table_finish_bulk_insert(Relation rel, int options) +table_at_commit_sync(Relation rel) { /* optional callback */ - if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert) - rel->rd_tableam->finish_bulk_insert(rel, options); + if (rel->rd_tableam && rel->rd_tableam->at_commit_sync) + rel->rd_tableam->at_commit_sync(rel); } - /* ------------------------------------------------------------------------ * DDL related functionality. * ------------------------------------------------------------------------ diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h index d7f33abce3..c09fd84a1c 100644 --- a/src/include/utils/rel.h +++ b/src/include/utils/rel.h @@ -64,6 +64,9 @@ typedef struct RelationData * rd_replidindex) */ bool rd_statvalid; /* is rd_statlist valid? */ + /* Some relations cane comit WAL-logging on certain condition. */ + bool rd_can_skipwal; /* can skip WAL-logging? */ + /* * rd_createSubid is the ID of the highest subtransaction the rel has * survived into; or zero if the rel was not created in the current top @@ -512,9 +515,25 @@ typedef struct ViewOptions /* * RelationNeedsWAL * True if relation needs WAL. + * + * If underlying table AM has at_commit_sync interface, returns false if + * wal_level = minimal and this relation is created in the current transaction */ #define RelationNeedsWAL(relation) \ - ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT) + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \ + (!relation->rd_can_skipwal || \ + !(RELATION_IS_LOCAL(relation) && !XLogIsNeeded()))) + +/* + * RelationNeedAtCommitSync + * True if relation needs WAL needs on-commit sync + */ +#define RelationNeedsAtCommitSync(relation) \ + ((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \ + relation->rd_can_skipwal && \ + (RELATION_IS_LOCAL(relation) || \ + relation->rd_newRelfilenodeSubid != InvalidBlockNumber) \ + && !XLogIsNeeded())) /* * RelationUsesLocalBuffers diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 364495a5f0..07c4cfa565 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -120,6 +120,7 @@ extern void RelationCacheInvalidate(void); extern void RelationCloseSmgrByOid(Oid relationId); +extern void PreCommit_RelationSync(void); extern void AtEOXact_RelationCache(bool isCommit); extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid, SubTransactionId parentSubid); -- 2.16.3