Hello.

At Mon, 20 May 2019 15:54:30 +0900 (Tokyo Standard Time), Kyotaro HORIGUCHI 
<horiguchi.kyot...@lab.ntt.co.jp> wrote in 
<20190520.155430.215084510.horiguchi.kyot...@lab.ntt.co.jp>
> > I suspect the design in the https://postgr.es/m/559fa0ba.3080...@iki.fi last
> > paragraph will be simpler, not more complex.  In the implementation I'm
> > envisioning, smgrDoPendingDeletes() would change name, perhaps to
> > AtEOXact_Storage().  For every relfilenode it does not delete, it would 
> > ensure
> > durability by syncing (for large nodes) or by WAL-logging each page (for 
> > small
> > nodes).  RelationNeedsWAL() would return false whenever the applicable
> > relfilenode appears in pendingDeletes.  Access methods would remove their
> > smgrimmedsync() calls, but they would otherwise not change.  Would anyone 
> > like
> > to try implementing that?
> 
> Following this direction, the attached PoC works *at least for*
> the wal_optimization TAP tests, but doing pending flush not in
> smgr but in relcache. This is extending skip-wal feature to
> indexes. And makes the old 0002 patch on nbtree useless.

This is a tidier version of the patch.

- Passes regression tests including 018_wal_optimize.pl

- Move the substantial work to table/index AMs.

  Each AM can decide whether to support WAL skip or not.
  Currently heap and nbtree support it.

- The timing of sync is moved from AtEOXact to PreCommit. This is
  because heap_sync() needs xact state = INPROGRESS.

- matview and cluster is broken, since swapping to new
  relfilenode doesn't change rd_newRelfilenodeSubid. I'll address
  that.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 680462288cb82da23c19a02239787fc1ea08cdde Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp>
Date: Thu, 11 Oct 2018 10:03:21 +0900
Subject: [PATCH 1/2] TAP test for copy-truncation optimization.

---
 src/test/recovery/t/018_wal_optimize.pl | 291 ++++++++++++++++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 src/test/recovery/t/018_wal_optimize.pl

diff --git a/src/test/recovery/t/018_wal_optimize.pl b/src/test/recovery/t/018_wal_optimize.pl
new file mode 100644
index 0000000000..4fa8be728e
--- /dev/null
+++ b/src/test/recovery/t/018_wal_optimize.pl
@@ -0,0 +1,291 @@
+# Test WAL replay for optimized TRUNCATE and COPY records
+#
+# WAL truncation is optimized in some cases with TRUNCATE and COPY queries
+# which sometimes interact badly with the other optimizations in line with
+# several setting values of wal_level, particularly when using "minimal" or
+# "replica".  The optimization may be enabled or disabled depending on the
+# scenarios dealt here, and should never result in any type of failures or
+# data loss.
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 24;
+
+sub check_orphan_relfilenodes
+{
+	my($node, $test_name) = @_;
+
+	my $db_oid = $node->safe_psql('postgres',
+	   "SELECT oid FROM pg_database WHERE datname = 'postgres'");
+	my $prefix = "base/$db_oid/";
+	my $filepaths_referenced = $node->safe_psql('postgres', "
+	   SELECT pg_relation_filepath(oid) FROM pg_class
+	   WHERE reltablespace = 0 and relpersistence <> 't' and
+	   pg_relation_filepath(oid) IS NOT NULL;");
+	is_deeply([sort(map { "$prefix$_" }
+					grep(/^[0-9]+$/,
+						 slurp_dir($node->data_dir . "/$prefix")))],
+			  [sort split /\n/, $filepaths_referenced],
+			  $test_name);
+	return;
+}
+
+# Wrapper routine tunable for wal_level.
+sub run_wal_optimize
+{
+	my $wal_level = shift;
+
+	# Primary needs to have wal_level = minimal here
+	my $node = get_new_node("node_$wal_level");
+	$node->init;
+	$node->append_conf('postgresql.conf', qq(
+wal_level = $wal_level
+));
+	$node->start;
+
+	# Setup
+	my $tablespace_dir = $node->basedir . '/tablespace_other';
+	mkdir ($tablespace_dir);
+	$tablespace_dir = TestLib::real_dir($tablespace_dir);
+	$node->safe_psql('postgres',
+	   "CREATE TABLESPACE other LOCATION '$tablespace_dir';");
+
+	# Test direct truncation optimization.  No tuples
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test1 (id serial PRIMARY KEY);
+		TRUNCATE test1;
+		COMMIT;");
+
+	$node->stop('immediate');
+	$node->start;
+
+	my $result = $node->safe_psql('postgres', "SELECT count(*) FROM test1;");
+	is($result, qq(0),
+	   "wal_level = $wal_level, optimized truncation with empty table");
+
+	# Test truncation with inserted tuples within the same transaction.
+	# Tuples inserted after the truncation should be seen.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test2 (id serial PRIMARY KEY);
+		INSERT INTO test2 VALUES (DEFAULT);
+		TRUNCATE test2;
+		INSERT INTO test2 VALUES (DEFAULT);
+		COMMIT;");
+
+	$node->stop('immediate');
+	$node->start;
+
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test2;");
+	is($result, qq(1),
+	   "wal_level = $wal_level, optimized truncation with inserted table");
+
+	# Data file for COPY query in follow-up tests.
+	my $basedir = $node->basedir;
+	my $copy_file = "$basedir/copy_data.txt";
+	TestLib::append_to_file($copy_file, qq(20000,30000
+20001,30001
+20002,30002));
+
+	# Test truncation with inserted tuples using COPY.  Tuples copied after the
+	# truncation should be seen.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test3 (id serial PRIMARY KEY, id2 int);
+		INSERT INTO test3 (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+		TRUNCATE test3;
+		COPY test3 FROM '$copy_file' DELIMITER ',';
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test3;");
+	is($result, qq(3),
+	   "wal_level = $wal_level, optimized truncation with copied table");
+
+	# Like previous test, but rollback SET TABLESPACE in a subtransaction.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test3a (id serial PRIMARY KEY, id2 int);
+		INSERT INTO test3a (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+		TRUNCATE test3a;
+		SAVEPOINT s; ALTER TABLE test3a SET TABLESPACE other; ROLLBACK TO s;
+		COPY test3a FROM '$copy_file' DELIMITER ',';
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;");
+	is($result, qq(3),
+	   "wal_level = $wal_level, SET TABLESPACE in subtransaction");
+
+	# in different subtransaction patterns
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test3a2 (id serial PRIMARY KEY, id2 int);
+		INSERT INTO test3a2 (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+		TRUNCATE test3a2;
+		SAVEPOINT s; ALTER TABLE test3a SET TABLESPACE other; RELEASE s;
+		COPY test3a2 FROM '$copy_file' DELIMITER ',';
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;");
+	is($result, qq(3),
+	   "wal_level = $wal_level, SET TABLESPACE in subtransaction");
+
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test3a3 (id serial PRIMARY KEY, id2 int);
+		INSERT INTO test3a3 (id, id2) VALUES (DEFAULT, generate_series(1,3000));
+		TRUNCATE test3a3;
+		SAVEPOINT s;
+			ALTER TABLE test3a3 SET TABLESPACE other;
+			SAVEPOINT s2;
+				ALTER TABLE test3a3 SET TABLESPACE pg_default;
+			ROLLBACK TO s2;
+			SAVEPOINT s2;
+				ALTER TABLE test3a3 SET TABLESPACE pg_default;
+			RELEASE s2;
+		ROLLBACK TO s;
+		COPY test3a3 FROM '$copy_file' DELIMITER ',';
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test3a;");
+	is($result, qq(3),
+	   "wal_level = $wal_level, SET TABLESPACE in subtransaction");
+
+	# UPDATE touches two buffers; one is BufferNeedsWAL(); the other is not.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test3b (id serial PRIMARY KEY, id2 int);
+		INSERT INTO test3b (id, id2) VALUES (DEFAULT, generate_series(1,10000));
+		COPY test3b FROM '$copy_file' DELIMITER ',';  -- set sync_above
+		UPDATE test3b SET id2 = id2 + 1;
+		DELETE FROM test3b;
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test3b;");
+	is($result, qq(0),
+	   "wal_level = $wal_level, UPDATE of logged page extends relation");
+
+	# Test truncation with inserted tuples using both INSERT and COPY. Tuples
+	# inserted after the truncation should be seen.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test4 (id serial PRIMARY KEY, id2 int);
+		INSERT INTO test4 (id, id2) VALUES (DEFAULT, generate_series(1,10000));
+		TRUNCATE test4;
+		INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000);
+		COPY test4 FROM '$copy_file' DELIMITER ',';
+		INSERT INTO test4 (id, id2) VALUES (DEFAULT, 10000);
+		COMMIT;");
+
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test4;");
+	is($result, qq(5),
+	   "wal_level = $wal_level, optimized truncation with inserted/copied table");
+
+	# Test consistency of COPY with INSERT for table created in the same
+	# transaction.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test5 (id serial PRIMARY KEY, id2 int);
+		INSERT INTO test5 VALUES (DEFAULT, 1);
+		COPY test5 FROM '$copy_file' DELIMITER ',';
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test5;");
+	is($result, qq(4),
+	   "wal_level = $wal_level, replay of optimized copy with inserted table");
+
+	# Test consistency of COPY that inserts more to the same table using
+	# triggers.  If the INSERTS from the trigger go to the same block data
+	# is copied to, and the INSERTs are WAL-logged, WAL replay will fail when
+	# it tries to replay the WAL record but the "before" image doesn't match,
+	# because not all changes were WAL-logged.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test6 (id serial PRIMARY KEY, id2 text);
+		CREATE FUNCTION test6_before_row_trig() RETURNS trigger
+		  LANGUAGE plpgsql as \$\$
+		  BEGIN
+		    IF new.id2 NOT LIKE 'triggered%' THEN
+		      INSERT INTO test6 VALUES (DEFAULT, 'triggered row before' || NEW.id2);
+		    END IF;
+		    RETURN NEW;
+		  END; \$\$;
+		CREATE FUNCTION test6_after_row_trig() RETURNS trigger
+		  LANGUAGE plpgsql as \$\$
+		  BEGIN
+		    IF new.id2 NOT LIKE 'triggered%' THEN
+		      INSERT INTO test6 VALUES (DEFAULT, 'triggered row after' || OLD.id2);
+		    END IF;
+		    RETURN NEW;
+		  END; \$\$;
+		CREATE TRIGGER test6_before_row_insert
+		  BEFORE INSERT ON test6
+		  FOR EACH ROW EXECUTE PROCEDURE test6_before_row_trig();
+		CREATE TRIGGER test6_after_row_insert
+		  AFTER INSERT ON test6
+		  FOR EACH ROW EXECUTE PROCEDURE test6_after_row_trig();
+		COPY test6 FROM '$copy_file' DELIMITER ',';
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test6;");
+	is($result, qq(9),
+	   "wal_level = $wal_level, replay of optimized copy with before trigger");
+
+	# Test consistency of INSERT, COPY and TRUNCATE in same transaction block
+	# with TRUNCATE triggers.
+	$node->safe_psql('postgres', "
+		BEGIN;
+		CREATE TABLE test7 (id serial PRIMARY KEY, id2 text);
+		CREATE FUNCTION test7_before_stat_trig() RETURNS trigger
+		  LANGUAGE plpgsql as \$\$
+		  BEGIN
+		    INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before');
+		    RETURN NULL;
+		  END; \$\$;
+		CREATE FUNCTION test7_after_stat_trig() RETURNS trigger
+		  LANGUAGE plpgsql as \$\$
+		  BEGIN
+		    INSERT INTO test7 VALUES (DEFAULT, 'triggered stat before');
+		    RETURN NULL;
+		  END; \$\$;
+		CREATE TRIGGER test7_before_stat_truncate
+		  BEFORE TRUNCATE ON test7
+		  FOR EACH STATEMENT EXECUTE PROCEDURE test7_before_stat_trig();
+		CREATE TRIGGER test7_after_stat_truncate
+		  AFTER TRUNCATE ON test7
+		  FOR EACH STATEMENT EXECUTE PROCEDURE test7_after_stat_trig();
+		INSERT INTO test7 VALUES (DEFAULT, 1);
+		TRUNCATE test7;
+		COPY test7 FROM '$copy_file' DELIMITER ',';
+		COMMIT;");
+	$node->stop('immediate');
+	$node->start;
+	$result = $node->safe_psql('postgres', "SELECT count(*) FROM test7;");
+	is($result, qq(4),
+	   "wal_level = $wal_level, replay of optimized copy with before trigger");
+
+	# Test redo of temp table creation.
+	$node->safe_psql('postgres', "
+		CREATE TEMP TABLE test8 (id serial PRIMARY KEY, id2 text);");
+	$node->stop('immediate');
+	$node->start;
+
+	check_orphan_relfilenodes($node, "wal_level = $wal_level, no orphan relfilenode remains");
+
+	return;
+}
+
+# Run same test suite for multiple wal_level values.
+run_wal_optimize("minimal");
+run_wal_optimize("replica");
-- 
2.16.3

>From 75b90a8020275af6ee5e6ee5a4433c5582bd9148 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horiguchi.kyot...@lab.ntt.co.jp>
Date: Mon, 20 May 2019 15:38:59 +0900
Subject: [PATCH 2/2] Fix WAL skipping feature

WAL-skipping operations mixed with WAL-logged operations can lead to
database corruption after a crash. This patch changes the WAL-skipping
feature so that no data modifcation is WAL-logged at all then sync
such relations at commit.
---
 src/backend/access/brin/brin.c           |  2 +
 src/backend/access/gin/ginutil.c         |  2 +
 src/backend/access/gist/gist.c           |  2 +
 src/backend/access/hash/hash.c           |  2 +
 src/backend/access/heap/heapam.c         |  8 +--
 src/backend/access/heap/heapam_handler.c | 15 +++---
 src/backend/access/heap/rewriteheap.c    |  3 --
 src/backend/access/index/indexam.c       | 16 ++++++
 src/backend/access/nbtree/nbtree.c       | 13 +++++
 src/backend/access/transam/xact.c        |  6 +++
 src/backend/commands/copy.c              |  6 ---
 src/backend/commands/createas.c          |  5 +-
 src/backend/commands/matview.c           |  4 --
 src/backend/commands/tablecmds.c         |  4 --
 src/backend/utils/cache/relcache.c       | 87 ++++++++++++++++++++++++++++++++
 src/include/access/amapi.h               |  8 +++
 src/include/access/genam.h               |  1 +
 src/include/access/heapam.h              |  1 -
 src/include/access/nbtree.h              |  1 +
 src/include/access/tableam.h             | 36 +++++++------
 src/include/utils/rel.h                  | 21 +++++++-
 src/include/utils/relcache.h             |  1 +
 22 files changed, 188 insertions(+), 56 deletions(-)

diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index aba234c0af..681520852f 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -125,6 +125,8 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
 
+	amroutine->amatcommitsync = NULL;
+
 	PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index cf9699ad18..f4f0eebec5 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -77,6 +77,8 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
 
+	amroutine->amatcommitsync = NULL;
+
 	PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index d70a138f54..3a23e7c4b2 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -99,6 +99,8 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
 
+	amroutine->amatcommitsync = NULL;
+
 	PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 048e40e46f..3fa8262319 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -98,6 +98,8 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->aminitparallelscan = NULL;
 	amroutine->amparallelrescan = NULL;
 
+	amroutine->amatcommitsync = NULL;
+
 	PG_RETURN_POINTER(amroutine);
 }
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 19d2c529d8..7f78122b81 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1950,7 +1950,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	MarkBufferDirty(buffer);
 
 	/* XLOG stuff */
-	if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
+	if (RelationNeedsWAL(relation))
 	{
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
@@ -2133,7 +2133,7 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 	/* currently not needed (thus unsupported) for heap_multi_insert() */
 	AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
 
-	needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
+	needwal = RelationNeedsWAL(relation);
 	saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
 												   HEAP_DEFAULT_FILLFACTOR);
 
@@ -8906,10 +8906,6 @@ heap2_redo(XLogReaderState *record)
 void
 heap_sync(Relation rel)
 {
-	/* non-WAL-logged tables never need fsync */
-	if (!RelationNeedsWAL(rel))
-		return;
-
 	/* main heap */
 	FlushRelationBuffers(rel);
 	/* FlushRelationBuffers will have opened rd_smgr */
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 8d8161fd97..a2e1464845 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -557,15 +557,14 @@ tuple_lock_retry:
 	return result;
 }
 
+/* ------------------------------------------------------------------------
+ * WAL-skipping related routine
+ * ------------------------------------------------------------------------
+ */
 static void
-heapam_finish_bulk_insert(Relation relation, int options)
+heapam_at_commit_sync(Relation relation)
 {
-	/*
-	 * If we skipped writing WAL, then we need to sync the heap (but not
-	 * indexes since those use WAL anyway / don't go through tableam)
-	 */
-	if (options & HEAP_INSERT_SKIP_WAL)
-		heap_sync(relation);
+	heap_sync(relation);
 }
 
 
@@ -2573,7 +2572,7 @@ static const TableAmRoutine heapam_methods = {
 	.tuple_delete = heapam_tuple_delete,
 	.tuple_update = heapam_tuple_update,
 	.tuple_lock = heapam_tuple_lock,
-	.finish_bulk_insert = heapam_finish_bulk_insert,
+	.at_commit_sync = heapam_at_commit_sync,
 
 	.tuple_fetch_row_version = heapam_fetch_row_version,
 	.tuple_get_latest_tid = heap_get_latest_tid,
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index bce4274362..1ac77f7c14 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -654,9 +654,6 @@ raw_heap_insert(RewriteState state, HeapTuple tup)
 	{
 		int options = HEAP_INSERT_SKIP_FSM;
 
-		if (!state->rs_use_wal)
-			options |= HEAP_INSERT_SKIP_WAL;
-
 		/*
 		 * While rewriting the heap for VACUUM FULL / CLUSTER, make sure data
 		 * for the TOAST table are not logically decoded.  The main heap is
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 0fc9139bad..1d089603b7 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
  *		index_can_return	- does index support index-only scans?
  *		index_getprocid - get a support procedure OID
  *		index_getprocinfo - get a support procedure's lookup info
+ *		index_at_commit_sync - perform at_commit_sync
  *
  * NOTES
  *		This file contains the index_ routines which used
@@ -837,6 +838,21 @@ index_getprocinfo(Relation irel,
 	return locinfo;
 }
 
+/* ----------------
+ *		index_at_commit_sync
+ *
+ *		This routine perfoms at-commit sync of index storage.  This is called
+ *		when permanent index created in the current transaction is committed.
+ *		----------------
+ */
+void
+index_at_commit_sync(Relation irel)
+{
+	Assert(irel->rd_indam != NULL && irel->rd_indam->amatcommitsync != NULL);
+	
+	irel->rd_indam->amatcommitsync(irel);
+}
+
 /* ----------------
  *		index_store_float8_orderby_distances
  *
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 02fb352b94..39377f35eb 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -147,6 +147,8 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->aminitparallelscan = btinitparallelscan;
 	amroutine->amparallelrescan = btparallelrescan;
 
+	amroutine->amatcommitsync = btatcommitsync;
+
 	PG_RETURN_POINTER(amroutine);
 }
 
@@ -1385,3 +1387,14 @@ btcanreturn(Relation index, int attno)
 {
 	return true;
 }
+
+/*
+ *	btatcommitsync() -- Perform at-commit sync of WAL-skipped index
+ */
+void
+btatcommitsync(Relation index)
+{
+	FlushRelationBuffers(index);
+	smgrimmedsync(index->rd_smgr, MAIN_FORKNUM);
+}
+
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 20feeec327..bc38a53195 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2120,6 +2120,9 @@ CommitTransaction(void)
 	if (!is_parallel_worker)
 		PreCommit_CheckForSerializationFailure();
 
+	/* Sync WAL-skipped relations */
+	PreCommit_RelationSync();
+
 	/*
 	 * Insert notifications sent by NOTIFY commands into the queue.  This
 	 * should be late in the pre-commit sequence to minimize time spent
@@ -2395,6 +2398,9 @@ PrepareTransaction(void)
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot PREPARE a transaction that has manipulated logical replication workers")));
 
+	/* Sync WAL-skipped relations */
+	PreCommit_RelationSync();
+
 	/* Prevent cancel/die interrupt while cleaning up */
 	HOLD_INTERRUPTS();
 
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 5f81aa57d4..a25c82438e 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2761,11 +2761,7 @@ CopyFrom(CopyState cstate)
 	if (RELKIND_HAS_STORAGE(cstate->rel->rd_rel->relkind) &&
 		(cstate->rel->rd_createSubid != InvalidSubTransactionId ||
 		 cstate->rel->rd_newRelfilenodeSubid != InvalidSubTransactionId))
-	{
 		ti_options |= TABLE_INSERT_SKIP_FSM;
-		if (!XLogIsNeeded())
-			ti_options |= TABLE_INSERT_SKIP_WAL;
-	}
 
 	/*
 	 * Optimize if new relfilenode was created in this subxact or one of its
@@ -3364,8 +3360,6 @@ CopyFrom(CopyState cstate)
 
 	FreeExecutorState(estate);
 
-	table_finish_bulk_insert(cstate->rel, ti_options);
-
 	return processed;
 }
 
diff --git a/src/backend/commands/createas.c b/src/backend/commands/createas.c
index 43c2fa9124..859b869b0d 100644
--- a/src/backend/commands/createas.c
+++ b/src/backend/commands/createas.c
@@ -558,8 +558,7 @@ intorel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 * We can skip WAL-logging the insertions, unless PITR or streaming
 	 * replication is in use. We can skip the FSM in any case.
 	 */
-	myState->ti_options = TABLE_INSERT_SKIP_FSM |
-		(XLogIsNeeded() ? 0 : TABLE_INSERT_SKIP_WAL);
+	myState->ti_options = TABLE_INSERT_SKIP_FSM;
 	myState->bistate = GetBulkInsertState();
 
 	/* Not using WAL requires smgr_targblock be initially invalid */
@@ -604,8 +603,6 @@ intorel_shutdown(DestReceiver *self)
 
 	FreeBulkInsertState(myState->bistate);
 
-	table_finish_bulk_insert(myState->rel, myState->ti_options);
-
 	/* close rel, but keep lock until commit */
 	table_close(myState->rel, NoLock);
 	myState->rel = NULL;
diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c
index 99bf3c29f2..c84edd0db0 100644
--- a/src/backend/commands/matview.c
+++ b/src/backend/commands/matview.c
@@ -463,8 +463,6 @@ transientrel_startup(DestReceiver *self, int operation, TupleDesc typeinfo)
 	 * replication is in use. We can skip the FSM in any case.
 	 */
 	myState->ti_options = TABLE_INSERT_SKIP_FSM | TABLE_INSERT_FROZEN;
-	if (!XLogIsNeeded())
-		myState->ti_options |= TABLE_INSERT_SKIP_WAL;
 	myState->bistate = GetBulkInsertState();
 
 	/* Not using WAL requires smgr_targblock be initially invalid */
@@ -509,8 +507,6 @@ transientrel_shutdown(DestReceiver *self)
 
 	FreeBulkInsertState(myState->bistate);
 
-	table_finish_bulk_insert(myState->transientrel, myState->ti_options);
-
 	/* close transientrel, but keep lock until commit */
 	table_close(myState->transientrel, NoLock);
 	myState->transientrel = NULL;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index bfcf9472d7..75f11a327d 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4741,8 +4741,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 		bistate = GetBulkInsertState();
 
 		ti_options = TABLE_INSERT_SKIP_FSM;
-		if (!XLogIsNeeded())
-			ti_options |= TABLE_INSERT_SKIP_WAL;
 	}
 	else
 	{
@@ -5026,8 +5024,6 @@ ATRewriteTable(AlteredTableInfo *tab, Oid OIDNewHeap, LOCKMODE lockmode)
 	{
 		FreeBulkInsertState(bistate);
 
-		table_finish_bulk_insert(newrel, ti_options);
-
 		table_close(newrel, NoLock);
 	}
 }
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index d0f6f715e6..4bffbfff5d 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1512,6 +1512,9 @@ RelationInitIndexAccessInfo(Relation relation)
 	relation->rd_exclprocs = NULL;
 	relation->rd_exclstrats = NULL;
 	relation->rd_amcache = NULL;
+
+	if (relation->rd_indam->amatcommitsync != NULL)
+		relation->rd_can_skipwal = true;
 }
 
 /*
@@ -1781,6 +1784,9 @@ RelationInitTableAccessMethod(Relation relation)
 	 * Now we can fetch the table AM's API struct
 	 */
 	InitTableAmRoutine(relation);
+
+	if (relation->rd_tableam && relation->rd_tableam->at_commit_sync)
+		relation->rd_can_skipwal = true;
 }
 
 /*
@@ -2913,6 +2919,73 @@ RememberToFreeTupleDescAtEOX(TupleDesc td)
 	EOXactTupleDescArray[NextEOXactTupleDescNum++] = td;
 }
 
+/*
+ * PreComimt_RelationSync
+ *
+ *	Sync relations that were WAL-skipped in this transaction .
+ *
+ * AMs may have skipped WAL-logging for relations created in the current
+ * transaction. This let such relations be synced.  This operation can only be
+ * perfomed while transaction status is INPROGRESS so it is separated from
+ * AtEOXact_RelationCache.
+ */
+void
+PreCommit_RelationSync(void)
+{
+	HASH_SEQ_STATUS status;
+	RelIdCacheEnt *idhentry;
+	int			i;
+
+	/* See AtEOXact_RelationCache for details on eoxact_list */
+	if (eoxact_list_overflowed)
+	{
+		hash_seq_init(&status, RelationIdCache);
+		while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
+		{
+			Relation rel = idhentry->reldesc;
+
+			if (!RelationNeedsAtCommitSync(rel))
+				continue;
+
+			if (rel->rd_tableam != NULL)
+				table_at_commit_sync(rel);
+			else
+			{
+				Assert (rel->rd_indam != NULL);
+				table_at_commit_sync(rel);
+			}				
+		}
+	}
+	else
+	{
+		for (i = 0; i < eoxact_list_len; i++)
+		{
+			Relation rel;
+
+			idhentry = (RelIdCacheEnt *) hash_search(RelationIdCache,
+													 (void *) &eoxact_list[i],
+													 HASH_FIND,
+													 NULL);
+
+			if (idhentry == NULL)
+				continue;
+
+			rel = idhentry->reldesc;
+
+			if (!RelationNeedsAtCommitSync(rel))
+				continue;
+
+			if (rel->rd_tableam != NULL)
+				table_at_commit_sync(rel);
+			else
+			{
+				Assert (rel->rd_indam != NULL);
+				table_at_commit_sync(rel);
+			}
+		}
+	}
+}
+
 /*
  * AtEOXact_RelationCache
  *
@@ -3032,7 +3105,21 @@ AtEOXact_cleanup(Relation relation, bool isCommit)
 	if (relation->rd_createSubid != InvalidSubTransactionId)
 	{
 		if (isCommit)
+		{
+			/*
+			 * While wal_level=minimal, we have skipped WAL-logging on
+			 * persistent relations created in this transaction. Sync that
+			 * tables out before they become publicly accessible.
+			 */
+			if (!XLogIsNeeded() && relation->rd_smgr &&
+				relation->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+			{
+				FlushRelationBuffers(relation);
+				smgrimmedsync(relation->rd_smgr, MAIN_FORKNUM);
+			}
+
 			relation->rd_createSubid = InvalidSubTransactionId;
+		}
 		else if (RelationHasReferenceCountZero(relation))
 		{
 			RelationClearRelation(relation, false);
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 09a7404267..fc6981d98a 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -156,6 +156,11 @@ typedef void (*aminitparallelscan_function) (void *target);
 /* (re)start parallel index scan */
 typedef void (*amparallelrescan_function) (IndexScanDesc scan);
 
+/* sync relation at commit */
+typedef void (*amatcommitsync_function) (Relation indexRelation);
+
+	/* interface function to support WAL-skipping feature */
+	
 /*
  * API struct for an index AM.  Note this must be stored in a single palloc'd
  * chunk of memory.
@@ -230,6 +235,9 @@ typedef struct IndexAmRoutine
 	amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
 	aminitparallelscan_function aminitparallelscan; /* can be NULL */
 	amparallelrescan_function amparallelrescan; /* can be NULL */
+
+	/* interface function to support WAL-skipping feature */
+	amatcommitsync_function amatcommitsync; /* can be NULL */;
 } IndexAmRoutine;
 
 
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index 9717183ef2..b225fd622e 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -177,6 +177,7 @@ extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
 				uint16 procnum);
 extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
 				  uint16 procnum);
+extern void index_at_commit_sync(Relation irel);
 extern void index_store_float8_orderby_distances(IndexScanDesc scan,
 									 Oid *orderByTypes, double *distances,
 									 bool recheckOrderBy);
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index 62aaa08eff..0fb7d86bf2 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -29,7 +29,6 @@
 
 
 /* "options" flag bits for heap_insert */
-#define HEAP_INSERT_SKIP_WAL	TABLE_INSERT_SKIP_WAL
 #define HEAP_INSERT_SKIP_FSM	TABLE_INSERT_SKIP_FSM
 #define HEAP_INSERT_FROZEN		TABLE_INSERT_FROZEN
 #define HEAP_INSERT_NO_LOGICAL	TABLE_INSERT_NO_LOGICAL
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 6c1acd4855..1d042e89b5 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -717,6 +717,7 @@ extern IndexBulkDeleteResult *btbulkdelete(IndexVacuumInfo *info,
 extern IndexBulkDeleteResult *btvacuumcleanup(IndexVacuumInfo *info,
 				IndexBulkDeleteResult *stats);
 extern bool btcanreturn(Relation index, int attno);
+extern void btatcommitsync(Relation index);
 
 /*
  * prototypes for internal functions in nbtree.c
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 06eae2337a..90254cb278 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -409,19 +409,15 @@ typedef struct TableAmRoutine
 							   TM_FailureData *tmfd);
 
 	/*
-	 * Perform operations necessary to complete insertions made via
-	 * tuple_insert and multi_insert with a BulkInsertState specified. This
-	 * may for example be used to flush the relation, when the
-	 * TABLE_INSERT_SKIP_WAL option was used.
+	 * Sync relation at commit-time if needed.
 	 *
-	 * Typically callers of tuple_insert and multi_insert will just pass all
-	 * the flags that apply to them, and each AM has to decide which of them
-	 * make sense for it, and then only take actions in finish_bulk_insert for
-	 * those flags, and ignore others.
+	 *  A table AM may skip WAL-logging for relations created in the current
+	 *  transaction. This routine is called commit-time and the table AM
+	 *  must flush buffer and sync the underlying storage.
 	 *
 	 * Optional callback.
 	 */
-	void		(*finish_bulk_insert) (Relation rel, int options);
+	void		(*at_commit_sync) (Relation rel);
 
 
 	/* ------------------------------------------------------------------------
@@ -1104,8 +1100,7 @@ table_compute_xid_horizon_for_tuples(Relation rel,
  *
  *
  * The BulkInsertState object (if any; bistate can be NULL for default
- * behavior) is also just passed through to RelationGetBufferForTuple. If
- * `bistate` is provided, table_finish_bulk_insert() needs to be called.
+ * behavior) is also just passed through to RelationGetBufferForTuple.
  *
  * On return the slot's tts_tid and tts_tableOid are updated to reflect the
  * insertion. But note that any toasting of fields within the slot is NOT
@@ -1300,20 +1295,23 @@ table_lock_tuple(Relation rel, ItemPointer tid, Snapshot snapshot,
 }
 
 /*
- * Perform operations necessary to complete insertions made via
- * tuple_insert and multi_insert with a BulkInsertState specified. This
- * e.g. may e.g. used to flush the relation when inserting with
- * TABLE_INSERT_SKIP_WAL specified.
+ * Sync relation at commit-time if needed.
+ *
+ *  A table AM that defines this interface can allow derived objects created
+ *  in the current transaction to skip WAL-logging. This routine is called
+ *  commit-time and the table AM must flush buffer and sync the underlying
+ *  storage.
+ *
+ * Optional callback.
  */
 static inline void
-table_finish_bulk_insert(Relation rel, int options)
+table_at_commit_sync(Relation rel)
 {
 	/* optional callback */
-	if (rel->rd_tableam && rel->rd_tableam->finish_bulk_insert)
-		rel->rd_tableam->finish_bulk_insert(rel, options);
+	if (rel->rd_tableam && rel->rd_tableam->at_commit_sync)
+		rel->rd_tableam->at_commit_sync(rel);
 }
 
-
 /* ------------------------------------------------------------------------
  * DDL related functionality.
  * ------------------------------------------------------------------------
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index d7f33abce3..c09fd84a1c 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -64,6 +64,9 @@ typedef struct RelationData
 								 * rd_replidindex) */
 	bool		rd_statvalid;	/* is rd_statlist valid? */
 
+	/* Some relations cane comit WAL-logging on certain condition. */
+	bool		rd_can_skipwal; /* can skip WAL-logging?  */
+
 	/*
 	 * rd_createSubid is the ID of the highest subtransaction the rel has
 	 * survived into; or zero if the rel was not created in the current top
@@ -512,9 +515,25 @@ typedef struct ViewOptions
 /*
  * RelationNeedsWAL
  *		True if relation needs WAL.
+ *
+ * If underlying table AM has at_commit_sync interface, returns false if
+ * wal_level = minimal and this relation is created in the current transaction
  */
 #define RelationNeedsWAL(relation) \
-	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT)
+	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT && \
+	 (!relation->rd_can_skipwal ||									   \
+	  !(RELATION_IS_LOCAL(relation) && !XLogIsNeeded())))
+
+/*
+ * RelationNeedAtCommitSync
+ *      True if relation needs WAL needs on-commit sync
+ */
+#define RelationNeedsAtCommitSync(relation) \
+	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&	\
+	 relation->rd_can_skipwal &&										\
+	 (RELATION_IS_LOCAL(relation) ||									\
+	  relation->rd_newRelfilenodeSubid != InvalidBlockNumber)			\
+	 && !XLogIsNeeded()))
 
 /*
  * RelationUsesLocalBuffers
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index 364495a5f0..07c4cfa565 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -120,6 +120,7 @@ extern void RelationCacheInvalidate(void);
 
 extern void RelationCloseSmgrByOid(Oid relationId);
 
+extern void PreCommit_RelationSync(void);
 extern void AtEOXact_RelationCache(bool isCommit);
 extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
 						  SubTransactionId parentSubid);
-- 
2.16.3

Reply via email to