From 535e9178c5acbc27a5096643207afb3bea27a756 Mon Sep 17 00:00:00 2001
From: Osumi Takamichi <osumi.takamichi@fujitsu.com>
Date: Wed, 7 Apr 2021 06:10:46 +0000
Subject: [PATCH v09] new wal_level to disable WAL logging

In order to speed up the performance, especially
for bulk data loading or pg_dumpall, this feature generates
only limited types of WALs. This means we gain
this speed-up even at the cost of crash recovery.

During the operation of this new wal_level,
an unexpected stoppage or shutdown of the server
makes the whole cluster corrupted and unrecoverable.
In other words, any kind of accidents make the server never
start up again. Therefore, taking a full backup before and after
the change of this wal_level is a must.

Author: Takamichi Osumi <osumi.takamichi@fujitsu.com>
Reviewed-by: Tsunakawa Takayuki <tsunakawa.takay@fujitsu.com>
Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com>
Reviewed-by: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com>
Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Discussion: https://www.postgresql.org/message-id/TYAPR01MB29901EBE5A3ACCE55BA99186FE320%40TYAPR01MB2990.jpnprd01.prod.outlook.com
---
 doc/src/sgml/config.sgml                      | 21 +++++++++++++++++++--
 doc/src/sgml/perform.sgml                     | 10 ++++++++--
 src/backend/access/gist/gistxlog.c            |  2 +-
 src/backend/access/rmgrdesc/xlogdesc.c        |  1 +
 src/backend/access/transam/twophase.c         |  2 +-
 src/backend/access/transam/varsup.c           |  2 +-
 src/backend/access/transam/xlog.c             | 27 ++++++++++++++++++++-------
 src/backend/access/transam/xlogfuncs.c        |  5 +++++
 src/backend/access/transam/xloginsert.c       | 11 +++++++++++
 src/backend/postmaster/postmaster.c           |  6 +++---
 src/backend/utils/misc/postgresql.conf.sample |  6 ++++--
 src/bin/pg_controldata/pg_controldata.c       |  2 ++
 src/include/access/xlog.h                     |  6 ++++--
 src/include/access/xlogdefs.h                 |  2 +-
 src/include/utils/rel.h                       |  3 ++-
 src/test/recovery/t/024_archive_recovery.pl   |  2 +-
 16 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e51639d..21c253a 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2696,7 +2696,16 @@ include_dir 'conf.d'
         data to support WAL archiving and replication, including running
         read-only queries on a standby server. <literal>minimal</literal> removes all
         logging except the information required to recover from a crash or
-        immediate shutdown.  Finally,
+        immediate shutdown.  <literal>none</literal> generates strictly limited types of WAL.
+        Those are WAL for transaction, transaction resources, GIST and generic WAL record type.
+        This means that the amount of WAL during <literal>none</literal>
+        is much less than that of <literal>minimal</literal> for ordinal operation.
+        Intrinsically, the purpose of <literal>none</literal>
+        is to accelerate data bulk loading at the expense of recovery. Accordingly, note that
+        any kind of accidental server stoppage during <literal>none</literal> makes the whole cluster
+        corrupted and never start up again. Therefore, never use this mode unless the operation
+        during this mode is repeatable and the cluster can be backed up before and after the operation.
+        Thus, this mode requires careful planning and preparation. Finally,
         <literal>logical</literal> adds information necessary to support logical
         decoding.  Each level includes the information logged at all lower
         levels.  This parameter can only be set at server start.
@@ -2724,6 +2733,13 @@ include_dir 'conf.d'
         <literal>minimal</literal> makes any base backups taken before
         unavailable for archive recovery and standby server, which may
         lead to database loss.
+        In the same way, <literal>none</literal> does not create almost
+        all types of WAL logs in principle. Therefore, this <varname>wal_level</varname>
+        can be used to maximize the speed of data loading. For example, bulk data loading
+        or version upgrade using pg_dumpall. On the other hand, an unexpected crash of
+        the server makes the database cluster inconsistent and never be able to restart.
+        For that reason, before and after utilizing this level, get a full backup
+        of the whole cluster.
        </para>
        <para>
         In <literal>logical</literal> level, the same information is logged as
@@ -3487,7 +3503,8 @@ include_dir 'conf.d'
         changed without leaving archiving mode.
         This parameter can only be set at server start.
         <varname>archive_mode</varname> cannot be enabled when
-        <varname>wal_level</varname> is set to <literal>minimal</literal>.
+        <varname>wal_level</varname> is set to <literal>none</literal> or
+        <literal>minimal</literal>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index e0d3f24..3ae854a 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1742,12 +1742,18 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
     new base backup after the load has completed than to process a large
     amount of incremental WAL data.  To prevent incremental WAL logging
     while loading, disable archiving and streaming replication, by setting
-    <xref linkend="guc-wal-level"/> to <literal>minimal</literal>,
+    <xref linkend="guc-wal-level"/> to either <literal>minimal</literal>
+    or <literal>none</literal>,
     <xref linkend="guc-archive-mode"/> to <literal>off</literal>, and
     <xref linkend="guc-max-wal-senders"/> to zero.
     But note that changing these settings requires a server restart,
     and makes any base backups taken before unavailable for archive
     recovery and standby server, which may lead to database loss.
+    Further, setting <literal>wal_level</literal> to <literal>none</literal>
+    is extremely performance-oriented feature and a crash or server stoppage
+    during the data loading causes corruption of the whole cluster. When it happens,
+    the server will not restart again any more. Thus, the administrator
+    needs to set up the cluster from the full backup taken before the operation.
    </para>
 
    <para>
@@ -1813,7 +1819,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
        If using WAL archiving or streaming replication, consider disabling
        them during the restore. To do that, set <varname>archive_mode</varname>
        to <literal>off</literal>,
-       <varname>wal_level</varname> to <literal>minimal</literal>, and
+       <varname>wal_level</varname> to <literal>minimal</literal> or <literal>none</literal>, and
        <varname>max_wal_senders</varname> to zero before loading the dump.
        Afterwards, set them back to the right values and take a fresh
        base backup.
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 6464cb9..965e124 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -587,7 +587,7 @@ gistXLogAssignLSN(void)
 	 * follow the restriction.
 	 */
 	XLogBeginInsert();
-	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+	XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT | XLOG_MARK_ESSENTIAL);
 	XLogRegisterData((char *) &dummy, sizeof(dummy));
 	return XLogInsert(RM_GIST_ID, XLOG_GIST_ASSIGN_LSN);
 }
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index e6090a9..9d17ed2 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -25,6 +25,7 @@
  * GUC support
  */
 const struct config_enum_entry wal_level_options[] = {
+	{"none", WAL_LEVEL_NONE, false},
 	{"minimal", WAL_LEVEL_MINIMAL, false},
 	{"replica", WAL_LEVEL_REPLICA, false},
 	{"archive", WAL_LEVEL_REPLICA, true},	/* deprecated */
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 89335b6..3e82050 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1115,7 +1115,7 @@ EndPrepare(GlobalTransaction gxact)
 	for (record = records.head; record != NULL; record = record->next)
 		XLogRegisterData(record->data, record->len);
 
-	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
+	XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN | XLOG_MARK_ESSENTIAL);
 
 	gxact->prepare_end_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE);
 
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 142da4a..11a0c97 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -368,7 +368,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
 	 * within 3M transactions of data loss.  This leaves lots of room for the
 	 * DBA to fool around fixing things in a standalone backend, while not
 	 * being significant compared to total XID space. (VACUUM requires an XID
-	 * if it truncates at wal_level!=minimal.  "VACUUM (ANALYZE)", which a DBA
+	 * if it truncates at wal_level<=minimal.  "VACUUM (ANALYZE)", which a DBA
 	 * might do by reflex, assigns an XID.  Hence, we had better be sure
 	 * there's lots of XIDs left...)  Also, at default BLCKSZ, this leaves two
 	 * completely-idle segments.  In the event of edge-case bugs involving
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c1d4415..580503b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6401,11 +6401,11 @@ CheckRequiredParameterValues(void)
 	 * For archive recovery, the WAL must be generated with at least 'replica'
 	 * wal_level.
 	 */
-	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+	if (ArchiveRecoveryRequested && ControlFile->wal_level <= WAL_LEVEL_MINIMAL)
 	{
 		ereport(FATAL,
-				(errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
-				 errdetail("This happens if you temporarily set wal_level=minimal on the server."),
+				(errmsg("WAL was generated with wal_level<=minimal, cannot continue recovering"),
+				 errdetail("This happens if you temporarily set wal_level<=minimal on the server."),
 				 errhint("Use a backup taken after setting wal_level to higher than minimal.")));
 	}
 
@@ -6529,6 +6529,15 @@ StartupXLOG(void)
 					(errmsg("control file contains invalid database cluster state")));
 	}
 
+	/*
+	 * Detect if the server previously crashed under wal_level='none' or not.
+	 */
+	if (ControlFile->wal_level == WAL_LEVEL_NONE &&
+		(ControlFile->state != DB_SHUTDOWNED && ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY))
+		ereport(ERROR,
+				(errmsg("detected an unexpected server shutdown when WAL logging was disabled"),
+				 errhint("It looks like you need to deploy a new cluster from your full backup again.")));
+
 	/* This is just to allow attaching to startup process with a debugger */
 #ifdef XLOG_REPLAY_DELAY
 	if (ControlFile->state != DB_SHUTDOWNED)
@@ -9173,9 +9182,13 @@ CreateCheckPoint(int flags)
 	 */
 	XLogBeginInsert();
 	XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
-	recptr = XLogInsert(RM_XLOG_ID,
-						shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
-						XLOG_CHECKPOINT_ONLINE);
+	if (shutdown)
+	{
+		XLogSetRecordFlags(XLOG_MARK_ESSENTIAL);
+		recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_SHUTDOWN);
+	}
+	else
+		recptr = XLogInsert(RM_XLOG_ID, XLOG_CHECKPOINT_ONLINE);
 
 	XLogFlush(recptr);
 
@@ -9943,7 +9956,7 @@ XLogReportParameters(void)
 
 			XLogBeginInsert();
 			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
-
+			XLogSetRecordFlags(XLOG_MARK_ESSENTIAL);
 			recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
 			XLogFlush(recptr);
 		}
diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index f363a4c..74194eb 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -268,6 +268,11 @@ pg_switch_wal(PG_FUNCTION_ARGS)
 {
 	XLogRecPtr	switchpoint;
 
+	if (wal_level == WAL_LEVEL_NONE)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot execute pg_switch_wal when WAL logging is turned off")));
+
 	if (RecoveryInProgress())
 		ereport(ERROR,
 				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 7052dc2..7e17e1e 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -19,6 +19,7 @@
 
 #include "postgres.h"
 
+#include "access/gistxlog.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xlog_internal.h"
@@ -399,6 +400,8 @@ XLogRegisterBufData(uint8 block_id, char *data, int len)
  * - XLOG_MARK_UNIMPORTANT, to signal that the record is not important for
  *	 durability, which allows to avoid triggering WAL archiving and other
  *	 background activity.
+ * - XLOG_MARK_ESSENTIAL, to issue limited types of crucial WAL even when
+ **	 WAL logging is off.
  */
 void
 XLogSetRecordFlags(uint8 flags)
@@ -449,6 +452,14 @@ XLogInsert(RmgrId rmid, uint8 info)
 		return EndPos;
 	}
 
+	/* Issues only limited types of WAL when wal logging is disabled */
+	if (wal_level == WAL_LEVEL_NONE &&
+		!(curinsert_flags & XLOG_MARK_ESSENTIAL))
+	{
+		XLogResetInsertion();
+		return GetXLogInsertRecPtr();
+	}
+
 	do
 	{
 		XLogRecPtr	RedoRecPtr;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 4a3ca78..6a76358 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -911,10 +911,10 @@ PostmasterMain(int argc, char *argv[])
 					 ReservedBackends, MaxConnections);
 		ExitPostmaster(1);
 	}
-	if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level == WAL_LEVEL_MINIMAL)
+	if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level <= WAL_LEVEL_MINIMAL)
 		ereport(ERROR,
-				(errmsg("WAL archival cannot be enabled when wal_level is \"minimal\"")));
-	if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL)
+				(errmsg("WAL archival cannot be enabled when wal_level is \"none\" or \"\"minimal")));
+	if (max_wal_senders > 0 && wal_level <= WAL_LEVEL_MINIMAL)
 		ereport(ERROR,
 				(errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\"")));
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 39da7cc..29f14b5 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -197,8 +197,10 @@
 
 # - Settings -
 
-#wal_level = replica			# minimal, replica, or logical
-					# (change requires restart)
+#wal_level = replica			# none, minimal, replica, or logical
+					# (change requires restart.
+					# choosing wal_level=none
+					# can cause unrecoverable data corruption)
 #fsync = on				# flush data to disk for crash safety
 					# (turning this off can cause
 					# unrecoverable data corruption)
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index f911f98..84b70c1 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -74,6 +74,8 @@ wal_level_str(WalLevel wal_level)
 {
 	switch (wal_level)
 	{
+		case WAL_LEVEL_NONE:
+			return "none";
 		case WAL_LEVEL_MINIMAL:
 			return "minimal";
 		case WAL_LEVEL_REPLICA:
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 77187c1..031bb26 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -162,7 +162,8 @@ extern int	XLogArchiveMode;
 /* WAL levels */
 typedef enum WalLevel
 {
-	WAL_LEVEL_MINIMAL = 0,
+	WAL_LEVEL_NONE = 0,
+	WAL_LEVEL_MINIMAL,
 	WAL_LEVEL_REPLICA,
 	WAL_LEVEL_LOGICAL
 } WalLevel;
@@ -247,7 +248,8 @@ extern bool XLOG_DEBUG;
 #define XLOG_INCLUDE_ORIGIN		0x01	/* include the replication origin */
 #define XLOG_MARK_UNIMPORTANT	0x02	/* record not important for durability */
 #define XLOG_INCLUDE_XID		0x04	/* include XID of top-level xact */
-
+#define XLOG_MARK_ESSENTIAL		0x08	/* necessary even when WAL logging is
+										 * disabled */
 
 /* Checkpoint statistics */
 typedef struct CheckpointStatsData
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index 0940b64..208f535 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -66,7 +66,7 @@ typedef uint16 RepOriginId;
 
 /*
  *	Because O_DIRECT bypasses the kernel buffers, and because we never
- *	read those buffers except during crash recovery or if wal_level != minimal,
+ *	read those buffers except during crash recovery or if wal_level <= minimal,
  *	it is a win to use it in all cases where we sync on each write().  We could
  *	allow O_DIRECT with fsync(), but it is unclear if fsync() could process
  *	writes not buffered in the kernel.  Also, O_DIRECT is never enough to force
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 9a3a03e..b46a964 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -568,7 +568,8 @@ typedef struct ViewOptions
  * RelFileNode" in src/backend/access/transam/README.
  */
 #define RelationNeedsWAL(relation)										\
-	(RelationIsPermanent(relation) && (XLogIsNeeded() ||				\
+	(wal_level != WAL_LEVEL_NONE &&										\
+	 RelationIsPermanent(relation) && (XLogIsNeeded() ||				\
 	  (relation->rd_createSubid == InvalidSubTransactionId &&			\
 	   relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))
 
diff --git a/src/test/recovery/t/024_archive_recovery.pl b/src/test/recovery/t/024_archive_recovery.pl
index 2d8d594..cf9cbe2 100644
--- a/src/test/recovery/t/024_archive_recovery.pl
+++ b/src/test/recovery/t/024_archive_recovery.pl
@@ -84,7 +84,7 @@ sub test_recovery_wal_level_minimal
 	# Confirm that the archive recovery fails with an expected error
 	my $logfile = slurp_file($recovery_node->logfile());
 	ok( $logfile =~
-		qr/FATAL:  WAL was generated with wal_level=minimal, cannot continue recovering/,
+		qr/FATAL:  WAL was generated with wal_level<=minimal, cannot continue recovering/,
 		"$node_text ends with an error because it finds WAL generated with wal_level=minimal");
 }
 
-- 
2.2.0

