From 4d7f4765470fe0617024e9b9c2067d353e0a0c60 Mon Sep 17 00:00:00 2001
From: Osumi Takamichi <osumi.takamichi@fujitsu.com>
Date: Tue, 24 Nov 2020 01:27:31 +0000
Subject: [PATCH v04] new wal_level to disable WAL logging

In order to speed up the performance, especially
for bulk data loading or pg_dumpall, this feature turns off
generation of WAL except for ones associated with transaction and
XLOG resources. This means we gain this speed-up even at the cost of crash recovery.

During the operation of this new wal_level,
an unexpected stoppage or shutdown of the server
makes the whole cluster corrupted and unrecoverable.
Therefore, taking a full backup before and after the operation is a must.

Author: Takamichi Osumi <osumi.takamichi@fujitsu.com>
Reviewed-by: Tsunakawa Takayuki <tsunakawa.takay@fujitsu.com>
Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com>
Reviewed-by: Laurenz Albe <laurenz.albe@cybertec.at>
Reviewed-by: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>
Discussion: https://www.postgresql.org/message-id/TYAPR01MB29901EBE5A3ACCE55BA99186FE320%40TYAPR01MB2990.jpnprd01.prod.outlook.com
---
 doc/src/sgml/config.sgml                      | 20 ++++++++++++++++++--
 doc/src/sgml/perform.sgml                     | 13 ++++++++++---
 src/backend/access/rmgrdesc/xlogdesc.c        |  1 +
 src/backend/access/transam/varsup.c           |  2 +-
 src/backend/access/transam/xlog.c             | 13 +++++++++++--
 src/backend/access/transam/xloginsert.c       |  8 ++++++++
 src/backend/postmaster/postmaster.c           |  6 +++---
 src/backend/utils/misc/postgresql.conf.sample |  6 ++++--
 src/bin/pg_controldata/pg_controldata.c       |  2 ++
 src/include/access/xlog.h                     |  3 ++-
 src/include/access/xlogdefs.h                 |  2 +-
 src/include/utils/rel.h                       |  3 ++-
 12 files changed, 63 insertions(+), 16 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index f043433..97c1a05 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2591,7 +2591,15 @@ include_dir 'conf.d'
         data to support WAL archiving and replication, including running
         read-only queries on a standby server. <literal>minimal</literal> removes all
         logging except the information required to recover from a crash or
-        immediate shutdown.  Finally,
+        immediate shutdown.  <literal>none</literal> generates no WAL
+        except for ones related to transaction or transaction resources such as the end of commit
+        or the indication of checkpoint.  This means that the amount of WAL during <literal>none</literal>
+        operation could be much less than that of <literal>minimal</literal> for ordinal operation.
+        Intrinsically, the purpose of <literal>none</literal>
+        is to accelerate data bulk loading at the expense of recovery.
+        Accordingly, note that crash during <literal>none</literal> makes
+        the whole cluster corrupted and unrecoverable. Therefore, never use this mode
+        unless the operation during the mode is repeatable and the cluster is backed up. Finally,
         <literal>logical</literal> adds information necessary to support logical
         decoding.  Each level includes the information logged at all lower
         levels.  This parameter can only be set at server start.
@@ -2615,6 +2623,13 @@ include_dir 'conf.d'
         data from a base backup and the WAL logs, so <literal>replica</literal> or
         higher must be used to enable WAL archiving
         (<xref linkend="guc-archive-mode"/>) and streaming replication.
+        In the same way, <literal>none</literal> does not create WAL logs in principle.
+        Therefore, this <varname>wal_level</varname> can be used to maximize the speed of data loading.
+        For example, bulk data loading or version upgrade using pg_dumpall.
+        On the other hand, an unexpected crash of the server makes the database cluster
+        inconsistent and never able to restart. For that reason, before utilizing this level,
+        get a full backup of both the cluster itself and the entire operations
+        that are done under the condition that <varname>wal_level</varname> is <literal>none</literal>.
        </para>
        <para>
         In <literal>logical</literal> level, the same information is logged as
@@ -3372,7 +3387,8 @@ include_dir 'conf.d'
         changed without leaving archiving mode.
         This parameter can only be set at server start.
         <varname>archive_mode</varname> cannot be enabled when
-        <varname>wal_level</varname> is set to <literal>minimal</literal>.
+        <varname>wal_level</varname> is set to <literal>none</literal> or
+        <literal>minimal</literal>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 117a1f7..07d47d4 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1741,10 +1741,17 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
     new base backup after the load has completed than to process a large
     amount of incremental WAL data.  To prevent incremental WAL logging
     while loading, disable archiving and streaming replication, by setting
-    <xref linkend="guc-wal-level"/> to <literal>minimal</literal>,
+    <xref linkend="guc-wal-level"/> to either <literal>none</literal>
+    or <literal>minimal</literal>,
     <xref linkend="guc-archive-mode"/> to <literal>off</literal>, and
     <xref linkend="guc-max-wal-senders"/> to zero.
-    But note that changing these settings requires a server restart.
+    Changing <literal>wal_level</literal> to <literal>none</literal>
+    is extremely performance-oriented feature. Therefore, paying
+    a careful attention that a crash during the data loading causes
+    corruption of the whole cluster is needed. When it happens,
+    the server will not restart again any more and the administrator
+    needs to set up the cluster from the full backup.
+    Also, note that changing these settings requires a server restart.
    </para>
 
    <para>
@@ -1810,7 +1817,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
        If using WAL archiving or streaming replication, consider disabling
        them during the restore. To do that, set <varname>archive_mode</varname>
        to <literal>off</literal>,
-       <varname>wal_level</varname> to <literal>minimal</literal>, and
+       <varname>wal_level</varname> to <literal>minimal</literal> or <literal>none</literal>, and
        <varname>max_wal_senders</varname> to zero before loading the dump.
        Afterwards, set them back to the right values and take a fresh
        base backup.
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 3200f77..8293b3b 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -25,6 +25,7 @@
  * GUC support
  */
 const struct config_enum_entry wal_level_options[] = {
+	{"none", WAL_LEVEL_NONE, false},
 	{"minimal", WAL_LEVEL_MINIMAL, false},
 	{"replica", WAL_LEVEL_REPLICA, false},
 	{"archive", WAL_LEVEL_REPLICA, true},	/* deprecated */
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index a4944fa..712943a 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -368,7 +368,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
 	 * within 3M transactions of data loss.  This leaves lots of room for the
 	 * DBA to fool around fixing things in a standalone backend, while not
 	 * being significant compared to total XID space. (VACUUM requires an XID
-	 * if it truncates at wal_level!=minimal.  "VACUUM (ANALYZE)", which a DBA
+	 * if it truncates at wal_level<=minimal.  "VACUUM (ANALYZE)", which a DBA
 	 * might do by reflex, assigns an XID.  Hence, we had better be sure
 	 * there's lots of XIDs left...)  Also, at default BLCKSZ, this leaves two
 	 * completely-idle segments.  In the event of edge-case bugs involving
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index aa63f37..887767b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6254,10 +6254,10 @@ CheckRequiredParameterValues(void)
 	 * For archive recovery, the WAL must be generated with at least 'replica'
 	 * wal_level.
 	 */
-	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+	if (ArchiveRecoveryRequested && ControlFile->wal_level <= WAL_LEVEL_MINIMAL)
 	{
 		ereport(WARNING,
-				(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
+				(errmsg("WAL was generated with wal_level<=minimal, data may be missing"),
 				 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
 	}
 
@@ -6386,6 +6386,15 @@ StartupXLOG(void)
 					(errmsg("control file contains invalid database cluster state")));
 	}
 
+	/*
+	 * Detect if the server previously crashed under wal_level='none' or not.
+	 */
+	if (ControlFile->wal_level == WAL_LEVEL_NONE &&
+		(ControlFile->state != DB_SHUTDOWNED && ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY))
+		ereport(ERROR,
+				(errmsg("detected an unexpected server shutdown when WAL logging was disabled"),
+				 errhint("It looks like you need to deploy a new cluster from your full backup again.")));
+
 	/* This is just to allow attaching to startup process with a debugger */
 #ifdef XLOG_REPLAY_DELAY
 	if (ControlFile->state != DB_SHUTDOWNED)
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 1f0e4e0..a44fb70 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -449,6 +449,14 @@ XLogInsert(RmgrId rmid, uint8 info)
 		return EndPos;
 	}
 
+	/* Issues WAL related to XLOG resources and transactions only */
+	if (wal_level == WAL_LEVEL_NONE &&
+		rmid != RM_XLOG_ID && rmid != RM_XACT_ID)
+	{
+		XLogResetInsertion();
+		return GetXLogInsertRecPtr();
+	}
+
 	do
 	{
 		XLogRecPtr	RedoRecPtr;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b7799ed..27e4475 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -903,10 +903,10 @@ PostmasterMain(int argc, char *argv[])
 					 ReservedBackends, MaxConnections);
 		ExitPostmaster(1);
 	}
-	if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level == WAL_LEVEL_MINIMAL)
+	if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level <= WAL_LEVEL_MINIMAL)
 		ereport(ERROR,
-				(errmsg("WAL archival cannot be enabled when wal_level is \"minimal\"")));
-	if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL)
+				(errmsg("WAL archival cannot be enabled when wal_level is \"none\" or \"\"minimal")));
+	if (max_wal_senders > 0 && wal_level <= WAL_LEVEL_MINIMAL)
 		ereport(ERROR,
 				(errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\"")));
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 9cb571f..9bf89e5 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -196,8 +196,10 @@
 
 # - Settings -
 
-#wal_level = replica			# minimal, replica, or logical
-					# (change requires restart)
+#wal_level = replica			# none, minimal, replica, or logical
+					# (change requires restart.
+					# choosing the none wal_level
+					# can cause unrecoverable data corruption)
 #fsync = on				# flush data to disk for crash safety
 					# (turning this off can cause
 					# unrecoverable data corruption)
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 3e00ac0..90ec0dc 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -74,6 +74,8 @@ wal_level_str(WalLevel wal_level)
 {
 	switch (wal_level)
 	{
+		case WAL_LEVEL_NONE:
+			return "none";
 		case WAL_LEVEL_MINIMAL:
 			return "minimal";
 		case WAL_LEVEL_REPLICA:
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 221af87..75f4404 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -161,7 +161,8 @@ extern int	XLogArchiveMode;
 /* WAL levels */
 typedef enum WalLevel
 {
-	WAL_LEVEL_MINIMAL = 0,
+	WAL_LEVEL_NONE = 0,
+	WAL_LEVEL_MINIMAL,
 	WAL_LEVEL_REPLICA,
 	WAL_LEVEL_LOGICAL
 } WalLevel;
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index e1f5812..197ad06 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -59,7 +59,7 @@ typedef uint16 RepOriginId;
 
 /*
  *	Because O_DIRECT bypasses the kernel buffers, and because we never
- *	read those buffers except during crash recovery or if wal_level != minimal,
+ *	read those buffers except during crash recovery or if wal_level <= minimal,
  *	it is a win to use it in all cases where we sync on each write().  We could
  *	allow O_DIRECT with fsync(), but it is unclear if fsync() could process
  *	writes not buffered in the kernel.  Also, O_DIRECT is never enough to force
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index c5ffea4..201afca 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -561,7 +561,8 @@ typedef struct ViewOptions
  * RelFileNode" in src/backend/access/transam/README.
  */
 #define RelationNeedsWAL(relation)										\
-	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&	\
+	(wal_level != WAL_LEVEL_NONE &&                                     \
+	(relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&	\
 	 (XLogIsNeeded() ||													\
 	  (relation->rd_createSubid == InvalidSubTransactionId &&			\
 	   relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))
-- 
1.8.3.1

