From 73a24030c622605ed9b692cdd4814f558cb33f7d Mon Sep 17 00:00:00 2001
From: Osumi Takamichi <osumi.takamichi@fujitsu.com>
Date: Thu, 3 Dec 2020 02:48:25 +0000
Subject: [PATCH v05] new wal_level to disable WAL logging

In order to speed up the performance, especially                                             
for bulk data loading or pg_dumpall, this feature generates                                  
only limited types of WALs related to transaction and XLOG resources.                        
This means we gain this speed-up even at the cost of crash recovery.                         
                                                                                             
During the operation of this new wal_level,                                                  
an unexpected stoppage or shutdown of the server                                             
makes the whole cluster corrupted and unrecoverable.                                         
In other words, any kind of accidents make the server never                                  
start up again. Therefore, taking a full backup before and after                             
the change of this wal_level is a must.                                                      
                                                                                             
Author: Takamichi Osumi <osumi.takamichi@fujitsu.com>                                        
Reviewed-by: Tsunakawa Takayuki <tsunakawa.takay@fujitsu.com>                                
Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com>                                       
Reviewed-by: Laurenz Albe <laurenz.albe@cybertec.at>                                         
Reviewed-by: Ashutosh Bapat <ashutosh.bapat.oss@gmail.com>                                   
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>                                     
Reviewed-by: Masahiko Sawada <sawada.mshk@gmail.com>                                         
Discussion: https://www.postgresql.org/message-id/TYAPR01MB29901EBE5A3ACCE55BA99186FE320%40TYAPR01MB2990.jpnprd01.prod.outlook.com
---
 doc/src/sgml/config.sgml                      | 20 ++++++++++++++++++--
 doc/src/sgml/perform.sgml                     | 13 ++++++++++---
 src/backend/access/rmgrdesc/xlogdesc.c        |  1 +
 src/backend/access/transam/varsup.c           |  2 +-
 src/backend/access/transam/xlog.c             | 15 ++++++++++++---
 src/backend/access/transam/xloginsert.c       | 10 ++++++++++
 src/backend/postmaster/postmaster.c           |  6 +++---
 src/backend/utils/misc/postgresql.conf.sample |  6 ++++--
 src/bin/pg_controldata/pg_controldata.c       |  2 ++
 src/include/access/xlog.h                     |  3 ++-
 src/include/access/xlogdefs.h                 |  2 +-
 src/include/utils/rel.h                       |  3 ++-
 12 files changed, 66 insertions(+), 17 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 8cd3d69..58366e9 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2616,7 +2616,15 @@ include_dir 'conf.d'
         data to support WAL archiving and replication, including running
         read-only queries on a standby server. <literal>minimal</literal> removes all
         logging except the information required to recover from a crash or
-        immediate shutdown.  Finally,
+        immediate shutdown.  <literal>none</literal> generates strictly limited types of WAL
+        only for transaction or transaction resources such as WAL that write 2PC state
+        or checkpoint for shutt-down. This means that the amount of WAL during <literal>none</literal>
+        is much less than that of <literal>minimal</literal> for ordinal operation.
+        Intrinsically, the purpose of <literal>none</literal>
+        is to accelerate data bulk loading at the expense of recovery.
+        Accordingly, note that crash during <literal>none</literal> makes
+        the whole cluster corrupted and never start up again. Therefore, never use this mode
+        unless the operation during the mode is repeatable and the cluster is backed up. Finally,
         <literal>logical</literal> adds information necessary to support logical
         decoding.  Each level includes the information logged at all lower
         levels.  This parameter can only be set at server start.
@@ -2640,6 +2648,13 @@ include_dir 'conf.d'
         data from a base backup and the WAL logs, so <literal>replica</literal> or
         higher must be used to enable WAL archiving
         (<xref linkend="guc-archive-mode"/>) and streaming replication.
+        In the same way, <literal>none</literal> does not create almost all types of WAL logs in principle.
+        Therefore, this <varname>wal_level</varname> can be used to maximize the speed of data loading.
+        For example, bulk data loading or version upgrade using pg_dumpall.
+        On the other hand, an unexpected crash of the server makes the database cluster
+        inconsistent and never able to restart. For that reason, before utilizing this level,
+        get a full backup of both the cluster itself and the entire operations
+        that are done under the condition that <varname>wal_level</varname> is <literal>none</literal>.
        </para>
        <para>
         In <literal>logical</literal> level, the same information is logged as
@@ -3397,7 +3412,8 @@ include_dir 'conf.d'
         changed without leaving archiving mode.
         This parameter can only be set at server start.
         <varname>archive_mode</varname> cannot be enabled when
-        <varname>wal_level</varname> is set to <literal>minimal</literal>.
+        <varname>wal_level</varname> is set to <literal>none</literal> or
+        <literal>minimal</literal>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml
index 117a1f7..9b5202b 100644
--- a/doc/src/sgml/perform.sgml
+++ b/doc/src/sgml/perform.sgml
@@ -1741,10 +1741,17 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
     new base backup after the load has completed than to process a large
     amount of incremental WAL data.  To prevent incremental WAL logging
     while loading, disable archiving and streaming replication, by setting
-    <xref linkend="guc-wal-level"/> to <literal>minimal</literal>,
+    <xref linkend="guc-wal-level"/> to either <literal>none</literal>
+    or <literal>minimal</literal>,
     <xref linkend="guc-archive-mode"/> to <literal>off</literal>, and
     <xref linkend="guc-max-wal-senders"/> to zero.
-    But note that changing these settings requires a server restart.
+    Changing <literal>wal_level</literal> to <literal>none</literal>
+    is extremely performance-oriented feature. Therefore, paying
+    a careful attention that a crash during the data loading causes
+    corruption of the whole cluster is needed. When it happens,
+    the server will not restart again any more. Thus, the administrator
+    needs to set up the cluster from the full backup taken just before
+    the operation. Also, note that changing these settings requires a server restart.
    </para>
 
    <para>
@@ -1810,7 +1817,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse;
        If using WAL archiving or streaming replication, consider disabling
        them during the restore. To do that, set <varname>archive_mode</varname>
        to <literal>off</literal>,
-       <varname>wal_level</varname> to <literal>minimal</literal>, and
+       <varname>wal_level</varname> to <literal>minimal</literal> or <literal>none</literal>, and
        <varname>max_wal_senders</varname> to zero before loading the dump.
        Afterwards, set them back to the right values and take a fresh
        base backup.
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 3200f77..8293b3b 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -25,6 +25,7 @@
  * GUC support
  */
 const struct config_enum_entry wal_level_options[] = {
+	{"none", WAL_LEVEL_NONE, false},
 	{"minimal", WAL_LEVEL_MINIMAL, false},
 	{"replica", WAL_LEVEL_REPLICA, false},
 	{"archive", WAL_LEVEL_REPLICA, true},	/* deprecated */
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index a4944fa..712943a 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -368,7 +368,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
 	 * within 3M transactions of data loss.  This leaves lots of room for the
 	 * DBA to fool around fixing things in a standalone backend, while not
 	 * being significant compared to total XID space. (VACUUM requires an XID
-	 * if it truncates at wal_level!=minimal.  "VACUUM (ANALYZE)", which a DBA
+	 * if it truncates at wal_level<=minimal.  "VACUUM (ANALYZE)", which a DBA
 	 * might do by reflex, assigns an XID.  Hence, we had better be sure
 	 * there's lots of XIDs left...)  Also, at default BLCKSZ, this leaves two
 	 * completely-idle segments.  In the event of edge-case bugs involving
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 13f1d8c..4202444 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6255,11 +6255,11 @@ CheckRequiredParameterValues(void)
 	 * For archive recovery, the WAL must be generated with at least 'replica'
 	 * wal_level.
 	 */
-	if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
+	if (ArchiveRecoveryRequested && ControlFile->wal_level <= WAL_LEVEL_MINIMAL)
 	{
 		ereport(WARNING,
-				(errmsg("WAL was generated with wal_level=minimal, data may be missing"),
-				 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
+				(errmsg("WAL was generated with wal_level<=minimal, data may be missing"),
+				 errhint("This happens if you temporarily set wal_level<=minimal without taking a new base backup.")));
 	}
 
 	/*
@@ -6387,6 +6387,15 @@ StartupXLOG(void)
 					(errmsg("control file contains invalid database cluster state")));
 	}
 
+	/*
+	 * Detect if the server previously crashed under wal_level='none' or not.
+	 */
+	if (ControlFile->wal_level == WAL_LEVEL_NONE &&
+		(ControlFile->state != DB_SHUTDOWNED && ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY))
+		ereport(ERROR,
+				(errmsg("detected an unexpected server shutdown when WAL logging was disabled"),
+				 errhint("It looks like you need to deploy a new cluster from your full backup again.")));
+
 	/* This is just to allow attaching to startup process with a debugger */
 #ifdef XLOG_REPLAY_DELAY
 	if (ControlFile->state != DB_SHUTDOWNED)
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index 1f0e4e0..0e653ea 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -449,6 +449,16 @@ XLogInsert(RmgrId rmid, uint8 info)
 		return EndPos;
 	}
 
+	/* Issues only limited types of WAL for XLOG resources and transaction */
+	if (wal_level == WAL_LEVEL_NONE &&
+		!((rmid == RM_XLOG_ID && info == XLOG_CHECKPOINT_SHUTDOWN) ||
+		  (rmid == RM_XLOG_ID && info == XLOG_PARAMETER_CHANGE) ||
+		  (rmid == RM_XACT_ID && info == XLOG_XACT_PREPARE)))
+	{
+		XLogResetInsertion();
+		return GetXLogInsertRecPtr();
+	}
+
 	do
 	{
 		XLogRecPtr	RedoRecPtr;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b7799ed..27e4475 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -903,10 +903,10 @@ PostmasterMain(int argc, char *argv[])
 					 ReservedBackends, MaxConnections);
 		ExitPostmaster(1);
 	}
-	if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level == WAL_LEVEL_MINIMAL)
+	if (XLogArchiveMode > ARCHIVE_MODE_OFF && wal_level <= WAL_LEVEL_MINIMAL)
 		ereport(ERROR,
-				(errmsg("WAL archival cannot be enabled when wal_level is \"minimal\"")));
-	if (max_wal_senders > 0 && wal_level == WAL_LEVEL_MINIMAL)
+				(errmsg("WAL archival cannot be enabled when wal_level is \"none\" or \"\"minimal")));
+	if (max_wal_senders > 0 && wal_level <= WAL_LEVEL_MINIMAL)
 		ereport(ERROR,
 				(errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"replica\" or \"logical\"")));
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 9c9091e..d177700 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -196,8 +196,10 @@
 
 # - Settings -
 
-#wal_level = replica			# minimal, replica, or logical
-					# (change requires restart)
+#wal_level = replica			# none, minimal, replica, or logical
+					# (change requires restart.
+					# choosing the none wal_level
+					# can cause unrecoverable data corruption)
 #fsync = on				# flush data to disk for crash safety
 					# (turning this off can cause
 					# unrecoverable data corruption)
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 3e00ac0..90ec0dc 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -74,6 +74,8 @@ wal_level_str(WalLevel wal_level)
 {
 	switch (wal_level)
 	{
+		case WAL_LEVEL_NONE:
+			return "none";
 		case WAL_LEVEL_MINIMAL:
 			return "minimal";
 		case WAL_LEVEL_REPLICA:
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 221af87..75f4404 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -161,7 +161,8 @@ extern int	XLogArchiveMode;
 /* WAL levels */
 typedef enum WalLevel
 {
-	WAL_LEVEL_MINIMAL = 0,
+	WAL_LEVEL_NONE = 0,
+	WAL_LEVEL_MINIMAL,
 	WAL_LEVEL_REPLICA,
 	WAL_LEVEL_LOGICAL
 } WalLevel;
diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h
index e1f5812..197ad06 100644
--- a/src/include/access/xlogdefs.h
+++ b/src/include/access/xlogdefs.h
@@ -59,7 +59,7 @@ typedef uint16 RepOriginId;
 
 /*
  *	Because O_DIRECT bypasses the kernel buffers, and because we never
- *	read those buffers except during crash recovery or if wal_level != minimal,
+ *	read those buffers except during crash recovery or if wal_level <= minimal,
  *	it is a win to use it in all cases where we sync on each write().  We could
  *	allow O_DIRECT with fsync(), but it is unclear if fsync() could process
  *	writes not buffered in the kernel.  Also, O_DIRECT is never enough to force
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index c5ffea4..201afca 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -561,7 +561,8 @@ typedef struct ViewOptions
  * RelFileNode" in src/backend/access/transam/README.
  */
 #define RelationNeedsWAL(relation)										\
-	((relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&	\
+	(wal_level != WAL_LEVEL_NONE &&                                     \
+	(relation)->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT &&	\
 	 (XLogIsNeeded() ||													\
 	  (relation->rd_createSubid == InvalidSubTransactionId &&			\
 	   relation->rd_firstRelfilenodeSubid == InvalidSubTransactionId)))
-- 
1.8.3.1

