Hello,

Currently pg_rewind refuses to run if full_page_writes is off. This is to prevent it to run into a torn page during operation.

This is usually a good call, but some file systems like ZFS are naturally immune to torn page (maybe btrfs too, but I don't know for sure for this one).

Having the option to use pg_rewind without the cost associated with full_page_writes when using a system immune to torn page is beneficial: increased performance and more compact WAL.

This patch adds a new option "--no-ensure-full-page-writes" to pg_rewind for this situation, as well as patched documentation.

Regards,
Jeremie Grauer
From 4f973b7e2d4bd9c04b6b6ce2c825dfdab4dbad11 Mon Sep 17 00:00:00 2001
From: Jeremie Grauer <jeremie.gra...@cosium.com>
Date: Thu, 3 Nov 2022 16:23:49 +0100
Subject: [PATCH] adds the option --no-ensure-full-page-writes to pg_rewind

---
 doc/src/sgml/ref/pg_rewind.sgml  | 25 +++++++++++++++++-
 src/bin/pg_rewind/libpq_source.c | 44 ++++++++++++++++++--------------
 src/bin/pg_rewind/pg_rewind.c    | 43 ++++++++++++++++++-------------
 src/bin/pg_rewind/pg_rewind.h    |  1 +
 4 files changed, 75 insertions(+), 38 deletions(-)

diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 69d6924b3a..3dbdb503cd 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -99,7 +99,11 @@ PostgreSQL documentation
    in <filename>postgresql.conf</filename> or data checksums enabled when
    the cluster was initialized with <application>initdb</application>.  Neither of these
    are currently on by default.  <xref linkend="guc-full-page-writes"/>
-   must also be set to <literal>on</literal>, but is enabled by default.
+   should also be set to <literal>on</literal> (which is the default) except when the
+   underlying system is not succeptible to torn pages. ZFS filesystem for instance doesn't
+   need this parameter. If you are running such a system, you need to add the option
+   --no-enforce-full-page-writes else <application>pg_rewind</application> will refuse
+   to run.
   </para>
 
   <warning>
@@ -283,6 +287,25 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--no-ensure-full-page-writes</option></term>
+      <listitem>
+       <para>
+        <application>pg_rewind</application> by default requires that <xref linkend="guc-full-page-writes"/>
+        is set to on. This ensures that <application>pg_rewind</application> do
+        not run into torn pages while running. This option is necessary most of
+        the time since very few filesystems enforce the flush of full page to disk.
+        One such system is ZFS where the page is always flushed to disk in its
+        entirety. Other storage system may also have the same warranty. This option
+        is here to allow pg_rewind to run on such systems without enforcing that
+        <xref linkend="guc-full-page-writes"/> is on. It's important to know what
+        you are doing before setting this setting because in case your system does
+        not really warrants that the page is persisted to disk in its entirety, you
+        may run into data corruption.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-V</option></term>
       <term><option>--version</option></term>
diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c
index 011c9cce6e..7b0c1d54f2 100644
--- a/src/bin/pg_rewind/libpq_source.c
+++ b/src/bin/pg_rewind/libpq_source.c
@@ -132,26 +132,32 @@ init_libpq_conn(PGconn *conn)
 	PQclear(res);
 
 	/*
-	 * Also check that full_page_writes is enabled.  We can get torn pages if
-	 * a page is modified while we read it with pg_read_binary_file(), and we
-	 * rely on full page images to fix them.
+	 * If --no-ensure-full-page-write is false (the default), check that
+	 * full_page_writes is enabled. The goal here is to not run into a torn
+	 * pages. Torn page can happend if a page is modified while we read it
+	 * with pg_read_binary_file(). Some file systems are immune to torn page,
+	 * this is why there is an option to disable this check. Outside of those
+	 * specific systems, we rely on full_page_writes to avoid torn page.
 	 */
-	str = run_simple_query(conn, "SHOW full_page_writes");
-	if (strcmp(str, "on") != 0)
-		pg_fatal("full_page_writes must be enabled in the source server");
-	pg_free(str);
-
-	/* Prepare a statement we'll use to fetch files */
-	res = PQprepare(conn, "fetch_chunks_stmt",
-					"SELECT path, begin,\n"
-					"  pg_read_binary_file(path, begin, len, true) AS chunk\n"
-					"FROM unnest ($1::text[], $2::int8[], $3::int4[]) as x(path, begin, len)",
-					3, NULL);
-
-	if (PQresultStatus(res) != PGRES_COMMAND_OK)
-		pg_fatal("could not prepare statement to fetch file contents: %s",
-				 PQresultErrorMessage(res));
-	PQclear(res);
+	if (!no_ensure_full_page_writes)
+	{
+		str = run_simple_query(conn, "SHOW full_page_writes");
+		if (strcmp(str, "on") != 0)
+			pg_fatal("full_page_writes must be enabled in the source server");
+		pg_free(str);
+
+		/* Prepare a statement we'll use to fetch files */
+		res = PQprepare(conn, "fetch_chunks_stmt",
+						"SELECT path, begin,\n"
+						"  pg_read_binary_file(path, begin, len, true) AS chunk\n"
+						"FROM unnest ($1::text[], $2::int8[], $3::int4[]) as x(path, begin, len)",
+						3, NULL);
+
+		if (PQresultStatus(res) != PGRES_COMMAND_OK)
+			pg_fatal("could not prepare statement to fetch file contents: %s",
+					 PQresultErrorMessage(res));
+		PQclear(res);
+	}
 }
 
 /*
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 3cd77c09b1..230e48cfc3 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -68,6 +68,7 @@ bool		showprogress = false;
 bool		dry_run = false;
 bool		do_sync = true;
 bool		restore_wal = false;
+bool		no_ensure_full_page_writes = false;
 
 /* Target history */
 TimeLineHistoryEntry *targetHistory;
@@ -86,23 +87,24 @@ usage(const char *progname)
 	printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname);
 	printf(_("Usage:\n  %s [OPTION]...\n\n"), progname);
 	printf(_("Options:\n"));
-	printf(_("  -c, --restore-target-wal       use restore_command in target configuration to\n"
-			 "                                 retrieve WAL files from archives\n"));
-	printf(_("  -D, --target-pgdata=DIRECTORY  existing data directory to modify\n"));
-	printf(_("      --source-pgdata=DIRECTORY  source data directory to synchronize with\n"));
-	printf(_("      --source-server=CONNSTR    source server to synchronize with\n"));
-	printf(_("  -n, --dry-run                  stop before modifying anything\n"));
-	printf(_("  -N, --no-sync                  do not wait for changes to be written\n"
-			 "                                 safely to disk\n"));
-	printf(_("  -P, --progress                 write progress messages\n"));
-	printf(_("  -R, --write-recovery-conf      write configuration for replication\n"
-			 "                                 (requires --source-server)\n"));
-	printf(_("      --config-file=FILENAME     use specified main server configuration\n"
-			 "                                 file when running target cluster\n"));
-	printf(_("      --debug                    write a lot of debug messages\n"));
-	printf(_("      --no-ensure-shutdown       do not automatically fix unclean shutdown\n"));
-	printf(_("  -V, --version                  output version information, then exit\n"));
-	printf(_("  -?, --help                     show this help, then exit\n"));
+	printf(_("  -c, --restore-target-wal               use restore_command in target configuration to\n"
+			 "                                         retrieve WAL files from archives\n"));
+	printf(_("  -D, --target-pgdata=DIRECTORY          existing data directory to modify\n"));
+	printf(_("      --source-pgdata=DIRECTORY          source data directory to synchronize with\n"));
+	printf(_("      --source-server=CONNSTR            source server to synchronize with\n"));
+	printf(_("  -n, --dry-run                          stop before modifying anything\n"));
+	printf(_("  -N, --no-sync                          do not wait for changes to be written\n"
+			 "                                         safely to disk\n"));
+	printf(_("  -P, --progress                         write progress messages\n"));
+	printf(_("  -R, --write-recovery-conf              write configuration for replication\n"
+			 "                                         (requires --source-server)\n"));
+	printf(_("      --config-file=FILENAME             use specified main server configuration\n"
+			 "                                         file when running target cluster\n"));
+	printf(_("      --debug                            write a lot of debug messages\n"));
+	printf(_("      --no-ensure-shutdown               do not automatically fix unclean shutdown\n"));
+	printf(_("      --no-ensure-full-page-writes       do not make sure that full_page_writes is on\n"));
+	printf(_("  -V, --version                          output version information, then exit\n"));
+	printf(_("  -?, --help                             show this help, then exit\n"));
 	printf(_("\nReport bugs to <%s>.\n"), PACKAGE_BUGREPORT);
 	printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
 }
@@ -118,7 +120,8 @@ main(int argc, char **argv)
 		{"source-pgdata", required_argument, NULL, 1},
 		{"source-server", required_argument, NULL, 2},
 		{"no-ensure-shutdown", no_argument, NULL, 4},
-		{"config-file", required_argument, NULL, 5},
+		{"no-ensure-full-page-writes", no_argument, NULL, 5},
+		{"config-file", required_argument, NULL, 6},
 		{"version", no_argument, NULL, 'V'},
 		{"restore-target-wal", no_argument, NULL, 'c'},
 		{"dry-run", no_argument, NULL, 'n'},
@@ -207,6 +210,10 @@ main(int argc, char **argv)
 				break;
 
 			case 5:
+				no_ensure_full_page_writes = true;
+				break;
+
+			case 6:
 				config_file = pg_strdup(optarg);
 				break;
 
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index 80c276285a..8ddb9e34b1 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -23,6 +23,7 @@ extern char *datadir_target;
 extern bool showprogress;
 extern bool dry_run;
 extern bool do_sync;
+extern bool no_ensure_full_page_writes;
 extern int	WalSegSz;
 
 /* Target history */
-- 
2.38.1

Reply via email to