On Mon, Jul 3, 2023 at 7:47 PM Peter Eisentraut <pe...@eisentraut.org> wrote:
> When we added --clone, copy_file_range() was available, but the problem
> was that it was hard for the user to predict whether you'd get the fast
> clone behavior or the slow copy behavior.  That's the kind of thing you
> want to know when planning and testing your upgrade.  At the time, there
> were patches passed around in Linux kernel circles that would have been
> able to enforce cloning via the flags argument of copy_file_range(), but
> that didn't make it to the mainline.
>
> So, yes, being able to specify exactly which copy mechanism to use makes
> sense, so that users can choose the tradeoffs.

Thanks for looking.  Yeah, it is quite inconvenient for planning
purposes that it is hard for a user to know which internal strategy it
uses, but that's the interface we have (and clearly "flags" is
reserved for future usage so that might still evolve..).

> About your patch:
>
> I think you should have a "check" function called from
> check_new_cluster().  That check function can then also handle the "not
> supported" case, and you don't need to handle that in
> parseCommandLine().  I suggest following the clone example for these,
> since the issues there are very similar.

Done.
From 9ea1c3fc39a47f634a4fffd1ff1c9b9cf0299d65 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Fri, 2 Jun 2023 13:35:54 -0400
Subject: [PATCH v2] Add --copy-file-range option to pg_upgrade.

The copy_file_range() system call is available on at least Linux and
FreeBSD, and asks the kernel to use efficient ways to copy ranges of a
file.  Options available to the kernel include sharing block ranges
(similar to --clone mode), and pushing down block copies to the storage
layer.

Reviewed-by: Peter Eisentraut <pe...@eisentraut.org>
Discussion: https://postgr.es/m/CA%2BhUKGKe7Hb0-UNih8VD5UNZy5-ojxFb3Pr3xSBBL8qj2M2%3DdQ%40mail.gmail.com
---
 configure                          |  2 +-
 configure.ac                       |  1 +
 doc/src/sgml/ref/pgupgrade.sgml    | 13 +++++
 meson.build                        |  1 +
 src/bin/pg_upgrade/check.c         |  3 ++
 src/bin/pg_upgrade/file.c          | 78 ++++++++++++++++++++++++++++++
 src/bin/pg_upgrade/option.c        |  7 ++-
 src/bin/pg_upgrade/pg_upgrade.h    |  4 ++
 src/bin/pg_upgrade/relfilenumber.c |  8 +++
 src/include/pg_config.h.in         |  3 ++
 src/tools/msvc/Solution.pm         |  1 +
 11 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index d47e0f8b26..2076b19a1b 100755
--- a/configure
+++ b/configure
@@ -15578,7 +15578,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 440b08d113..d0d31dd91e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1767,6 +1767,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 AC_CHECK_FUNCS(m4_normalize([
 	backtrace_symbols
 	copyfile
+	copy_file_range
 	getifaddrs
 	getpeerucred
 	inet_pton
diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index f17fdb1ba5..8275cc067b 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -263,6 +263,19 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--copy-file-range</option></term>
+      <listitem>
+       <para>
+        Use the <function>copy_file_range</function> system call for efficient
+        copying.  On some file systems this gives results similar to
+        <option>--clone</option>, sharing physical disk blocks, while on others
+        it may still copy blocks, but do so via an optimized path.  At present,
+        it is supported on Linux and FreeBSD.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-?</option></term>
       <term><option>--help</option></term>
diff --git a/meson.build b/meson.build
index 862c955453..20e7327e9e 100644
--- a/meson.build
+++ b/meson.build
@@ -2415,6 +2415,7 @@ func_checks = [
   ['backtrace_symbols', {'dependencies': [execinfo_dep]}],
   ['clock_gettime', {'dependencies': [rt_dep], 'define': false}],
   ['copyfile'],
+  ['copy_file_range'],
   # gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus
   # when enabling asan the dlopen check doesn't notice that -ldl is actually
   # required. Just checking for dlsym() ought to suffice.
diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c
index 21a0ff9e42..4a615edb62 100644
--- a/src/bin/pg_upgrade/check.c
+++ b/src/bin/pg_upgrade/check.c
@@ -213,6 +213,9 @@ check_new_cluster(void)
 			break;
 		case TRANSFER_MODE_COPY:
 			break;
+		case TRANSFER_MODE_COPY_FILE_RANGE:
+			check_copy_file_range();
+			break;
 		case TRANSFER_MODE_LINK:
 			check_hard_link();
 			break;
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index d173602882..e30d944be3 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -10,6 +10,7 @@
 #include "postgres_fe.h"
 
 #include <sys/stat.h>
+#include <limits.h>
 #include <fcntl.h>
 #ifdef HAVE_COPYFILE_H
 #include <copyfile.h>
@@ -140,6 +141,45 @@ copyFile(const char *src, const char *dst,
 }
 
 
+/*
+ * copyFileByRange()
+ *
+ * Copies a relation file from src to dst.
+ * schemaName/relName are relation's SQL name (used for error messages only).
+ */
+void
+copyFileByRange(const char *src, const char *dst,
+				const char *schemaName, const char *relName)
+{
+#ifdef HAVE_COPY_FILE_RANGE
+	int			src_fd;
+	int			dest_fd;
+	ssize_t		nbytes;
+
+	if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0)
+		pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s",
+				 schemaName, relName, src, strerror(errno));
+
+	if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+						pg_file_create_mode)) < 0)
+		pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s",
+				 schemaName, relName, dst, strerror(errno));
+
+	do
+	{
+		nbytes = copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0);
+		if (nbytes < 0 && errno != EINTR)
+			pg_fatal("error while copying relation \"%s.%s\": could not copy file range from \"%s\" to \"%s\": %s",
+					 schemaName, relName, src, dst, strerror(errno));
+	}
+	while (nbytes > 0);
+
+	close(src_fd);
+	close(dest_fd);
+#endif
+}
+
+
 /*
  * linkFile()
  *
@@ -358,6 +398,44 @@ check_file_clone(void)
 	unlink(new_link_file);
 }
 
+void
+check_copy_file_range(void)
+{
+	char		existing_file[MAXPGPATH];
+	char		new_link_file[MAXPGPATH];
+
+	snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata);
+	snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata);
+	unlink(new_link_file);		/* might fail */
+
+#if defined(HAVE_COPY_FILE_RANGE)
+	{
+		int			src_fd;
+		int			dest_fd;
+
+		if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0)
+			pg_fatal("could not open file \"%s\": %s",
+					 existing_file, strerror(errno));
+
+		if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+							pg_file_create_mode)) < 0)
+			pg_fatal("could not create file \"%s\": %s",
+					 new_link_file, strerror(errno));
+
+		if (copy_file_range(src_fd, NULL, dest_fd, NULL, SSIZE_MAX, 0) < 0)
+			pg_fatal("could not copy file range between old and new data directories: %s",
+					 strerror(errno));
+
+		close(src_fd);
+		close(dest_fd);
+	}
+#else
+	pg_fatal("copy_file_range not supported on this platform");
+#endif
+
+	unlink(new_link_file);
+}
+
 void
 check_hard_link(void)
 {
diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c
index b9d900d0db..600ba7e8eb 100644
--- a/src/bin/pg_upgrade/option.c
+++ b/src/bin/pg_upgrade/option.c
@@ -58,7 +58,8 @@ parseCommandLine(int argc, char *argv[])
 		{"verbose", no_argument, NULL, 'v'},
 		{"clone", no_argument, NULL, 1},
 		{"copy", no_argument, NULL, 2},
-		{"sync-method", required_argument, NULL, 3},
+		{"copy-file-range", no_argument, NULL, 3},
+		{"sync-method", required_argument, NULL, 4},
 
 		{NULL, 0, NULL, 0}
 	};
@@ -203,6 +204,9 @@ parseCommandLine(int argc, char *argv[])
 				break;
 
 			case 3:
+				user_opts.transfer_mode = TRANSFER_MODE_COPY_FILE_RANGE;
+				break;
+			case 4:
 				if (!parse_sync_method(optarg, &unused))
 					exit(1);
 				user_opts.sync_method = pg_strdup(optarg);
@@ -301,6 +305,7 @@ usage(void)
 	printf(_("  -V, --version                 display version information, then exit\n"));
 	printf(_("  --clone                       clone instead of copying files to new cluster\n"));
 	printf(_("  --copy                        copy files to new cluster (default)\n"));
+	printf(_("  --copy-file-range             copy files to new cluster with copy_file_range\n"));
 	printf(_("  --sync-method=METHOD          set method for syncing files to disk\n"));
 	printf(_("  -?, --help                    show this help, then exit\n"));
 	printf(_("\n"
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 842f3b6cd3..25fb7dc7ad 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -234,6 +234,7 @@ typedef enum
 {
 	TRANSFER_MODE_CLONE,
 	TRANSFER_MODE_COPY,
+	TRANSFER_MODE_COPY_FILE_RANGE,
 	TRANSFER_MODE_LINK
 } transferMode;
 
@@ -380,11 +381,14 @@ void		cloneFile(const char *src, const char *dst,
 					  const char *schemaName, const char *relName);
 void		copyFile(const char *src, const char *dst,
 					 const char *schemaName, const char *relName);
+void		copyFileByRange(const char *src, const char *dst,
+							const char *schemaName, const char *relName);
 void		linkFile(const char *src, const char *dst,
 					 const char *schemaName, const char *relName);
 void		rewriteVisibilityMap(const char *fromfile, const char *tofile,
 								 const char *schemaName, const char *relName);
 void		check_file_clone(void);
+void		check_copy_file_range(void);
 void		check_hard_link(void);
 
 /* fopen_priv() is no longer different from fopen() */
diff --git a/src/bin/pg_upgrade/relfilenumber.c b/src/bin/pg_upgrade/relfilenumber.c
index 34bc9c1504..094a4db936 100644
--- a/src/bin/pg_upgrade/relfilenumber.c
+++ b/src/bin/pg_upgrade/relfilenumber.c
@@ -37,6 +37,9 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr,
 		case TRANSFER_MODE_COPY:
 			prep_status_progress("Copying user relation files");
 			break;
+		case TRANSFER_MODE_COPY_FILE_RANGE:
+			prep_status_progress("Copying user relation files with copy_file_range");
+			break;
 		case TRANSFER_MODE_LINK:
 			prep_status_progress("Linking user relation files");
 			break;
@@ -250,6 +253,11 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro
 						   old_file, new_file);
 					copyFile(old_file, new_file, map->nspname, map->relname);
 					break;
+				case TRANSFER_MODE_COPY_FILE_RANGE:
+					pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\" with copy_file_range",
+						   old_file, new_file);
+					copyFileByRange(old_file, new_file, map->nspname, map->relname);
+					break;
 				case TRANSFER_MODE_LINK:
 					pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"",
 						   old_file, new_file);
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d8a2985567..d787484259 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -81,6 +81,9 @@
 /* Define to 1 if you have the <copyfile.h> header file. */
 #undef HAVE_COPYFILE_H
 
+/* Define to 1 if you have the `copy_file_range' function. */
+#undef HAVE_COPY_FILE_RANGE
+
 /* Define to 1 if you have the <crtdefs.h> header file. */
 #undef HAVE_CRTDEFS_H
 
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index a50f730260..3d72a6e4aa 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -229,6 +229,7 @@ sub GenerateFiles
 		HAVE_COMPUTED_GOTO => undef,
 		HAVE_COPYFILE => undef,
 		HAVE_COPYFILE_H => undef,
+		HAVE_COPY_FILE_RANGE => undef,
 		HAVE_CRTDEFS_H => undef,
 		HAVE_CRYPTO_LOCK => undef,
 		HAVE_DECL_FDATASYNC => 0,
-- 
2.42.0

Reply via email to