Add file cloning as an alternative data transfer method to pg_upgrade.
Currently only btrfs is supported, but copy-on-write cloning is also
available on at least ZFS.  Cloning must be requested explicitly and if
it isn't supported by the operating system or filesystem a fatal error
is thrown.

This provides upgrade performance similar to link mode while allowing
the old cluster to be used even after the new one has been started.

Signed-off-by: Oskari Saarenmaa <o...@ohmu.fi>
---
 configure                        |   5 +-
 configure.in                     |   7 ++-
 contrib/pg_upgrade/check.c       |   3 +
 contrib/pg_upgrade/file.c        | 125 +++++++++++++++++++++++++++++----------
 contrib/pg_upgrade/option.c      |   7 +++
 contrib/pg_upgrade/pg_upgrade.h  |  13 ++--
 contrib/pg_upgrade/relfilenode.c |  31 ++++------
 doc/src/sgml/pgupgrade.sgml      |   7 +++
 src/include/pg_config.h.in       |   3 +
 9 files changed, 141 insertions(+), 60 deletions(-)

diff --git a/configure b/configure
index c685ca3..5087463 100755
--- a/configure
+++ b/configure
@@ -10351,7 +10351,10 @@ done
 
 
 
-for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h 
langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h 
sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h 
sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
+for ac_header in crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h 
langinfo.h \
+    linux/btrfs.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h \
+    sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h \
+    sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h
 do
 as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
 if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
diff --git a/configure.in b/configure.in
index 82771bd..609aa73 100644
--- a/configure.in
+++ b/configure.in
@@ -982,7 +982,12 @@ AC_SUBST(OSSP_UUID_LIBS)
 ##
 
 dnl sys/socket.h is required by AC_FUNC_ACCEPT_ARGTYPES
-AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h 
langinfo.h poll.h pwd.h sys/ioctl.h sys/ipc.h sys/poll.h sys/pstat.h 
sys/resource.h sys/select.h sys/sem.h sys/shm.h sys/socket.h sys/sockio.h 
sys/tas.h sys/time.h sys/un.h termios.h ucred.h utime.h wchar.h wctype.h])
+AC_CHECK_HEADERS([crypt.h dld.h fp_class.h getopt.h ieeefp.h ifaddrs.h \
+                  langinfo.h linux/btrfs.h poll.h pwd.h sys/ioctl.h \
+                  sys/ipc.h sys/poll.h sys/pstat.h sys/resource.h \
+                  sys/select.h sys/sem.h sys/shm.h sys/socket.h \
+                  sys/sockio.h sys/tas.h sys/time.h sys/un.h termios.h \
+                  ucred.h utime.h wchar.h wctype.h])
 
 # On BSD, test for net/if.h will fail unless sys/socket.h
 # is included first.
diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index 0376fcb..2a52dd8 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -151,6 +151,9 @@ check_new_cluster(void)
        if (user_opts.transfer_mode == TRANSFER_MODE_LINK)
                check_hard_link();
 
+       if (user_opts.transfer_mode == TRANSFER_MODE_CLONE)
+               check_clone_file();
+
        check_is_super_user(&new_cluster);
 
        /*
diff --git a/contrib/pg_upgrade/file.c b/contrib/pg_upgrade/file.c
index dfeb79f..fc935b7 100644
--- a/contrib/pg_upgrade/file.c
+++ b/contrib/pg_upgrade/file.c
@@ -8,11 +8,16 @@
  */
 
 #include "postgres_fe.h"
+#include "pg_config.h"
 
 #include "pg_upgrade.h"
 
 #include <fcntl.h>
 
+#ifdef HAVE_LINUX_BTRFS_H
+# include <sys/ioctl.h>
+# include <linux/btrfs.h>
+#endif
 
 
 #ifndef WIN32
@@ -23,21 +28,42 @@ static int  win32_pghardlink(const char *src, const char 
*dst);
 
 
 /*
- * copyAndUpdateFile()
+ * upgradeFile()
  *
- *     Copies a relation file from src to dst.  If pageConverter is non-NULL, 
this function
- *     uses that pageConverter to do a page-by-page conversion.
+ * Transfer a relation file from src to dst using one of the supported
+ * methods.  If the on-disk format of the new cluster is bit-for-bit
+ * compatible with the on-disk format of the old cluster we can simply link
+ * each relation to perform a true in-place upgrade.  Otherwise we must copy
+ * (either block-by-block or using a copy-on-write clone) the data from old
+ * cluster to new cluster and then perform the conversion.
  */
 const char *
-copyAndUpdateFile(pageCnvCtx *pageConverter,
-                                 const char *src, const char *dst, bool force)
+upgradeFile(transferMode transfer_mode, const char *src,
+               const char *dst, pageCnvCtx *pageConverter)
 {
        if (pageConverter == NULL)
        {
-               if (pg_copy_file(src, dst, force) == -1)
-                       return getErrorText(errno);
-               else
-                       return NULL;
+               int rc = -1;
+
+               switch (transfer_mode)
+               {
+                       case TRANSFER_MODE_COPY:
+                               rc = pg_copy_file(src, dst, true);
+                               break;
+                       case TRANSFER_MODE_CLONE:
+                               rc = upg_clone_file(src, dst);
+                               break;
+                       case TRANSFER_MODE_LINK:
+                               rc = pg_link_file(src, dst);
+                               break;
+               }
+
+               return (rc < 0) ? getErrorText(errno) : NULL;
+       }
+       else if (transfer_mode != TRANSFER_MODE_COPY)
+       {
+               return "Cannot in-place update this cluster, "
+                       "page-by-page (copy-mode) conversion is required";
        }
        else
        {
@@ -100,29 +126,6 @@ copyAndUpdateFile(pageCnvCtx *pageConverter,
 }
 
 
-/*
- * linkAndUpdateFile()
- *
- * Creates a hard link between the given relation files. We use
- * this function to perform a true in-place update. If the on-disk
- * format of the new cluster is bit-for-bit compatible with the on-disk
- * format of the old cluster, we can simply link each relation
- * instead of copying the data from the old cluster to the new cluster.
- */
-const char *
-linkAndUpdateFile(pageCnvCtx *pageConverter,
-                                 const char *src, const char *dst)
-{
-       if (pageConverter != NULL)
-               return "Cannot in-place update this cluster, page-by-page 
conversion is required";
-
-       if (pg_link_file(src, dst) == -1)
-               return getErrorText(errno);
-       else
-               return NULL;
-}
-
-
 #ifndef WIN32
 static int
 copy_file(const char *srcfile, const char *dstfile, bool force)
@@ -228,6 +231,64 @@ win32_pghardlink(const char *src, const char *dst)
 #endif
 
 
+int
+upg_clone_file(const char *existing_file, const char *new_file)
+{
+#ifdef BTRFS_IOC_CLONE
+       int rc, res_errno = 0, src_fd = -1, dest_fd = -1;
+
+       src_fd = open(existing_file, O_RDONLY);
+       if (src_fd < 0)
+               return -1;
+
+       dest_fd = open(new_file, O_RDWR | O_CREAT | O_EXCL, S_IRUSR | S_IWUSR);
+       if (dest_fd < 0)
+       {
+               close(src_fd);
+               return -1;
+       }
+
+       rc = ioctl(dest_fd, BTRFS_IOC_CLONE, src_fd);
+       if (rc < 0)
+       {
+               pg_log(PG_REPORT, "btrfs clone: %s\n", strerror(errno));
+               res_errno = errno;  /* save errno for caller */
+               unlink(new_file);
+       }
+
+       close(dest_fd);
+       close(src_fd);
+
+       errno = res_errno;  /* restore errno after close() calls */
+       return rc;
+#else
+       /* TODO: add support for zfs clones */
+       pg_log(PG_REPORT, "system does not support file cloning\n");
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+void
+check_clone_file(void)
+{
+       char            existing_file[MAXPGPATH];
+       char            cloned_file[MAXPGPATH];
+
+       snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", 
old_cluster.pgdata);
+       snprintf(cloned_file, sizeof(cloned_file), "%s/PG_VERSION.linktest", 
new_cluster.pgdata);
+       unlink(cloned_file);            /* might fail */
+
+       if (upg_clone_file(existing_file, cloned_file) == -1)
+       {
+               pg_log(PG_FATAL,
+                          "Could not clone a file between old and new data 
directories: %s\n"
+                          "File cloning is currently only supported on 
btrfs.\n",
+                          getErrorText(errno));
+       }
+       unlink(cloned_file);
+}
+
 /* fopen() file with no group/other permissions */
 FILE *
 fopen_priv(const char *path, const char *mode)
diff --git a/contrib/pg_upgrade/option.c b/contrib/pg_upgrade/option.c
index 2774b1e..fdf9f5c 100644
--- a/contrib/pg_upgrade/option.c
+++ b/contrib/pg_upgrade/option.c
@@ -54,6 +54,7 @@ parseCommandLine(int argc, char *argv[])
                {"retain", no_argument, NULL, 'r'},
                {"jobs", required_argument, NULL, 'j'},
                {"verbose", no_argument, NULL, 'v'},
+               {"clone", no_argument, NULL, 1},
                {NULL, 0, NULL, 0}
        };
        int                     option;                 /* Command line option 
*/
@@ -186,6 +187,10 @@ parseCommandLine(int argc, char *argv[])
                                log_opts.verbose = true;
                                break;
 
+                       case 1:
+                               user_opts.transfer_mode = TRANSFER_MODE_CLONE;
+                               break;
+
                        default:
                                pg_log(PG_FATAL,
                                           "Try \"%s --help\" for more 
information.\n",
@@ -236,6 +241,8 @@ Options:\n\
   -D, --new-datadir=DATADIR     new cluster data directory\n\
   -j, --jobs                    number of simultaneous processes or threads to 
use\n\
   -k, --link                    link instead of copying files to new cluster\n\
+      --clone                   use copy-on-write cloning instead of copying\n\
+                                files to new cluster (only supported on 
btrfs)\n\
   -o, --old-options=OPTIONS     old cluster options to pass to the server\n\
   -O, --new-options=OPTIONS     new cluster options to pass to the server\n\
   -p, --old-port=PORT           old cluster port number (default %d)\n\
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index 0b3ad20..6932bd6 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -209,12 +209,13 @@ typedef struct
 } ControlData;
 
 /*
- * Enumeration to denote link modes
+ * Enumeration to denote transfer mode
  */
 typedef enum
 {
        TRANSFER_MODE_COPY,
-       TRANSFER_MODE_LINK
+       TRANSFER_MODE_LINK,
+       TRANSFER_MODE_CLONE
 } transferMode;
 
 /*
@@ -381,12 +382,12 @@ const pageCnvCtx *setupPageConverter(void);
 typedef void *pageCnvCtx;
 #endif
 
-const char *copyAndUpdateFile(pageCnvCtx *pageConverter, const char *src,
-                                 const char *dst, bool force);
-const char *linkAndUpdateFile(pageCnvCtx *pageConverter, const char *src,
-                                 const char *dst);
+const char *upgradeFile(transferMode transfer_mode, const char *src,
+                               const char *dst, pageCnvCtx *pageConverter);
 
 void           check_hard_link(void);
+void           check_clone_file(void);
+int            upg_clone_file(const char *old_file, const char *new_file);
 FILE      *fopen_priv(const char *path, const char *mode);
 
 /* function.c */
diff --git a/contrib/pg_upgrade/relfilenode.c b/contrib/pg_upgrade/relfilenode.c
index a951fc9..c808313 100644
--- a/contrib/pg_upgrade/relfilenode.c
+++ b/contrib/pg_upgrade/relfilenode.c
@@ -32,7 +32,10 @@ transfer_all_new_tablespaces(DbInfoArr *old_db_arr, 
DbInfoArr *new_db_arr,
                                                         char *old_pgdata, char 
*new_pgdata)
 {
        pg_log(PG_REPORT, "%s user relation files\n",
-         user_opts.transfer_mode == TRANSFER_MODE_LINK ? "Linking" : 
"Copying");
+               user_opts.transfer_mode == TRANSFER_MODE_COPY ? "Copying" :
+               user_opts.transfer_mode == TRANSFER_MODE_LINK ? "Linking" :
+               user_opts.transfer_mode == TRANSFER_MODE_CLONE ? "Cloning" :
+               "FAIL");
 
        /*
         * Transfering files by tablespace is tricky because a single database 
can
@@ -270,26 +273,14 @@ transfer_relfile(pageCnvCtx *pageConverter, FileNameMap 
*map,
                /* Copying files might take some time, so give feedback. */
                pg_log(PG_STATUS, "%s", old_file);
 
-               if ((user_opts.transfer_mode == TRANSFER_MODE_LINK) && 
(pageConverter != NULL))
-                       pg_log(PG_FATAL, "This upgrade requires page-by-page 
conversion, "
-                                  "you must use copy mode instead of link 
mode.\n");
-
-               if (user_opts.transfer_mode == TRANSFER_MODE_COPY)
-               {
-                       pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", 
old_file, new_file);
-
-                       if ((msg = copyAndUpdateFile(pageConverter, old_file, 
new_file, true)) != NULL)
-                               pg_log(PG_FATAL, "error while copying relation 
\"%s.%s\" (\"%s\" to \"%s\"): %s\n",
-                                          map->nspname, map->relname, 
old_file, new_file, msg);
-               }
-               else
+               msg = upgradeFile(user_opts.transfer_mode, old_file, new_file, 
pageConverter);
+               if (msg != NULL)
                {
-                       pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", 
old_file, new_file);
-
-                       if ((msg = linkAndUpdateFile(pageConverter, old_file, 
new_file)) != NULL)
-                               pg_log(PG_FATAL,
-                                          "error while creating link for 
relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
-                                          map->nspname, map->relname, 
old_file, new_file, msg);
+                       pg_log(PG_FATAL, "error while %s relation \"%s.%s\" 
(\"%s\" to \"%s\"): %s\n",
+                               (user_opts.transfer_mode == TRANSFER_MODE_COPY 
? "copying" :
+                                user_opts.transfer_mode == TRANSFER_MODE_LINK 
? "linking" :
+                                user_opts.transfer_mode == TRANSFER_MODE_CLONE 
? "cloning" :
+                                "FAIL"), map->nspname, map->relname, old_file, 
new_file, msg);
                }
        }
 
diff --git a/doc/src/sgml/pgupgrade.sgml b/doc/src/sgml/pgupgrade.sgml
index f6cd9f0..fc9b89d 100644
--- a/doc/src/sgml/pgupgrade.sgml
+++ b/doc/src/sgml/pgupgrade.sgml
@@ -126,6 +126,13 @@
      </varlistentry>
 
      <varlistentry>
+      <term><option>--clone</option></term>
+      <listitem><para>use copy-on-write clones instead of copying or hard 
linking files to
+      the new cluster; this option can only be used when the old and new 
clusters reside
+      on the same btrfs filesystem</para></listitem>
+     </varlistentry>
+
+     <varlistentry>
       <term><option>-o</option> <replaceable 
class="parameter">options</replaceable></term>
       <term><option>--old-options</option> <replaceable 
class="parameter">options</replaceable></term>
       <listitem><para>options to be passed directly to the
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 8aabf3c..7cd5a8d 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -317,6 +317,9 @@
 /* Define to 1 if you have the `z' library (-lz). */
 #undef HAVE_LIBZ
 
+/* Define to 1 if you have the <linux/btrfs.h> header file. */
+#undef HAVE_LINUX_BTRFS_H
+
 /* Define to 1 if constants of type 'long long int' should have the suffix LL.
    */
 #undef HAVE_LL_CONSTANTS
-- 
1.8.3.1



-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to