This patch against PostgreSQL 9.1.8 takes advantage of efficient file
cloning on Linux Btrfs file systems to make CREATE DATABASE operations
extremely fast regardless of the size of the database used as a
template. On my system, I can create a database from a multi-gibibyte
template in a second or less. This is very useful for automated testing
as well in a development environment where reverting to a baseline
database is frequently required. As an added bonus, newly created
databases require very little additional disk storage until they diverge
from the template.

The efficient cloning is accomplished by a Btrfs-specific ioctl() call.
On non-Linux systems or if the ioctl() call fails, file contents are
copied in the conventional way so no configuration is needed. This has
been tested on a Linux system on both Btrfs and XFS file systems as well
as an OSX system.

The clone_file() function was originally copied from GNU coreutils which
is under GPL v3. The function is currently only about ten lines long and
contains little essential information beyond the magic values needed for
the ioctl() call so I'm not sure if license is a problem.
-- 
Jonathan Ross Rogers
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index 6cfb816..719a5c1 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -22,6 +22,10 @@
 #include <unistd.h>
 #include <sys/stat.h>
 
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+
 #include "storage/copydir.h"
 #include "storage/fd.h"
 #include "miscadmin.h"
@@ -139,6 +143,24 @@ copydir(char *fromdir, char *todir, bool recurse)
 }
 
 /*
+ * Perform the O(1) btrfs clone operation, if possible.
+ * Upon success, return 0.  Otherwise, return -1.
+ */
+static inline int
+clone_file (int dest_fd, int src_fd)
+{
+#ifdef __linux__
+# define BTRFS_IOCTL_MAGIC 0x94
+# define BTRFS_IOC_CLONE _IOW (BTRFS_IOCTL_MAGIC, 9, int)
+	return ioctl (dest_fd, BTRFS_IOC_CLONE, src_fd);
+#else
+	(void) dest_fd;
+	(void) src_fd;
+	return -1;
+#endif
+}
+
+/*
  * copy one file
  */
 void
@@ -150,11 +172,6 @@ copy_file(char *fromfile, char *tofile)
 	int			nbytes;
 	off_t		offset;
 
-	/* Use palloc to ensure we get a maxaligned buffer */
-#define COPY_BUF_SIZE (8 * BLCKSZ)
-
-	buffer = palloc(COPY_BUF_SIZE);
-
 	/*
 	 * Open the files
 	 */
@@ -171,38 +188,54 @@ copy_file(char *fromfile, char *tofile)
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tofile)));
 
-	/*
-	 * Do the data copying.
-	 */
-	for (offset = 0;; offset += nbytes)
+	if (clone_file (dstfd, srcfd) == 0)
+		ereport(DEBUG1, (errmsg("Cloned \"%s\" to \"%s\".", fromfile, tofile)));
+
+	else
 	{
-		/* If we got a cancel signal during the copy of the file, quit */
-		CHECK_FOR_INTERRUPTS();
+		/*
+		 * Do the data copying.
+		 */
 
-		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not read file \"%s\": %m", fromfile)));
-		if (nbytes == 0)
-			break;
-		errno = 0;
-		if ((int) write(dstfd, buffer, nbytes) != nbytes)
+		/* Use palloc to ensure we get a maxaligned buffer */
+#define COPY_BUF_SIZE (8 * BLCKSZ)
+
+		buffer = palloc(COPY_BUF_SIZE);
+
+		ereport(DEBUG1, (errmsg("Copying \"%s\" to \"%s\" in userspace.",
+								fromfile, tofile)));
+		for (offset = 0;; offset += nbytes)
 		{
-			/* if write didn't set errno, assume problem is no disk space */
-			if (errno == 0)
-				errno = ENOSPC;
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not write to file \"%s\": %m", tofile)));
+			/* If we got a cancel signal during the copy of the file, quit */
+			CHECK_FOR_INTERRUPTS();
+
+			nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+			if (nbytes < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read file \"%s\": %m", fromfile)));
+			if (nbytes == 0)
+				break;
+			errno = 0;
+			if ((int) write(dstfd, buffer, nbytes) != nbytes)
+			{
+				/* if write didn't set errno, assume problem is no disk space */
+				if (errno == 0)
+					errno = ENOSPC;
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not write to file \"%s\": %m", tofile)));
+			}
+
+			/*
+			 * We fsync the files later but first flush them to avoid spamming the
+			 * cache and hopefully get the kernel to start writing them out before
+			 * the fsync comes.
+			 */
+			pg_flush_data(dstfd, offset, nbytes);
 		}
 
-		/*
-		 * We fsync the files later but first flush them to avoid spamming the
-		 * cache and hopefully get the kernel to start writing them out before
-		 * the fsync comes.
-		 */
-		pg_flush_data(dstfd, offset, nbytes);
+		pfree(buffer);
 	}
 
 	if (close(dstfd))
@@ -212,7 +245,6 @@ copy_file(char *fromfile, char *tofile)
 
 	close(srcfd);
 
-	pfree(buffer);
 }
 
 
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to