Re: CREATE DATABASE with filesystem cloning

2024-08-08 Thread Nazir Bilal Yavuz
Hi,

Rebased version of the patch is attached.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft
From a419004a26410c9ad9348e2f1420695db8ca35b6 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz 
Date: Thu, 8 Aug 2024 15:01:48 +0300
Subject: [PATCH v9] Introduce file_copy_method GUC

This GUC can be set to either COPY (default) or CLONE (if system
supports it).

If CLONE method is chosen, similar to COPY; but attempting to use
efficient file copying system calls.  The kernel has the opportunity to
share block ranges in copy-on-write file systems, or maybe push down
the copy to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

Author: Thomas Munro 
Author: Nazir Bilal Yavuz 
Reviewed-by: Robert Haas 
Reviewed-by: Ranier Vilela 
Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/include/storage/copydir.h |  9 ++
 src/backend/storage/file/copydir.c| 86 ++-
 .../utils/activity/wait_event_names.txt   |  1 +
 src/backend/utils/misc/guc_tables.c   | 19 
 src/backend/utils/misc/postgresql.conf.sample |  4 +
 doc/src/sgml/config.sgml  | 46 ++
 doc/src/sgml/ref/alter_database.sgml  |  3 +-
 doc/src/sgml/ref/create_database.sgml |  4 +-
 src/tools/pgindent/typedefs.list  |  1 +
 9 files changed, 170 insertions(+), 3 deletions(-)

diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..6edc3ea4f69 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,6 +13,15 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
+typedef enum FileCopyMethod
+{
+	FILE_COPY_METHOD_COPY,
+	FILE_COPY_METHOD_CLONE,
+} FileCopyMethod;
+
+/* GUC parameters */
+extern PGDLLIMPORT int file_copy_method;
+
 extern void copydir(const char *fromdir, const char *todir, bool recurse);
 extern void copy_file(const char *fromfile, const char *tofile);
 
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index d4fbe542077..0a9c6fb62c0 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,17 +21,30 @@
 #include 
 #include 
 
+#ifdef HAVE_COPYFILE_H
+#include 
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
 
+/* GUCs */
+int			file_copy_method = FILE_COPY_METHOD_COPY;
+
+static void clone_file(const char *fromfile, const char *tofile);
+
 /*
  * copydir: copy a directory
  *
  * If recurse is false, subdirectories are ignored.  Anything that's not
  * a directory or a regular file is ignored.
+ *
+ * This function uses a file_copy_method GUC to determine copy method.
+ * Uses of this function must be documented in the list of places
+ * affected by this GUC.
  */
 void
 copydir(const char *fromdir, const char *todir, bool recurse)
@@ -71,7 +84,12 @@ copydir(const char *fromdir, const char *todir, bool recurse)
 copydir(fromfile, tofile, true);
 		}
 		else if (xlde_type == PGFILETYPE_REG)
-			copy_file(fromfile, tofile);
+		{
+			if (file_copy_method == FILE_COPY_METHOD_CLONE)
+clone_file(fromfile, tofile);
+			else
+copy_file(fromfile, tofile);
+		}
 	}
 	FreeDir(xldir);
 
@@ -214,3 +232,69 @@ copy_file(const char *fromfile, const char *tofile)
 
 	pfree(buffer);
 }
+
+/*
+ * clone one file
+ */
+static void
+clone_file(const char *fromfile, const char *tofile)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0)
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not clone file \"%s\" to \"%s\": %m",
+		fromfile, tofile)));
+#elif defined(HAVE_COPY_FILE_RANGE)
+	int			srcfd;
+	int			dstfd;
+	ssize_t		nbytes;
+
+	srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+	if (srcfd < 0)
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", fromfile)));
+
+	dstfd = OpenTransientFile(tofile, O_WRONLY | O_CREAT | O_EXCL | PG_BINARY);
+	if (dstfd < 0)
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", tofile)));
+
+	do
+	{
+		/* If we got a cancel signal during the copy of the file, quit */
+		CHECK_FOR_INTERRUPTS();
+
+		/*
+		 * Don't copy too much at once, so we can check for interrupts from
+		 * time to time if this falls back to a slow copy.
+		 */
+		pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_COPY);
+		nbytes = copy_file_range(srcfd, NULL, dstfd, NULL, 1024 * 1024, 0);
+		if (nbytes < 0 && errno != EINTR)
+			ereport(ERROR,
+	(errcode_for_file_access(),
+	 errmsg("could not clone file \"%s\" to \"%s\": %m",
+			fromfile, tofile)));
+		pgstat_report_wait_end();
+	}
+	while (nbytes != 0);
+
+	if (CloseTransientFile(dstfd) != 0)
+		ereport(ERROR,
+(errcode_for_file_acc

Re: CREATE DATABASE with filesystem cloning

2024-06-03 Thread Nazir Bilal Yavuz
Hi,

On Tue, 21 May 2024 at 15:08, Ranier Vilela  wrote:
>
> Em ter., 21 de mai. de 2024 às 05:18, Nazir Bilal Yavuz  
> escreveu:
>>
>> Hi,
>>
>> On Thu, 16 May 2024 at 17:47, Robert Haas  wrote:
>> >
>> > On Thu, May 16, 2024 at 9:43 AM Nazir Bilal Yavuz  
>> > wrote:
>> > > Actually, the documentation for the file_copy_method was mentioning
>> > > the things it controls; I converted it to an itemized list now. Also,
>> > > changed the comment to: 'Further uses of this function requires
>> > > updates to the list that GUC controls in its documentation.'. v7 is
>> > > attached.
>> >
>> > I think the comments need some wordsmithing.
>>
>> I changed it to 'Uses of this function must be documented in the list
>> of places affected by this GUC.', I am open to any suggestions.
>>
>> > I don't see why this parameter should be PGC_POSTMASTER.
>>
>> I changed it to 'PGC_USERSET' now. My initial point was the database
>> or tablespace to be copied with the same method. I thought copying
>> some portion of the database with the copy and rest with the clone
>> could cause potential problems. After a bit of searching, I could not
>> find any problems related to that.
>>
>> v8 is attached.
>
> Hi,
> I did some research on the subject and despite being an improvement,
> I believe that some terminologies should be changed in this patch.
> Although the new function is called *clone_file*, I'm not sure if it really 
> is "clone".
> Why MacOS added an API called clonefile [1] and Linux exists
> another called FICLONE.[2]
> So perhaps it should be treated here as a copy and not a clone?
> Leaving it open, is the possibility of implementing a true clone api?
> Thoughts?

Thank you for mentioning this.

1- I do not know where to look for MacOS' function definitions but I
followed the same links you shared. It says copyfile(...,
COPYFILE_CLONE_FORCE) means 'Clone the file instead. This is a force
flag i.e. if cloning fails, an error is returned' [1]. It does the
cloning but I still do not know whether there is a difference between
'copyfile() function with COPYFILE_CLONE_FORCE flag' and 'clonefile()'
function.

2- I read a couple of discussions about copy_file_range() and FICLONE.
It seems that the copy_file_range() function is a slightly upgraded
version of FICLONE but less available since it is newer. It still does
the clone [2]. Also, FICLONE is already used in pg_upgrade and
pg_combinebackup.

Both of these copyfile(..., COPYFILE_CLONE_FORCE) and
copy_file_range() functions do the cloning, so I think using clone
terminology is correct. But, using FICLONE instead of
copy_file_range() could be better since it is more widely available.

[1] https://www.unix.com/man-page/mojave/3/copyfile/
[2] https://elixir.bootlin.com/linux/v5.15-rc5/source/fs/read_write.c#L1495

-- 
Regards,
Nazir Bilal Yavuz
Microsoft




Re: CREATE DATABASE with filesystem cloning

2024-05-21 Thread Ranier Vilela
Em ter., 21 de mai. de 2024 às 05:18, Nazir Bilal Yavuz 
escreveu:

> Hi,
>
> On Thu, 16 May 2024 at 17:47, Robert Haas  wrote:
> >
> > On Thu, May 16, 2024 at 9:43 AM Nazir Bilal Yavuz 
> wrote:
> > > Actually, the documentation for the file_copy_method was mentioning
> > > the things it controls; I converted it to an itemized list now. Also,
> > > changed the comment to: 'Further uses of this function requires
> > > updates to the list that GUC controls in its documentation.'. v7 is
> > > attached.
> >
> > I think the comments need some wordsmithing.
>
> I changed it to 'Uses of this function must be documented in the list
> of places affected by this GUC.', I am open to any suggestions.
>
> > I don't see why this parameter should be PGC_POSTMASTER.
>
> I changed it to 'PGC_USERSET' now. My initial point was the database
> or tablespace to be copied with the same method. I thought copying
> some portion of the database with the copy and rest with the clone
> could cause potential problems. After a bit of searching, I could not
> find any problems related to that.
>
> v8 is attached.
>
Hi,
I did some research on the subject and despite being an improvement,
I believe that some terminologies should be changed in this patch.
Although the new function is called *clone_file*, I'm not sure if it really
is "clone".
Why MacOS added an API called clonefile [1] and Linux exists
another called FICLONE.[2]
So perhaps it should be treated here as a copy and not a clone?
Leaving it open, is the possibility of implementing a true clone api?
Thoughts?

best regards,
Ranier Vilela

[1] clonefile 
[2] ioctl_ficlonerange



>
> --
> Regards,
> Nazir Bilal Yavuz
> Microsoft
>


Re: CREATE DATABASE with filesystem cloning

2024-05-21 Thread Nazir Bilal Yavuz
Hi,

On Thu, 16 May 2024 at 17:47, Robert Haas  wrote:
>
> On Thu, May 16, 2024 at 9:43 AM Nazir Bilal Yavuz  wrote:
> > Actually, the documentation for the file_copy_method was mentioning
> > the things it controls; I converted it to an itemized list now. Also,
> > changed the comment to: 'Further uses of this function requires
> > updates to the list that GUC controls in its documentation.'. v7 is
> > attached.
>
> I think the comments need some wordsmithing.

I changed it to 'Uses of this function must be documented in the list
of places affected by this GUC.', I am open to any suggestions.

> I don't see why this parameter should be PGC_POSTMASTER.

I changed it to 'PGC_USERSET' now. My initial point was the database
or tablespace to be copied with the same method. I thought copying
some portion of the database with the copy and rest with the clone
could cause potential problems. After a bit of searching, I could not
find any problems related to that.

v8 is attached.

--
Regards,
Nazir Bilal Yavuz
Microsoft
From 54dac96e493f482581c09ec14e67196e73e371ca Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz 
Date: Tue, 21 May 2024 10:19:29 +0300
Subject: [PATCH v8] Introduce file_copy_method GUC

This GUC can be set to either COPY (default) or CLONE (if system
supports it).

If CLONE method is chosen, similar to COPY; but attempting to use
efficient file copying system calls.  The kernel has the opportunity to
share block ranges in copy-on-write file systems, or maybe push down
the copy to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

Author: Thomas Munro 
Author: Nazir Bilal Yavuz 
Reviewed-by: Robert Haas 
Reviewed-by: Ranier Vilela 
Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/include/storage/copydir.h |  9 ++
 src/backend/storage/file/copydir.c| 98 ++-
 .../utils/activity/wait_event_names.txt   |  1 +
 src/backend/utils/misc/guc_tables.c   | 12 +++
 src/backend/utils/misc/postgresql.conf.sample |  4 +
 doc/src/sgml/config.sgml  | 46 +
 doc/src/sgml/ref/alter_database.sgml  |  3 +-
 doc/src/sgml/ref/create_database.sgml |  4 +-
 src/tools/pgindent/typedefs.list  |  1 +
 9 files changed, 175 insertions(+), 3 deletions(-)

diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..6edc3ea4f69 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,6 +13,15 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
+typedef enum FileCopyMethod
+{
+	FILE_COPY_METHOD_COPY,
+	FILE_COPY_METHOD_CLONE,
+} FileCopyMethod;
+
+/* GUC parameters */
+extern PGDLLIMPORT int file_copy_method;
+
 extern void copydir(const char *fromdir, const char *todir, bool recurse);
 extern void copy_file(const char *fromfile, const char *tofile);
 
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index d4fbe542077..6d8f05199ca 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,17 +21,42 @@
 #include 
 #include 
 
+#ifdef HAVE_COPYFILE_H
+#include 
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
+#include "utils/guc.h"
+
+/* GUCs */
+int			file_copy_method = FILE_COPY_METHOD_COPY;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry file_copy_method_options[] = {
+	{"copy", FILE_COPY_METHOD_COPY, false},
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) || defined(HAVE_COPY_FILE_RANGE)
+	{"clone", FILE_COPY_METHOD_CLONE, false},
+#endif
+	{NULL, 0, false}
+};
+
+static void clone_file(const char *fromfile, const char *tofile);
 
 /*
  * copydir: copy a directory
  *
  * If recurse is false, subdirectories are ignored.  Anything that's not
  * a directory or a regular file is ignored.
+ *
+ * This function uses a file_copy_method GUC to determine copy method.
+ * Uses of this function must be documented in the list of places
+ * affected by this GUC.
  */
 void
 copydir(const char *fromdir, const char *todir, bool recurse)
@@ -71,7 +96,12 @@ copydir(const char *fromdir, const char *todir, bool recurse)
 copydir(fromfile, tofile, true);
 		}
 		else if (xlde_type == PGFILETYPE_REG)
-			copy_file(fromfile, tofile);
+		{
+			if (file_copy_method == FILE_COPY_METHOD_CLONE)
+clone_file(fromfile, tofile);
+			else
+copy_file(fromfile, tofile);
+		}
 	}
 	FreeDir(xldir);
 
@@ -214,3 +244,69 @@ copy_file(const char *fromfile, const char *tofile)
 
 	pfree(buffer);
 }
+
+/*
+ * clone one file
+ */
+static void
+clone_file(const char *fromfile, const char *tofile)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0)
+		ereport(ERROR,
+(errcode_for_file_access(),

Re: CREATE DATABASE with filesystem cloning

2024-05-16 Thread Robert Haas
On Thu, May 16, 2024 at 9:43 AM Nazir Bilal Yavuz  wrote:
> Actually, the documentation for the file_copy_method was mentioning
> the things it controls; I converted it to an itemized list now. Also,
> changed the comment to: 'Further uses of this function requires
> updates to the list that GUC controls in its documentation.'. v7 is
> attached.

I think the comments need some wordsmithing.

I don't see why this parameter should be PGC_POSTMASTER.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: CREATE DATABASE with filesystem cloning

2024-05-16 Thread Nazir Bilal Yavuz
Hi,

On Thu, 16 May 2024 at 15:40, Robert Haas  wrote:
>
> On Thu, May 16, 2024 at 8:35 AM Nazir Bilal Yavuz  wrote:
> > I updated the documentation and put a comment on top of the copydir()
> > function to inform that further changes and uses of this function may
> > require documentation updates.
>
> I was assuming that the documentation for the file_copy_method was
> going to list the things that it controlled, and that the comment was
> going to say that you should update that list specifically. Just
> saying that you may need to update some part of the documentation in
> some way is fairly useless, IMHO.

Actually, the documentation for the file_copy_method was mentioning
the things it controls; I converted it to an itemized list now. Also,
changed the comment to: 'Further uses of this function requires
updates to the list that GUC controls in its documentation.'. v7 is
attached.

--
Regards,
Nazir Bilal Yavuz
Microsoft
From 197ce921c20d49b524e306bf082701434b71b7af Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz 
Date: Thu, 16 May 2024 16:21:46 +0300
Subject: [PATCH v7] Introduce file_copy_method GUC

This GUC can be set to either COPY (default) or CLONE (if system
supports it).

If CLONE option is chosen, similar to COPY; but attempting to use
efficient file copying system calls.  The kernel has the opportunity to
share block ranges in copy-on-write file systems, or maybe push down
the copy to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

Author: Thomas Munro 
Author: Nazir Bilal Yavuz 
Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/include/storage/copydir.h |  9 ++
 src/backend/storage/file/copydir.c| 98 ++-
 .../utils/activity/wait_event_names.txt   |  1 +
 src/backend/utils/misc/guc_tables.c   | 12 +++
 src/backend/utils/misc/postgresql.conf.sample |  5 +
 doc/src/sgml/config.sgml  | 48 +
 doc/src/sgml/ref/alter_database.sgml  |  3 +-
 doc/src/sgml/ref/create_database.sgml |  4 +-
 src/tools/pgindent/typedefs.list  |  1 +
 9 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..6edc3ea4f69 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,6 +13,15 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
+typedef enum FileCopyMethod
+{
+	FILE_COPY_METHOD_COPY,
+	FILE_COPY_METHOD_CLONE,
+} FileCopyMethod;
+
+/* GUC parameters */
+extern PGDLLIMPORT int file_copy_method;
+
 extern void copydir(const char *fromdir, const char *todir, bool recurse);
 extern void copy_file(const char *fromfile, const char *tofile);
 
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index d4fbe542077..651c74672e0 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,17 +21,42 @@
 #include 
 #include 
 
+#ifdef HAVE_COPYFILE_H
+#include 
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
+#include "utils/guc.h"
+
+/* GUCs */
+int			file_copy_method = FILE_COPY_METHOD_COPY;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry file_copy_method_options[] = {
+	{"copy", FILE_COPY_METHOD_COPY, false},
+#if (defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)) || defined(HAVE_COPY_FILE_RANGE)
+	{"clone", FILE_COPY_METHOD_CLONE, false},
+#endif
+	{NULL, 0, false}
+};
+
+static void clone_file(const char *fromfile, const char *tofile);
 
 /*
  * copydir: copy a directory
  *
  * If recurse is false, subdirectories are ignored.  Anything that's not
  * a directory or a regular file is ignored.
+ *
+ * This function uses a file_copy_method GUC to determine copy method.
+ * Further uses of this function requires updates to the list that GUC controls
+ * in its documentation.
  */
 void
 copydir(const char *fromdir, const char *todir, bool recurse)
@@ -71,7 +96,12 @@ copydir(const char *fromdir, const char *todir, bool recurse)
 copydir(fromfile, tofile, true);
 		}
 		else if (xlde_type == PGFILETYPE_REG)
-			copy_file(fromfile, tofile);
+		{
+			if (file_copy_method == FILE_COPY_METHOD_CLONE)
+clone_file(fromfile, tofile);
+			else
+copy_file(fromfile, tofile);
+		}
 	}
 	FreeDir(xldir);
 
@@ -214,3 +244,69 @@ copy_file(const char *fromfile, const char *tofile)
 
 	pfree(buffer);
 }
+
+/*
+ * clone one file
+ */
+static void
+clone_file(const char *fromfile, const char *tofile)
+{
+#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)
+	if (copyfile(fromfile, tofile, NULL, COPYFILE_CLONE_FORCE) < 0)
+		ereport(ERROR,
+(errcode_for_file_access(),
+ errmsg("could not clone file \"%s\" to \"%s\": %m",
+		fromfile, tofile)));
+#elif defined(HAVE_COPY_FILE_RANGE)
+	in

Re: CREATE DATABASE with filesystem cloning

2024-05-16 Thread Robert Haas
On Thu, May 16, 2024 at 8:35 AM Nazir Bilal Yavuz  wrote:
> I updated the documentation and put a comment on top of the copydir()
> function to inform that further changes and uses of this function may
> require documentation updates.

I was assuming that the documentation for the file_copy_method was
going to list the things that it controlled, and that the comment was
going to say that you should update that list specifically. Just
saying that you may need to update some part of the documentation in
some way is fairly useless, IMHO.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: CREATE DATABASE with filesystem cloning

2024-05-16 Thread Nazir Bilal Yavuz
Hi,

On Thu, 9 May 2024 at 19:29, Robert Haas  wrote:
>
> On Wed, May 8, 2024 at 10:34 AM Nazir Bilal Yavuz  wrote:
> > > I'm not so sure about the GUC name. On the one hand, it feels like
> > > createdb should be spelled out as create_database, but on the other
> > > hand, the GUC name is quite long already. Then again, why would we
> > > make this specific to CREATE DATABASE in the first place? Would we
> > > also want alter_tablespace_file_copy_method and
> > > basic_archive.file_copy_method?
> >
> > I agree that it is already quite long, because of that I chose the
> > createdb as a prefix. I did not think that file cloning was planned to
> > be used in other places. If that is the case, does something like
> > 'preferred_copy_method' work? Then, we would mention which places will
> > be affected with this GUC in the docs.
>
> I would go with file_copy_method rather than preferred_copy_method,
> because (1) there's nothing "preferred" about it if we're using it
> unconditionally and (2) it's nice to clarify that we're talking about
> copying files rather than anything else.

Done.

>
> My personal enthusiasm for making platform-specific file copy methods
> usable all over PostgreSQL is quite limited. However, it is my
> observation that other people seem far more enthusiastic about it than
> I am. For example, consider how quickly it got added to
> pg_combinebackup. So, I suspect it's smart to plan on anything we add
> in this area getting used in a bunch of places. And perhaps it is even
> best to think about making it work in all of those places right from
> the start. If we build support into copydir and copy_file, then we
> just get everything that uses those, and all that remains is to
> document was is covered (and add comments so that future patch authors
> know they should further update the documentation).

I updated the documentation and put a comment on top of the copydir()
function to inform that further changes and uses of this function may
require documentation updates.

I also changed O_RDWR to O_WRONLY flag in the clone_file() function
based on Raniers' feedback. Also, does this feature need tests? I
thought about possible test cases but since this feature requires
specific file systems to run, I could not find any.

v6 is attached.

--
Regards,
Nazir Bilal Yavuz
Microsoft
From 33fcb21730683c628f81451e21a0cf2455fd41be Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz 
Date: Thu, 16 May 2024 15:22:46 +0300
Subject: [PATCH v6] Introduce file_copy_method GUC

This GUC can be set to either COPY (default) or CLONE (if system
supports it).

If CLONE option is chosen, similar to COPY; but attempting to use
efficient file copying system calls.  The kernel has the opportunity to
share block ranges in copy-on-write file systems, or maybe push down
the copy to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

Author: Thomas Munro 
Author: Nazir Bilal Yavuz 
Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/include/storage/copydir.h |  9 ++
 src/backend/storage/file/copydir.c| 98 ++-
 .../utils/activity/wait_event_names.txt   |  1 +
 src/backend/utils/misc/guc_tables.c   | 12 +++
 src/backend/utils/misc/postgresql.conf.sample |  5 +
 doc/src/sgml/config.sgml  | 35 +++
 doc/src/sgml/ref/alter_database.sgml  |  3 +-
 doc/src/sgml/ref/create_database.sgml |  4 +-
 src/tools/pgindent/typedefs.list  |  1 +
 9 files changed, 165 insertions(+), 3 deletions(-)

diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..6edc3ea4f69 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,6 +13,15 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
+typedef enum FileCopyMethod
+{
+	FILE_COPY_METHOD_COPY,
+	FILE_COPY_METHOD_CLONE,
+} FileCopyMethod;
+
+/* GUC parameters */
+extern PGDLLIMPORT int file_copy_method;
+
 extern void copydir(const char *fromdir, const char *todir, bool recurse);
 extern void copy_file(const char *fromfile, const char *tofile);
 
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
index d4fbe542077..48e756ce284 100644
--- a/src/backend/storage/file/copydir.c
+++ b/src/backend/storage/file/copydir.c
@@ -21,17 +21,42 @@
 #include 
 #include 
 
+#ifdef HAVE_COPYFILE_H
+#include 
+#endif
+
 #include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
+#include "utils/guc.h"
+
+/* GUCs */
+int			file_copy_method = FILE_COPY_METHOD_COPY;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry file_copy_method_options[] = {
+	{"copy", FILE_COPY_METHOD_COPY, false},
+#if (defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE)) || defined(HAVE_COPY_FILE_RANGE)
+	{"clone", FILE_COPY_METHOD_C

Re: CREATE DATABASE with filesystem cloning

2024-05-09 Thread Robert Haas
On Wed, May 8, 2024 at 10:34 AM Nazir Bilal Yavuz  wrote:
> > I'm not so sure about the GUC name. On the one hand, it feels like
> > createdb should be spelled out as create_database, but on the other
> > hand, the GUC name is quite long already. Then again, why would we
> > make this specific to CREATE DATABASE in the first place? Would we
> > also want alter_tablespace_file_copy_method and
> > basic_archive.file_copy_method?
>
> I agree that it is already quite long, because of that I chose the
> createdb as a prefix. I did not think that file cloning was planned to
> be used in other places. If that is the case, does something like
> 'preferred_copy_method' work? Then, we would mention which places will
> be affected with this GUC in the docs.

I would go with file_copy_method rather than preferred_copy_method,
because (1) there's nothing "preferred" about it if we're using it
unconditionally and (2) it's nice to clarify that we're talking about
copying files rather than anything else.

My personal enthusiasm for making platform-specific file copy methods
usable all over PostgreSQL is quite limited. However, it is my
observation that other people seem far more enthusiastic about it than
I am. For example, consider how quickly it got added to
pg_combinebackup. So, I suspect it's smart to plan on anything we add
in this area getting used in a bunch of places. And perhaps it is even
best to think about making it work in all of those places right from
the start. If we build support into copydir and copy_file, then we
just get everything that uses those, and all that remains is to
document was is covered (and add comments so that future patch authors
know they should further update the documentation).

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Nazir Bilal Yavuz
Hi,

On Wed, 8 May 2024 at 16:58, Robert Haas  wrote:
>
> On Tue, May 7, 2024 at 8:00 AM Nazir Bilal Yavuz  wrote:
> > We had an off-list talk with Thomas and we thought making this option
> > GUC instead of SQL command level could solve this problem.
> >
> > I am posting a new rebased version of the patch with some important changes:
> >
> > * 'createdb_file_copy_method' GUC is created. Possible values are
> > 'copy' and 'clone'. Copy is the default option. Clone option can be
> > chosen if the system supports it, otherwise it gives error at the
> > startup.
>
> This seems like a smart idea, because the type of file copying that we
> do during redo need not be the same as what was done when the
> operation was originally performed.
>
> I'm not so sure about the GUC name. On the one hand, it feels like
> createdb should be spelled out as create_database, but on the other
> hand, the GUC name is quite long already. Then again, why would we
> make this specific to CREATE DATABASE in the first place? Would we
> also want alter_tablespace_file_copy_method and
> basic_archive.file_copy_method?

I agree that it is already quite long, because of that I chose the
createdb as a prefix. I did not think that file cloning was planned to
be used in other places. If that is the case, does something like
'preferred_copy_method' work? Then, we would mention which places will
be affected with this GUC in the docs.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft




Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Robert Haas
On Tue, May 7, 2024 at 8:00 AM Nazir Bilal Yavuz  wrote:
> We had an off-list talk with Thomas and we thought making this option
> GUC instead of SQL command level could solve this problem.
>
> I am posting a new rebased version of the patch with some important changes:
>
> * 'createdb_file_copy_method' GUC is created. Possible values are
> 'copy' and 'clone'. Copy is the default option. Clone option can be
> chosen if the system supports it, otherwise it gives error at the
> startup.

This seems like a smart idea, because the type of file copying that we
do during redo need not be the same as what was done when the
operation was originally performed.

I'm not so sure about the GUC name. On the one hand, it feels like
createdb should be spelled out as create_database, but on the other
hand, the GUC name is quite long already. Then again, why would we
make this specific to CREATE DATABASE in the first place? Would we
also want alter_tablespace_file_copy_method and
basic_archive.file_copy_method?

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Ranier Vilela
Em qua., 8 de mai. de 2024 às 10:06, Nazir Bilal Yavuz 
escreveu:

> Hi,
>
> On Wed, 8 May 2024 at 15:23, Ranier Vilela  wrote:
> >
> > Em qua., 8 de mai. de 2024 às 08:42, Nazir Bilal Yavuz <
> byavu...@gmail.com> escreveu:
> >>
> >> Hi,
> >>
> >> On Wed, 8 May 2024 at 14:16, Ranier Vilela  wrote:
> >> >
> >> >
> >> > Em qua., 8 de mai. de 2024 às 04:37, Nazir Bilal Yavuz <
> byavu...@gmail.com> escreveu:
> >> >>
> >> >> Hi Ranier,
> >> >>
> >> >> Thanks for looking into this!
> >> >>
> >> >> I am not sure why but your reply does not show up in the thread, so I
> >> >> copied your reply and answered it in the thread for visibility.
> >> >>
> >> >> On Tue, 7 May 2024 at 16:28, Ranier Vilela 
> wrote:
> >> >> >
> >> >> > I know it's coming from copy-and-paste, but
> >> >> > I believe the flags could be:
> >> >> > - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL |
> PG_BINARY);
> >> >> > + dstfd = OpenTransientFile(tofile, O_CREAT | O_WRONLY | O_TRUNC |
> O_EXCL | PG_BINARY);
> >> >> >
> >> >> > The flags:
> >> >> > O_WRONLY | O_TRUNC
> >> >> >
> >> >> > Allow the OS to make some optimizations, if you deem it possible.
> >> >> >
> >> >> > The flag O_RDWR indicates that the file can be read, which is not
> true in this case.
> >> >> > The destination file will just be written.
> >> >>
> >> >> You may be right about the O_WRONLY flag but I am not sure about the
> >> >> O_TRUNC flag.
> >> >>
> >> >> O_TRUNC from the linux man page [1]: If the file already exists and
> is
> >> >> a regular file and the access mode allows writing (i.e., is O_RDWR or
> >> >> O_WRONLY) it will be truncated to length 0.  If the file is a FIFO
> or
> >> >> terminal device file, the O_TRUNC flag is ignored. Otherwise, the
> >> >> effect of O_TRUNC is unspecified.
> >> >
> >> > O_TRUNC is usually used in conjunction with O_WRONLY.
> >> > See the example at:
> >> > open.html
> >> >
> >> > O_TRUNC signals the OS to forget the current contents of the file, if
> it happens to exist.
> >> > In other words, there will be no seeks, only and exclusively writes.
> >>
> >> You are right; the O_TRUNC flag truncates the file, if it happens to
> >> exist. But it should not exist in the first place because the code at
> >> [2] should check this before the OpenTransientFile() call. Even if we
> >> assume somehow the check at [2] does not work, I do not think the
> >> correct operation is to truncate the contents of the existing file.
> >
> > I don't know if there is a communication problem here.
> > But what I'm trying to suggest refers to the destination file,
> > which doesn't matter if it exists or not?
>
> I do not think there is a communication problem. Actually it matters
> because the destination file should not exist, there is a code [2]
> which already checks and confirms that it does not exist.
>
I got it.


>
> >
> > If the destination file does not exist, O_TRUNC is ignored.
> > If the destination file exists, O_TRUNC truncates the current contents
> of the file.
> > I don't see why you think it's a problem to truncate the current content
> if the destination file exists.
> > Isn't he going to be replaced anyway?
>
> 'If the destination file does not exist' means the code at [2] works
> well and we do not need the O_TRUNC flag.
>
True, the O_TRUNC is ignored in this case.


>
> 'If the destination file already exists' means the code at [2] is
> broken somehow and there is a high chance that we could truncate
> something that we do not want to. For example, there is a foo db and
> we want to create bar db, Postgres chose the foo db's location as the
> destination of the bar db (which should not happen but let's assume
> somehow checks at [2] failed), then we could wrongly truncate the foo
> db's contents.
>
Of course, truncating the wrong file would be pretty bad.


>
> Hence, if Postgres works successfully I think the O_TRUNC flag is
> unnecessary but if Postgres does not work successfully, the O_TRUNC
> flag could cause harm.
>
The disaster will happen anyway, but of course we can help in some way.
Even without truncating, the wrong file will be destroyed anyway.


> >
> > Unless we want to preserve the current content (destination file), in
> case the copy/clone fails?
>
> Like I said above, Postgres should not choose the existing file as the
> destination file.
>
> Also, we have O_CREAT | O_EXCL flags together, from the link [3] you
> shared before: If O_CREAT and O_EXCL are set, open() shall fail if the
> file exists. So, overwriting the already existing file is already
> prevented.
>
That said, I agree that not using O_TRUNC helps in some way.

best regards,
Ranier Vilela


Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Nazir Bilal Yavuz
Hi,

On Wed, 8 May 2024 at 15:23, Ranier Vilela  wrote:
>
> Em qua., 8 de mai. de 2024 às 08:42, Nazir Bilal Yavuz  
> escreveu:
>>
>> Hi,
>>
>> On Wed, 8 May 2024 at 14:16, Ranier Vilela  wrote:
>> >
>> >
>> > Em qua., 8 de mai. de 2024 às 04:37, Nazir Bilal Yavuz 
>> >  escreveu:
>> >>
>> >> Hi Ranier,
>> >>
>> >> Thanks for looking into this!
>> >>
>> >> I am not sure why but your reply does not show up in the thread, so I
>> >> copied your reply and answered it in the thread for visibility.
>> >>
>> >> On Tue, 7 May 2024 at 16:28, Ranier Vilela  wrote:
>> >> >
>> >> > I know it's coming from copy-and-paste, but
>> >> > I believe the flags could be:
>> >> > - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | 
>> >> > PG_BINARY);
>> >> > + dstfd = OpenTransientFile(tofile, O_CREAT | O_WRONLY | O_TRUNC | 
>> >> > O_EXCL | PG_BINARY);
>> >> >
>> >> > The flags:
>> >> > O_WRONLY | O_TRUNC
>> >> >
>> >> > Allow the OS to make some optimizations, if you deem it possible.
>> >> >
>> >> > The flag O_RDWR indicates that the file can be read, which is not true 
>> >> > in this case.
>> >> > The destination file will just be written.
>> >>
>> >> You may be right about the O_WRONLY flag but I am not sure about the
>> >> O_TRUNC flag.
>> >>
>> >> O_TRUNC from the linux man page [1]: If the file already exists and is
>> >> a regular file and the access mode allows writing (i.e., is O_RDWR or
>> >> O_WRONLY) it will be truncated to length 0.  If the file is a FIFO  or
>> >> terminal device file, the O_TRUNC flag is ignored. Otherwise, the
>> >> effect of O_TRUNC is unspecified.
>> >
>> > O_TRUNC is usually used in conjunction with O_WRONLY.
>> > See the example at:
>> > open.html
>> >
>> > O_TRUNC signals the OS to forget the current contents of the file, if it 
>> > happens to exist.
>> > In other words, there will be no seeks, only and exclusively writes.
>>
>> You are right; the O_TRUNC flag truncates the file, if it happens to
>> exist. But it should not exist in the first place because the code at
>> [2] should check this before the OpenTransientFile() call. Even if we
>> assume somehow the check at [2] does not work, I do not think the
>> correct operation is to truncate the contents of the existing file.
>
> I don't know if there is a communication problem here.
> But what I'm trying to suggest refers to the destination file,
> which doesn't matter if it exists or not?

I do not think there is a communication problem. Actually it matters
because the destination file should not exist, there is a code [2]
which already checks and confirms that it does not exist.

>
> If the destination file does not exist, O_TRUNC is ignored.
> If the destination file exists, O_TRUNC truncates the current contents of the 
> file.
> I don't see why you think it's a problem to truncate the current content if 
> the destination file exists.
> Isn't he going to be replaced anyway?

'If the destination file does not exist' means the code at [2] works
well and we do not need the O_TRUNC flag.

'If the destination file already exists' means the code at [2] is
broken somehow and there is a high chance that we could truncate
something that we do not want to. For example, there is a foo db and
we want to create bar db, Postgres chose the foo db's location as the
destination of the bar db (which should not happen but let's assume
somehow checks at [2] failed), then we could wrongly truncate the foo
db's contents.

Hence, if Postgres works successfully I think the O_TRUNC flag is
unnecessary but if Postgres does not work successfully, the O_TRUNC
flag could cause harm.

>
> Unless we want to preserve the current content (destination file), in case 
> the copy/clone fails?

Like I said above, Postgres should not choose the existing file as the
destination file.

Also, we have O_CREAT | O_EXCL flags together, from the link [3] you
shared before: If O_CREAT and O_EXCL are set, open() shall fail if the
file exists. So, overwriting the already existing file is already
prevented.

[3] https://pubs.opengroup.org/onlinepubs/009696699/functions/open.html

-- 
Regards,
Nazir Bilal Yavuz
Microsoft




Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Ranier Vilela
Em qua., 8 de mai. de 2024 às 08:42, Nazir Bilal Yavuz 
escreveu:

> Hi,
>
> On Wed, 8 May 2024 at 14:16, Ranier Vilela  wrote:
> >
> >
> > Em qua., 8 de mai. de 2024 às 04:37, Nazir Bilal Yavuz <
> byavu...@gmail.com> escreveu:
> >>
> >> Hi Ranier,
> >>
> >> Thanks for looking into this!
> >>
> >> I am not sure why but your reply does not show up in the thread, so I
> >> copied your reply and answered it in the thread for visibility.
> >>
> >> On Tue, 7 May 2024 at 16:28, Ranier Vilela  wrote:
> >> >
> >> > I know it's coming from copy-and-paste, but
> >> > I believe the flags could be:
> >> > - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL |
> PG_BINARY);
> >> > + dstfd = OpenTransientFile(tofile, O_CREAT | O_WRONLY | O_TRUNC |
> O_EXCL | PG_BINARY);
> >> >
> >> > The flags:
> >> > O_WRONLY | O_TRUNC
> >> >
> >> > Allow the OS to make some optimizations, if you deem it possible.
> >> >
> >> > The flag O_RDWR indicates that the file can be read, which is not
> true in this case.
> >> > The destination file will just be written.
> >>
> >> You may be right about the O_WRONLY flag but I am not sure about the
> >> O_TRUNC flag.
> >>
> >> O_TRUNC from the linux man page [1]: If the file already exists and is
> >> a regular file and the access mode allows writing (i.e., is O_RDWR or
> >> O_WRONLY) it will be truncated to length 0.  If the file is a FIFO  or
> >> terminal device file, the O_TRUNC flag is ignored. Otherwise, the
> >> effect of O_TRUNC is unspecified.
> >
> > O_TRUNC is usually used in conjunction with O_WRONLY.
> > See the example at:
> > open.html
> >
> > O_TRUNC signals the OS to forget the current contents of the file, if it
> happens to exist.
> > In other words, there will be no seeks, only and exclusively writes.
>
> You are right; the O_TRUNC flag truncates the file, if it happens to
> exist. But it should not exist in the first place because the code at
> [2] should check this before the OpenTransientFile() call. Even if we
> assume somehow the check at [2] does not work, I do not think the
> correct operation is to truncate the contents of the existing file.
>
I don't know if there is a communication problem here.
But what I'm trying to suggest refers to the destination file,
which doesn't matter if it exists or not?

If the destination file does not exist, O_TRUNC is ignored.
If the destination file exists, O_TRUNC truncates the current contents of
the file.
I don't see why you think it's a problem to truncate the current content if
the destination file exists.
Isn't he going to be replaced anyway?

Unless we want to preserve the current content (destination file), in case
the copy/clone fails?

best regards,
Ranier Vilela


Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Nazir Bilal Yavuz
Hi,

On Wed, 8 May 2024 at 14:16, Ranier Vilela  wrote:
>
>
> Em qua., 8 de mai. de 2024 às 04:37, Nazir Bilal Yavuz  
> escreveu:
>>
>> Hi Ranier,
>>
>> Thanks for looking into this!
>>
>> I am not sure why but your reply does not show up in the thread, so I
>> copied your reply and answered it in the thread for visibility.
>>
>> On Tue, 7 May 2024 at 16:28, Ranier Vilela  wrote:
>> >
>> > I know it's coming from copy-and-paste, but
>> > I believe the flags could be:
>> > - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
>> > + dstfd = OpenTransientFile(tofile, O_CREAT | O_WRONLY | O_TRUNC | O_EXCL 
>> > | PG_BINARY);
>> >
>> > The flags:
>> > O_WRONLY | O_TRUNC
>> >
>> > Allow the OS to make some optimizations, if you deem it possible.
>> >
>> > The flag O_RDWR indicates that the file can be read, which is not true in 
>> > this case.
>> > The destination file will just be written.
>>
>> You may be right about the O_WRONLY flag but I am not sure about the
>> O_TRUNC flag.
>>
>> O_TRUNC from the linux man page [1]: If the file already exists and is
>> a regular file and the access mode allows writing (i.e., is O_RDWR or
>> O_WRONLY) it will be truncated to length 0.  If the file is a FIFO  or
>> terminal device file, the O_TRUNC flag is ignored. Otherwise, the
>> effect of O_TRUNC is unspecified.
>
> O_TRUNC is usually used in conjunction with O_WRONLY.
> See the example at:
> open.html
>
> O_TRUNC signals the OS to forget the current contents of the file, if it 
> happens to exist.
> In other words, there will be no seeks, only and exclusively writes.

You are right; the O_TRUNC flag truncates the file, if it happens to
exist. But it should not exist in the first place because the code at
[2] should check this before the OpenTransientFile() call. Even if we
assume somehow the check at [2] does not work, I do not think the
correct operation is to truncate the contents of the existing file.

>>
>> But it should be already checked if the destination directory already
>> exists in dbcommands.c file in createdb() function [2], so this should
>> not happen.
>
> I'm not sure what you're referring to here.

Sorry, I meant that the destination directory / file should not exist
because the code at [2] confirms it does not exist, otherwise it
errors out.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft




Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Ranier Vilela
Em qua., 8 de mai. de 2024 às 04:37, Nazir Bilal Yavuz 
escreveu:

> Hi Ranier,
>
> Thanks for looking into this!
>
> I am not sure why but your reply does not show up in the thread, so I
> copied your reply and answered it in the thread for visibility.
>
> On Tue, 7 May 2024 at 16:28, Ranier Vilela  wrote:
> >
> > I know it's coming from copy-and-paste, but
> > I believe the flags could be:
> > - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL |
> PG_BINARY);
> > + dstfd = OpenTransientFile(tofile, O_CREAT | O_WRONLY | O_TRUNC |
> O_EXCL | PG_BINARY);
> >
> > The flags:
> > O_WRONLY | O_TRUNC
> >
> > Allow the OS to make some optimizations, if you deem it possible.
> >
> > The flag O_RDWR indicates that the file can be read, which is not true
> in this case.
> > The destination file will just be written.
>
> You may be right about the O_WRONLY flag but I am not sure about the
> O_TRUNC flag.
>
> O_TRUNC from the linux man page [1]: If the file already exists and is
> a regular file and the access mode allows writing (i.e., is O_RDWR or
> O_WRONLY) it will be truncated to length 0.  If the file is a FIFO  or
> terminal device file, the O_TRUNC flag is ignored. Otherwise, the
> effect of O_TRUNC is unspecified.
>
O_TRUNC is usually used in conjunction with O_WRONLY.
See the example at:
open.html


O_TRUNC signals the OS to forget the current contents of the file, if it
happens to exist.
In other words, there will be no seeks, only and exclusively writes.


> But it should be already checked if the destination directory already
> exists in dbcommands.c file in createdb() function [2], so this should
> not happen.
>
I'm not sure what you're referring to here.

best regards,
Ranier Vilela


Re: CREATE DATABASE with filesystem cloning

2024-05-08 Thread Nazir Bilal Yavuz
Hi Ranier,

Thanks for looking into this!

I am not sure why but your reply does not show up in the thread, so I
copied your reply and answered it in the thread for visibility.

On Tue, 7 May 2024 at 16:28, Ranier Vilela  wrote:
>
> I know it's coming from copy-and-paste, but
> I believe the flags could be:
> - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
> + dstfd = OpenTransientFile(tofile, O_CREAT | O_WRONLY | O_TRUNC | O_EXCL | 
> PG_BINARY);
>
> The flags:
> O_WRONLY | O_TRUNC
>
> Allow the OS to make some optimizations, if you deem it possible.
>
> The flag O_RDWR indicates that the file can be read, which is not true in 
> this case.
> The destination file will just be written.

You may be right about the O_WRONLY flag but I am not sure about the
O_TRUNC flag.

O_TRUNC from the linux man page [1]: If the file already exists and is
a regular file and the access mode allows writing (i.e., is O_RDWR or
O_WRONLY) it will be truncated to length 0.  If the file is a FIFO  or
terminal device file, the O_TRUNC flag is ignored. Otherwise, the
effect of O_TRUNC is unspecified.

But it should be already checked if the destination directory already
exists in dbcommands.c file in createdb() function [2], so this should
not happen.

[1] https://man7.org/linux/man-pages/man2/open.2.html

[2]
/*
 * If database OID is configured, check if the OID is already in use or
 * data directory already exists.
 */
if (OidIsValid(dboid))
{
char   *existing_dbname = get_database_name(dboid);

if (existing_dbname != NULL)
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE)),
errmsg("database OID %u is already in use by
database \"%s\"",
   dboid, existing_dbname));

if (check_db_file_conflict(dboid))
ereport(ERROR,
(errcode(ERRCODE_INVALID_PARAMETER_VALUE)),
errmsg("data directory with the specified OID %u
already exists", dboid));
}
else
{
/* Select an OID for the new database if is not explicitly
configured. */
do
{
dboid = GetNewOidWithIndex(pg_database_rel, DatabaseOidIndexId,
   Anum_pg_database_oid);
} while (check_db_file_conflict(dboid));
}

-- 
Regards,
Nazir Bilal Yavuz
Microsoft




Re: CREATE DATABASE with filesystem cloning

2024-05-07 Thread Ranier Vilela
Hi,

Nazir Bilal Yavuz  wrote:

>Any kind of feedback would be appreciated.

I know it's coming from copy-and-paste, but
I believe the flags could be:
- dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ dstfd = OpenTransientFile(tofile, O_CREAT | O_WRONLY | O_TRUNC | O_EXCL |
PG_BINARY);

The flags:
O_WRONLY | O_TRUNC

Allow the OS to make some optimizations, if you deem it possible.

The flag O_RDWR indicates that the file can be read, which is not true in
this case.
The destination file will just be written.

best regards,
Ranier Vilela


Re: CREATE DATABASE with filesystem cloning

2024-05-07 Thread Nazir Bilal Yavuz
Hi,

On Wed, 6 Mar 2024 at 05:17, Thomas Munro  wrote:
>
> The main thing that is missing is support for redo.  It's mostly
> trivial I think, probably just a record type for "try cloning first"
> and then teaching that clone function to fall back to the regular copy
> path if it fails in recovery, do you agree with that idea?  Another
> approach would be to let it fail if it doesn't work on the replica, so
> you don't finish up using dramatically different amounts of disk
> space, but that seems terrible because now your replica is broken.  So
> probably fallback with logged warning (?), though I'm not sure exactly
> which errnos to give that treatment to.

We had an off-list talk with Thomas and we thought making this option
GUC instead of SQL command level could solve this problem.

I am posting a new rebased version of the patch with some important changes:

* 'createdb_file_copy_method' GUC is created. Possible values are
'copy' and 'clone'. Copy is the default option. Clone option can be
chosen if the system supports it, otherwise it gives error at the
startup.

* #else part of the clone_file() function calls pg_unreachable()
instead of ereport().

* Documentation updates.

Also, what should happen when the kernel has clone support but the
file system does not?

- I tested this on Linux and copy_file_range() silently uses normal
copy when this happens. I assume the same thing will happen for
FreeBSD because it uses the copy_file_range() function as well.

- I am not sure about MacOS since the force flag
(COPYFILE_CLONE_FORCE) is used. I do not have MacOS so I can not test
it but I assume it will error out when this happens. If that is the
case, is this a problem? I am asking that since this is a GUC now, the
user will have the full responsibility.

Any kind of feedback would be appreciated.

-- 
Regards,
Nazir Bilal Yavuz
Microsoft
From 404e301dbdb252c23ea9d451b817cf6e372d0d9a Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz 
Date: Tue, 7 May 2024 14:16:09 +0300
Subject: [PATCH v5] Use CLONE method with GUC on CREATE DATABASE ...
 STRATEGY=FILE_COPY.

createdb_file_copy_method GUC option is introduced. It can be set to
either COPY (default) or CLONE (if system supports it).

If CLONE option is chosen, similar to STRATEGY=FILE_COPY; but attempting
to use efficient file copying system calls.  The kernel has the
opportunity to share block ranges in copy-on-write file systems, or
maybe push down the copy to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

Author: Thomas Munro 
Author: Nazir Bilal Yavuz 
Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/include/commands/dbcommands.h |  9 ++
 src/include/storage/copydir.h |  3 +-
 src/backend/commands/dbcommands.c | 21 -
 src/backend/storage/file/copydir.c| 83 ++-
 .../utils/activity/wait_event_names.txt   |  1 +
 src/backend/utils/misc/guc_tables.c   | 13 +++
 src/backend/utils/misc/postgresql.conf.sample |  5 ++
 doc/src/sgml/config.sgml  | 17 
 doc/src/sgml/ref/create_database.sgml | 24 --
 src/tools/pgindent/typedefs.list  |  1 +
 10 files changed, 162 insertions(+), 15 deletions(-)

diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h
index 92e17c71158..2e1d3565edc 100644
--- a/src/include/commands/dbcommands.h
+++ b/src/include/commands/dbcommands.h
@@ -14,6 +14,15 @@
 #ifndef DBCOMMANDS_H
 #define DBCOMMANDS_H
 
+typedef enum CreateDBFileCopyMethod
+{
+	CREATEDB_FILE_COPY_METHOD_COPY,
+	CREATEDB_FILE_COPY_METHOD_CLONE,
+} CreateDBFileCopyMethod;
+
+/* GUC parameters */
+extern PGDLLIMPORT int createdb_file_copy_method;
+
 #include "access/xlogreader.h"
 #include "catalog/objectaddress.h"
 #include "lib/stringinfo.h"
diff --git a/src/include/storage/copydir.h b/src/include/storage/copydir.h
index a25e258f479..9ff28f2eec9 100644
--- a/src/include/storage/copydir.h
+++ b/src/include/storage/copydir.h
@@ -13,7 +13,8 @@
 #ifndef COPYDIR_H
 #define COPYDIR_H
 
-extern void copydir(const char *fromdir, const char *todir, bool recurse);
+extern void copydir(const char *fromdir, const char *todir, bool recurse,
+	bool clone_files);
 extern void copy_file(const char *fromfile, const char *tofile);
 
 #endif			/* COPYDIR_H */
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index be629ea92cf..cf0dff65e69 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -69,6 +69,20 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 
+/* GUCs */
+int			createdb_file_copy_method = CREATEDB_FILE_COPY_METHOD_COPY;
+
+/*
+ * GUC support
+ */
+const struct config_enum_entry createdb_file_copy_method_options[] = {
+	{"copy", CREATEDB_FILE_COPY_METHOD_COPY, false},
+#if (defined(HAVE_COPYFILE) && 

Re: CREATE DATABASE with filesystem cloning

2024-03-05 Thread Thomas Munro
On Wed, Mar 6, 2024 at 3:16 PM Thomas Munro  wrote:
> Here's a rebase.

Now with a wait event and a paragraph of documentation.
From 9d5a60e9a9cc4a4312de3081be99c254a8876e42 Mon Sep 17 00:00:00 2001
From: Thomas Munro 
Date: Sat, 2 Sep 2023 22:21:49 +1200
Subject: [PATCH v4] CREATE DATABASE ... STRATEGY=FILE_CLONE.

Similar to STRATEGY=FILE_COPY, but attempting to use efficient file
copying system calls.  The kernel has the opportunity to share block
ranges in copy-on-write file systems, or maybe push down the copy to
network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

XXX need redo -- what to do if unsupported during redo, fall back to plain copy?

Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 doc/src/sgml/ref/create_database.sgml |  6 ++
 src/backend/commands/dbcommands.c | 19 +++--
 src/backend/storage/file/copydir.c| 82 ++-
 .../utils/activity/wait_event_names.txt   |  1 +
 src/include/storage/copydir.h |  3 +-
 5 files changed, 101 insertions(+), 10 deletions(-)

diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml
index 72927960ebb..6ed82ee98dd 100644
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@@ -138,6 +138,12 @@ CREATE DATABASE name
 it also forces the system to perform a checkpoint both before and
 after the creation of the new database. In some situations, this may
 have a noticeable negative impact on overall system performance.
+On some platforms and file systems, the FILE_CLONE
+strategy is available.  This works the same way as
+FILE_COPY, except that it uses fast file cloning
+or copying system calls that might push down the work of copying to the
+storage, or use copy-on-write techniques.  The effect on disk space
+usage and execution time is file system-dependent.

   
  
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index b256d6d0f7d..8ccfd18b4c9 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -78,11 +78,14 @@
  * CREATEDB_FILE_COPY will simply perform a file system level copy of the
  * database and log a single record for each tablespace copied. To make this
  * safe, it also triggers checkpoints before and after the operation.
+ *
+ * CREATEDB_FILE_CLONE is the same, but uses faster file cloning system calls.
  */
 typedef enum CreateDBStrategy
 {
 	CREATEDB_WAL_LOG,
 	CREATEDB_FILE_COPY,
+	CREATEDB_FILE_CLONE,
 } CreateDBStrategy;
 
 typedef struct
@@ -136,7 +139,8 @@ static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple,
 static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid,
 	bool isRedo);
 static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid,
-		Oid src_tsid, Oid dst_tsid);
+		Oid src_tsid, Oid dst_tsid,
+		bool clone_files);
 static void recovery_create_dbdir(char *path, bool only_tblspc);
 
 /*
@@ -548,7 +552,7 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo)
  */
 static void
 CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
-			Oid dst_tsid)
+			Oid dst_tsid, bool clone_files)
 {
 	TableScanDesc scan;
 	Relation	rel;
@@ -608,7 +612,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(srcpath, dstpath, false);
+		copydir(srcpath, dstpath, false, clone_files);
 
 		/* Record the filesystem change in XLOG */
 		{
@@ -1010,6 +1014,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 			dbstrategy = CREATEDB_WAL_LOG;
 		else if (strcmp(strategy, "file_copy") == 0)
 			dbstrategy = CREATEDB_FILE_COPY;
+		else if (strcmp(strategy, "file_clone") == 0)
+			dbstrategy = CREATEDB_FILE_CLONE;
 		else
 			ereport(ERROR,
 	(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
@@ -1460,7 +1466,8 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
 	  dst_deftablespace);
 		else
 			CreateDatabaseUsingFileCopy(src_dboid, dboid, src_deftablespace,
-		dst_deftablespace);
+		dst_deftablespace,
+		dbstrategy == CREATEDB_FILE_CLONE);
 
 		/*
 		 * Close pg_database, but keep lock till commit.
@@ -2096,7 +2103,7 @@ movedb(const char *dbname, const char *tblspcname)
 		/*
 		 * Copy files from the old tablespace to the new one
 		 */
-		copydir(src_dbpath, dst_dbpath, false);
+		copydir(src_dbpath, dst_dbpath, false, false);
 
 		/*
 		 * Record the filesystem change in XLOG
@@ -3255,7 +3262,7 @@ dbase_redo(XLogReaderState *record)
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(src_path, dst_path, false);
+		copydir(src_path, dst_path, false, false);
 
 		pfree(src_pa

Re: CREATE DATABASE with filesystem cloning

2024-03-05 Thread Thomas Munro
On Wed, Oct 11, 2023 at 7:40 PM Peter Eisentraut  wrote:
> On 07.10.23 07:51, Thomas Munro wrote:
> > Here is an experimental POC of fast/cheap database cloning.
>
> Here are some previous discussions of this:
>
> https://www.postgresql.org/message-id/flat/20131001223108.GG23410%40saarenmaa.fi
>
> https://www.postgresql.org/message-id/flat/511B5D11.4040507%40socialserve.com
>
> https://www.postgresql.org/message-id/flat/bc9ca382-b98d-0446-f699-8c5de2307ca7%402ndquadrant.com
>
> (I don't see any clear conclusions in any of these threads, but it might
> be good to check them in any case.)

Thanks.  Wow, quite a lot of people have written an experimental patch
like this.  I would say the things that changed since those ones are:

* copy_file_range() became the preferred way to do this on Linux AFAIK
(instead of various raw ioctls)
* FreeBSD adopted Linux's copy_file_range()
* Open ZFS 2.2 implemented range-based cloning
* XFS enabled reflink support by default
* Apple invented ApFS with cloning
* Several OSes adopted XFS, BTRFS, ZFS, ApFS by default
* copy_file_range() went in the direction of not revealing how the
copying is done (no flags to force behaviour)

Here's a rebase.

The main thing that is missing is support for redo.  It's mostly
trivial I think, probably just a record type for "try cloning first"
and then teaching that clone function to fall back to the regular copy
path if it fails in recovery, do you agree with that idea?  Another
approach would be to let it fail if it doesn't work on the replica, so
you don't finish up using dramatically different amounts of disk
space, but that seems terrible because now your replica is broken.  So
probably fallback with logged warning (?), though I'm not sure exactly
which errnos to give that treatment to.

One thing to highlight about COW file system semantics: PostgreSQL
behaves differently when space runs out.  When writing relation data,
eg ZFS sometimes fails like bullet point 2 in this ENOSPC article[1],
while XFS usually fails like bullet point 1.  A database on XFS that
has been cloned in this way might presumably start to fail like bullet
point 2, eg when checkpointing dirty pages, instead of its usual
extension-time-only ENOSPC-rolls-back-your-transaction behaviour.

[1] https://wiki.postgresql.org/wiki/ENOSPC
From 49b7077e24da9d8140de3f303d3f90cb3c0dd0f8 Mon Sep 17 00:00:00 2001
From: Thomas Munro 
Date: Sat, 2 Sep 2023 22:21:49 +1200
Subject: [PATCH v3] CREATE DATABASE ... STRATEGY=FILE_CLONE.

Similar to STRATEGY=FILE_COPY, but attempting to use efficient file
copying system calls.  The kernel has the opportunity to share block
ranges in copy-on-write file systems, or maybe push down the copy to
network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

XXX need redo -- what to do if unsupported during redo, fall back to plain copy?

Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 src/backend/commands/dbcommands.c  | 19 ---
 src/backend/storage/file/copydir.c | 80 --
 src/include/storage/copydir.h  |  3 +-
 3 files changed, 92 insertions(+), 10 deletions(-)

diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index b256d6d0f7d..8ccfd18b4c9 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -78,11 +78,14 @@
  * CREATEDB_FILE_COPY will simply perform a file system level copy of the
  * database and log a single record for each tablespace copied. To make this
  * safe, it also triggers checkpoints before and after the operation.
+ *
+ * CREATEDB_FILE_CLONE is the same, but uses faster file cloning system calls.
  */
 typedef enum CreateDBStrategy
 {
 	CREATEDB_WAL_LOG,
 	CREATEDB_FILE_COPY,
+	CREATEDB_FILE_CLONE,
 } CreateDBStrategy;
 
 typedef struct
@@ -136,7 +139,8 @@ static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple,
 static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid,
 	bool isRedo);
 static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid,
-		Oid src_tsid, Oid dst_tsid);
+		Oid src_tsid, Oid dst_tsid,
+		bool clone_files);
 static void recovery_create_dbdir(char *path, bool only_tblspc);
 
 /*
@@ -548,7 +552,7 @@ CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid, bool isRedo)
  */
 static void
 CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
-			Oid dst_tsid)
+			Oid dst_tsid, bool clone_files)
 {
 	TableScanDesc scan;
 	Relation	rel;
@@ -608,7 +612,7 @@ CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dst_dboid, Oid src_tsid,
 		 *
 		 * We don't need to copy subdirectories
 		 */
-		copydir(srcpath, dstpath, false);
+		copydir(srcpath, dstpath, false, clone_files);
 
 		/* Record the filesystem change in XLOG */
 		{
@@ -1010,6 +1014,8 @@ createdb(ParseState *pstate, const CreatedbStmt 

Re: CREATE DATABASE with filesystem cloning

2023-10-16 Thread Dan Langille
On Sat, Oct 7, 2023, at 1:51 AM, Thomas Munro wrote:

> I tested on bleeding edge FreeBSD/ZFS, where you need to set sysctl
> vfs.zfs.bclone_enabled=1 to enable the optimisation, as it's still a
> very new feature that is still being rolled out.  The system call
> succeeds either way, but that controls whether the new database
> initially shares blocks on disk, or get new copies.  I also tested on
> a Mac.  In both cases I could clone large databases in a fraction of a
> second.

WOOT! Looking forward to this. It would greatly improve times needed
when deploying test environments.

Thank you.

-- 
  Dan Langille
  d...@langille.org




Re: CREATE DATABASE with filesystem cloning

2023-10-11 Thread Robert Haas
On Mon, Oct 9, 2023 at 7:49 PM Andres Freund  wrote:
> If we do this, I think we should consider creating template0, template1 with
> the new strategy, so that a new initdb cluster ends up with deduplicated data.

Seems a little questionable given the reports earlier in the thread
about some filesystems still having bugs in this functionality.

-- 
Robert Haas
EDB: http://www.enterprisedb.com




Re: CREATE DATABASE with filesystem cloning

2023-10-10 Thread Peter Eisentraut

On 07.10.23 07:51, Thomas Munro wrote:

Here is an experimental POC of fast/cheap database cloning.


Here are some previous discussions of this:

https://www.postgresql.org/message-id/flat/20131001223108.GG23410%40saarenmaa.fi

https://www.postgresql.org/message-id/flat/511B5D11.4040507%40socialserve.com

https://www.postgresql.org/message-id/flat/bc9ca382-b98d-0446-f699-8c5de2307ca7%402ndquadrant.com

(I don't see any clear conclusions in any of these threads, but it might 
be good to check them in any case.)






Re: CREATE DATABASE with filesystem cloning

2023-10-09 Thread Andres Freund
Hi,

On 2023-10-07 18:51:45 +1300, Thomas Munro wrote:
> It should be a lot faster, and use less physical disk, than the two
> existing strategies on recent-ish XFS, BTRFS, very recent OpenZFS,
> APFS (= macOS), and it could in theory be extended to other systems
> that invented different system calls for this with more work (Solaris,
> Windows).  Then extra physical disk space will be consumed only as the
> two clones diverge.

> It's just like the old strategy=file_copy, except it asks the OS to do
> its best copying trick.  If you try it on a system that doesn't
> support copy-on-write, then copy_file_range() should fall back to
> plain old copy, but it might still be better than we could do, as it
> can push copy commands to network storage or physical storage.
> 
> Therefore, the usual caveats from strategy=file_copy also apply here.
> Namely that it has to perform checkpoints which could be very
> expensive, and there are some quirks/brokenness about concurrent
> backups and PITR.  Which makes me wonder if it's worth pursuing this
> idea.  Thoughts?

I think it'd be interesting to have. For the regression tests we do end up
spending a lot of disk throughput on contents duplicated between
template0/template1/postgres.  And I've plenty of time spent time copying huge
template databases, to have a reproducible starting point for some benchmark
that's expensive to initialize.

If we do this, I think we should consider creating template0, template1 with
the new strategy, so that a new initdb cluster ends up with deduplicated data.


FWIW, I experimented with using cp -c on macos for the initdb template, and
that provided some further gain. I suspect that that gain would increase if
template0/template1/postgres were deduplicated.


> diff --git a/src/backend/storage/file/copydir.c 
> b/src/backend/storage/file/copydir.c
> index e04bc3941a..8c963ff548 100644
> --- a/src/backend/storage/file/copydir.c
> +++ b/src/backend/storage/file/copydir.c
> @@ -19,14 +19,21 @@
>  #include "postgres.h"
>  
>  #include 
> +#include 
>  #include 
>  
> +#ifdef HAVE_COPYFILE_H
> +#include 
> +#endif

We already have code around this in src/bin/pg_upgrade/file.c, seems we ought
to move it somewhere in src/port?

Greetings,

Andres Freund




Re: CREATE DATABASE with filesystem cloning

2023-10-08 Thread Thomas Munro
On Mon, Oct 9, 2023 at 2:20 AM Andrew Dunstan  wrote:
> I've had to disable COW on my BTRFS-resident buildfarm animals (see
> previous discussion re Direct I/O).

Right, because it is still buggy[1].  I don't see any sign that a fix
has been committed yet, assuming that is the right thing (and it sure
sounds like it).  It means you still have to disable COW to run the
004_io_direct.pl test for now, but that's an independent thing due
hopefully to be fixed soon, and you can still run PostgreSQL just fine
with COW enabled as it is by default as long as you don't turn on
debug_io_direct (which isn't for users yet anyway).

Since I hadn't actually tried out this cloning stuff out on
Linux/btrfs before and was claiming that it should work, I took it for
a quick unscientific spin (literally, this is on a spinning SATA disk
for extra crunchy slowness...).  I created a scale 500 pgbench
database, saw that du -h showed 7.4G, and got these times:

postgres=# create database foodb_copy template=foodb strategy=file_copy;
CREATE DATABASE
Time: 124019.885 ms (02:04.020)
postgres=# create database foodb_clone template=foodb strategy=file_clone;
CREATE DATABASE
Time: 8618.195 ms (00:08.618)

That's something, but not as good as I was expecting, so let's also
try Linux/XFS for reference on the same spinny rust...  One thing I
learned is that if you have an existing XFS partition, it might have
been created without reflinks enabled (see output of xfs_info) as that
was the default not very long ago and it's not changeable later, so on
the box I'm writing from I had to do a fresh mkfs.xfs to see any
benefit from this.

postgres=# create database foodb_copy template=foodb strategy=file_copy;
CREATE DATABASE
Time: 49157.876 ms (00:49.158)
postgres=# create database foodb_clone template=foodb strategy=file_clone;
CREATE DATABASE
Time: 1026.455 ms (00:01.026)

Not bad.  To understand what that did, we can check which physical
blocks on disk hold the first segment of the pgbench_accounts table in
foodb and foodb_clone:

$ sudo xfs_bmap /mnt/xfs/pgdata/base/16384/16400
/mnt/xfs/pgdata/base/16384/16400:
0: [0..1637439]: 977586048..979223487
1: [1637440..2097151]: 1464966136..1465425847
$ sudo xfs_bmap /mnt/xfs/pgdata/base/16419/16400
/mnt/xfs/pgdata/base/16419/16400:
0: [0..1637439]: 977586048..979223487
1: [1637440..2097151]: 1464966136..1465425847

The same blocks.  Now let's update a tuple on the second page of
pgbench_accounts in the clone:

foodb=# update pgbench_accounts set bid = bid + 1 where ctid = '(1, 1)';
UPDATE 1
foodb=# checkpoint;
CHECKPOINT

Now some new physical disk blocks have been allocated just for that
page, but the rest are still clones:

$ sudo xfs_bmap /mnt/xfs/pgdata/base/16419/16400
/mnt/xfs/pgdata/base/16419/16400:
0: [0..15]: 977586048..977586063
1: [16..31]: 977586064..977586079
2: [32..1637439]: 977586080..979223487
3: [1637440..2097151]: 1464966136..1465425847

I tried changing it to work in 1MB chunks and add the CFI() (v2
attached), and it didn't affect the time measurably and also didn't
generate any extra extents as displayed by xfs_bmap, so the end result
is the same.  I haven't looked into the chunked version on the other
file systems yet.

I don't have the numbers to hand (different machines far from me right
now) but FreeBSD/ZFS and macOS/APFS were on the order of a few hundred
milliseconds for the same scale of pgbench on laptop storage (so not
comparable with the above).

I also tried a -s 5000 database, and saw that XFS could clone a 74GB
database just as fast as the 7.4GB database (still ~1s).  At a guess,
this is going to scale not so much by total data size, but more by
things like number of relations, segment size and internal (invisible)
fragmentation due to previous cloning/update history in
filesystem-dependent ways, since those are the things that generate
extents (contiguous ranges of physical blocks to be referenced by the
new file).

[1] 
https://lore.kernel.org/linux-btrfs/ae81e48b0e954bae1c3451c0da1a24ae7146606c.1676684984.git.bo...@bur.io/T/#u
From dd5c07d873e90a6feac371d2879015a5e6154632 Mon Sep 17 00:00:00 2001
From: Thomas Munro 
Date: Sat, 2 Sep 2023 22:21:49 +1200
Subject: [PATCH v2] WIP: CREATE DATABASE ... STRATEGY=FILE_CLONE.

Similar to STRATEGY=FILE_COPY, but using facilities that tell the OS
explicitly that we're copying, so that it has the opportunity to share
block ranges in copy-on-write file systems, or maybe push down the copy
to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

XXX need docs
XXX need to think more about chunk size and interruptibility
XXX need redo -- what to do if unsupported during redo, plain copy?

Discussion: https://postgr.es/m/CA%2BhUKGLM%2Bt%2BSwBU-cHeMUXJCOgBxSHLGZutV5zCwY4qrCcE02w%40mail.gmail.com
---
 configure  |  2 +-
 configure.ac   |  1 +
 meson.build|  1 +
 

Re: CREATE DATABASE with filesystem cloning

2023-10-08 Thread Andrew Dunstan



On 2023-10-07 Sa 01:51, Thomas Munro wrote:

Hello hackers,

Here is an experimental POC of fast/cheap database cloning.  For
clones from little template databases, no one cares much, but it might
be useful to be able to create a snapshot or fork of very large
database for testing/experimentation like this:

   create database foodb_snapshot20231007 template=foodb strategy=file_clone

It should be a lot faster, and use less physical disk, than the two
existing strategies on recent-ish XFS, BTRFS, very recent OpenZFS,
APFS (= macOS), and it could in theory be extended to other systems
that invented different system calls for this with more work (Solaris,
Windows).  Then extra physical disk space will be consumed only as the
two clones diverge.

It's just like the old strategy=file_copy, except it asks the OS to do
its best copying trick.  If you try it on a system that doesn't
support copy-on-write, then copy_file_range() should fall back to
plain old copy, but it might still be better than we could do, as it
can push copy commands to network storage or physical storage.

Therefore, the usual caveats from strategy=file_copy also apply here.
Namely that it has to perform checkpoints which could be very
expensive, and there are some quirks/brokenness about concurrent
backups and PITR.  Which makes me wonder if it's worth pursuing this
idea.  Thoughts?

I tested on bleeding edge FreeBSD/ZFS, where you need to set sysctl
vfs.zfs.bclone_enabled=1 to enable the optimisation, as it's still a
very new feature that is still being rolled out.  The system call
succeeds either way, but that controls whether the new database
initially shares blocks on disk, or get new copies.  I also tested on
a Mac.  In both cases I could clone large databases in a fraction of a
second.



I've had to disable COW on my BTRFS-resident buildfarm animals (see 
previous discussion re Direct I/O).



cheers


andrew

--
Andrew Dunstan
EDB: https://www.enterprisedb.com





Re: CREATE DATABASE with filesystem cloning

2023-10-07 Thread Thomas Munro
One thing I forgot to mention about this experiment: when used in a
backend I think this probably needs a chunk size (what size?) and a
CFI().  Which is a bit annoying, because for best results we want
SSIZE_MAX, but in the case that copy_file_range() falls back to raw
copy, that'd do I/O work bounded only by segment size :-/  (That's not
a problem that comes up in the similar patch for pg_upgrade.)




CREATE DATABASE with filesystem cloning

2023-10-06 Thread Thomas Munro
Hello hackers,

Here is an experimental POC of fast/cheap database cloning.  For
clones from little template databases, no one cares much, but it might
be useful to be able to create a snapshot or fork of very large
database for testing/experimentation like this:

  create database foodb_snapshot20231007 template=foodb strategy=file_clone

It should be a lot faster, and use less physical disk, than the two
existing strategies on recent-ish XFS, BTRFS, very recent OpenZFS,
APFS (= macOS), and it could in theory be extended to other systems
that invented different system calls for this with more work (Solaris,
Windows).  Then extra physical disk space will be consumed only as the
two clones diverge.

It's just like the old strategy=file_copy, except it asks the OS to do
its best copying trick.  If you try it on a system that doesn't
support copy-on-write, then copy_file_range() should fall back to
plain old copy, but it might still be better than we could do, as it
can push copy commands to network storage or physical storage.

Therefore, the usual caveats from strategy=file_copy also apply here.
Namely that it has to perform checkpoints which could be very
expensive, and there are some quirks/brokenness about concurrent
backups and PITR.  Which makes me wonder if it's worth pursuing this
idea.  Thoughts?

I tested on bleeding edge FreeBSD/ZFS, where you need to set sysctl
vfs.zfs.bclone_enabled=1 to enable the optimisation, as it's still a
very new feature that is still being rolled out.  The system call
succeeds either way, but that controls whether the new database
initially shares blocks on disk, or get new copies.  I also tested on
a Mac.  In both cases I could clone large databases in a fraction of a
second.
From 341c0287237d59b0723bc88244dc1c48ed06c5a5 Mon Sep 17 00:00:00 2001
From: Thomas Munro 
Date: Sat, 2 Sep 2023 22:21:49 +1200
Subject: [PATCH] WIP: CREATE DATABASE ... STRATEGY=FILE_CLONE.

Similar to STRATEGY=FILE_COPY, but using facilities that tell the OS
explicitly that we're copying, so that it has the opportunity to share
block ranges in copy-on-write file systems, or maybe push down the copy
to network file systems and storage devices.

Currently works on Linux, FreeBSD and macOS.  More systems could be
supported.

XXX need docs
XXX need a test
XXX need redo -- what to do if unsupported during redo, plain copy?

diff --git a/configure b/configure
index d47e0f8b26..2076b19a1b 100755
--- a/configure
+++ b/configure
@@ -15578,7 +15578,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in backtrace_symbols copyfile getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
+for ac_func in backtrace_symbols copyfile copy_file_range getifaddrs getpeerucred inet_pton kqueue mbstowcs_l memset_s posix_fallocate ppoll pthread_is_threaded_np setproctitle setproctitle_fast strchrnul strsignal syncfs sync_file_range uselocale wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.ac b/configure.ac
index 440b08d113..d0d31dd91e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1767,6 +1767,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 AC_CHECK_FUNCS(m4_normalize([
 	backtrace_symbols
 	copyfile
+	copy_file_range
 	getifaddrs
 	getpeerucred
 	inet_pton
diff --git a/meson.build b/meson.build
index 862c955453..20e7327e9e 100644
--- a/meson.build
+++ b/meson.build
@@ -2415,6 +2415,7 @@ func_checks = [
   ['backtrace_symbols', {'dependencies': [execinfo_dep]}],
   ['clock_gettime', {'dependencies': [rt_dep], 'define': false}],
   ['copyfile'],
+  ['copy_file_range'],
   # gcc/clang's sanitizer helper library provides dlopen but not dlsym, thus
   # when enabling asan the dlopen check doesn't notice that -ldl is actually
   # required. Just checking for dlsym() ought to suffice.
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 307729ab7e..9bbcabbb6f 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -79,11 +79,14 @@
  * CREATEDB_FILE_COPY will simply perform a file system level copy of the
  * database and log a single record for each tablespace copied. To make this
  * safe, it also triggers checkpoints before and after the operation.
+ *
+ * CREATEDB_FILE_CLONE is the same, but uses faster file cloning system calls.
  */
 typedef enum CreateDBStrategy
 {
 	CREATEDB_WAL_LOG,
-	CREATEDB_FILE_COPY
+	CREATEDB_FILE_COPY,
+	CREATEDB_FILE_CLONE
 } CreateDBStrategy;
 
 typedef struct
@@ -137,7 +140,8 @@ static CreateDBRelInfo *ScanSourceDatabasePgClassTuple(HeapTupleData *tuple,
 static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid,
 	bool isRedo);
 static void CreateDatabaseUsingFileCopy(