On Thu, Mar 10, 2016 at 8:51 AM, Masahiko Sawada <sawada.m...@gmail.com> wrote:
> After some further thought, I thought that it's better to add check
> logic for result of rewriting visibility map to upgrading logic rather
> than regression test in order to ensure that rewriting visibility map
> has been successfully done.
> As a draft, attached patch checks the result of rewriting visibility
> map after rewrote for each relation as a routine of pg_upgrade.
> The disadvantage point of this is that we need to scan each visibility
> map page for 2 times.
> But since visibility map size would not be so large, it would not bad.
> Thoughts?

I think that's kind of pointless.  We need to test that this
conversion code works, but once it does, I don't think we should make
everybody pay the overhead of retesting that.  Anyway, the test code
could have bugs, too.

Here's an updated version of your patch with that code removed and
some cosmetic cleanups like fixing typos and stuff like that.  I think
this is mostly ready to commit, but I noticed one problem: your
conversion code always produces two output pages for each input page
even if one of them would be empty.  In particular, if you have a
large number of small relations and run pg_upgrade, all of their
visibility maps will go from 8kB to 16kB.  That isn't the end of the
world, maybe, but I think you should see if you can't fix it
somehow....

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company
diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c
index 2a99a28..34e1451 100644
--- a/src/bin/pg_upgrade/file.c
+++ b/src/bin/pg_upgrade/file.c
@@ -9,10 +9,15 @@
 
 #include "postgres_fe.h"
 
+#include "access/visibilitymap.h"
 #include "pg_upgrade.h"
+#include "storage/bufpage.h"
+#include "storage/checksum.h"
+#include "storage/checksum_impl.h"
 
 #include <fcntl.h>
 
+#define BITS_PER_HEAPBLOCK_OLD 1
 
 
 #ifndef WIN32
@@ -138,6 +143,130 @@ copy_file(const char *srcfile, const char *dstfile, bool force)
 #endif
 
 
+/*
+ * rewriteVisibilityMap()
+ *
+ * In versions of PostgreSQL prior to catversion 201603011, PostgreSQL's
+ * visibility map included one bit per heap page; it now includes two.
+ * When upgrading a cluster from before that time to a current PostgreSQL
+ * version, we could refuse to copy visibility maps from the old cluster
+ * to the new cluster; the next VACUUM would recreate them, but at the
+ * price of scanning the entire table.  So, instead, we rewrite the old
+ * visibility maps in the new format.  That way, the all-visible bit
+ * remains set for the pages for which it was set previously.  The
+ * all-frozen bit is never set by this conversion; we leave that to
+ * VACUUM.
+ */
+const char *
+rewriteVisibilityMap(const char *fromfile, const char *tofile, bool force)
+{
+	int			src_fd = 0;
+	int			dst_fd = 0;
+	char		buffer[BLCKSZ];
+	ssize_t		bytesRead;
+	int			rewriteVmBytesPerPage;
+	BlockNumber blkno = 0;
+
+	/* Compute we need how many old page bytes to rewrite a new page */
+	rewriteVmBytesPerPage = (BLCKSZ - SizeOfPageHeaderData) / 2;
+
+	if ((fromfile == NULL) || (tofile == NULL))
+		return "Invalid old file or new file";
+
+	if ((src_fd = open(fromfile, O_RDONLY, 0)) < 0)
+		return getErrorText();
+
+	if ((dst_fd = open(tofile, O_RDWR | O_CREAT | (force ? 0 : O_EXCL), S_IRUSR | S_IWUSR)) < 0)
+	{
+		close(src_fd);
+		return getErrorText();
+	}
+
+	/*
+	 * Turn each visibility map page into 2 pages one by one. Each new page
+	 * has the same page header as the old one.
+	 */
+	while ((bytesRead = read(src_fd, buffer, BLCKSZ)) == BLCKSZ)
+	{
+		char	   *old_cur,
+				   *old_break,
+				   *old_blkend;
+		PageHeaderData pageheader;
+
+		/* Save the page header data */
+		memcpy(&pageheader, buffer, SizeOfPageHeaderData);
+
+		/*
+		 * These old_* variables point to old visibility map page. old_cur
+		 * points to current position on old page. old_blkend points to end of
+		 * old block. old_break points to old page break position for rewriting
+		 * a new page. After wrote a new page, old_break proceeds
+		 * rewriteVmBytesPerPage bytes.
+		 */
+		old_cur = buffer + SizeOfPageHeaderData;
+		old_blkend = buffer + bytesRead;
+		old_break = old_cur + rewriteVmBytesPerPage;
+
+		while (old_blkend >= old_break)
+		{
+			char		vmbuf[BLCKSZ];
+			char	   *new_cur = vmbuf;
+
+			/* Copy page header in advance */
+			memcpy(vmbuf, &pageheader, SizeOfPageHeaderData);
+
+			new_cur += SizeOfPageHeaderData;
+
+			/*
+			 * Process old page bytes one by one, and turn it into new page.
+			 */
+			while (old_break > old_cur)
+			{
+				uint16		new_vmbits = 0;
+				int			i;
+
+				/* Generate new format bits while keeping old information */
+				for (i = 0; i < BITS_PER_BYTE; i++)
+				{
+					uint8	byte = * (uint8 *) old_cur;
+
+					if (((byte & (1 << (BITS_PER_HEAPBLOCK_OLD * i)))) != 0)
+						new_vmbits |= 1 << (BITS_PER_HEAPBLOCK * i);
+				}
+
+				/* Copy new visibility map bit to new format page */
+				memcpy(new_cur, &new_vmbits, BITS_PER_HEAPBLOCK);
+
+				old_cur += BITS_PER_HEAPBLOCK_OLD;
+				new_cur += BITS_PER_HEAPBLOCK;
+			}
+
+			/* Set new checksum for a visibility map page (if enabled) */
+			if (old_cluster.controldata.data_checksum_version != 0 &&
+				new_cluster.controldata.data_checksum_version != 0)
+				((PageHeader) vmbuf)->pd_checksum =
+					pg_checksum_page(vmbuf, blkno);
+
+			if (write(dst_fd, vmbuf, BLCKSZ) != BLCKSZ)
+			{
+				close(dst_fd);
+				close(src_fd);
+				return getErrorText();
+			}
+
+			old_break += rewriteVmBytesPerPage;
+			blkno++;
+		}
+	}
+
+	/* Close files */
+	close(dst_fd);
+	close(src_fd);
+
+	return NULL;
+
+}
+
 void
 check_hard_link(void)
 {
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index 6122878..89beb73 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -110,6 +110,10 @@ extern char *output_files[];
 #define VISIBILITY_MAP_CRASHSAFE_CAT_VER 201107031
 
 /*
+ * The format of visibility map is changed with this 9.6 commit,
+ */
+#define VISIBILITY_MAP_FROZEN_BIT_CAT_VER 201603011
+/*
  * pg_multixact format changed in 9.3 commit 0ac5ad5134f2769ccbaefec73844f85,
  * ("Improve concurrency of foreign key locking") which also updated catalog
  * version to this value.  pg_upgrade behavior depends on whether old and new
@@ -365,6 +369,8 @@ bool		pid_lock_file_exists(const char *datadir);
 
 const char *copyFile(const char *src, const char *dst, bool force);
 const char *linkFile(const char *src, const char *dst);
+const char *rewriteVisibilityMap(const char *fromfile, const char *tofile,
+								 bool force);
 
 void		check_hard_link(void);
 FILE	   *fopen_priv(const char *path, const char *mode);
diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c
index b20f073..103651a 100644
--- a/src/bin/pg_upgrade/relfilenode.c
+++ b/src/bin/pg_upgrade/relfilenode.c
@@ -16,7 +16,7 @@
 
 
 static void transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace);
-static void transfer_relfile(FileNameMap *map, const char *suffix);
+static void transfer_relfile(FileNameMap *map, const char *suffix, bool vm_must_add_frozenbit);
 
 
 /*
@@ -132,6 +132,7 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 {
 	int			mapnum;
 	bool		vm_crashsafe_match = true;
+	bool		vm_must_add_frozenbit = false;
 
 	/*
 	 * Do the old and new cluster disagree on the crash-safetiness of the vm
@@ -141,13 +142,20 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 		new_cluster.controldata.cat_ver >= VISIBILITY_MAP_CRASHSAFE_CAT_VER)
 		vm_crashsafe_match = false;
 
+	/*
+	 * Do we need to rewrite visibilitymap?
+	 */
+	if (old_cluster.controldata.cat_ver < VISIBILITY_MAP_FROZEN_BIT_CAT_VER &&
+		new_cluster.controldata.cat_ver >= VISIBILITY_MAP_FROZEN_BIT_CAT_VER)
+		vm_must_add_frozenbit = true;
+
 	for (mapnum = 0; mapnum < size; mapnum++)
 	{
 		if (old_tablespace == NULL ||
 			strcmp(maps[mapnum].old_tablespace, old_tablespace) == 0)
 		{
 			/* transfer primary file */
-			transfer_relfile(&maps[mapnum], "");
+			transfer_relfile(&maps[mapnum], "", vm_must_add_frozenbit);
 
 			/* fsm/vm files added in PG 8.4 */
 			if (GET_MAJOR_VERSION(old_cluster.major_version) >= 804)
@@ -155,9 +163,9 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 				/*
 				 * Copy/link any fsm and vm files, if they exist
 				 */
-				transfer_relfile(&maps[mapnum], "_fsm");
+				transfer_relfile(&maps[mapnum], "_fsm", vm_must_add_frozenbit);
 				if (vm_crashsafe_match)
-					transfer_relfile(&maps[mapnum], "_vm");
+					transfer_relfile(&maps[mapnum], "_vm", vm_must_add_frozenbit);
 			}
 		}
 	}
@@ -167,10 +175,12 @@ transfer_single_new_db(FileNameMap *maps, int size, char *old_tablespace)
 /*
  * transfer_relfile()
  *
- * Copy or link file from old cluster to new one.
+ * Copy or link file from old cluster to new one.  If vm_must_add_frozenbit
+ * is true, visibility map forks are converted and rewritten, even in link
+ * mode.
  */
 static void
-transfer_relfile(FileNameMap *map, const char *type_suffix)
+transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_frozenbit)
 {
 	const char *msg;
 	char		old_file[MAXPGPATH];
@@ -232,7 +242,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
 		{
 			pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", old_file, new_file);
 
-			if ((msg = copyFile(old_file, new_file, true)) != NULL)
+			/* Rewrite visibility map if needed */
+			if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
+				msg = rewriteVisibilityMap(old_file, new_file, true);
+			else
+				msg = copyFile(old_file, new_file, true);
+
+			if (msg)
 				pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
 						 map->nspname, map->relname, old_file, new_file, msg);
 		}
@@ -240,7 +256,13 @@ transfer_relfile(FileNameMap *map, const char *type_suffix)
 		{
 			pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", old_file, new_file);
 
-			if ((msg = linkFile(old_file, new_file)) != NULL)
+			/* Rewrite visibility map if needed */
+			if (vm_must_add_frozenbit && (strcmp(type_suffix, "_vm") == 0))
+				msg = rewriteVisibilityMap(old_file, new_file, true);
+			else
+				msg = linkFile(old_file, new_file);
+
+			if (msg)
 				pg_fatal("error while creating link for relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n",
 						 map->nspname, map->relname, old_file, new_file, msg);
 		}
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to