Alvaro Herrera escribió:

> As it turns out, I have a patched slru.c that adds a new function to
> verify whether a page exists on disk.  I created this for the commit
> timestamp module, for the BDR branch, but I think it's what we need
> here.

Here's a patch that should fix the problem.  Jesse, if you're able to
test it, please give it a run and let me know if it works for you.  I
was able to upgrade an installation containing a problem that should
reproduce yours.

-- 
Álvaro Herrera                http://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Training & Services
*** a/src/backend/access/transam/multixact.c
--- b/src/backend/access/transam/multixact.c
***************
*** 1719,1724 **** ZeroMultiXactMemberPage(int pageno, bool writeXlog)
--- 1719,1756 ----
  }
  
  /*
+  * After a binary upgrade from <= 9.2, the pg_multixact/offset SLRU area might
+  * contain files that are shorter than necessary; this would occur if the old
+  * installation had used multixacts beyond the first page (files cannot be
+  * copied, because the on-disk representation is different).  pg_upgrade would
+  * update pg_control to set the next offset value to be at that position, so
+  * that tuples marked as locked by such MultiXacts would be seen as visible
+  * without having to consult multixact.  However, trying to create a use a new
+  * MultiXactId would result in an error because the page on which the new value
+  * would reside does not exist.  This routine is in charge of creating such
+  * pages.
+  */
+ static void
+ MaybeExtendOffsetSlru(void)
+ {
+ 	int			pageno;
+ 
+ 	pageno = MultiXactIdToOffsetPage(MultiXactState->nextMXact);
+ 
+ 	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
+ 
+ 	if (!SimpleLruDoesPhysicalPageExist(MultiXactOffsetCtl, pageno))
+ 	{
+ 		int		slotno;
+ 
+ 		slotno = ZeroMultiXactOffsetPage(pageno, false);
+ 		SimpleLruWritePage(MultiXactOffsetCtl, slotno);
+ 	}
+ 
+ 	LWLockRelease(MultiXactOffsetControlLock);
+ }
+ 
+ /*
   * This must be called ONCE during postmaster or standalone-backend startup.
   *
   * StartupXLOG has already established nextMXact/nextOffset by calling
***************
*** 1738,1743 **** StartupMultiXact(void)
--- 1770,1782 ----
  	int			entryno;
  	int			flagsoff;
  
+ 	/*
+ 	 * During a binary upgrade, make sure that the offsets SLRU is large
+ 	 * enough to contain the next value that would be created.
+ 	 */
+ 	if (IsBinaryUpgrade)
+ 		MaybeExtendOffsetSlru();
+ 
  	/* Clean up offsets state */
  	LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE);
  
*** a/src/backend/access/transam/slru.c
--- b/src/backend/access/transam/slru.c
***************
*** 563,568 **** SimpleLruWritePage(SlruCtl ctl, int slotno)
--- 563,612 ----
  	SlruInternalWritePage(ctl, slotno, NULL);
  }
  
+ /*
+  * Return whether the given page exists on disk.
+  *
+  * A false return means that either the file does not exist, or that it's not
+  * large enough to contain the given page.
+  */
+ bool
+ SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno)
+ {
+ 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
+ 	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
+ 	int			offset = rpageno * BLCKSZ;
+ 	char		path[MAXPGPATH];
+ 	int			fd;
+ 	bool		result;
+ 	off_t		endpos;
+ 
+ 	SlruFileName(ctl, path, segno);
+ 
+ 	fd = OpenTransientFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
+ 	if (fd < 0)
+ 	{
+ 		/* expected: file doesn't exist */
+ 		if (errno == ENOENT)
+ 			return false;
+ 
+ 		/* report error normally */
+ 		slru_errcause = SLRU_OPEN_FAILED;
+ 		slru_errno = errno;
+ 		SlruReportIOError(ctl, pageno, 0);
+ 	}
+ 
+ 	if ((endpos = lseek(fd, 0, SEEK_END)) < 0)
+ 	{
+ 		slru_errcause = SLRU_OPEN_FAILED;
+ 		slru_errno = errno;
+ 		SlruReportIOError(ctl, pageno, 0);
+ 	}
+ 
+ 	result = endpos >= (off_t) (offset + BLCKSZ);
+ 
+ 	CloseTransientFile(fd);
+ 	return result;
+ }
  
  /*
   * Physical read of a (previously existing) page into a buffer slot
*** a/src/include/access/slru.h
--- b/src/include/access/slru.h
***************
*** 145,150 **** extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno,
--- 145,151 ----
  extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
  extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
  extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
+ extern bool SimpleLruDoesPhysicalPageExist(SlruCtl ctl, int pageno);
  
  typedef bool (*SlruScanCallback) (SlruCtl ctl, char *filename, int segpage,
  											  void *data);
-- 
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

Reply via email to