On Tue, 2010-04-06 at 10:22 +0100, Simon Riggs wrote:

> Initial patch. I will be testing over next day. No commit before at
> least midday on Wed 7 Apr.

Various previous discussions sidelined a very important point: what
exactly does it mean to "start recovery from a shutdown checkpoint"?

If standby_mode is enabled and there is no source of WAL, then we get a
stream of messages saying

LOG:  record with zero length at 0/C000088
...

but most importantly we never get to the main recovery loop, so Hot
Standby never gets to start at all. We can't keep retrying the request
for WAL and at the same time enter the retry loop, executing lots of
things that expect non-NULL pointers using a NULL xlog pointer.

What we are asking for here is a completely new state: the database is
not "in recovery" - by definition there is nothing at all to recover. 

The following patch adds "Snapshot Mode", a very simple variation on the
existing code - emphasis on the "simple":

LOG:  entering snapshot mode
LOG:  record with zero length at 0/C000088
LOG:  consistent recovery state reached at 0/C000088
LOG:  database system is ready to accept read only connections

this mode does *not* continually check to see if new WAL files have been
added. Startup just sits and waits, backends allowed. If a trigger file
is specified, then we can leave recovery. Otherwise Startup process just
sits doing nothing.

There's possibly an argument for inventing some more special modes where
we do allow read only connections but don't start the bgwriter. I don't
personally wish to do this at this stage of the release cycle. The
attached patch is non-invasive and safe and I want to leave it at that.

I will be committing later today, unless major objections, but I ask you
to read the patch before you sharpen your pen. It's simple.

-- 
 Simon Riggs           www.2ndQuadrant.com
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index e2566a4..365cd17 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1719,6 +1719,88 @@ PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 }
 
 /*
+ * StandbyRecoverPreparedTransactions
+ *
+ * Scan the pg_twophase directory and setup all the required information to
+ * allow standby queries to treat prepared transactions as still active.
+ * This is never called at the end of recovery - we use
+ * RecoverPreparedTransactions() at that point.
+ *
+ * Currently we simply call SubTransSetParent() for any subxids of prepared
+ * transactions.
+ */
+void
+StandbyRecoverPreparedTransactions(bool can_overwrite)
+{
+	DIR		   *cldir;
+	struct dirent *clde;
+
+	cldir = AllocateDir(TWOPHASE_DIR);
+	while ((clde = ReadDir(cldir, TWOPHASE_DIR)) != NULL)
+	{
+		if (strlen(clde->d_name) == 8 &&
+			strspn(clde->d_name, "0123456789ABCDEF") == 8)
+		{
+			TransactionId xid;
+			char	   *buf;
+			TwoPhaseFileHeader *hdr;
+			TransactionId *subxids;
+			int			i;
+
+			xid = (TransactionId) strtoul(clde->d_name, NULL, 16);
+
+			/* Already processed? */
+			if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+			{
+				ereport(WARNING,
+						(errmsg("removing stale two-phase state file \"%s\"",
+								clde->d_name)));
+				RemoveTwoPhaseFile(xid, true);
+				continue;
+			}
+
+			/* Read and validate file */
+			buf = ReadTwoPhaseFile(xid, true);
+			if (buf == NULL)
+			{
+				ereport(WARNING,
+					  (errmsg("removing corrupt two-phase state file \"%s\"",
+							  clde->d_name)));
+				RemoveTwoPhaseFile(xid, true);
+				continue;
+			}
+
+			/* Deconstruct header */
+			hdr = (TwoPhaseFileHeader *) buf;
+			if (!TransactionIdEquals(hdr->xid, xid))
+			{
+				ereport(WARNING,
+					  (errmsg("removing corrupt two-phase state file \"%s\"",
+							  clde->d_name)));
+				RemoveTwoPhaseFile(xid, true);
+				pfree(buf);
+				continue;
+			}
+
+			/*
+			 * Examine subtransaction XIDs ... they should all follow main
+			 * XID, and they may force us to advance nextXid.
+			 */
+			subxids = (TransactionId *)
+				(buf + MAXALIGN(sizeof(TwoPhaseFileHeader)));
+			for (i = 0; i < hdr->nsubxacts; i++)
+			{
+				TransactionId subxid = subxids[i];
+
+				Assert(TransactionIdFollows(subxid, xid));
+				SubTransSetParent(xid, subxid, can_overwrite);
+			}
+		}
+	}
+	FreeDir(cldir);
+}
+
+/*
  * RecoverPreparedTransactions
  *
  * Scan the pg_twophase directory and reload shared-memory state for each
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 3000ab7..d26b369 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -180,6 +180,7 @@ static TimestampTz recoveryLastXTime = 0;
 
 /* options taken from recovery.conf for XLOG streaming */
 static bool StandbyMode = false;
+static bool SnapshotMode = false;
 static char *PrimaryConnInfo = NULL;
 char	   *TriggerFile = NULL;
 
@@ -3845,7 +3846,7 @@ next_record_is_invalid:
 	}
 
 	/* In standby-mode, keep trying */
-	if (StandbyMode)
+	if (StandbyMode && !SnapshotMode)
 		goto retry;
 	else
 		return NULL;
@@ -5163,10 +5164,15 @@ readRecoveryCommandFile(void)
 	if (StandbyMode)
 	{
 		if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
-			ereport(WARNING,
+		{
+			if (XLogRequestRecoveryConnections)
+				SnapshotMode = true;
+			else
+				ereport(WARNING,
 					(errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
 							RECOVERY_COMMAND_FILE),
 					 errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
+		}
 	}
 	else
 	{
@@ -5643,12 +5649,15 @@ StartupXLOG(void)
 
 	if (InArchiveRecovery)
 	{
-		if (StandbyMode)
+		if (SnapshotMode)
+			ereport(LOG,
+					(errmsg("entering snapshot mode")));
+		else if (StandbyMode)
 			ereport(LOG,
 					(errmsg("entering standby mode")));
 		else if (recoveryTarget == RECOVERY_TARGET_XID)
 			ereport(LOG,
-					 (errmsg("starting point-in-time recovery to XID %u",
+					(errmsg("starting point-in-time recovery to XID %u",
 						 recoveryTargetXid)));
 		else if (recoveryTarget == RECOVERY_TARGET_TIME)
 			ereport(LOG,
@@ -5880,6 +5889,33 @@ StartupXLOG(void)
 			StartupMultiXact();
 
 			ProcArrayInitRecoveryInfo(oldestActiveXID);
+
+			/*
+			 * If we're beginning at a shutdown checkpoint, we know that
+			 * nothing was running on the master at this point. So fake-up
+			 * an empty running-xacts record and use that here and now.
+			 * Recover additional standby state for prepared transactions.
+			 */
+			if (wasShutdown)
+			{
+				RunningTransactionsData running;
+
+				/*
+				 * Construct a RunningTransactions snapshot representing a shut
+				 * down server, with only prepared transactions still alive.
+				 * We're never overflowed at this point because all subxids
+				 * are listed with their parent prepared transactions.
+				 */
+				running.xcnt = nxids;
+				running.subxid_overflow = false;
+				running.nextXid = checkPoint.nextXid;
+				running.oldestRunningXid = oldestActiveXID;
+				running.xids = xids;
+
+				ProcArrayApplyRecoveryInfo(&running);
+
+				StandbyRecoverPreparedTransactions(false);
+			}
 		}
 
 		/* Initialize resource managers */
@@ -6070,6 +6106,40 @@ StartupXLOG(void)
 							 timestamptz_to_str(recoveryLastXTime))));
 			InRedo = false;
 		}
+		else if (SnapshotMode)
+		{
+			if (standbyState == STANDBY_SNAPSHOT_READY &&
+				IsUnderPostmaster &&
+				XLByteLE(minRecoveryPoint, EndRecPtr) &&
+				XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
+			{
+				SetForwardFsyncRequests();
+				SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
+
+				ereport(LOG,
+						(errmsg("consistent recovery state reached at %X/%X",
+							EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+
+				pg_usleep(2000000L); 	//wait for Postmaster to absorb signal
+
+				SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+
+				/*
+				 * Cue the elevator music...
+				 */
+				while (true)
+				{
+					pg_usleep(5000000L);
+					if (CheckForStandbyTrigger())
+						break;
+
+				}
+			}
+			else
+				ereport(LOG,
+						(errmsg("unable to reach consistent recovery state at %X/%X",
+									EndRecPtr.xlogid, EndRecPtr.xrecoff)));
+		}
 		else
 		{
 			/* there are no WAL records following the checkpoint */
@@ -7592,13 +7662,34 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 		if (standbyState != STANDBY_DISABLED)
 			CheckRequiredParameterValues(checkPoint);
 
+		/*
+		 * If we're beginning at a shutdown checkpoint, we know that
+		 * nothing was running on the master at this point. So fake-up
+		 * an empty running-xacts record and use that here and now.
+		 * Recover additional standby state for prepared transactions.
+		 */
 		if (standbyState >= STANDBY_INITIALIZED)
 		{
+			TransactionId *xids;
+			int			nxids;
+			TransactionId oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
+			RunningTransactionsData running;
+
 			/*
-			 * Remove stale transactions, if any.
+			 * Construct a RunningTransactions snapshot representing a shut
+			 * down server, with only prepared transactions still alive.
+			 * We're never overflowed at this point because all subxids
+			 * are listed with their parent prepared transactions.
 			 */
-			ExpireOldKnownAssignedTransactionIds(checkPoint.nextXid);
-			StandbyReleaseOldLocks(checkPoint.nextXid);
+			running.xcnt = nxids;
+			running.subxid_overflow = false;
+			running.nextXid = checkPoint.nextXid;
+			running.oldestRunningXid = oldestActiveXID;
+			running.xids = xids;
+
+			ProcArrayApplyRecoveryInfo(&running);
+
+			StandbyRecoverPreparedTransactions(true);
 		}
 
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
diff --git a/src/include/access/twophase.h b/src/include/access/twophase.h
index 5d9d6cf..46f5fca 100644
--- a/src/include/access/twophase.h
+++ b/src/include/access/twophase.h
@@ -44,6 +44,7 @@ extern bool StandbyTransactionIdIsPrepared(TransactionId xid);
 
 extern TransactionId PrescanPreparedTransactions(TransactionId **xids_p,
 							int *nxids_p);
+extern void StandbyRecoverPreparedTransactions(bool can_overwrite);
 extern void RecoverPreparedTransactions(void);
 
 extern void RecreateTwoPhaseFile(TransactionId xid, void *content, int len);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to