Once more, with feeling.

On Sun, 2007-04-01 at 12:11 +0100, Simon Riggs wrote:
> Resending...
> 
> -------- Forwarded Message --------
> From: Simon Riggs <[EMAIL PROTECTED]>
> To: pgsql-patches@postgresql.org
> Cc: pgsql-hackers@postgresql.org
> Subject: Deferred Transactions, Transaction Guarantee and COMMIT without
> waiting
> Date: Sat, 31 Mar 2007 22:09:23 +0100
> 
> Here's the next version (v10) of the patch, ready for review.
> 
> I've struggled with what to call all of the new concepts inherent in
> this patch, but I think I've got something now. COMMIT NOWAIT doesn't
> describe this feature, since there is no command of that name in the
> implementation that we've agreed. So what's it called?
> 
> This patch implements a feature called Deferred Fsync Transactions, or
> Deferred Transactions for short. The idea is we don't fsync at commit,
> but we defer that briefly, letting a new WAL Writer process perform the
> fsync at regular intervals of 50-250 ms. It's a much safer version of
> fsync = off, yet retaining most of the speed *and* it can be used for
> some transactions and not others.
> 
> Deferred Transactions provide considerable additional performance in a
> range of circumstances, but at the cost that a handful of committed
> transactions will definitely be lost if the server crashes.
> 
> To remind everybody of the risks, this feature is enabled using a
> parameter named transaction_guarantee. The default mode is "on"
> reminding us that PostgreSQL provides a strong default guarantee that if
> a transaction is committed, it stays committed. If you prefer
> performance at the risk of data loss, then you can opt to relax the
> standard level of protection and request transaction_guarantee = off
> 
> The data loss isn't random, nor is it indeterminate, but it is certain.
> We will say that a transaction is committed, but it isn't until it has
> reached disk. So all transactions that have reached the commit point,
> but not yet reached disk will be certainly lost - probably best to use a
> guidelines figure of 1000 transactions when assessing the business
> impact of such loss. The risk is very similar to normal transactions
> waiting to write to disk, but the important difference is we will have
> replied to the client that the transaction is safely on disk, when it is
> not.
> 
> Relaxing the transaction guarantee in this way is completely
> controllable by users. Guaranteed and Unguaranteed transactions can
> co-exist safely without increased risk for more important data.
> 
> v10 fixes a number of lurking bugs present in v9. There are no
> outstanding bugs, after a range of tests, though more are needed.
> 
> wal_writer_delay = 0 (default) ms enables this feature at server start.
> Once enabled, individual sessions or transactions may request
> transaction_guarantee = off, or it may be set for the whole server.
> 
> It also provides additional instrumentation, with new parameters:
> trace_commit = on will show details of each commit (high volume)
> trace_bg_flush = on will give more frequent summaries of monitoring data
> 
> The patch needs a reviewers guide, which I'll write next week.
> 
> patch -p0 < transaction_guarantee.v10.patch
> with additional files:
> src/backend/postmaster/walwriter.c
> src/include/postmaster/walwriter.c

-- 
  Simon Riggs             
  EnterpriseDB   http://www.enterprisedb.com

Attachment: transaction_guarantee.v10.patch.gz
Description: GNU Zip compressed data

/*-------------------------------------------------------------------------
 *
 * walwriter.c
 *
 *	PostgreSQL WAL Writer
 *
 *	Initial author: Simon Riggs		[EMAIL PROTECTED]
 *
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/postmaster/WALWriter.c,v 1.29 2007/02/10 14:58:54 petere Exp $
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <fcntl.h>
#include <signal.h>
#include <time.h>
#include <sys/time.h>
#include <sys/wait.h>
#include <unistd.h>

#include "access/xact.h"
#include "access/xlog.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "postmaster/fork_process.h"
#include "postmaster/postmaster.h"
#include "postmaster/walwriter.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/pg_shmem.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/sinval.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"


/* ----------
 * Timer definitions.
 * ----------
 */
#define WALWRITER_RESTART_INTERVAL 2		/* How often to attempt to restart a
										 	 * failed WALWriter; in seconds. */
/* ----------
 * Local data
 * ----------
 */
static time_t last_WALWriter_start_time;

/* Memory context for long-lived data */
static MemoryContext walwriter_cxt;

/*
 * Flags set by interrupt handlers for later service in the main loop.
 */
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t wakened = false;

/* ----------
 * Local function forward declarations
 * ----------
 */
#ifdef EXEC_BACKEND
static pid_t WALWriter_forkexec(void);
#endif

int WALWriterDelay = 0;

NON_EXEC_STATIC void WALWriterMain(int argc, char *argv[]);
static void WALWriter_exit(SIGNAL_ARGS);
static void WALWriterSigHupHandler(SIGNAL_ARGS);
static void WALWriter_waken(SIGNAL_ARGS);
static void WALWriter_MainLoop(void);

/* ------------------------------------------------------------
 * Public functions called from postmaster follow
 * ------------------------------------------------------------
 */

/*
 * WALWriter_start
 *
 *	Called from postmaster at startup or after an existing WALWriter
 *	died.  Attempt to fire up a fresh WALWriter process.
 *
 *	Returns PID of child process, or 0 if fail.
 *
 *	Note: if fail, we will be called again from the postmaster main loop.
 */
int
StartWALWriter(void)
{
	time_t		curtime;
	pid_t		WALWriterPid;

	/*
	 * Do nothing if no WALWriter needed
	 */
	if (!WALWriterActive())
		return 0;

	/*
	 * Do nothing if too soon since last WALWriter start.  This is a safety
	 * valve to protect against continuous respawn attempts if the WALWriter is
	 * dying immediately at launch. Note that since we will be re-called from
	 * the postmaster main loop, we will get another chance later.
	 */
	curtime = time(NULL);
	if ((unsigned int) (curtime - last_WALWriter_start_time) <
		(unsigned int) WALWRITER_RESTART_INTERVAL)
		return 0;
	last_WALWriter_start_time = curtime;

#ifdef EXEC_BACKEND
	switch ((WALWriterPid = WALWriter_forkexec()))
#else
	switch ((WALWriterPid = fork_process()))
#endif
	{
		case -1:
			ereport(LOG,
					(errmsg("could not fork WALWriter: %m")));
			return 0;

#ifndef EXEC_BACKEND
		case 0:
			/* in postmaster child ... */
			/* Close the postmaster's sockets */
			ClosePostmasterPorts(false);

			/* Lose the postmaster's on-exit routines */
			on_exit_reset();

			WALWriterMain(0, NULL);
			break;
#endif

		default:
			return (int) WALWriterPid;
	}

	/* shouldn't get here */
	return 0;
}

/* ------------------------------------------------------------
 * Local functions called by WALWriter follow
 * ------------------------------------------------------------
 */


#ifdef EXEC_BACKEND

/*
 * WALWriter_forkexec() -
 *
 * Format up the arglist for, then fork and exec, WALWriter process
 */
static pid_t
WALWriter_forkexec(void)
{
	char	   *av[10];
	int			ac = 0;

	av[ac++] = "postgres";

	av[ac++] = "--forkwalwriter";

	av[ac++] = NULL;			/* filled in by postmaster_forkexec */

	av[ac] = NULL;
	Assert(ac < lengthof(av));

	return postmaster_forkexec(ac, av);
}
#endif   /* EXEC_BACKEND */


/*
 * WALWriterMain
 *
 *	The argc/argv parameters are valid only in EXEC_BACKEND case.  However,
 *	since we don't use 'em, it hardly matters...
 */
NON_EXEC_STATIC void
WALWriterMain(int argc, char *argv[])
{
	sigjmp_buf	local_sigjmp_buf;
	IsUnderPostmaster = true;	/* we are a postmaster subprocess now */

	MyProcPid = getpid();		/* reset MyProcPid */

	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Ignore all signals usually bound to some action in the postmaster,
	 * except for SIGHUP, SIGUSR1 and SIGQUIT.
	 */
	pqsignal(SIGHUP, WALWriterSigHupHandler);
	pqsignal(SIGINT, SIG_IGN);
	pqsignal(SIGTERM, SIG_IGN);				/* Not executing transactions */
	pqsignal(SIGQUIT, WALWriter_exit);
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, WALWriter_waken);		/* XXX: May want this later */
	pqsignal(SIGUSR2, SIG_IGN);
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/*
	 * Identify myself via ps
	 */
	init_ps_display("WAL writer process", "", "", "");

	SetProcessingMode(InitProcessing);

	/* Early initialization */
	BaseInit();

	/*
	 * Create a per-backend PGPROC struct in shared memory, except in the
	 * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
	 * this before we can use LWLocks (and in the EXEC_BACKEND case we already
	 * had to do some stuff with LWLocks).
	 */
#ifndef EXEC_BACKEND
	InitAuxiliaryProcess();
#endif

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.
	 */
	walwriter_cxt = AllocSetContextCreate(TopMemoryContext,
										   "WAL Writer",
										   ALLOCSET_DEFAULT_MINSIZE,
										   ALLOCSET_DEFAULT_INITSIZE,
										   ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(walwriter_cxt);

	/*
	 * If an exception is encountered, processing resumes here.
	 *
	 * This code is heavily based on bgwriter.c, q.v.
	 */
	if (sigsetjmp(local_sigjmp_buf, 1) != 0)
	{
		/* since not using PG_TRY, must reset error stack by hand */
		error_context_stack = NULL;

		/* Prevents interrupts while cleaning up */
		HOLD_INTERRUPTS();

		/* Report the error to the server log */
		EmitErrorReport();

		/*
		 * These operations are really just a minimal subset of
		 * AbortTransaction().  We don't have very many resources to worry
		 * about, but we do have LWLocks.
		 */
		LWLockReleaseAll();

		/*
		 * Now return to normal top-level context and clear ErrorContext for
		 * next time.
		 */
		MemoryContextSwitchTo(walwriter_cxt);
		FlushErrorState();

		/* Flush any leaked data in the top-level context */
		MemoryContextResetAndDeleteChildren(walwriter_cxt);

		/* Make sure pgstat also considers our stat data as gone */

		/* Now we can allow interrupts again */
		RESUME_INTERRUPTS();

		/*
		 * Sleep at least 1 second after any error.  We don't want to be
		 * filling the error logs as fast as we can.
		 */
		pg_usleep(1000000L);
	}

	/* We can now handle ereport(ERROR) */
	PG_exception_stack = &local_sigjmp_buf;

	InitXLOGAccess();

	ereport(LOG,
			(errmsg("WAL writer started")));

	PG_SETMASK(&UnBlockSig);

	WALWriter_MainLoop();

	ereport(LOG,
			(errmsg("WAL writer shutting down")));

	exit(0);
}

/* SIGQUIT signal handler for WALWriter process */
static void
WALWriter_exit(SIGNAL_ARGS)
{
	PG_SETMASK(&BlockSig);

	/*
	 * For now, we just nail the doors shut and get out of town. 
	 */
	exit(0);
}

/* SIGHUP: set flag to re-read config file at next convenient time */
static void
WALWriterSigHupHandler(SIGNAL_ARGS)
{
	got_SIGHUP = true;
}

/* SIGUSR1 signal handler for WALWriter process */
static void
WALWriter_waken(SIGNAL_ARGS)
{
	wakened = true;
}

/*
 * WALWriter_MainLoop
 */
static void
WALWriter_MainLoop(void)
{
	do
	{
		/* Check for config update */
		if (got_SIGHUP)
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP);
			if (!WALWriterActive())
				break;			/* user wants us to shut down */
		}

		wakened = false;

		/* 
		 * Do what we're here for and keep doing it as long as we're busy
		 */
		FlushAnyDeferredFsyncTransactions(true, false);

		if (!got_SIGHUP && !wakened)
			pg_usleep(WALWriterDelay);

	} while (PostmasterIsAlive(true));
}

/*-------------------------------------------------------------------------
 *
 * walwriter.h
 *		WALWriter definitions
 *
 * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/postmaster/WALWriter.c,v 1.29 2007/02/10 14:58:54 petere Exp $
 *
 *-------------------------------------------------------------------------
 */

extern int WALWriterDelay;

#define WALWriterActive() (WALWriterDelay > 0)

extern int StartWALWriter(void);
---------------------------(end of broadcast)---------------------------
TIP 7: You can help support the PostgreSQL project by donating at

                http://www.postgresql.org/about/donate

Reply via email to