Magnus Hagander wrote:
> 2010/2/18 Fujii Masao <masao.fu...@gmail.com>:
>> On Thu, Feb 18, 2010 at 7:04 PM, Heikki Linnakangas
>> <heikki.linnakan...@enterprisedb.com> wrote:
>>> Magnus Hagander wrote:
>>>> O_DIRECT helps us when we're not going to read the file again, because
>>>> we don't waste cache on it. If we are, which is the case here, it
>>>> should be really bad for performance, since we actually have to do a
>>>> physical read.
>>>>
>>>> Incidentally, that should also apply to general WAL when archive_mdoe
>>>> is on. Do we optimize for that?
>>> Hmm, no we don't. We do take that into account so that we refrain from
>>> issuing posix_fadvice(DONTNEED) if archive_mode is on, but we don't
>>> disable O_DIRECT. Maybe we should..
>> Since the performance of WAL write is more important than that of WAL
>> archiving in general, that optimization might offer little benefit.
> 
> Well, it's going to make the process that reads the WAL cause actual
> physical I/O... That'll take a chunk out of your total available I/O,
> which is likely to push you to the limit of your I/O capacity much
> quicker.

Right, doesn't seem sensible, though it would be nice to see a benchmark
on that.

Here's a patch to disable O_DIRECT when archiving or streaming is
enabled. This is pretty hard to test, so any extra eyeballs would be nice..

-- 
  Heikki Linnakangas
  EnterpriseDB   http://www.enterprisedb.com
? GNUmakefile
? a.patch
? b
? backend
? config.log
? config.status
? config.status.lineno
? configure.lineno
? gin-splay-1.patch
? gin-splay-2.patch
? gin-splay-3.patch
? include
? libpq
? log_unlogged_op_0115.patch
? md-1.c
? md-1.patch
? replication
? temp-file-resowner-2.patch
? contrib/pgbench/.deps
? contrib/pgbench/fsynctest
? contrib/pgbench/fsynctest.c
? contrib/pgbench/fsynctestfile
? contrib/pgbench/pgbench
? contrib/spi/.deps
? doc/src/sgml/HTML.index
? doc/src/sgml/bookindex.sgml
? doc/src/sgml/features-supported.sgml
? doc/src/sgml/features-unsupported.sgml
? doc/src/sgml/version.sgml
? src/Makefile.global
? src/backend/aaa.patch
? src/backend/postgres
? src/backend/access/common/.deps
? src/backend/access/gin/.deps
? src/backend/access/gist/.deps
? src/backend/access/hash/.deps
? src/backend/access/heap/.deps
? src/backend/access/index/.deps
? src/backend/access/nbtree/.deps
? src/backend/access/transam/.deps
? src/backend/bootstrap/.deps
? src/backend/catalog/.deps
? src/backend/commands/.deps
? src/backend/executor/.deps
? src/backend/foreign/.deps
? src/backend/foreign/dummy/.deps
? src/backend/foreign/postgresql/.deps
? src/backend/lib/.deps
? src/backend/libpq/.deps
? src/backend/main/.deps
? src/backend/nodes/.deps
? src/backend/optimizer/geqo/.deps
? src/backend/optimizer/path/.deps
? src/backend/optimizer/plan/.deps
? src/backend/optimizer/prep/.deps
? src/backend/optimizer/util/.deps
? src/backend/parser/.deps
? src/backend/po/af.mo
? src/backend/po/cs.mo
? src/backend/po/hr.mo
? src/backend/po/hu.mo
? src/backend/po/it.mo
? src/backend/po/ko.mo
? src/backend/po/nb.mo
? src/backend/po/nl.mo
? src/backend/po/pl.mo
? src/backend/po/ro.mo
? src/backend/po/ru.mo
? src/backend/po/sk.mo
? src/backend/po/sl.mo
? src/backend/po/sv.mo
? src/backend/po/zh_CN.mo
? src/backend/po/zh_TW.mo
? src/backend/port/.deps
? src/backend/postmaster/.deps
? src/backend/regex/.deps
? src/backend/replication/.deps
? src/backend/replication/libpqwalreceiver/.deps
? src/backend/rewrite/.deps
? src/backend/snowball/.deps
? src/backend/snowball/snowball_create.sql
? src/backend/storage/buffer/.deps
? src/backend/storage/file/.deps
? src/backend/storage/freespace/.deps
? src/backend/storage/ipc/.deps
? src/backend/storage/large_object/.deps
? src/backend/storage/lmgr/.deps
? src/backend/storage/page/.deps
? src/backend/storage/smgr/.deps
? src/backend/tcop/.deps
? src/backend/tsearch/.deps
? src/backend/utils/.deps
? src/backend/utils/probes.h
? src/backend/utils/adt/.deps
? src/backend/utils/cache/.deps
? src/backend/utils/error/.deps
? src/backend/utils/fmgr/.deps
? src/backend/utils/hash/.deps
? src/backend/utils/init/.deps
? src/backend/utils/mb/.deps
? src/backend/utils/mb/Unicode/BIG5.TXT
? src/backend/utils/mb/Unicode/CP950.TXT
? src/backend/utils/mb/conversion_procs/conversion_create.sql
? src/backend/utils/mb/conversion_procs/ascii_and_mic/.deps
? src/backend/utils/mb/conversion_procs/cyrillic_and_mic/.deps
? src/backend/utils/mb/conversion_procs/euc2004_sjis2004/.deps
? src/backend/utils/mb/conversion_procs/euc_cn_and_mic/.deps
? src/backend/utils/mb/conversion_procs/euc_jis_2004_and_shift_jis_2004/.deps
? src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/.deps
? src/backend/utils/mb/conversion_procs/euc_kr_and_mic/.deps
? src/backend/utils/mb/conversion_procs/euc_tw_and_big5/.deps
? src/backend/utils/mb/conversion_procs/latin2_and_win1250/.deps
? src/backend/utils/mb/conversion_procs/latin_and_mic/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_ascii/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_big5/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_jis_2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_gb18030/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_gbk/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_iso8859/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_johab/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_shift_jis_2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_sjis/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_sjis2004/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_uhc/.deps
? src/backend/utils/mb/conversion_procs/utf8_and_win/.deps
? src/backend/utils/misc/.deps
? src/backend/utils/mmgr/.deps
? src/backend/utils/resowner/.deps
? src/backend/utils/sort/.deps
? src/backend/utils/time/.deps
? src/bin/initdb/.deps
? src/bin/initdb/initdb
? src/bin/initdb/po/ko.mo
? src/bin/initdb/po/pl.mo
? src/bin/initdb/po/ro.mo
? src/bin/initdb/po/sk.mo
? src/bin/initdb/po/sl.mo
? src/bin/initdb/po/zh_CN.mo
? src/bin/initdb/po/zh_TW.mo
? src/bin/pg_config/.deps
? src/bin/pg_config/pg_config
? src/bin/pg_config/po/cs.mo
? src/bin/pg_config/po/pl.mo
? src/bin/pg_config/po/sl.mo
? src/bin/pg_config/po/zh_CN.mo
? src/bin/pg_config/po/zh_TW.mo
? src/bin/pg_controldata/.deps
? src/bin/pg_controldata/pg_controldata
? src/bin/pg_controldata/po/cs.mo
? src/bin/pg_controldata/po/fa.mo
? src/bin/pg_controldata/po/hu.mo
? src/bin/pg_controldata/po/nb.mo
? src/bin/pg_controldata/po/pl.mo
? src/bin/pg_controldata/po/ro.mo
? src/bin/pg_controldata/po/ru.mo
? src/bin/pg_controldata/po/sk.mo
? src/bin/pg_controldata/po/sl.mo
? src/bin/pg_controldata/po/zh_CN.mo
? src/bin/pg_controldata/po/zh_TW.mo
? src/bin/pg_ctl/.deps
? src/bin/pg_ctl/pg_ctl
? src/bin/pg_ctl/po/cs.mo
? src/bin/pg_ctl/po/ro.mo
? src/bin/pg_ctl/po/sk.mo
? src/bin/pg_ctl/po/sl.mo
? src/bin/pg_ctl/po/zh_CN.mo
? src/bin/pg_ctl/po/zh_TW.mo
? src/bin/pg_dump/.deps
? src/bin/pg_dump/pg_dump
? src/bin/pg_dump/pg_dumpall
? src/bin/pg_dump/pg_restore
? src/bin/pg_dump/po/cs.mo
? src/bin/pg_dump/po/ko.mo
? src/bin/pg_dump/po/nb.mo
? src/bin/pg_dump/po/ro.mo
? src/bin/pg_dump/po/ru.mo
? src/bin/pg_dump/po/sk.mo
? src/bin/pg_dump/po/sl.mo
? src/bin/pg_dump/po/zh_CN.mo
? src/bin/pg_dump/po/zh_TW.mo
? src/bin/pg_resetxlog/.deps
? src/bin/pg_resetxlog/pg_resetxlog
? src/bin/pg_resetxlog/po/cs.mo
? src/bin/pg_resetxlog/po/hu.mo
? src/bin/pg_resetxlog/po/nb.mo
? src/bin/pg_resetxlog/po/sk.mo
? src/bin/pg_resetxlog/po/sl.mo
? src/bin/pg_resetxlog/po/zh_CN.mo
? src/bin/pg_resetxlog/po/zh_TW.mo
? src/bin/psql/.deps
? src/bin/psql/psql
? src/bin/psql/po/fa.mo
? src/bin/psql/po/hu.mo
? src/bin/psql/po/it.mo
? src/bin/psql/po/ko.mo
? src/bin/psql/po/nb.mo
? src/bin/psql/po/ro.mo
? src/bin/psql/po/ru.mo
? src/bin/psql/po/sk.mo
? src/bin/psql/po/sl.mo
? src/bin/psql/po/zh_CN.mo
? src/bin/psql/po/zh_TW.mo
? src/bin/scripts/.deps
? src/bin/scripts/clusterdb
? src/bin/scripts/createdb
? src/bin/scripts/createlang
? src/bin/scripts/createuser
? src/bin/scripts/dropdb
? src/bin/scripts/droplang
? src/bin/scripts/dropuser
? src/bin/scripts/foo
? src/bin/scripts/reindexdb
? src/bin/scripts/vacuumdb
? src/bin/scripts/po/ru.mo
? src/bin/scripts/po/sk.mo
? src/bin/scripts/po/sl.mo
? src/bin/scripts/po/zh_CN.mo
? src/bin/scripts/po/zh_TW.mo
? src/include/pg_config.h
? src/include/stamp-h
? src/interfaces/ecpg/compatlib/.deps
? src/interfaces/ecpg/compatlib/exports.list
? src/interfaces/ecpg/compatlib/libecpg_compat.so.3.1
? src/interfaces/ecpg/compatlib/libecpg_compat.so.3.2
? src/interfaces/ecpg/ecpglib/.deps
? src/interfaces/ecpg/ecpglib/exports.list
? src/interfaces/ecpg/ecpglib/libecpg.so.6.1
? src/interfaces/ecpg/ecpglib/libecpg.so.6.2
? src/interfaces/ecpg/include/ecpg_config.h
? src/interfaces/ecpg/include/stamp-h
? src/interfaces/ecpg/pgtypeslib/.deps
? src/interfaces/ecpg/pgtypeslib/exports.list
? src/interfaces/ecpg/pgtypeslib/libpgtypes.so.3.1
? src/interfaces/ecpg/pgtypeslib/libpgtypes.so.3.2
? src/interfaces/ecpg/preproc/.deps
? src/interfaces/ecpg/preproc/ecpg
? src/interfaces/libpq/.deps
? src/interfaces/libpq/exports.list
? src/interfaces/libpq/libpq.so.5.3
? src/interfaces/libpq/po/af.mo
? src/interfaces/libpq/po/hr.mo
? src/interfaces/libpq/po/nb.mo
? src/interfaces/libpq/po/pl.mo
? src/interfaces/libpq/po/sk.mo
? src/interfaces/libpq/po/sl.mo
? src/interfaces/libpq/po/zh_CN.mo
? src/interfaces/libpq/po/zh_TW.mo
? src/pl/plpgsql/src/.deps
? src/pl/plpgsql/src/pl_scan.c
? src/pl/plpgsql/src/po/tr.mo
? src/port/.deps
? src/port/pg_config_paths.h
? src/test/regress/.deps
? src/test/regress/log
? src/test/regress/pg_regress
? src/test/regress/results
? src/test/regress/testtablespace
? src/test/regress/expected/constraints.out
? src/test/regress/expected/copy.out
? src/test/regress/expected/create_function_1.out
? src/test/regress/expected/create_function_2.out
? src/test/regress/expected/largeobject.out
? src/test/regress/expected/largeobject_1.out
? src/test/regress/expected/misc.out
? src/test/regress/expected/tablespace.out
? src/test/regress/sql/constraints.sql
? src/test/regress/sql/copy.sql
? src/test/regress/sql/create_function_1.sql
? src/test/regress/sql/create_function_2.sql
? src/test/regress/sql/largeobject.sql
? src/test/regress/sql/misc.sql
? src/test/regress/sql/tablespace.sql
? src/timezone/.deps
? src/timezone/zic
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.375
diff -u -r1.375 xlog.c
--- src/backend/access/transam/xlog.c	17 Feb 2010 04:19:39 -0000	1.375
+++ src/backend/access/transam/xlog.c	18 Feb 2010 12:40:05 -0000
@@ -2686,13 +2686,10 @@
 	 * WAL segment files will not be re-read in normal operation, so we advise
 	 * the OS to release any cached pages.	But do not do so if WAL archiving
 	 * or streaming is active, because archiver and walsender process could use
-	 * the cache to read the WAL segment.  Also, don't bother with it if we
-	 * are using O_DIRECT, since the kernel is presumably not caching in that
-	 * case.
+	 * the cache to read the WAL segment.
 	 */
 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-	if (!XLogIsNeeded() &&
-		(get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
+	if (!XLogIsNeeded())
 		(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
 
@@ -7652,10 +7649,29 @@
 static int
 get_sync_bit(int method)
 {
+	int o_direct_flag = 0;
+
 	/* If fsync is disabled, never open in sync mode */
 	if (!enableFsync)
 		return 0;
 
+	/*
+	 * Optimize writes by bypassing kernel cache with O_DIRECT when using
+	 * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
+	 * disabled, otherwise the archive command or walsender process will
+	 * read the WAL soon after writing it, which is guaranteed to cause a
+	 * physical read if we bypassed the kernel cache. We also skip the
+	 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the
+	 * same reason.
+	 *
+	 * Never use O_DIRECT in walsender process for similar reasons; the WAL
+	 * written by walreceiver is normally read by the startup process soon
+	 * after its written. Also, walreceiver performs unaligned writes, which
+	 * don't work with O_DIRECT, so it is required for correctness too.
+	 */
+	if (!XLogIsNeeded() && !am_walreceiver)
+		o_direct_flag = PG_O_DIRECT;
+
 	switch (method)
 	{
 			/*
@@ -7670,11 +7686,11 @@
 			return 0;
 #ifdef OPEN_SYNC_FLAG
 		case SYNC_METHOD_OPEN:
-			return OPEN_SYNC_FLAG;
+			return OPEN_SYNC_FLAG | o_direct_flag;
 #endif
 #ifdef OPEN_DATASYNC_FLAG
 		case SYNC_METHOD_OPEN_DSYNC:
-			return OPEN_DATASYNC_FLAG;
+			return OPEN_DATASYNC_FLAG | o_direct_flag;
 #endif
 		default:
 			/* can't happen (unless we are out of sync with option array) */
Index: src/backend/replication/walreceiver.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/replication/walreceiver.c,v
retrieving revision 1.4
diff -u -r1.4 walreceiver.c
--- src/backend/replication/walreceiver.c	17 Feb 2010 04:19:39 -0000	1.4
+++ src/backend/replication/walreceiver.c	18 Feb 2010 12:40:05 -0000
@@ -50,6 +50,9 @@
 #include "utils/ps_status.h"
 #include "utils/resowner.h"
 
+/* Global variable to indicate if this process is a walreceiver process */
+bool am_walreceiver;
+
 /* libpqreceiver hooks to these when loaded */
 walrcv_connect_type walrcv_connect = NULL;
 walrcv_receive_type walrcv_receive = NULL;
@@ -158,6 +161,8 @@
 	/* use volatile pointer to prevent code rearrangement */
 	volatile WalRcvData *walrcv = WalRcv;
 
+	am_walreceiver = true;
+
 	/*
 	 * WalRcv should be set up already (if we are a backend, we inherit
 	 * this by fork() or EXEC_BACKEND mechanism from the postmaster).
@@ -424,16 +429,18 @@
 			bool	use_existent;
 
 			/*
-			 * XLOG segment files will be re-read in recovery operation soon,
-			 * so we don't need to advise the OS to release any cache page.
+			 * fsync() and close current file before we switch to next one.
+			 * We would otherwise have to reopen this file to fsync it later
 			 */
 			if (recvFile >= 0)
 			{
+				XLogWalRcvFlush();
+
 				/*
-				 * fsync() before we switch to next file. We would otherwise
-				 * have to reopen this file to fsync it later
+				 * XLOG segment files will be re-read by recovery in startup
+				 * process soon, so we don't advise the OS to release cache
+				 * pages associated with the file like XLogFileClose() does.
 				 */
-				XLogWalRcvFlush();
 				if (close(recvFile) != 0)
 					ereport(PANIC,
 							(errcode_for_file_access(),
@@ -445,8 +452,7 @@
 			/* Create/use new log file */
 			XLByteToSeg(recptr, recvId, recvSeg);
 			use_existent = true;
-			recvFile = XLogFileInit(recvId, recvSeg,
-									&use_existent, true);
+			recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true);
 			recvOff = 0;
 		}
 
Index: src/include/access/xlogdefs.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/access/xlogdefs.h,v
retrieving revision 1.25
diff -u -r1.25 xlogdefs.h
--- src/include/access/xlogdefs.h	15 Jan 2010 09:19:06 -0000	1.25
+++ src/include/access/xlogdefs.h	18 Feb 2010 12:40:05 -0000
@@ -106,23 +106,20 @@
  * configure determined whether fdatasync() is.
  */
 #if defined(O_SYNC)
-#define BARE_OPEN_SYNC_FLAG		O_SYNC
+#define OPEN_SYNC_FLAG		O_SYNC
 #elif defined(O_FSYNC)
-#define BARE_OPEN_SYNC_FLAG		O_FSYNC
-#endif
-#ifdef BARE_OPEN_SYNC_FLAG
-#define OPEN_SYNC_FLAG			(BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
+#define OPEN_SYNC_FLAG		O_FSYNC
 #endif
 
 #if defined(O_DSYNC)
 #if defined(OPEN_SYNC_FLAG)
 /* O_DSYNC is distinct? */
-#if O_DSYNC != BARE_OPEN_SYNC_FLAG
-#define OPEN_DATASYNC_FLAG		(O_DSYNC | PG_O_DIRECT)
+#if O_DSYNC != OPEN_SYNC_FLAG
+#define OPEN_DATASYNC_FLAG		O_DSYNC
 #endif
 #else							/* !defined(OPEN_SYNC_FLAG) */
 /* Win32 only has O_DSYNC */
-#define OPEN_DATASYNC_FLAG		(O_DSYNC | PG_O_DIRECT)
+#define OPEN_DATASYNC_FLAG		O_DSYNC
 #endif
 #endif
 
Index: src/include/replication/walreceiver.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/replication/walreceiver.h,v
retrieving revision 1.6
diff -u -r1.6 walreceiver.h
--- src/include/replication/walreceiver.h	3 Feb 2010 09:47:19 -0000	1.6
+++ src/include/replication/walreceiver.h	18 Feb 2010 12:40:05 -0000
@@ -15,6 +15,8 @@
 #include "access/xlogdefs.h"
 #include "storage/spin.h"
 
+extern bool am_walreceiver;
+
 /*
  * MAXCONNINFO: maximum size of a connection string.
  *
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to