I have applied the following patch to CVS HEAD and 8.0.X that changes the Win32 O_SYNC flag to O_DATASYNC, because this the actual behavior of the flag. This is now the default wal fsync method on Win32 because we perfer O_DATASYNC to fsync().
And second, it changes Win32 fsync to a new wal sync method called fsync_writethrough which is the old Win32 fsync behavior, which uses _commit(). --------------------------------------------------------------------------- Magnus Hagander wrote: > > > > > * Win32, with fsync, write-cache disabled: no data corruption > > > > > * Win32, with fsync, write-cache enabled: no data corruption > > > > > * Win32, with osync, write cache disabled: no data corruption > > > > > * Win32, with osync, write cache enabled: no data > > corruption. Once > > > > > I > > > > > got: > > > > > 2005-02-24 12:19:54 LOG: could not open file "C:/Program > > > > > Files/PostgreSQL/8.0/data/pg_xlog/000000010000000000000010" > > > > (log file > > > > > 0, segment 16): No such file or directory > > > > > but the data in the database was consistent. > > > > > > > > It disturbs me that you couldn't produce data corruption in the > > > > cases where it theoretically should occur. Seems like this is an > > > > indication that your test was insufficiently severe, or > > that there > > > > is something going on we don't understand. > > > > > > The Windows driver knows abotu the write cache, and at > > least fsync() > > > pushes through the write cache even if it's there. This seems to > > > indicate taht O_SYNC at least partiallyi does this as well. This is > > > why there is no performance difference at all on fsync() with write > > > cache on or off. > > > > > > I don't know if this is true for all IDE disks. COuld be > > that my disk > > > is particularly well-behaved. > > > > This indicated to me that open_sync did not require any > > additional changes than our current fsync. > > fsync and open_sync both write through the write cache in the operating > system. Only fsync=off turns this off. > > fsync also writes through the hardware write cache. o_sync does not. > This is what causes the large slowdown with write cache enabled, > *including* most battery backed write cache systems (pretty much making > the write-cache a waste of money). This may be a good thing on IDE > systems (for admins that don't know how to remove the little check in > the box for "enable write caching on the disk" that MS provides, which > *explicitly* warns that you may lose data if you enabled it), but it's a > very bad thing for anything higher end. > > fsync also syncs the directory metadata. o_sync only cares about the > files contents. (This is what causes the large slowdown with write cache > *disabled*, becuase it requires multiple writes on multiple disk > locations for each fsync). > > > Basically, fsync hurts people who configure their box correctly, or who > use things like SCSI disks. o_sync hurts people who configure their > machine in an unsafe way. > > //Magnus > > ---------------------------(end of broadcast)--------------------------- > TIP 1: subscribe and unsubscribe commands go to [EMAIL PROTECTED] > -- Bruce Momjian | http://candle.pha.pa.us pgman@candle.pha.pa.us | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073
Index: doc/src/sgml/runtime.sgml =================================================================== RCS file: /cvsroot/pgsql/doc/src/sgml/runtime.sgml,v retrieving revision 1.310 diff -c -c -r1.310 runtime.sgml *** doc/src/sgml/runtime.sgml 19 Mar 2005 23:27:04 -0000 1.310 --- doc/src/sgml/runtime.sgml 24 Mar 2005 04:27:11 -0000 *************** *** 1587,1592 **** --- 1587,1593 ---- values are <literal>fsync</> (call <function>fsync()</> at each commit), <literal>fdatasync</> (call <function>fdatasync()</> at each commit), + <literal>fsync_writethrough</> (call <function>_commit()</> at each commit on Windows), <literal>open_sync</> (write WAL files with <function>open()</> option <symbol>O_SYNC</>), and <literal>open_datasync</> (write WAL files with <function>open()</> option <symbol>O_DSYNC</>). Not all of these choices are available on all platforms. Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.181 diff -c -c -r1.181 xlog.c *** src/backend/access/transam/xlog.c 12 Feb 2005 23:53:37 -0000 1.181 --- src/backend/access/transam/xlog.c 24 Mar 2005 04:27:15 -0000 *************** *** 63,70 **** #endif #endif #if defined(OPEN_SYNC_FLAG) ! #if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG) #define OPEN_DATASYNC_FLAG O_DSYNC #endif #endif --- 63,75 ---- #endif #endif + #if defined(O_DSYNC) #if defined(OPEN_SYNC_FLAG) ! #if O_DSYNC != OPEN_SYNC_FLAG ! #define OPEN_DATASYNC_FLAG O_DSYNC ! #endif ! #else /* !defined(OPEN_SYNC_FLAG) */ ! /* Win32 only has O_DSYNC */ #define OPEN_DATASYNC_FLAG O_DSYNC #endif #endif *************** *** 79,85 **** --- 84,94 ---- #define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC #define DEFAULT_SYNC_FLAGBIT 0 #else + #ifndef FSYNC_IS_WRITE_THROUGH #define DEFAULT_SYNC_METHOD_STR "fsync" + #else + #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough" + #endif #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC #define DEFAULT_SYNC_FLAGBIT 0 #endif *************** *** 5154,5160 **** --- 5163,5174 ---- int new_sync_method; int new_sync_bit; + #ifndef FSYNC_IS_WRITE_THROUGH if (pg_strcasecmp(method, "fsync") == 0) + #else + /* Win32 fsync() == _commit(0, which writes through a write cache */ + if (pg_strcasecmp(method, "fsync_writethrough") == 0) + #endif { new_sync_method = SYNC_METHOD_FSYNC; new_sync_bit = 0; Index: src/backend/utils/misc/postgresql.conf.sample =================================================================== RCS file: /cvsroot/pgsql/src/backend/utils/misc/postgresql.conf.sample,v retrieving revision 1.137 diff -c -c -r1.137 postgresql.conf.sample *** src/backend/utils/misc/postgresql.conf.sample 19 Mar 2005 23:27:07 -0000 1.137 --- src/backend/utils/misc/postgresql.conf.sample 24 Mar 2005 04:27:18 -0000 *************** *** 114,120 **** #fsync = true # turns forced synchronization on or off #wal_sync_method = fsync # the default varies across platforms: ! # fsync, fdatasync, open_sync, or open_datasync #wal_buffers = 8 # min 4, 8KB each #commit_delay = 0 # range 0-100000, in microseconds #commit_siblings = 5 # range 1-1000 --- 114,121 ---- #fsync = true # turns forced synchronization on or off #wal_sync_method = fsync # the default varies across platforms: ! # fsync, fdatasync, fsync_writethrough, ! # open_sync, open_datasync #wal_buffers = 8 # min 4, 8KB each #commit_delay = 0 # range 0-100000, in microseconds #commit_siblings = 5 # range 1-1000 Index: src/include/port/win32.h =================================================================== RCS file: /cvsroot/pgsql/src/include/port/win32.h,v retrieving revision 1.43 diff -c -c -r1.43 win32.h *** src/include/port/win32.h 27 Feb 2005 00:53:29 -0000 1.43 --- src/include/port/win32.h 24 Mar 2005 04:27:19 -0000 *************** *** 17,22 **** --- 17,23 ---- #define fsync(a) _commit(a) + #define FSYNC_IS_WRITE_THROUGH #define ftruncate(a,b) chsize(a,b) #define USES_WINSOCK *************** *** 189,195 **** * to ensure that we don't collide with a future definition. It means * we cannot use _O_NOINHERIT ourselves. */ ! #define O_SYNC 0x0080 /* * Supplement to <errno.h>. --- 190,196 ---- * to ensure that we don't collide with a future definition. It means * we cannot use _O_NOINHERIT ourselves. */ ! #define O_DSYNC 0x0080 /* * Supplement to <errno.h>. Index: src/port/open.c =================================================================== RCS file: /cvsroot/pgsql/src/port/open.c,v retrieving revision 1.8 diff -c -c -r1.8 open.c *** src/port/open.c 27 Feb 2005 00:53:29 -0000 1.8 --- src/port/open.c 24 Mar 2005 04:27:19 -0000 *************** *** 63,69 **** /* Check that we can handle the request */ assert((fileFlags & ((O_RDONLY | O_WRONLY | O_RDWR) | O_APPEND | (O_RANDOM | O_SEQUENTIAL | O_TEMPORARY) | ! _O_SHORT_LIVED | O_SYNC | (O_CREAT | O_TRUNC | O_EXCL) | (O_TEXT | O_BINARY))) == fileFlags); sa.nLength = sizeof(sa); --- 63,69 ---- /* Check that we can handle the request */ assert((fileFlags & ((O_RDONLY | O_WRONLY | O_RDWR) | O_APPEND | (O_RANDOM | O_SEQUENTIAL | O_TEMPORARY) | ! _O_SHORT_LIVED | O_DSYNC | (O_CREAT | O_TRUNC | O_EXCL) | (O_TEXT | O_BINARY))) == fileFlags); sa.nLength = sizeof(sa); *************** *** 83,89 **** ((fileFlags & O_SEQUENTIAL) ? FILE_FLAG_SEQUENTIAL_SCAN : 0) | ((fileFlags & _O_SHORT_LIVED) ? FILE_ATTRIBUTE_TEMPORARY : 0) | ((fileFlags & O_TEMPORARY) ? FILE_FLAG_DELETE_ON_CLOSE : 0)| ! ((fileFlags & O_SYNC) ? FILE_FLAG_WRITE_THROUGH : 0), NULL)) == INVALID_HANDLE_VALUE) { switch (GetLastError()) --- 83,89 ---- ((fileFlags & O_SEQUENTIAL) ? FILE_FLAG_SEQUENTIAL_SCAN : 0) | ((fileFlags & _O_SHORT_LIVED) ? FILE_ATTRIBUTE_TEMPORARY : 0) | ((fileFlags & O_TEMPORARY) ? FILE_FLAG_DELETE_ON_CLOSE : 0)| ! ((fileFlags & O_DSYNC) ? FILE_FLAG_WRITE_THROUGH : 0), NULL)) == INVALID_HANDLE_VALUE) { switch (GetLastError())
---------------------------(end of broadcast)--------------------------- TIP 5: Have you checked our extensive FAQ? http://www.postgresql.org/docs/faq