Hi all, (Added Kevin in CC) There have been over the ages discussions about getting better O_DIRECT support to close the gap with other players in the database market, but I have not actually seen on those lists a patch which makes use of O_DIRECT for relations and SLRUs (perhaps I missed it, anyway that would most likely conflict now).
Attached is a toy patch that I have begun using for tests in this area. That's nothing really serious at this stage, but you can use that if you would like to see the impact of O_DIRECT. Of course, things get significantly slower. The patch is able to compile, pass regression tests, and looks stable. So that's usable for experiments. The patch uses a GUC called direct_io, enabled to true to ease regression testing when applying it. Note that pg_attribute_aligned() cannot be used as that's not an option with clang and a couple of other comilers as far as I know, so the patch uses a simple set of placeholder buffers large enough to be aligned with the OS pages, which should be 4k for Linux by the way, and not set to BLCKSZ, but for WAL's O_DIRECT we don't really care much with such details. If there is interest for such things, perhaps we could get a patch sorted out, with some angles of attack like: - Move to use of page-aligned buffers for relations and SLRUs. - Split use of O_DIRECT for SLRU and relations into separate GUCs. - Perhaps other things. However this is a large and very controversial topic, and of course more complex than the experiment attached, still this prototype is fun to play with. Thanks for reading! -- Michael
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 3623352b9c..a3ba8cddba 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -120,6 +120,13 @@ typedef enum SLRU_CLOSE_FAILED } SlruErrorCause; +/* + * Static area used as placeholder for O_DIRECT operations. Make sure + * to keep its size at twice the block size so as it would be aligned + * with the OS pages. + */ +static char direct_buf[BLCKSZ + BLCKSZ]; + static SlruErrorCause slru_errcause; static int slru_errno; @@ -644,6 +651,12 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd; + int direct_flag = 0; + char *aligned_buf = (char *) TYPEALIGN(BLCKSZ, direct_buf); + + /* Use direct I/O if configured as such */ + if (direct_io) + direct_flag |= O_DIRECT | O_SYNC; SlruFileName(ctl, path, segno); @@ -654,7 +667,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) * SlruPhysicalWritePage). Hence, if we are InRecovery, allow the case * where the file doesn't exist, and return zeroes instead. */ - fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + fd = OpenTransientFile(path, O_RDWR | PG_BINARY | direct_flag); if (fd < 0) { if (errno != ENOENT || !InRecovery) @@ -681,7 +694,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); - if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) + if (read(fd, aligned_buf, BLCKSZ) != BLCKSZ) { pgstat_report_wait_end(); slru_errcause = SLRU_READ_FAILED; @@ -698,6 +711,9 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) return false; } + /* copy contents back to the slot */ + memcpy(shared->page_buffer[slotno], aligned_buf, BLCKSZ); + return true; } @@ -724,6 +740,12 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; + int direct_flag = 0; + char *aligned_buf = (char *) TYPEALIGN(BLCKSZ, direct_buf); + + /* Use direct I/O if configured as such */ + if (direct_io) + direct_flag |= O_DIRECT | O_SYNC; /* * Honor the write-WAL-before-data rule, if appropriate, so that we do not @@ -804,7 +826,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) * don't use O_EXCL or O_TRUNC or anything like that. */ SlruFileName(ctl, path, segno); - fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY); + fd = OpenTransientFile(path, O_RDWR | O_CREAT | PG_BINARY | direct_flag); if (fd < 0) { slru_errcause = SLRU_OPEN_FAILED; @@ -840,9 +862,12 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) return false; } + /* use aligned buffer for O_DIRECT */ + memcpy(aligned_buf, shared->page_buffer[slotno], BLCKSZ); + errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); - if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) + if (write(fd, aligned_buf, BLCKSZ) != BLCKSZ) { pgstat_report_wait_end(); /* if write didn't set errno, assume problem is no disk space */ @@ -858,12 +883,13 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) /* * If not part of Flush, need to fsync now. We assume this happens - * infrequently enough that it's not a performance issue. + * infrequently enough that it's not a performance issue. O_DIRECT + * has already made sure that this happens. */ if (!fdata) { pgstat_report_wait_start(WAIT_EVENT_SLRU_SYNC); - if (ctl->do_fsync && pg_fsync(fd)) + if (!direct_io && ctl->do_fsync && pg_fsync(fd)) { pgstat_report_wait_end(); slru_errcause = SLRU_FSYNC_FAILED; diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 213de7698a..92bcb57934 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -148,6 +148,9 @@ int max_safe_fds = 32; /* default if not changed */ /* Whether it is safe to continue running after fsync() fails. */ bool data_sync_retry = false; +/* Use O_DIRECT for operations involving relation files and SLRUs */ +bool direct_io = true; + /* Debugging.... */ #ifdef FDDEBUG diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index c37dd1290b..a927c8bd5a 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -159,6 +159,12 @@ static MemoryContext pendingOpsCxt; /* context for the above */ static CycleCtr mdsync_cycle_ctr = 0; static CycleCtr mdckpt_cycle_ctr = 0; +/* + * Static area used as placeholder for O_DIRECT operations. Make sure + * to keep its size at twice the block size so as it would be aligned + * with the OS pages. + */ +static char direct_buf[BLCKSZ + BLCKSZ]; /*** behavior for mdopen & _mdfd_getseg ***/ /* ereport if segment not present */ @@ -296,6 +302,7 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) MdfdVec *mdfd; char *path; File fd; + int direct_flag = 0; if (isRedo && reln->md_num_open_segs[forkNum] > 0) return; /* created and opened already... */ @@ -304,7 +311,11 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) path = relpath(reln->smgr_rnode, forkNum); - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + /* Use direct I/O if configured as such */ + if (direct_io) + direct_flag |= O_DIRECT | O_SYNC; + + fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY | direct_flag); if (fd < 0) { @@ -317,7 +328,7 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) * already, even if isRedo is not set. (See also mdopen) */ if (isRedo || IsBootstrapProcessingMode()) - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY | direct_flag); if (fd < 0) { /* be sure to report the error reported by create, not open */ @@ -498,12 +509,18 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, off_t seekpos; int nbytes; MdfdVec *v; + char *aligned_buf = (char *) TYPEALIGN(BLCKSZ, direct_buf); /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum >= mdnblocks(reln, forknum)); #endif + /* + * Copy contents into an aligned buffer for direct I/O. + */ + memcpy(aligned_buf, buffer, BLCKSZ); + /* * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any * more --- we mustn't create a block whose number actually is @@ -522,7 +539,8 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) + if ((nbytes = FileWrite(v->mdfd_vfd, aligned_buf, BLCKSZ, seekpos, + WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { if (nbytes < 0) ereport(ERROR, @@ -561,6 +579,7 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) MdfdVec *mdfd; char *path; File fd; + int direct_flag = 0; /* No work if already open */ if (reln->md_num_open_segs[forknum] > 0) @@ -568,7 +587,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) path = relpath(reln->smgr_rnode, forknum); - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + /* Use direct I/O if configured as such */ + if (direct_io) + direct_flag |= O_DIRECT | O_SYNC; + + fd = PathNameOpenFile(path, O_RDWR | PG_BINARY | direct_flag); if (fd < 0) { @@ -579,7 +602,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) * substitute for mdcreate() in bootstrap mode only. (See mdcreate) */ if (IsBootstrapProcessingMode()) - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = PathNameOpenFile(path, + O_RDWR | O_CREAT | O_EXCL | PG_BINARY | direct_flag); if (fd < 0) { if ((behavior & EXTENSION_RETURN_NULL) && @@ -719,6 +743,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, off_t seekpos; int nbytes; MdfdVec *v; + char *aligned_buf = (char *) TYPEALIGN(BLCKSZ, direct_buf); TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -733,7 +758,8 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); + nbytes = FileRead(v->mdfd_vfd, aligned_buf, BLCKSZ, seekpos, + WAIT_EVENT_DATA_FILE_READ); TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -760,7 +786,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * update a block that was later truncated away. */ if (zero_damaged_pages || InRecovery) - MemSet(buffer, 0, BLCKSZ); + MemSet(aligned_buf, 0, BLCKSZ); else ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), @@ -768,6 +794,11 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, blocknum, FilePathName(v->mdfd_vfd), nbytes, BLCKSZ))); } + + /* + * Copy the result from the aligned buffer to the real one. + */ + memcpy(buffer, aligned_buf, BLCKSZ); } /* @@ -784,12 +815,18 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, off_t seekpos; int nbytes; MdfdVec *v; + char *aligned_buf = (char *) TYPEALIGN(BLCKSZ, direct_buf); /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum < mdnblocks(reln, forknum)); #endif + /* + * Copy contents into an aligned buffer for direct I/O. + */ + memcpy(aligned_buf, buffer, BLCKSZ); + TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, reln->smgr_rnode.node.spcNode, reln->smgr_rnode.node.dbNode, @@ -803,7 +840,8 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + nbytes = FileWrite(v->mdfd_vfd, aligned_buf, BLCKSZ, seekpos, + WAIT_EVENT_DATA_FILE_WRITE); TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.node.spcNode, @@ -1011,11 +1049,14 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) { MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; - if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) + /* nothing to do with direct I/O */ + if (!direct_io && + FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) ereport(data_sync_elevel(ERROR), (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(v->mdfd_vfd)))); + segno--; } } @@ -1414,9 +1455,15 @@ mdpostckpt(void) static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) { + return; + /* Temp relations should never be fsync'd */ Assert(!SmgrIsTemp(reln)); + /* Not actually needed with direct I/O */ + if (direct_io) + return; + if (pendingOpsTable) { /* push it into local pending-ops table */ @@ -1799,11 +1846,16 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, MdfdVec *v; int fd; char *fullpath; + int direct_flag = 0; fullpath = _mdfd_segpath(reln, forknum, segno); + /* Use direct I/O if configured as such */ + if (direct_io) + direct_flag |= O_DIRECT | O_SYNC; + /* open the file */ - fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); + fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | direct_flag | oflags); pfree(fullpath); diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f81e0424e7..a0809ad628 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -631,6 +631,8 @@ const char *const config_group_names[] = gettext_noop("Resource Usage / Background Writer"), /* RESOURCES_ASYNCHRONOUS */ gettext_noop("Resource Usage / Asynchronous Behavior"), + /* RESOURCES_IO */ + gettext_noop("Resource Usage / I/O"), /* WAL */ gettext_noop("Write-Ahead Log"), /* WAL_SETTINGS */ @@ -1875,6 +1877,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"direct_io", PGC_POSTMASTER, RESOURCES_IO, + gettext_noop("Use direct I/O when flushing relations and SLRUs."), + }, + &direct_io, + true, + NULL, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 74c34757fb..ecaa347657 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -48,6 +48,7 @@ typedef int File; /* GUC parameter */ extern PGDLLIMPORT int max_files_per_process; extern PGDLLIMPORT bool data_sync_retry; +extern PGDLLIMPORT bool direct_io; /* * This is private to fd.c, but exported for save/restore_backend_variables() diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index a0970b2e1c..17178862a7 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -65,6 +65,7 @@ enum config_group RESOURCES_VACUUM_DELAY, RESOURCES_BGWRITER, RESOURCES_ASYNCHRONOUS, + RESOURCES_IO, WAL, WAL_SETTINGS, WAL_CHECKPOINTS,
signature.asc
Description: PGP signature