Hi,
The Uber blog post, among other things, pointed out that PG uses lseek +
read instead of pread. I didn't see any discussion around that and my
Google searches didn't find any posts about pread / pwrite for the past
10 years.
With that plus the "C++ port" thread in mind, I was wondering if it's
time to see if we could do better by just utilizing newer C and POSIX
features.
The attached patch replaces FileWrite and FileRead with FileWriteAt and
FileReadAt and removes most FileSeek calls. FileSeek is still around so
we can find the end of a file, but it's not used for anything else.
On my laptop a simple pgbench run (scale 100, 15 minutes) shows a 1.5%
performance improvement. A 1.5% performance improvement is small but
measurable - and IMV more importantly it allows us to drop more than 100
lines of backwards (compatible?) code; maybe we could start targeting
more recent platforms in v10?
Obviously this patch needs some more work before it could be merged, and
we probably still need a fallback for some platforms without pread and
pwrite (afaik Windows doesn't implement them.)
/ Oskari
diff --git src/backend/access/heap/rewriteheap.c src/backend/access/heap/rewriteheap.c
index f9ce986..6dc5df3 100644
--- src/backend/access/heap/rewriteheap.c
+++ src/backend/access/heap/rewriteheap.c
@@ -918,7 +918,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
* Note that we deviate from the usual WAL coding practices here,
* check the above "Logical rewrite support" comment for reasoning.
*/
- written = FileWrite(src->vfd, waldata_start, len);
+ written = FileWriteAt(src->vfd, waldata_start, len, src->off);
if (written != len)
ereport(ERROR,
(errcode_for_file_access(),
diff --git src/backend/storage/file/buffile.c src/backend/storage/file/buffile.c
index 042be79..39482fa 100644
--- src/backend/storage/file/buffile.c
+++ src/backend/storage/file/buffile.c
@@ -60,12 +60,6 @@ struct BufFile
int numFiles; /* number of physical files in set */
/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
File *files; /* palloc'd array with numFiles entries */
- off_t *offsets; /* palloc'd array with numFiles entries */
-
- /*
- * offsets[i] is the current seek position of files[i]. We use this to
- * avoid making redundant FileSeek calls.
- */
bool isTemp; /* can only add files if this is TRUE */
bool isInterXact; /* keep open over transactions? */
@@ -108,8 +102,6 @@ makeBufFile(File firstfile)
file->numFiles = 1;
file->files = (File *) palloc(sizeof(File));
file->files[0] = firstfile;
- file->offsets = (off_t *) palloc(sizeof(off_t));
- file->offsets[0] = 0L;
file->isTemp = false;
file->isInterXact = false;
file->dirty = false;
@@ -143,10 +135,7 @@ extendBufFile(BufFile *file)
file->files = (File *) repalloc(file->files,
(file->numFiles + 1) * sizeof(File));
- file->offsets = (off_t *) repalloc(file->offsets,
- (file->numFiles + 1) * sizeof(off_t));
file->files[file->numFiles] = pfile;
- file->offsets[file->numFiles] = 0L;
file->numFiles++;
}
@@ -210,7 +199,6 @@ BufFileClose(BufFile *file)
FileClose(file->files[i]);
/* release the buffer space */
pfree(file->files);
- pfree(file->offsets);
pfree(file);
}
@@ -241,23 +229,12 @@ BufFileLoadBuffer(BufFile *file)
}
/*
- * May need to reposition physical file.
- */
- thisfile = file->files[file->curFile];
- if (file->curOffset != file->offsets[file->curFile])
- {
- if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
- return; /* seek failed, read nothing */
- file->offsets[file->curFile] = file->curOffset;
- }
-
- /*
* Read whatever we can get, up to a full bufferload.
*/
- file->nbytes = FileRead(thisfile, file->buffer, sizeof(file->buffer));
+ thisfile = file->files[file->curFile];
+ file->nbytes = FileReadAt(thisfile, file->buffer, sizeof(file->buffer), file->curOffset);
if (file->nbytes < 0)
file->nbytes = 0;
- file->offsets[file->curFile] += file->nbytes;
/* we choose not to advance curOffset here */
pgBufferUsage.temp_blks_read++;
@@ -307,20 +284,10 @@ BufFileDumpBuffer(BufFile *file)
bytestowrite = (int) availbytes;
}
- /*
- * May need to reposition physical file.
- */
thisfile = file->files[file->curFile];
- if (file->curOffset != file->offsets[file->curFile])
- {
- if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
- return; /* seek failed, give up */
- file->offsets[file->curFile] = file->curOffset;
- }
- bytestowrite = FileWrite(thisfile, file->buffer + wpos, bytestowrite);
+ bytestowrite = FileWriteAt(thisfile, file->buffer + wpos, bytestowrite, file->curOffset);
if (bytestowrite <= 0)
return; /* failed to write */
- file->offsets[file->curFile] += bytestowrite;
file->curOffset += bytestowrite;
wpos += bytestowrite;
diff --git src/backend/storage/file/fd.c src/backend/storage/file/fd.c
index 03143f1..795cb12 100644
--- src/backend/storage/file/fd.c
+++ src/backend/storage/file/fd.c
@@ -16,8 +16,8 @@
* including base tables, scratch files (e.g., sort and hash spool
* files), and random calls to C library routines like system(3); it
* is quite easy to exceed system limits on the number of open files a
- * single process can have. (This is around 256 on many modern
- * operating systems, but can be as low as 32 on others.)
+ * single process can have. (This is around 1024 on many modern
+ * operating systems, but may be lower on others.)
*
* VFDs are managed as an LRU pool, with actual OS file descriptors
* being opened and closed as needed. Obviously, if a routine is
@@ -174,7 +174,6 @@ typedef struct vfd
File nextFree; /* link to next free VFD, if in freelist */
File lruMoreRecently; /* doubly linked recency-of-use list */
File lruLessRecently;
- off_t seekPos; /* current logical file position */
off_t fileSize; /* current size of file (0 if not temporary) */
char *fileName; /* name of file, or NULL for unused VFD */
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
@@ -970,10 +969,6 @@ LruDelete(File file)
/* delete the vfd record from the LRU ring */
Delete(file);
- /* save the seek position */
- vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
- Assert(vfdP->seekPos != (off_t) -1);
-
/* close the file */
if (close(vfdP->fd))
elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
@@ -1038,15 +1033,6 @@ LruInsert(File file)
DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
++nfile;
}
-
- /* seek to the right position */
- if (vfdP->seekPos != (off_t) 0)
- {
- off_t returnValue PG_USED_FOR_ASSERTS_ONLY;
-
- returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
- Assert(returnValue != (off_t) -1);
- }
}
/*
@@ -1270,7 +1256,6 @@ PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
/* Saved flags are adjusted to be OK for re-opening file */
vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
vfdP->fileMode = fileMode;
- vfdP->seekPos = 0;
vfdP->fileSize = 0;
vfdP->fdstate = 0x0;
vfdP->resowner = NULL;
@@ -1563,7 +1548,7 @@ FileWriteback(File file, off_t offset, off_t nbytes)
}
int
-FileRead(File file, char *buffer, int amount)
+FileReadAt(File file, char *buffer, int amount, off_t offset)
{
int returnCode;
@@ -1571,7 +1556,7 @@ FileRead(File file, char *buffer, int amount)
DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
- (int64) VfdCache[file].seekPos,
+ (int64) offset,
amount, buffer));
returnCode = FileAccess(file);
@@ -1579,11 +1564,9 @@ FileRead(File file, char *buffer, int amount)
return returnCode;
retry:
- returnCode = read(VfdCache[file].fd, buffer, amount);
+ returnCode = pread(VfdCache[file].fd, buffer, amount, offset);
- if (returnCode >= 0)
- VfdCache[file].seekPos += returnCode;
- else
+ if (returnCode < 0)
{
/*
* Windows may run out of kernel buffers and return "Insufficient
@@ -1609,16 +1592,13 @@ retry:
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
-
- /* Trouble, so assume we don't know the file position anymore */
- VfdCache[file].seekPos = FileUnknownPos;
}
return returnCode;
}
int
-FileWrite(File file, char *buffer, int amount)
+FileWriteAt(File file, char *buffer, int amount, off_t offset)
{
int returnCode;
@@ -1626,7 +1606,7 @@ FileWrite(File file, char *buffer, int amount)
DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
file, VfdCache[file].fileName,
- (int64) VfdCache[file].seekPos,
+ (int64) offset,
amount, buffer));
returnCode = FileAccess(file);
@@ -1643,7 +1623,7 @@ FileWrite(File file, char *buffer, int amount)
*/
if (temp_file_limit >= 0 && (VfdCache[file].fdstate & FD_TEMPORARY))
{
- off_t newPos = VfdCache[file].seekPos + amount;
+ off_t newPos = offset + amount;
if (newPos > VfdCache[file].fileSize)
{
@@ -1660,7 +1640,7 @@ FileWrite(File file, char *buffer, int amount)
retry:
errno = 0;
- returnCode = write(VfdCache[file].fd, buffer, amount);
+ returnCode = pwrite(VfdCache[file].fd, buffer, amount, offset);
/* if write didn't set errno, assume problem is no disk space */
if (returnCode != amount && errno == 0)
@@ -1668,12 +1648,10 @@ retry:
if (returnCode >= 0)
{
- VfdCache[file].seekPos += returnCode;
-
/* maintain fileSize and temporary_files_size if it's a temp file */
if (VfdCache[file].fdstate & FD_TEMPORARY)
{
- off_t newPos = VfdCache[file].seekPos;
+ off_t newPos = offset + returnCode;
if (newPos > VfdCache[file].fileSize)
{
@@ -1685,7 +1663,7 @@ retry:
else
{
/*
- * See comments in FileRead()
+ * See comments in FileReadAt()
*/
#ifdef WIN32
DWORD error = GetLastError();
@@ -1704,9 +1682,6 @@ retry:
/* OK to retry if interrupted */
if (errno == EINTR)
goto retry;
-
- /* Trouble, so assume we don't know the file position anymore */
- VfdCache[file].seekPos = FileUnknownPos;
}
return returnCode;
@@ -1736,78 +1711,33 @@ FileSeek(File file, off_t offset, int whence)
Assert(FileIsValid(file));
- DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
+ DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " %d",
file, VfdCache[file].fileName,
- (int64) VfdCache[file].seekPos,
(int64) offset, whence));
- if (FileIsNotOpen(file))
+ switch (whence)
{
- switch (whence)
- {
- case SEEK_SET:
- if (offset < 0)
- elog(ERROR, "invalid seek offset: " INT64_FORMAT,
- (int64) offset);
- VfdCache[file].seekPos = offset;
- break;
- case SEEK_CUR:
- VfdCache[file].seekPos += offset;
- break;
- case SEEK_END:
+ case SEEK_SET:
+ if (offset < 0)
+ elog(ERROR, "invalid seek offset: " INT64_FORMAT,
+ (int64) offset);
+ return offset;
+
+ case SEEK_END:
+ if (FileIsNotOpen(file))
+ {
returnCode = FileAccess(file);
if (returnCode < 0)
return returnCode;
- VfdCache[file].seekPos = lseek(VfdCache[file].fd,
- offset, whence);
- break;
- default:
- elog(ERROR, "invalid whence: %d", whence);
- break;
- }
- }
- else
- {
- switch (whence)
- {
- case SEEK_SET:
- if (offset < 0)
- elog(ERROR, "invalid seek offset: " INT64_FORMAT,
- (int64) offset);
- if (VfdCache[file].seekPos != offset)
- VfdCache[file].seekPos = lseek(VfdCache[file].fd,
- offset, whence);
- break;
- case SEEK_CUR:
- if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
- VfdCache[file].seekPos = lseek(VfdCache[file].fd,
- offset, whence);
- break;
- case SEEK_END:
- VfdCache[file].seekPos = lseek(VfdCache[file].fd,
- offset, whence);
- break;
- default:
- elog(ERROR, "invalid whence: %d", whence);
- break;
- }
+ }
+ return lseek(VfdCache[file].fd, offset, whence);
+
+ default:
+ elog(ERROR, "invalid whence: %d", whence);
}
- return VfdCache[file].seekPos;
-}
-/*
- * XXX not actually used but here for completeness
- */
-#ifdef NOT_USED
-off_t
-FileTell(File file)
-{
- Assert(FileIsValid(file));
- DO_DB(elog(LOG, "FileTell %d (%s)",
- file, VfdCache[file].fileName));
- return VfdCache[file].seekPos;
+ return -1;
}
-#endif
int
FileTruncate(File file, off_t offset)
diff --git src/backend/storage/smgr/md.c src/backend/storage/smgr/md.c
index f329d15..5fc9a6b 100644
--- src/backend/storage/smgr/md.c
+++ src/backend/storage/smgr/md.c
@@ -527,22 +527,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
- /*
- * Note: because caller usually obtained blocknum by calling mdnblocks,
- * which did a seek(SEEK_END), this seek is often redundant and will be
- * optimized away by fd.c. It's not redundant, however, if there is a
- * partial page at the end of the file. In that case we want to try to
- * overwrite the partial page with a full page. It's also not redundant
- * if bufmgr.c had to dump another buffer of the same file to make room
- * for the new page's buffer.
- */
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek to block %u in file \"%s\": %m",
- blocknum, FilePathName(v->mdfd_vfd))));
-
- if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
+ if ((nbytes = FileWriteAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos)) != BLCKSZ)
{
if (nbytes < 0)
ereport(ERROR,
@@ -749,13 +734,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek to block %u in file \"%s\": %m",
- blocknum, FilePathName(v->mdfd_vfd))));
-
- nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
+ nbytes = FileReadAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos);
TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
@@ -825,13 +804,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
- if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not seek to block %u in file \"%s\": %m",
- blocknum, FilePathName(v->mdfd_vfd))));
-
- nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
+ nbytes = FileWriteAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos);
TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
reln->smgr_rnode.node.spcNode,
diff --git src/include/storage/fd.h src/include/storage/fd.h
index cbc2224..0b6b846 100644
--- src/include/storage/fd.h
+++ src/include/storage/fd.h
@@ -69,8 +69,8 @@ extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
extern File OpenTemporaryFile(bool interXact);
extern void FileClose(File file);
extern int FilePrefetch(File file, off_t offset, int amount);
-extern int FileRead(File file, char *buffer, int amount);
-extern int FileWrite(File file, char *buffer, int amount);
+extern int FileReadAt(File file, char *buffer, int amount, off_t offset);
+extern int FileWriteAt(File file, char *buffer, int amount, off_t offset);
extern int FileSync(File file);
extern off_t FileSeek(File file, off_t offset, int whence);
extern int FileTruncate(File file, off_t offset);
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers