Hi,

The Uber blog post, among other things, pointed out that PG uses lseek + read instead of pread. I didn't see any discussion around that and my Google searches didn't find any posts about pread / pwrite for the past 10 years.

With that plus the "C++ port" thread in mind, I was wondering if it's time to see if we could do better by just utilizing newer C and POSIX features.

The attached patch replaces FileWrite and FileRead with FileWriteAt and FileReadAt and removes most FileSeek calls. FileSeek is still around so we can find the end of a file, but it's not used for anything else.

On my laptop a simple pgbench run (scale 100, 15 minutes) shows a 1.5% performance improvement. A 1.5% performance improvement is small but measurable - and IMV more importantly it allows us to drop more than 100 lines of backwards (compatible?) code; maybe we could start targeting more recent platforms in v10?

Obviously this patch needs some more work before it could be merged, and we probably still need a fallback for some platforms without pread and pwrite (afaik Windows doesn't implement them.)

/ Oskari
diff --git src/backend/access/heap/rewriteheap.c src/backend/access/heap/rewriteheap.c
index f9ce986..6dc5df3 100644
--- src/backend/access/heap/rewriteheap.c
+++ src/backend/access/heap/rewriteheap.c
@@ -918,7 +918,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
 		 * Note that we deviate from the usual WAL coding practices here,
 		 * check the above "Logical rewrite support" comment for reasoning.
 		 */
-		written = FileWrite(src->vfd, waldata_start, len);
+		written = FileWriteAt(src->vfd, waldata_start, len, src->off);
 		if (written != len)
 			ereport(ERROR,
 					(errcode_for_file_access(),
diff --git src/backend/storage/file/buffile.c src/backend/storage/file/buffile.c
index 042be79..39482fa 100644
--- src/backend/storage/file/buffile.c
+++ src/backend/storage/file/buffile.c
@@ -60,12 +60,6 @@ struct BufFile
 	int			numFiles;		/* number of physical files in set */
 	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
 	File	   *files;			/* palloc'd array with numFiles entries */
-	off_t	   *offsets;		/* palloc'd array with numFiles entries */
-
-	/*
-	 * offsets[i] is the current seek position of files[i].  We use this to
-	 * avoid making redundant FileSeek calls.
-	 */
 
 	bool		isTemp;			/* can only add files if this is TRUE */
 	bool		isInterXact;	/* keep open over transactions? */
@@ -108,8 +102,6 @@ makeBufFile(File firstfile)
 	file->numFiles = 1;
 	file->files = (File *) palloc(sizeof(File));
 	file->files[0] = firstfile;
-	file->offsets = (off_t *) palloc(sizeof(off_t));
-	file->offsets[0] = 0L;
 	file->isTemp = false;
 	file->isInterXact = false;
 	file->dirty = false;
@@ -143,10 +135,7 @@ extendBufFile(BufFile *file)
 
 	file->files = (File *) repalloc(file->files,
 									(file->numFiles + 1) * sizeof(File));
-	file->offsets = (off_t *) repalloc(file->offsets,
-									   (file->numFiles + 1) * sizeof(off_t));
 	file->files[file->numFiles] = pfile;
-	file->offsets[file->numFiles] = 0L;
 	file->numFiles++;
 }
 
@@ -210,7 +199,6 @@ BufFileClose(BufFile *file)
 		FileClose(file->files[i]);
 	/* release the buffer space */
 	pfree(file->files);
-	pfree(file->offsets);
 	pfree(file);
 }
 
@@ -241,23 +229,12 @@ BufFileLoadBuffer(BufFile *file)
 	}
 
 	/*
-	 * May need to reposition physical file.
-	 */
-	thisfile = file->files[file->curFile];
-	if (file->curOffset != file->offsets[file->curFile])
-	{
-		if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
-			return;				/* seek failed, read nothing */
-		file->offsets[file->curFile] = file->curOffset;
-	}
-
-	/*
 	 * Read whatever we can get, up to a full bufferload.
 	 */
-	file->nbytes = FileRead(thisfile, file->buffer, sizeof(file->buffer));
+	thisfile = file->files[file->curFile];
+	file->nbytes = FileReadAt(thisfile, file->buffer, sizeof(file->buffer), file->curOffset);
 	if (file->nbytes < 0)
 		file->nbytes = 0;
-	file->offsets[file->curFile] += file->nbytes;
 	/* we choose not to advance curOffset here */
 
 	pgBufferUsage.temp_blks_read++;
@@ -307,20 +284,10 @@ BufFileDumpBuffer(BufFile *file)
 				bytestowrite = (int) availbytes;
 		}
 
-		/*
-		 * May need to reposition physical file.
-		 */
 		thisfile = file->files[file->curFile];
-		if (file->curOffset != file->offsets[file->curFile])
-		{
-			if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
-				return;			/* seek failed, give up */
-			file->offsets[file->curFile] = file->curOffset;
-		}
-		bytestowrite = FileWrite(thisfile, file->buffer + wpos, bytestowrite);
+		bytestowrite = FileWriteAt(thisfile, file->buffer + wpos, bytestowrite, file->curOffset);
 		if (bytestowrite <= 0)
 			return;				/* failed to write */
-		file->offsets[file->curFile] += bytestowrite;
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
diff --git src/backend/storage/file/fd.c src/backend/storage/file/fd.c
index 03143f1..795cb12 100644
--- src/backend/storage/file/fd.c
+++ src/backend/storage/file/fd.c
@@ -16,8 +16,8 @@
  * including base tables, scratch files (e.g., sort and hash spool
  * files), and random calls to C library routines like system(3); it
  * is quite easy to exceed system limits on the number of open files a
- * single process can have.  (This is around 256 on many modern
- * operating systems, but can be as low as 32 on others.)
+ * single process can have.  (This is around 1024 on many modern
+ * operating systems, but may be lower on others.)
  *
  * VFDs are managed as an LRU pool, with actual OS file descriptors
  * being opened and closed as needed.  Obviously, if a routine is
@@ -174,7 +174,6 @@ typedef struct vfd
 	File		nextFree;		/* link to next free VFD, if in freelist */
 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
 	File		lruLessRecently;
-	off_t		seekPos;		/* current logical file position */
 	off_t		fileSize;		/* current size of file (0 if not temporary) */
 	char	   *fileName;		/* name of file, or NULL for unused VFD */
 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
@@ -970,10 +969,6 @@ LruDelete(File file)
 	/* delete the vfd record from the LRU ring */
 	Delete(file);
 
-	/* save the seek position */
-	vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
-	Assert(vfdP->seekPos != (off_t) -1);
-
 	/* close the file */
 	if (close(vfdP->fd))
 		elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
@@ -1038,15 +1033,6 @@ LruInsert(File file)
 			DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
 			++nfile;
 		}
-
-		/* seek to the right position */
-		if (vfdP->seekPos != (off_t) 0)
-		{
-			off_t returnValue PG_USED_FOR_ASSERTS_ONLY;
-
-			returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
-			Assert(returnValue != (off_t) -1);
-		}
 	}
 
 	/*
@@ -1270,7 +1256,6 @@ PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
 	/* Saved flags are adjusted to be OK for re-opening file */
 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
 	vfdP->fileMode = fileMode;
-	vfdP->seekPos = 0;
 	vfdP->fileSize = 0;
 	vfdP->fdstate = 0x0;
 	vfdP->resowner = NULL;
@@ -1563,7 +1548,7 @@ FileWriteback(File file, off_t offset, off_t nbytes)
 }
 
 int
-FileRead(File file, char *buffer, int amount)
+FileReadAt(File file, char *buffer, int amount, off_t offset)
 {
 	int			returnCode;
 
@@ -1571,7 +1556,7 @@ FileRead(File file, char *buffer, int amount)
 
 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
+			   (int64) offset,
 			   amount, buffer));
 
 	returnCode = FileAccess(file);
@@ -1579,11 +1564,9 @@ FileRead(File file, char *buffer, int amount)
 		return returnCode;
 
 retry:
-	returnCode = read(VfdCache[file].fd, buffer, amount);
+	returnCode = pread(VfdCache[file].fd, buffer, amount, offset);
 
-	if (returnCode >= 0)
-		VfdCache[file].seekPos += returnCode;
-	else
+	if (returnCode < 0)
 	{
 		/*
 		 * Windows may run out of kernel buffers and return "Insufficient
@@ -1609,16 +1592,13 @@ retry:
 		/* OK to retry if interrupted */
 		if (errno == EINTR)
 			goto retry;
-
-		/* Trouble, so assume we don't know the file position anymore */
-		VfdCache[file].seekPos = FileUnknownPos;
 	}
 
 	return returnCode;
 }
 
 int
-FileWrite(File file, char *buffer, int amount)
+FileWriteAt(File file, char *buffer, int amount, off_t offset)
 {
 	int			returnCode;
 
@@ -1626,7 +1606,7 @@ FileWrite(File file, char *buffer, int amount)
 
 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
+			   (int64) offset,
 			   amount, buffer));
 
 	returnCode = FileAccess(file);
@@ -1643,7 +1623,7 @@ FileWrite(File file, char *buffer, int amount)
 	 */
 	if (temp_file_limit >= 0 && (VfdCache[file].fdstate & FD_TEMPORARY))
 	{
-		off_t		newPos = VfdCache[file].seekPos + amount;
+		off_t		newPos = offset + amount;
 
 		if (newPos > VfdCache[file].fileSize)
 		{
@@ -1660,7 +1640,7 @@ FileWrite(File file, char *buffer, int amount)
 
 retry:
 	errno = 0;
-	returnCode = write(VfdCache[file].fd, buffer, amount);
+	returnCode = pwrite(VfdCache[file].fd, buffer, amount, offset);
 
 	/* if write didn't set errno, assume problem is no disk space */
 	if (returnCode != amount && errno == 0)
@@ -1668,12 +1648,10 @@ retry:
 
 	if (returnCode >= 0)
 	{
-		VfdCache[file].seekPos += returnCode;
-
 		/* maintain fileSize and temporary_files_size if it's a temp file */
 		if (VfdCache[file].fdstate & FD_TEMPORARY)
 		{
-			off_t		newPos = VfdCache[file].seekPos;
+			off_t		newPos = offset + returnCode;
 
 			if (newPos > VfdCache[file].fileSize)
 			{
@@ -1685,7 +1663,7 @@ retry:
 	else
 	{
 		/*
-		 * See comments in FileRead()
+		 * See comments in FileReadAt()
 		 */
 #ifdef WIN32
 		DWORD		error = GetLastError();
@@ -1704,9 +1682,6 @@ retry:
 		/* OK to retry if interrupted */
 		if (errno == EINTR)
 			goto retry;
-
-		/* Trouble, so assume we don't know the file position anymore */
-		VfdCache[file].seekPos = FileUnknownPos;
 	}
 
 	return returnCode;
@@ -1736,78 +1711,33 @@ FileSeek(File file, off_t offset, int whence)
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
+	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " %d",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
 			   (int64) offset, whence));
 
-	if (FileIsNotOpen(file))
+	switch (whence)
 	{
-		switch (whence)
-		{
-			case SEEK_SET:
-				if (offset < 0)
-					elog(ERROR, "invalid seek offset: " INT64_FORMAT,
-						 (int64) offset);
-				VfdCache[file].seekPos = offset;
-				break;
-			case SEEK_CUR:
-				VfdCache[file].seekPos += offset;
-				break;
-			case SEEK_END:
+		case SEEK_SET:
+			if (offset < 0)
+				elog(ERROR, "invalid seek offset: " INT64_FORMAT,
+					 (int64) offset);
+			return offset;
+
+		case SEEK_END:
+			if (FileIsNotOpen(file))
+			{
 				returnCode = FileAccess(file);
 				if (returnCode < 0)
 					return returnCode;
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-											   offset, whence);
-				break;
-			default:
-				elog(ERROR, "invalid whence: %d", whence);
-				break;
-		}
-	}
-	else
-	{
-		switch (whence)
-		{
-			case SEEK_SET:
-				if (offset < 0)
-					elog(ERROR, "invalid seek offset: " INT64_FORMAT,
-						 (int64) offset);
-				if (VfdCache[file].seekPos != offset)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-												   offset, whence);
-				break;
-			case SEEK_CUR:
-				if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-												   offset, whence);
-				break;
-			case SEEK_END:
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-											   offset, whence);
-				break;
-			default:
-				elog(ERROR, "invalid whence: %d", whence);
-				break;
-		}
+			}
+			return lseek(VfdCache[file].fd, offset, whence);
+
+		default:
+			elog(ERROR, "invalid whence: %d", whence);
 	}
-	return VfdCache[file].seekPos;
-}
 
-/*
- * XXX not actually used but here for completeness
- */
-#ifdef NOT_USED
-off_t
-FileTell(File file)
-{
-	Assert(FileIsValid(file));
-	DO_DB(elog(LOG, "FileTell %d (%s)",
-			   file, VfdCache[file].fileName));
-	return VfdCache[file].seekPos;
+	return -1;
 }
-#endif
 
 int
 FileTruncate(File file, off_t offset)
diff --git src/backend/storage/smgr/md.c src/backend/storage/smgr/md.c
index f329d15..5fc9a6b 100644
--- src/backend/storage/smgr/md.c
+++ src/backend/storage/smgr/md.c
@@ -527,22 +527,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	/*
-	 * Note: because caller usually obtained blocknum by calling mdnblocks,
-	 * which did a seek(SEEK_END), this seek is often redundant and will be
-	 * optimized away by fd.c.  It's not redundant, however, if there is a
-	 * partial page at the end of the file. In that case we want to try to
-	 * overwrite the partial page with a full page.  It's also not redundant
-	 * if bufmgr.c had to dump another buffer of the same file to make room
-	 * for the new page's buffer.
-	 */
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
+	if ((nbytes = FileWriteAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos)) != BLCKSZ)
 	{
 		if (nbytes < 0)
 			ereport(ERROR,
@@ -749,13 +734,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
+	nbytes = FileReadAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos);
 
 	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
 									   reln->smgr_rnode.node.spcNode,
@@ -825,13 +804,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
+	nbytes = FileWriteAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos);
 
 	TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
 										reln->smgr_rnode.node.spcNode,
diff --git src/include/storage/fd.h src/include/storage/fd.h
index cbc2224..0b6b846 100644
--- src/include/storage/fd.h
+++ src/include/storage/fd.h
@@ -69,8 +69,8 @@ extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
 extern File OpenTemporaryFile(bool interXact);
 extern void FileClose(File file);
 extern int	FilePrefetch(File file, off_t offset, int amount);
-extern int	FileRead(File file, char *buffer, int amount);
-extern int	FileWrite(File file, char *buffer, int amount);
+extern int	FileReadAt(File file, char *buffer, int amount, off_t offset);
+extern int	FileWriteAt(File file, char *buffer, int amount, off_t offset);
 extern int	FileSync(File file);
 extern off_t FileSeek(File file, off_t offset, int whence);
 extern int	FileTruncate(File file, off_t offset);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to