17.08.2016, 16:40, Tom Lane kirjoitti:
Oskari Saarenmaa <o...@ohmu.fi> writes:
On my laptop a simple pgbench run (scale 100, 15 minutes) shows a 1.5%
performance improvement.

I would have hoped for a lot better result before anyone would propose
that we should deal with all the portability issues this'll create.

AFAICT pread and pwrite are available on pretty much all operating systems released in 2000s; it was added to Linux in 1997. Windows and HP-UX 10.20 don't have it, but we can just simulate it using lseek + read/write there without adding too much code.

A 1.5% performance improvement is small but
measurable - and IMV more importantly it allows us to drop more than 100
lines of backwards (compatible?) code; maybe we could start targeting
more recent platforms in v10?

That's basically nonsense: we'll end up adding way more than that to
deal with platforms that haven't got these APIs.

Attached an updated patch that adds a configure check and uses lseek+read/write instead pread/pwrite when the latter aren't available. The previous code ended up seeking anyway in most of the cases and pgbench shows no performance regression on my Linux box.

 8 files changed, 54 insertions(+), 168 deletions(-)

/ Oskari
diff --git configure configure
index 45c8eef..4d50b52 100755
--- configure
+++ configure
@@ -12473,7 +12473,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
+for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pread pstat pthread_is_threaded_np pwrite readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git configure.in configure.in
index c878b4e..5eddaca 100644
--- configure.in
+++ configure.in
@@ -1446,7 +1446,7 @@ PGAC_FUNC_WCSTOMBS_L
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l])
+AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll pread pstat pthread_is_threaded_np pwrite readlink setproctitle setsid shm_open symlink sync_file_range towlower utime utimes wcstombs wcstombs_l])
 
 AC_REPLACE_FUNCS(fseeko)
 case $host_os in
diff --git src/backend/access/heap/rewriteheap.c src/backend/access/heap/rewriteheap.c
index f9ce986..6dc5df3 100644
--- src/backend/access/heap/rewriteheap.c
+++ src/backend/access/heap/rewriteheap.c
@@ -918,7 +918,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
 		 * Note that we deviate from the usual WAL coding practices here,
 		 * check the above "Logical rewrite support" comment for reasoning.
 		 */
-		written = FileWrite(src->vfd, waldata_start, len);
+		written = FileWriteAt(src->vfd, waldata_start, len, src->off);
 		if (written != len)
 			ereport(ERROR,
 					(errcode_for_file_access(),
diff --git src/backend/storage/file/buffile.c src/backend/storage/file/buffile.c
index 042be79..39482fa 100644
--- src/backend/storage/file/buffile.c
+++ src/backend/storage/file/buffile.c
@@ -60,12 +60,6 @@ struct BufFile
 	int			numFiles;		/* number of physical files in set */
 	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
 	File	   *files;			/* palloc'd array with numFiles entries */
-	off_t	   *offsets;		/* palloc'd array with numFiles entries */
-
-	/*
-	 * offsets[i] is the current seek position of files[i].  We use this to
-	 * avoid making redundant FileSeek calls.
-	 */
 
 	bool		isTemp;			/* can only add files if this is TRUE */
 	bool		isInterXact;	/* keep open over transactions? */
@@ -108,8 +102,6 @@ makeBufFile(File firstfile)
 	file->numFiles = 1;
 	file->files = (File *) palloc(sizeof(File));
 	file->files[0] = firstfile;
-	file->offsets = (off_t *) palloc(sizeof(off_t));
-	file->offsets[0] = 0L;
 	file->isTemp = false;
 	file->isInterXact = false;
 	file->dirty = false;
@@ -143,10 +135,7 @@ extendBufFile(BufFile *file)
 
 	file->files = (File *) repalloc(file->files,
 									(file->numFiles + 1) * sizeof(File));
-	file->offsets = (off_t *) repalloc(file->offsets,
-									   (file->numFiles + 1) * sizeof(off_t));
 	file->files[file->numFiles] = pfile;
-	file->offsets[file->numFiles] = 0L;
 	file->numFiles++;
 }
 
@@ -210,7 +199,6 @@ BufFileClose(BufFile *file)
 		FileClose(file->files[i]);
 	/* release the buffer space */
 	pfree(file->files);
-	pfree(file->offsets);
 	pfree(file);
 }
 
@@ -241,23 +229,12 @@ BufFileLoadBuffer(BufFile *file)
 	}
 
 	/*
-	 * May need to reposition physical file.
-	 */
-	thisfile = file->files[file->curFile];
-	if (file->curOffset != file->offsets[file->curFile])
-	{
-		if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
-			return;				/* seek failed, read nothing */
-		file->offsets[file->curFile] = file->curOffset;
-	}
-
-	/*
 	 * Read whatever we can get, up to a full bufferload.
 	 */
-	file->nbytes = FileRead(thisfile, file->buffer, sizeof(file->buffer));
+	thisfile = file->files[file->curFile];
+	file->nbytes = FileReadAt(thisfile, file->buffer, sizeof(file->buffer), file->curOffset);
 	if (file->nbytes < 0)
 		file->nbytes = 0;
-	file->offsets[file->curFile] += file->nbytes;
 	/* we choose not to advance curOffset here */
 
 	pgBufferUsage.temp_blks_read++;
@@ -307,20 +284,10 @@ BufFileDumpBuffer(BufFile *file)
 				bytestowrite = (int) availbytes;
 		}
 
-		/*
-		 * May need to reposition physical file.
-		 */
 		thisfile = file->files[file->curFile];
-		if (file->curOffset != file->offsets[file->curFile])
-		{
-			if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
-				return;			/* seek failed, give up */
-			file->offsets[file->curFile] = file->curOffset;
-		}
-		bytestowrite = FileWrite(thisfile, file->buffer + wpos, bytestowrite);
+		bytestowrite = FileWriteAt(thisfile, file->buffer + wpos, bytestowrite, file->curOffset);
 		if (bytestowrite <= 0)
 			return;				/* failed to write */
-		file->offsets[file->curFile] += bytestowrite;
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
diff --git src/backend/storage/file/fd.c src/backend/storage/file/fd.c
index 03143f1..56348f5 100644
--- src/backend/storage/file/fd.c
+++ src/backend/storage/file/fd.c
@@ -16,8 +16,8 @@
  * including base tables, scratch files (e.g., sort and hash spool
  * files), and random calls to C library routines like system(3); it
  * is quite easy to exceed system limits on the number of open files a
- * single process can have.  (This is around 256 on many modern
- * operating systems, but can be as low as 32 on others.)
+ * single process can have.  (This is around 1024 on many modern
+ * operating systems, but may be lower on others.)
  *
  * VFDs are managed as an LRU pool, with actual OS file descriptors
  * being opened and closed as needed.  Obviously, if a routine is
@@ -174,7 +174,6 @@ typedef struct vfd
 	File		nextFree;		/* link to next free VFD, if in freelist */
 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
 	File		lruLessRecently;
-	off_t		seekPos;		/* current logical file position */
 	off_t		fileSize;		/* current size of file (0 if not temporary) */
 	char	   *fileName;		/* name of file, or NULL for unused VFD */
 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
@@ -970,10 +969,6 @@ LruDelete(File file)
 	/* delete the vfd record from the LRU ring */
 	Delete(file);
 
-	/* save the seek position */
-	vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
-	Assert(vfdP->seekPos != (off_t) -1);
-
 	/* close the file */
 	if (close(vfdP->fd))
 		elog(ERROR, "could not close file \"%s\": %m", vfdP->fileName);
@@ -1038,15 +1033,6 @@ LruInsert(File file)
 			DO_DB(elog(LOG, "RE_OPEN SUCCESS"));
 			++nfile;
 		}
-
-		/* seek to the right position */
-		if (vfdP->seekPos != (off_t) 0)
-		{
-			off_t returnValue PG_USED_FOR_ASSERTS_ONLY;
-
-			returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
-			Assert(returnValue != (off_t) -1);
-		}
 	}
 
 	/*
@@ -1270,7 +1256,6 @@ PathNameOpenFile(FileName fileName, int fileFlags, int fileMode)
 	/* Saved flags are adjusted to be OK for re-opening file */
 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
 	vfdP->fileMode = fileMode;
-	vfdP->seekPos = 0;
 	vfdP->fileSize = 0;
 	vfdP->fdstate = 0x0;
 	vfdP->resowner = NULL;
@@ -1563,7 +1548,7 @@ FileWriteback(File file, off_t offset, off_t nbytes)
 }
 
 int
-FileRead(File file, char *buffer, int amount)
+FileReadAt(File file, char *buffer, int amount, off_t offset)
 {
 	int			returnCode;
 
@@ -1571,7 +1556,7 @@ FileRead(File file, char *buffer, int amount)
 
 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
+			   (int64) offset,
 			   amount, buffer));
 
 	returnCode = FileAccess(file);
@@ -1579,11 +1564,14 @@ FileRead(File file, char *buffer, int amount)
 		return returnCode;
 
 retry:
+#ifdef HAVE_PREAD
+	returnCode = pread(VfdCache[file].fd, buffer, amount, offset);
+#else
+	lseek(VfdCache[file].fd, offset, SEEK_SET);
 	returnCode = read(VfdCache[file].fd, buffer, amount);
+#endif
 
-	if (returnCode >= 0)
-		VfdCache[file].seekPos += returnCode;
-	else
+	if (returnCode < 0)
 	{
 		/*
 		 * Windows may run out of kernel buffers and return "Insufficient
@@ -1609,16 +1597,13 @@ retry:
 		/* OK to retry if interrupted */
 		if (errno == EINTR)
 			goto retry;
-
-		/* Trouble, so assume we don't know the file position anymore */
-		VfdCache[file].seekPos = FileUnknownPos;
 	}
 
 	return returnCode;
 }
 
 int
-FileWrite(File file, char *buffer, int amount)
+FileWriteAt(File file, char *buffer, int amount, off_t offset)
 {
 	int			returnCode;
 
@@ -1626,7 +1611,7 @@ FileWrite(File file, char *buffer, int amount)
 
 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
+			   (int64) offset,
 			   amount, buffer));
 
 	returnCode = FileAccess(file);
@@ -1643,7 +1628,7 @@ FileWrite(File file, char *buffer, int amount)
 	 */
 	if (temp_file_limit >= 0 && (VfdCache[file].fdstate & FD_TEMPORARY))
 	{
-		off_t		newPos = VfdCache[file].seekPos + amount;
+		off_t		newPos = offset + amount;
 
 		if (newPos > VfdCache[file].fileSize)
 		{
@@ -1660,7 +1645,12 @@ FileWrite(File file, char *buffer, int amount)
 
 retry:
 	errno = 0;
+#ifdef HAVE_PWRITE
+	returnCode = pwrite(VfdCache[file].fd, buffer, amount, offset);
+#else
+	lseek(VfdCache[file].fd, offset, SEEK_SET);
 	returnCode = write(VfdCache[file].fd, buffer, amount);
+#endif
 
 	/* if write didn't set errno, assume problem is no disk space */
 	if (returnCode != amount && errno == 0)
@@ -1668,12 +1658,10 @@ retry:
 
 	if (returnCode >= 0)
 	{
-		VfdCache[file].seekPos += returnCode;
-
 		/* maintain fileSize and temporary_files_size if it's a temp file */
 		if (VfdCache[file].fdstate & FD_TEMPORARY)
 		{
-			off_t		newPos = VfdCache[file].seekPos;
+			off_t		newPos = offset + returnCode;
 
 			if (newPos > VfdCache[file].fileSize)
 			{
@@ -1685,7 +1673,7 @@ retry:
 	else
 	{
 		/*
-		 * See comments in FileRead()
+		 * See comments in FileReadAt()
 		 */
 #ifdef WIN32
 		DWORD		error = GetLastError();
@@ -1704,9 +1692,6 @@ retry:
 		/* OK to retry if interrupted */
 		if (errno == EINTR)
 			goto retry;
-
-		/* Trouble, so assume we don't know the file position anymore */
-		VfdCache[file].seekPos = FileUnknownPos;
 	}
 
 	return returnCode;
@@ -1736,78 +1721,33 @@ FileSeek(File file, off_t offset, int whence)
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
+	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " %d",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
 			   (int64) offset, whence));
 
-	if (FileIsNotOpen(file))
+	switch (whence)
 	{
-		switch (whence)
-		{
-			case SEEK_SET:
-				if (offset < 0)
-					elog(ERROR, "invalid seek offset: " INT64_FORMAT,
-						 (int64) offset);
-				VfdCache[file].seekPos = offset;
-				break;
-			case SEEK_CUR:
-				VfdCache[file].seekPos += offset;
-				break;
-			case SEEK_END:
+		case SEEK_SET:
+			if (offset < 0)
+				elog(ERROR, "invalid seek offset: " INT64_FORMAT,
+					 (int64) offset);
+			return offset;
+
+		case SEEK_END:
+			if (FileIsNotOpen(file))
+			{
 				returnCode = FileAccess(file);
 				if (returnCode < 0)
 					return returnCode;
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-											   offset, whence);
-				break;
-			default:
-				elog(ERROR, "invalid whence: %d", whence);
-				break;
-		}
-	}
-	else
-	{
-		switch (whence)
-		{
-			case SEEK_SET:
-				if (offset < 0)
-					elog(ERROR, "invalid seek offset: " INT64_FORMAT,
-						 (int64) offset);
-				if (VfdCache[file].seekPos != offset)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-												   offset, whence);
-				break;
-			case SEEK_CUR:
-				if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-												   offset, whence);
-				break;
-			case SEEK_END:
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
-											   offset, whence);
-				break;
-			default:
-				elog(ERROR, "invalid whence: %d", whence);
-				break;
-		}
+			}
+			return lseek(VfdCache[file].fd, offset, whence);
+
+		default:
+			elog(ERROR, "invalid whence: %d", whence);
 	}
-	return VfdCache[file].seekPos;
-}
 
-/*
- * XXX not actually used but here for completeness
- */
-#ifdef NOT_USED
-off_t
-FileTell(File file)
-{
-	Assert(FileIsValid(file));
-	DO_DB(elog(LOG, "FileTell %d (%s)",
-			   file, VfdCache[file].fileName));
-	return VfdCache[file].seekPos;
+	return -1;
 }
-#endif
 
 int
 FileTruncate(File file, off_t offset)
diff --git src/backend/storage/smgr/md.c src/backend/storage/smgr/md.c
index f329d15..5fc9a6b 100644
--- src/backend/storage/smgr/md.c
+++ src/backend/storage/smgr/md.c
@@ -527,22 +527,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	/*
-	 * Note: because caller usually obtained blocknum by calling mdnblocks,
-	 * which did a seek(SEEK_END), this seek is often redundant and will be
-	 * optimized away by fd.c.  It's not redundant, however, if there is a
-	 * partial page at the end of the file. In that case we want to try to
-	 * overwrite the partial page with a full page.  It's also not redundant
-	 * if bufmgr.c had to dump another buffer of the same file to make room
-	 * for the new page's buffer.
-	 */
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
+	if ((nbytes = FileWriteAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos)) != BLCKSZ)
 	{
 		if (nbytes < 0)
 			ereport(ERROR,
@@ -749,13 +734,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
+	nbytes = FileReadAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos);
 
 	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
 									   reln->smgr_rnode.node.spcNode,
@@ -825,13 +804,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
+	nbytes = FileWriteAt(v->mdfd_vfd, buffer, BLCKSZ, seekpos);
 
 	TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
 										reln->smgr_rnode.node.spcNode,
diff --git src/include/pg_config.h.in src/include/pg_config.h.in
index b621ff2..7d6723a 100644
--- src/include/pg_config.h.in
+++ src/include/pg_config.h.in
@@ -382,6 +382,9 @@
 /* Define to 1 if the assembler supports PPC's LWARX mutex hint bit. */
 #undef HAVE_PPC_LWARX_MUTEX_HINT
 
+/* Define to 1 if you have the `pread' function. */
+#undef HAVE_PREAD
+
 /* Define to 1 if you have the `pstat' function. */
 #undef HAVE_PSTAT
 
@@ -400,6 +403,9 @@
 /* Define to 1 if you have the <pwd.h> header file. */
 #undef HAVE_PWD_H
 
+/* Define to 1 if you have the `pwrite' function. */
+#undef HAVE_PWRITE
+
 /* Define to 1 if you have the `random' function. */
 #undef HAVE_RANDOM
 
diff --git src/include/storage/fd.h src/include/storage/fd.h
index cbc2224..0b6b846 100644
--- src/include/storage/fd.h
+++ src/include/storage/fd.h
@@ -69,8 +69,8 @@ extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode);
 extern File OpenTemporaryFile(bool interXact);
 extern void FileClose(File file);
 extern int	FilePrefetch(File file, off_t offset, int amount);
-extern int	FileRead(File file, char *buffer, int amount);
-extern int	FileWrite(File file, char *buffer, int amount);
+extern int	FileReadAt(File file, char *buffer, int amount, off_t offset);
+extern int	FileWriteAt(File file, char *buffer, int amount, off_t offset);
 extern int	FileSync(File file);
 extern off_t FileSeek(File file, off_t offset, int whence);
 extern int	FileTruncate(File file, off_t offset);
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to