From e5cd7f0f8af107646613c9d22a6a399340887b56 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@enterprisedb.com>
Date: Thu, 12 Jul 2018 13:14:02 +1200
Subject: [PATCH] Use pread()/pwrite() instead of lseek() + read()/write().

Cut down on system calls by doing random IO using POSIX.1-2008
offset-based IO routines, where available.  Remove the code for
tracking the 'virtual' seek position.  The only reason left to
call FileSeek() was to get the file's size, so provide a new
function FileSize() instead.

Author: Oskari Saarenmaa, Thomas Munro
Reviewed-by: Thomas Munro, Jesper Pedersen
Discussion: https://postgr.es/m/CAEepm=02rapCpPR3ZGF2vW=SBHSdFYO_bz_f-wwWJonmA3APgw@mail.gmail.com
Discussion: https://postgr.es/m/b8748d39-0b19-0514-a1b9-4e5a28e6a208%40gmail.com
Discussion: https://postgr.es/m/a86bd200-ebbe-d829-e3ca-0c4474b2fcb7%40ohmu.fi
---
 configure                             |   2 +-
 configure.in                          |   2 +-
 src/backend/access/heap/rewriteheap.c |   2 +-
 src/backend/access/transam/xlog.c     |  13 ++
 src/backend/storage/file/buffile.c    |  46 +-----
 src/backend/storage/file/fd.c         | 217 +++++---------------------
 src/backend/storage/smgr/md.c         |  35 +----
 src/include/pg_config.h.in            |   6 +
 src/include/storage/fd.h              |  12 +-
 9 files changed, 72 insertions(+), 263 deletions(-)

diff --git a/configure b/configure
index 0448c6bfebf..a0ee8c569cf 100755
--- a/configure
+++ b/configure
@@ -15100,7 +15100,7 @@ fi
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-for ac_func in cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l
+for ac_func in cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pread pstat pthread_is_threaded_np pwrite readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l
 do :
   as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
 ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var"
diff --git a/configure.in b/configure.in
index 23b5bb867bb..a4531ff3036 100644
--- a/configure.in
+++ b/configure.in
@@ -1571,7 +1571,7 @@ PGAC_FUNC_WCSTOMBS_L
 LIBS_including_readline="$LIBS"
 LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'`
 
-AC_CHECK_FUNCS([cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l])
+AC_CHECK_FUNCS([cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pread pstat pthread_is_threaded_np pwrite readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l])
 
 AC_REPLACE_FUNCS(fseeko)
 case $host_os in
diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 85f92973c95..5f573bafda6 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -922,7 +922,7 @@ logical_heap_rewrite_flush_mappings(RewriteState state)
 		 * Note that we deviate from the usual WAL coding practices here,
 		 * check the above "Logical rewrite support" comment for reasoning.
 		 */
-		written = FileWrite(src->vfd, waldata_start, len,
+		written = FileWrite(src->vfd, waldata_start, len, src->off,
 							WAIT_EVENT_LOGICAL_REWRITE_WRITE);
 		if (written != len)
 			ereport(ERROR,
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 7375a78ffcf..c5b99c93237 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2484,6 +2484,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			Size		nleft;
 			int			written;
 
+#ifndef HAVE_PWRITE
 			/* Need to seek in the file? */
 			if (openLogOff != startoffset)
 			{
@@ -2495,6 +2496,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 									startoffset)));
 				openLogOff = startoffset;
 			}
+#endif
 
 			/* OK to write the page(s) */
 			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
@@ -2504,7 +2506,11 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			{
 				errno = 0;
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
+#ifdef HAVE_PWRITE
+				written = pwrite(openLogFile, from, nleft, startoffset);
+#else
 				written = write(openLogFile, from, nleft);
+#endif
 				pgstat_report_wait_end();
 				if (written <= 0)
 				{
@@ -2519,6 +2525,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 				}
 				nleft -= written;
 				from += written;
+				startoffset += written;
 			} while (nleft > 0);
 
 			/* Update state for write */
@@ -11827,6 +11834,7 @@ retry:
 
 	/* Read the requested page */
 	readOff = targetPageOff;
+#ifndef HAVE_PREAD
 	if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
 	{
 		char		fname[MAXFNAMELEN];
@@ -11840,9 +11848,14 @@ retry:
 						fname, readOff)));
 		goto next_record_is_invalid;
 	}
+#endif
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
+#ifdef HAVE_PREAD
+	r = pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
+#else
 	r = read(readFile, readBuf, XLOG_BLCKSZ);
+#endif
 	if (r != XLOG_BLCKSZ)
 	{
 		char		fname[MAXFNAMELEN];
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index e93813d9737..dd687dfe71f 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -67,12 +67,6 @@ struct BufFile
 	int			numFiles;		/* number of physical files in set */
 	/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
 	File	   *files;			/* palloc'd array with numFiles entries */
-	off_t	   *offsets;		/* palloc'd array with numFiles entries */
-
-	/*
-	 * offsets[i] is the current seek position of files[i].  We use this to
-	 * avoid making redundant FileSeek calls.
-	 */
 
 	bool		isInterXact;	/* keep open over transactions? */
 	bool		dirty;			/* does buffer need to be written? */
@@ -116,7 +110,6 @@ makeBufFileCommon(int nfiles)
 	BufFile    *file = (BufFile *) palloc(sizeof(BufFile));
 
 	file->numFiles = nfiles;
-	file->offsets = (off_t *) palloc0(sizeof(off_t) * nfiles);
 	file->isInterXact = false;
 	file->dirty = false;
 	file->resowner = CurrentResourceOwner;
@@ -170,10 +163,7 @@ extendBufFile(BufFile *file)
 
 	file->files = (File *) repalloc(file->files,
 									(file->numFiles + 1) * sizeof(File));
-	file->offsets = (off_t *) repalloc(file->offsets,
-									   (file->numFiles + 1) * sizeof(off_t));
 	file->files[file->numFiles] = pfile;
-	file->offsets[file->numFiles] = 0L;
 	file->numFiles++;
 }
 
@@ -396,7 +386,6 @@ BufFileClose(BufFile *file)
 		FileClose(file->files[i]);
 	/* release the buffer space */
 	pfree(file->files);
-	pfree(file->offsets);
 	pfree(file);
 }
 
@@ -422,27 +411,17 @@ BufFileLoadBuffer(BufFile *file)
 		file->curOffset = 0L;
 	}
 
-	/*
-	 * May need to reposition physical file.
-	 */
-	thisfile = file->files[file->curFile];
-	if (file->curOffset != file->offsets[file->curFile])
-	{
-		if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
-			return;				/* seek failed, read nothing */
-		file->offsets[file->curFile] = file->curOffset;
-	}
-
 	/*
 	 * Read whatever we can get, up to a full bufferload.
 	 */
+	thisfile = file->files[file->curFile];
 	file->nbytes = FileRead(thisfile,
 							file->buffer.data,
 							sizeof(file->buffer),
+							file->curOffset,
 							WAIT_EVENT_BUFFILE_READ);
 	if (file->nbytes < 0)
 		file->nbytes = 0;
-	file->offsets[file->curFile] += file->nbytes;
 	/* we choose not to advance curOffset here */
 
 	if (file->nbytes > 0)
@@ -491,23 +470,14 @@ BufFileDumpBuffer(BufFile *file)
 		if ((off_t) bytestowrite > availbytes)
 			bytestowrite = (int) availbytes;
 
-		/*
-		 * May need to reposition physical file.
-		 */
 		thisfile = file->files[file->curFile];
-		if (file->curOffset != file->offsets[file->curFile])
-		{
-			if (FileSeek(thisfile, file->curOffset, SEEK_SET) != file->curOffset)
-				return;			/* seek failed, give up */
-			file->offsets[file->curFile] = file->curOffset;
-		}
 		bytestowrite = FileWrite(thisfile,
 								 file->buffer.data + wpos,
 								 bytestowrite,
+								 file->curOffset,
 								 WAIT_EVENT_BUFFILE_WRITE);
 		if (bytestowrite <= 0)
 			return;				/* failed to write */
-		file->offsets[file->curFile] += bytestowrite;
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
@@ -803,11 +773,10 @@ BufFileSize(BufFile *file)
 {
 	off_t		lastFileSize;
 
-	/* Get the size of the last physical file by seeking to end. */
-	lastFileSize = FileSeek(file->files[file->numFiles - 1], 0, SEEK_END);
+	/* Get the size of the last physical file. */
+	lastFileSize = FileSize(file->files[file->numFiles - 1]);
 	if (lastFileSize < 0)
 		return -1;
-	file->offsets[file->numFiles - 1] = lastFileSize;
 
 	return ((file->numFiles - 1) * (off_t) MAX_PHYSICAL_FILESIZE) +
 		lastFileSize;
@@ -849,13 +818,8 @@ BufFileAppend(BufFile *target, BufFile *source)
 
 	target->files = (File *)
 		repalloc(target->files, sizeof(File) * newNumFiles);
-	target->offsets = (off_t *)
-		repalloc(target->offsets, sizeof(off_t) * newNumFiles);
 	for (i = target->numFiles; i < newNumFiles; i++)
-	{
 		target->files[i] = source->files[i - target->numFiles];
-		target->offsets[i] = source->offsets[i - target->numFiles];
-	}
 	target->numFiles = newNumFiles;
 
 	return startBlock;
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 8dd51f17674..a380f794014 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -16,8 +16,8 @@
  * including base tables, scratch files (e.g., sort and hash spool
  * files), and random calls to C library routines like system(3); it
  * is quite easy to exceed system limits on the number of open files a
- * single process can have.  (This is around 256 on many modern
- * operating systems, but can be as low as 32 on others.)
+ * single process can have.  (This is around 1024 on many modern
+ * operating systems, but may be lower on others.)
  *
  * VFDs are managed as an LRU pool, with actual OS file descriptors
  * being opened and closed as needed.  Obviously, if a routine is
@@ -167,15 +167,6 @@ int			max_safe_fds = 32;	/* default if not changed */
 
 #define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
 
-/*
- * Note: a VFD's seekPos is normally always valid, but if for some reason
- * an lseek() fails, it might become set to FileUnknownPos.  We can struggle
- * along without knowing the seek position in many cases, but in some places
- * we have to fail if we don't have it.
- */
-#define FileUnknownPos ((off_t) -1)
-#define FilePosIsUnknown(pos) ((pos) < 0)
-
 /* these are the assigned bits in fdstate below: */
 #define FD_DELETE_AT_CLOSE	(1 << 0)	/* T = delete when closed */
 #define FD_CLOSE_AT_EOXACT	(1 << 1)	/* T = close at eoXact */
@@ -189,7 +180,6 @@ typedef struct vfd
 	File		nextFree;		/* link to next free VFD, if in freelist */
 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
 	File		lruLessRecently;
-	off_t		seekPos;		/* current logical file position, or -1 */
 	off_t		fileSize;		/* current size of file (0 if not temporary) */
 	char	   *fileName;		/* name of file, or NULL for unused VFD */
 	/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
@@ -407,9 +397,7 @@ pg_fdatasync(int fd)
 /*
  * pg_flush_data --- advise OS that the described dirty data should be flushed
  *
- * offset of 0 with nbytes 0 means that the entire file should be flushed;
- * in this case, this function may have side-effects on the file's
- * seek position!
+ * offset of 0 with nbytes 0 means that the entire file should be flushed
  */
 void
 pg_flush_data(int fd, off_t offset, off_t nbytes)
@@ -1029,22 +1017,6 @@ LruDelete(File file)
 
 	vfdP = &VfdCache[file];
 
-	/*
-	 * Normally we should know the seek position, but if for some reason we
-	 * have lost track of it, try again to get it.  If we still can't get it,
-	 * we have a problem: we will be unable to restore the file seek position
-	 * when and if the file is re-opened.  But we can't really throw an error
-	 * and refuse to close the file, or activities such as transaction cleanup
-	 * will be broken.
-	 */
-	if (FilePosIsUnknown(vfdP->seekPos))
-	{
-		vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
-		if (FilePosIsUnknown(vfdP->seekPos))
-			elog(LOG, "could not seek file \"%s\" before closing: %m",
-				 vfdP->fileName);
-	}
-
 	/*
 	 * Close the file.  We aren't expecting this to fail; if it does, better
 	 * to leak the FD than to mess up our internal state.
@@ -1113,33 +1085,6 @@ LruInsert(File file)
 		{
 			++nfile;
 		}
-
-		/*
-		 * Seek to the right position.  We need no special case for seekPos
-		 * equal to FileUnknownPos, as lseek() will certainly reject that
-		 * (thus completing the logic noted in LruDelete() that we will fail
-		 * to re-open a file if we couldn't get its seek position before
-		 * closing).
-		 */
-		if (vfdP->seekPos != (off_t) 0)
-		{
-			if (lseek(vfdP->fd, vfdP->seekPos, SEEK_SET) < 0)
-			{
-				/*
-				 * If we fail to restore the seek position, treat it like an
-				 * open() failure.
-				 */
-				int			save_errno = errno;
-
-				elog(LOG, "could not seek file \"%s\" after re-opening: %m",
-					 vfdP->fileName);
-				(void) close(vfdP->fd);
-				vfdP->fd = VFD_CLOSED;
-				--nfile;
-				errno = save_errno;
-				return -1;
-			}
-		}
 	}
 
 	/*
@@ -1406,7 +1351,6 @@ PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
 	/* Saved flags are adjusted to be OK for re-opening file */
 	vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
 	vfdP->fileMode = fileMode;
-	vfdP->seekPos = 0;
 	vfdP->fileSize = 0;
 	vfdP->fdstate = 0x0;
 	vfdP->resowner = NULL;
@@ -1820,7 +1764,6 @@ FileClose(File file)
 
 /*
  * FilePrefetch - initiate asynchronous read of a given range of the file.
- * The logical seek position is unaffected.
  *
  * Currently the only implementation of this function is using posix_fadvise
  * which is the simplest standardized interface that accomplishes this.
@@ -1867,10 +1810,6 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
 			   file, VfdCache[file].fileName,
 			   (int64) offset, (int64) nbytes));
 
-	/*
-	 * Caution: do not call pg_flush_data with nbytes = 0, it could trash the
-	 * file's seek position.  We prefer to define that as a no-op here.
-	 */
 	if (nbytes <= 0)
 		return;
 
@@ -1884,7 +1823,8 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
 }
 
 int
-FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
+FileRead(File file, char *buffer, int amount, off_t offset,
+		 uint32 wait_event_info)
 {
 	int			returnCode;
 	Vfd		   *vfdP;
@@ -1893,7 +1833,7 @@ FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
 
 	DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
+			   (int64) offset,
 			   amount, buffer));
 
 	returnCode = FileAccess(file);
@@ -1904,16 +1844,16 @@ FileRead(File file, char *buffer, int amount, uint32 wait_event_info)
 
 retry:
 	pgstat_report_wait_start(wait_event_info);
-	returnCode = read(vfdP->fd, buffer, amount);
+#ifdef HAVE_PREAD
+	returnCode = pread(vfdP->fd, buffer, amount, offset);
+#else
+	returnCode = lseek(VfdCache[file].fd, offset, SEEK_SET);
+	if (returnCode >= 0)
+		returnCode = read(vfdP->fd, buffer, amount);
+#endif
 	pgstat_report_wait_end();
 
-	if (returnCode >= 0)
-	{
-		/* if seekPos is unknown, leave it that way */
-		if (!FilePosIsUnknown(vfdP->seekPos))
-			vfdP->seekPos += returnCode;
-	}
-	else
+	if (returnCode < 0)
 	{
 		/*
 		 * Windows may run out of kernel buffers and return "Insufficient
@@ -1939,16 +1879,14 @@ retry:
 		/* OK to retry if interrupted */
 		if (errno == EINTR)
 			goto retry;
-
-		/* Trouble, so assume we don't know the file position anymore */
-		vfdP->seekPos = FileUnknownPos;
 	}
 
 	return returnCode;
 }
 
 int
-FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
+FileWrite(File file, char *buffer, int amount, off_t offset,
+		  uint32 wait_event_info)
 {
 	int			returnCode;
 	Vfd		   *vfdP;
@@ -1957,7 +1895,7 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
 
 	DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
 			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
+			   (int64) offset,
 			   amount, buffer));
 
 	returnCode = FileAccess(file);
@@ -1976,26 +1914,13 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
 	 */
 	if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
 	{
-		off_t		newPos;
+		off_t		past_write = offset + amount;
 
-		/*
-		 * Normally we should know the seek position, but if for some reason
-		 * we have lost track of it, try again to get it.  Here, it's fine to
-		 * throw an error if we still can't get it.
-		 */
-		if (FilePosIsUnknown(vfdP->seekPos))
-		{
-			vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
-			if (FilePosIsUnknown(vfdP->seekPos))
-				elog(ERROR, "could not seek file \"%s\": %m", vfdP->fileName);
-		}
-
-		newPos = vfdP->seekPos + amount;
-		if (newPos > vfdP->fileSize)
+		if (past_write > vfdP->fileSize)
 		{
 			uint64		newTotal = temporary_files_size;
 
-			newTotal += newPos - vfdP->fileSize;
+			newTotal += past_write - vfdP->fileSize;
 			if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
 				ereport(ERROR,
 						(errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
@@ -2007,7 +1932,13 @@ FileWrite(File file, char *buffer, int amount, uint32 wait_event_info)
 retry:
 	errno = 0;
 	pgstat_report_wait_start(wait_event_info);
-	returnCode = write(vfdP->fd, buffer, amount);
+#ifdef HAVE_PWRITE
+	returnCode = pwrite(VfdCache[file].fd, buffer, amount, offset);
+#else
+	returnCode = lseek(VfdCache[file].fd, offset, SEEK_SET);
+	if (returnCode >= 0)
+		returnCode = write(VfdCache[file].fd, buffer, amount);
+#endif
 	pgstat_report_wait_end();
 
 	/* if write didn't set errno, assume problem is no disk space */
@@ -2016,10 +1947,6 @@ retry:
 
 	if (returnCode >= 0)
 	{
-		/* if seekPos is unknown, leave it that way */
-		if (!FilePosIsUnknown(vfdP->seekPos))
-			vfdP->seekPos += returnCode;
-
 		/*
 		 * Maintain fileSize and temporary_files_size if it's a temp file.
 		 *
@@ -2029,12 +1956,12 @@ retry:
 		 */
 		if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
 		{
-			off_t		newPos = vfdP->seekPos;
+			off_t		past_write = offset + amount;
 
-			if (newPos > vfdP->fileSize)
+			if (past_write > vfdP->fileSize)
 			{
-				temporary_files_size += newPos - vfdP->fileSize;
-				vfdP->fileSize = newPos;
+				temporary_files_size += past_write - vfdP->fileSize;
+				vfdP->fileSize = past_write;
 			}
 		}
 	}
@@ -2060,9 +1987,6 @@ retry:
 		/* OK to retry if interrupted */
 		if (errno == EINTR)
 			goto retry;
-
-		/* Trouble, so assume we don't know the file position anymore */
-		vfdP->seekPos = FileUnknownPos;
 	}
 
 	return returnCode;
@@ -2090,92 +2014,25 @@ FileSync(File file, uint32 wait_event_info)
 }
 
 off_t
-FileSeek(File file, off_t offset, int whence)
+FileSize(File file)
 {
 	Vfd		   *vfdP;
 
 	Assert(FileIsValid(file));
 
-	DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
-			   file, VfdCache[file].fileName,
-			   (int64) VfdCache[file].seekPos,
-			   (int64) offset, whence));
+	DO_DB(elog(LOG, "FileSize %d (%s)",
+			   file, VfdCache[file].fileName));
 
 	vfdP = &VfdCache[file];
 
 	if (FileIsNotOpen(file))
 	{
-		switch (whence)
-		{
-			case SEEK_SET:
-				if (offset < 0)
-				{
-					errno = EINVAL;
-					return (off_t) -1;
-				}
-				vfdP->seekPos = offset;
-				break;
-			case SEEK_CUR:
-				if (FilePosIsUnknown(vfdP->seekPos) ||
-					vfdP->seekPos + offset < 0)
-				{
-					errno = EINVAL;
-					return (off_t) -1;
-				}
-				vfdP->seekPos += offset;
-				break;
-			case SEEK_END:
-				if (FileAccess(file) < 0)
-					return (off_t) -1;
-				vfdP->seekPos = lseek(vfdP->fd, offset, whence);
-				break;
-			default:
-				elog(ERROR, "invalid whence: %d", whence);
-				break;
-		}
-	}
-	else
-	{
-		switch (whence)
-		{
-			case SEEK_SET:
-				if (offset < 0)
-				{
-					errno = EINVAL;
-					return (off_t) -1;
-				}
-				if (vfdP->seekPos != offset)
-					vfdP->seekPos = lseek(vfdP->fd, offset, whence);
-				break;
-			case SEEK_CUR:
-				if (offset != 0 || FilePosIsUnknown(vfdP->seekPos))
-					vfdP->seekPos = lseek(vfdP->fd, offset, whence);
-				break;
-			case SEEK_END:
-				vfdP->seekPos = lseek(vfdP->fd, offset, whence);
-				break;
-			default:
-				elog(ERROR, "invalid whence: %d", whence);
-				break;
-		}
+		if (FileAccess(file) < 0)
+			return (off_t) -1;
 	}
 
-	return vfdP->seekPos;
-}
-
-/*
- * XXX not actually used but here for completeness
- */
-#ifdef NOT_USED
-off_t
-FileTell(File file)
-{
-	Assert(FileIsValid(file));
-	DO_DB(elog(LOG, "FileTell %d (%s)",
-			   file, VfdCache[file].fileName));
-	return VfdCache[file].seekPos;
+	return lseek(VfdCache[file].fd, 0, SEEK_END);
 }
-#endif
 
 int
 FileTruncate(File file, off_t offset, uint32 wait_event_info)
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index f4374d077be..86013a5c8b2 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -522,22 +522,7 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	/*
-	 * Note: because caller usually obtained blocknum by calling mdnblocks,
-	 * which did a seek(SEEK_END), this seek is often redundant and will be
-	 * optimized away by fd.c.  It's not redundant, however, if there is a
-	 * partial page at the end of the file. In that case we want to try to
-	 * overwrite the partial page with a full page.  It's also not redundant
-	 * if bufmgr.c had to dump another buffer of the same file to make room
-	 * for the new page's buffer.
-	 */
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
 	{
 		if (nbytes < 0)
 			ereport(ERROR,
@@ -748,13 +733,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_READ);
+	nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
 
 	TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
 									   reln->smgr_rnode.node.spcNode,
@@ -824,13 +803,7 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 
 	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 
-	if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not seek to block %u in file \"%s\": %m",
-						blocknum, FilePathName(v->mdfd_vfd))));
-
-	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, WAIT_EVENT_DATA_FILE_WRITE);
+	nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
 
 	TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
 										reln->smgr_rnode.node.spcNode,
@@ -1979,7 +1952,7 @@ _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 {
 	off_t		len;
 
-	len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
+	len = FileSize(seg->mdfd_vfd);
 	if (len < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 7894caa8c12..660dec5afb4 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -438,6 +438,9 @@
 /* Define to 1 if you have the `ppoll' function. */
 #undef HAVE_PPOLL
 
+/* Define to 1 if you have the `pread' function. */
+#undef HAVE_PREAD
+
 /* Define to 1 if you have the `pstat' function. */
 #undef HAVE_PSTAT
 
@@ -453,6 +456,9 @@
 /* Have PTHREAD_PRIO_INHERIT. */
 #undef HAVE_PTHREAD_PRIO_INHERIT
 
+/* Define to 1 if you have the `pwrite' function. */
+#undef HAVE_PWRITE
+
 /* Define to 1 if you have the `random' function. */
 #undef HAVE_RANDOM
 
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 8e7c9728f4b..f8b6fa8ece5 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -15,7 +15,7 @@
 /*
  * calls:
  *
- *	File {Close, Read, Write, Seek, Tell, Sync}
+ *	File {Close, Read, Write, Size, Tell, Sync}
  *	{Path Name Open, Allocate, Free} File
  *
  * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
@@ -42,10 +42,6 @@
 #include <dirent.h>
 
 
-/*
- * FileSeek uses the standard UNIX lseek(2) flags.
- */
-
 typedef int File;
 
 
@@ -68,10 +64,10 @@ extern File PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fil
 extern File OpenTemporaryFile(bool interXact);
 extern void FileClose(File file);
 extern int	FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_info);
-extern int	FileRead(File file, char *buffer, int amount, uint32 wait_event_info);
-extern int	FileWrite(File file, char *buffer, int amount, uint32 wait_event_info);
+extern int	FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
+extern int	FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
 extern int	FileSync(File file, uint32 wait_event_info);
-extern off_t FileSeek(File file, off_t offset, int whence);
+extern off_t FileSize(File file);
 extern int	FileTruncate(File file, off_t offset, uint32 wait_event_info);
 extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
 extern char *FilePathName(File file);
-- 
2.17.1 (Apple Git-112)

