On Tue, 2020-05-26 at 17:40 -0700, Jeff Davis wrote:
> On Tue, 2020-05-26 at 21:15 +0200, Tomas Vondra wrote:
> > Yeah. I agree prefetching is definitely out of v13 scope. It might
> > be
> > interesting to try how useful would it be, if you're willing to
> > spend
> > some time on a prototype.
>
> I think a POC would be pretty quick; I'll see if I can hack something
> together.
Attached (intended for v14).
I changed the list from a simple array to a circular buffer so that we
can keep enough preallocated block numbers in it to do prefetching.
On SSD I didn't see any improvement, but perhaps it will do better on
magnetic storage.
Regards,
Jeff Davis
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index d7f99d9944c..89edbc4c579 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3941,6 +3941,9 @@ pgstat_get_wait_io(WaitEventIO w)
case WAIT_EVENT_BUFFILE_WRITE:
event_name = "BufFileWrite";
break;
+ case WAIT_EVENT_BUFFILE_PREFETCH:
+ event_name = "BufFilePrefetch";
+ break;
case WAIT_EVENT_CONTROL_FILE_READ:
event_name = "ControlFileRead";
break;
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 35e8f12e62d..4e2d4f059a7 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -618,6 +618,20 @@ BufFileWrite(BufFile *file, void *ptr, size_t size)
return nwritten;
}
+/*
+ * BufFilePrefetch
+ */
+void
+BufFilePrefetchBlock(BufFile *buffile, long blknum)
+{
+ int filenum = (blknum / BUFFILE_SEG_SIZE);
+ off_t offset = (blknum % BUFFILE_SEG_SIZE) * BLCKSZ;
+ File file = buffile->files[filenum];
+
+ (void) FilePrefetch(file, offset, BLCKSZ,
+ WAIT_EVENT_BUFFILE_PREFETCH);
+}
+
/*
* BufFileFlush
*
diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 666a7c0e81c..014ec67b8ff 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -97,6 +97,7 @@ typedef struct TapeBlockTrailer
* block */
long next; /* next block on this tape, or # of valid
* bytes on last block (if < 0) */
+ long prefetch; /* block to prefetch when reading this one */
} TapeBlockTrailer;
#define TapeBlockPayloadSize (BLCKSZ - sizeof(TapeBlockTrailer))
@@ -123,6 +124,11 @@ typedef struct TapeBlockTrailer
#define TAPE_WRITE_PREALLOC_MIN 8
#define TAPE_WRITE_PREALLOC_MAX 128
+#define TAPE_PREFETCH_DISTANCE_MAX 64
+
+StaticAssertDecl(TAPE_PREFETCH_DISTANCE_MAX * 2 <= TAPE_WRITE_PREALLOC_MAX,
+ "Tape prefetch distance too large.");
+
/*
* This data structure represents a single "logical tape" within the set
* of logical tapes stored in the same file.
@@ -165,13 +171,13 @@ typedef struct LogicalTape
int nbytes; /* total # of valid bytes in buffer */
/*
- * Preallocated block numbers are held in an array sorted in descending
- * order; blocks are consumed from the end of the array (lowest block
- * numbers first).
+ * Preallocated block numbers are held in a sorted circular array.
*/
long *prealloc;
- int nprealloc; /* number of elements in list */
- int prealloc_size; /* number of elements list can hold */
+ int prealloc_next; /* next element in circular array */
+ int prealloc_nelem; /* number of elements in circular array */
+ int prealloc_size; /* total size of circular array */
+ int prefetch_dist; /* distance to prefetch */
} LogicalTape;
/*
@@ -219,7 +225,8 @@ struct LogicalTapeSet
static void ltsWriteBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
static void ltsReadBlock(LogicalTapeSet *lts, long blocknum, void *buffer);
static long ltsGetFreeBlock(LogicalTapeSet *lts);
-static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt);
+static long ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt,
+ long *prefetch_block);
static void ltsReleaseBlock(LogicalTapeSet *lts, long blocknum);
static void ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared,
SharedFileSet *fileset);
@@ -329,6 +336,10 @@ ltsReadFillBuffer(LogicalTapeSet *lts, LogicalTape *lt)
else
lt->nextBlockNumber = TapeBlockGetTrailer(thisbuf)->next;
+ if (TapeBlockGetTrailer(thisbuf)->prefetch >= 0)
+ BufFilePrefetchBlock(lts->pfile,
+ TapeBlockGetTrailer(thisbuf)->prefetch);
+
/* Advance to next block, if we have buffer space left */
} while (lt->buffer_size - lt->nbytes > BLCKSZ);
@@ -424,38 +435,79 @@ ltsGetFreeBlock(LogicalTapeSet *lts)
* Refill the preallocation list if necessary.
*/
static long
-ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt)
+ltsGetPreallocBlock(LogicalTapeSet *lts, LogicalTape *lt, long *prefetch)
{
- /* sorted in descending order, so return the last element */
- if (lt->nprealloc > 0)
- return lt->prealloc[--lt->nprealloc];
+ long block;
+ int prefetch_idx;
+
+ if (lt->prealloc_nelem > lt->prefetch_dist)
+ {
+ prefetch_idx = (lt->prealloc_next + lt->prefetch_dist) % lt->prealloc_size;
+ *prefetch = lt->prealloc[prefetch_idx];
+ block = lt->prealloc[lt->prealloc_next];
+ lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size;
+ lt->prealloc_nelem--;
+ return block;
+ }
+ /* get a preallocated block number, or allocate a new block number */
+ if (lt->prealloc_nelem > 0)
+ {
+ block = lt->prealloc[lt->prealloc_next];
+ lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size;
+ lt->prealloc_nelem--;
+ }
+ else
+ block = ltsGetFreeBlock(lts);
+
+ /* create or grow the circular buffer, as needed */
if (lt->prealloc == NULL)
{
lt->prealloc_size = TAPE_WRITE_PREALLOC_MIN;
+ lt->prefetch_dist = lt->prealloc_size / 2;
lt->prealloc = (long *) palloc(sizeof(long) * lt->prealloc_size);
}
else if (lt->prealloc_size < TAPE_WRITE_PREALLOC_MAX)
{
+ int oldsize = lt->prealloc_size;
+ int n;
+
/* when the preallocation list runs out, double the size */
lt->prealloc_size *= 2;
- if (lt->prealloc_size > TAPE_WRITE_PREALLOC_MAX)
- lt->prealloc_size = TAPE_WRITE_PREALLOC_MAX;
- lt->prealloc = (long *) repalloc(lt->prealloc,
- sizeof(long) * lt->prealloc_size);
+ Assert(lt->prealloc_size <= TAPE_WRITE_PREALLOC_MAX);
+
+ /*
+ * The prefetch distance is always half the prealloc size, until the
+ * maximum is reached.
+ */
+ lt->prefetch_dist *= 2;
+ if (lt->prefetch_dist > TAPE_PREFETCH_DISTANCE_MAX)
+ lt->prefetch_dist = TAPE_PREFETCH_DISTANCE_MAX;
+
+ lt->prealloc = (long *) repalloc(
+ lt->prealloc, sizeof(long) * lt->prealloc_size);
+
+ /* now that the buffer is larger, move elements into the new space */
+ n = lt->prealloc_nelem - (oldsize - lt->prealloc_next);
+ if (n > 0)
+ memcpy(lt->prealloc + oldsize, lt->prealloc, sizeof(long) * n);
}
- /* refill preallocation list */
- lt->nprealloc = lt->prealloc_size;
- for (int i = lt->nprealloc; i > 0; i--)
+ /* preallocate more block numbers */
+ while (lt->prealloc_nelem < lt->prealloc_size)
{
- lt->prealloc[i - 1] = ltsGetFreeBlock(lts);
+ int end;
- /* verify descending order */
- Assert(i == lt->nprealloc || lt->prealloc[i - 1] > lt->prealloc[i]);
+ end = (lt->prealloc_next + lt->prealloc_nelem) % lt->prealloc_size;
+ lt->prealloc[end] = ltsGetFreeBlock(lts);
+ lt->prealloc_nelem++;
}
- return lt->prealloc[--lt->nprealloc];
+ Assert(lt->prealloc_nelem >= lt->prefetch_dist);
+
+ prefetch_idx = (lt->prealloc_next + lt->prefetch_dist) % lt->prealloc_size;
+ *prefetch = lt->prealloc[prefetch_idx];
+ return block;
}
/*
@@ -619,8 +671,10 @@ ltsInitTape(LogicalTape *lt)
lt->pos = 0;
lt->nbytes = 0;
lt->prealloc = NULL;
- lt->nprealloc = 0;
+ lt->prealloc_next = 0;
+ lt->prealloc_nelem = 0;
lt->prealloc_size = 0;
+ lt->prefetch_dist = 0;
}
/*
@@ -770,13 +824,16 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
}
if (lt->curBlockNumber == -1)
{
+ long prefetch;
+
Assert(lt->firstBlockNumber == -1);
Assert(lt->pos == 0);
- lt->curBlockNumber = ltsGetPreallocBlock(lts, lt);
+ lt->curBlockNumber = ltsGetPreallocBlock(lts, lt, &prefetch);
lt->firstBlockNumber = lt->curBlockNumber;
TapeBlockGetTrailer(lt->buffer)->prev = -1L;
+ TapeBlockGetTrailer(lt->buffer)->prefetch = prefetch;
}
Assert(lt->buffer_size == BLCKSZ);
@@ -786,6 +843,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
{
/* Buffer full, dump it out */
long nextBlockNumber;
+ long prefetch;
if (!lt->dirty)
{
@@ -797,7 +855,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
* First allocate the next block, so that we can store it in the
* 'next' pointer of this block.
*/
- nextBlockNumber = ltsGetPreallocBlock(lts, lt);
+ nextBlockNumber = ltsGetPreallocBlock(lts, lt, &prefetch);
/* set the next-pointer and dump the current block. */
TapeBlockGetTrailer(lt->buffer)->next = nextBlockNumber;
@@ -805,6 +863,7 @@ LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
/* initialize the prev-pointer of the next block */
TapeBlockGetTrailer(lt->buffer)->prev = lt->curBlockNumber;
+ TapeBlockGetTrailer(lt->buffer)->prefetch = prefetch;
lt->curBlockNumber = nextBlockNumber;
lt->pos = 0;
lt->nbytes = 0;
@@ -909,12 +968,20 @@ LogicalTapeRewindForRead(LogicalTapeSet *lts, int tapenum, size_t buffer_size)
/* free the preallocation list, and return unused block numbers */
if (lt->prealloc != NULL)
{
- for (int i = lt->nprealloc; i > 0; i--)
- ltsReleaseBlock(lts, lt->prealloc[i - 1]);
+ while (lt->prealloc_nelem > 0)
+ {
+ long block = lt->prealloc[lt->prealloc_next];
+ lt->prealloc_next = (lt->prealloc_next + 1) % lt->prealloc_size;
+ lt->prealloc_nelem--;
+ ltsReleaseBlock(lts, block);
+ }
+
pfree(lt->prealloc);
lt->prealloc = NULL;
- lt->nprealloc = 0;
+ lt->prealloc_next = 0;
+ lt->prealloc_nelem = 0;
lt->prealloc_size = 0;
+ lt->prefetch_dist = 0;
}
}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c55dc1481ca..c95fbf67ef9 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -915,6 +915,7 @@ typedef enum
{
WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
WAIT_EVENT_BUFFILE_WRITE,
+ WAIT_EVENT_BUFFILE_PREFETCH,
WAIT_EVENT_CONTROL_FILE_READ,
WAIT_EVENT_CONTROL_FILE_SYNC,
WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE,
diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h
index 60433f35b45..016971ef29d 100644
--- a/src/include/storage/buffile.h
+++ b/src/include/storage/buffile.h
@@ -40,6 +40,7 @@ extern BufFile *BufFileCreateTemp(bool interXact);
extern void BufFileClose(BufFile *file);
extern size_t BufFileRead(BufFile *file, void *ptr, size_t size);
extern size_t BufFileWrite(BufFile *file, void *ptr, size_t size);
+extern void BufFilePrefetchBlock(BufFile *file, long blknum);
extern int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence);
extern void BufFileTell(BufFile *file, int *fileno, off_t *offset);
extern int BufFileSeekBlock(BufFile *file, long blknum);