Hi,
I am working on using read streams in the CREATE DATABASE command when the
strategy is wal_log. RelationCopyStorageUsingBuffer() function is used in
this context. This function reads source buffers then copies them to the
destination buffers. I used read streams only when reading source buffers
because the destination buffers are read by 'RBM_ZERO_AND_LOCK' option, so
it is not important.
I created a ~6 GB table [1] and created a new database with the wal_log
strategy using the database that table was created in as a template [2]. My
benchmarking results are:
a. Timings:
patched:
12955.027 ms
12917.475 ms
13177.846 ms
12971.308 ms
13059.985 ms
master:
13156.375 ms
13054.071 ms
13151.607 ms
13152.633 ms
13160.538 ms
There is no difference in timings, the patched version is a tiny bit better
but it is negligible. I actually expected the patched version to be better
because there was no prefetching before, but the read stream API detects
sequential access and disables prefetching.
b. strace:
patched:
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
68.02 3.749359 2 1285782 pwrite64
18.54 1.021734 21 46730 preadv
9.49 0.522889 826 633 fdatasync
2.55 0.140339 59 2368 pwritev
1.14 0.062583 409 153 fsync
master:
% time seconds usecs/call calls errors syscall
------ ----------- ----------- --------- --------- ----------------
59.71 3.831542 2 1288365 pwrite64
29.84 1.914540 2 747936 pread64
7.90 0.506843 837 605 fdatasync
1.58 0.101575 54 1856 pwritev
0.75 0.048431 400 121 fsync
There are fewer (~1/16) read system calls in the patched version.
c. perf:
patched:
- 97.83% 1.13% postgres postgres [.]
RelationCopyStorageUsingBuffer
- 97.83% RelationCopyStorageUsingBuffer
- 44.28% ReadBufferWithoutRelcache
+ 42.20% GetVictimBuffer
0.81% ZeroBuffer
+ 31.86% log_newpage_buffer
- 19.51% read_stream_next_buffer
- 17.92% WaitReadBuffers
+ 17.61% mdreadv
- 1.47% read_stream_start_pending_read
+ 1.46% StartReadBuffers
master:
- 97.68% 0.57% postgres postgres [.]
RelationCopyStorageUsingBuffer
- RelationCopyStorageUsingBuffer
- 65.48% ReadBufferWithoutRelcache
+ 41.16% GetVictimBuffer
- 20.42% WaitReadBuffers
+ 19.90% mdreadv
+ 1.85% StartReadBuffer
0.75% ZeroBuffer
+ 30.82% log_newpage_buffer
Patched version spends less CPU time in read calls and more CPU time in
other calls such as write.
There are three patch files attached. First two are optimization and adding
a way to create a read stream object by using SMgrRelation, these are
already proposed in the streaming I/O thread [3]. The third one is the
actual patch file.
Any kind of feedback would be appreciated.
[1] CREATE TABLE t as select repeat('a', 100) || i || repeat('b', 500) as
filler from generate_series(1, 9000000) as i;
[2] CREATE DATABASE test_1 STRATEGY 'wal_log' TEMPLATE test;
[3]
https://www.postgresql.org/message-id/CAN55FZ1yGvCzCW_aufu83VimdEYHbG_zuOY3J9JL-nBptyJyKA%40mail.gmail.com
--
Regards,
Nazir Bilal Yavuz
Microsoft
From edcfacec40a70a8747c5f18777b6c28b0fccba7a Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <[email protected]>
Date: Sun, 7 Apr 2024 22:33:36 +0300
Subject: [PATCH v1 1/3] Refactor PinBufferForBlock() to remove if checks about
persistence
There are if checks in PinBufferForBlock() function to set persistence
of the relation and this function is called for the each block in the
relation. Instead of that, set persistence of the relation before
PinBufferForBlock() function.
---
src/backend/storage/aio/read_stream.c | 2 +-
src/backend/storage/buffer/bufmgr.c | 31 +++++++++++----------------
2 files changed, 14 insertions(+), 19 deletions(-)
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index f54dacdd914..d155dde5ce3 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -549,7 +549,7 @@ read_stream_begin_relation(int flags,
{
stream->ios[i].op.rel = rel;
stream->ios[i].op.smgr = RelationGetSmgr(rel);
- stream->ios[i].op.smgr_persistence = 0;
+ stream->ios[i].op.smgr_persistence = rel->rd_rel->relpersistence;
stream->ios[i].op.forknum = forknum;
stream->ios[i].op.strategy = strategy;
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 901b7230fb9..17cb7127f99 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1067,24 +1067,10 @@ PinBufferForBlock(Relation rel,
BufferDesc *bufHdr;
IOContext io_context;
IOObject io_object;
- char persistence;
Assert(blockNum != P_NEW);
- /*
- * If there is no Relation it usually implies recovery and thus permanent,
- * but we take an argmument because CreateAndCopyRelationData can reach us
- * with only an SMgrRelation for an unlogged relation that we don't want
- * to flag with BM_PERMANENT.
- */
- if (rel)
- persistence = rel->rd_rel->relpersistence;
- else if (smgr_persistence == 0)
- persistence = RELPERSISTENCE_PERMANENT;
- else
- persistence = smgr_persistence;
-
- if (persistence == RELPERSISTENCE_TEMP)
+ if (smgr_persistence == RELPERSISTENCE_TEMP)
{
io_context = IOCONTEXT_NORMAL;
io_object = IOOBJECT_TEMP_RELATION;
@@ -1101,7 +1087,7 @@ PinBufferForBlock(Relation rel,
smgr->smgr_rlocator.locator.relNumber,
smgr->smgr_rlocator.backend);
- if (persistence == RELPERSISTENCE_TEMP)
+ if (smgr_persistence == RELPERSISTENCE_TEMP)
{
bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
if (*foundPtr)
@@ -1109,7 +1095,7 @@ PinBufferForBlock(Relation rel,
}
else
{
- bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
+ bufHdr = BufferAlloc(smgr, smgr_persistence, forkNum, blockNum,
strategy, foundPtr, io_context);
if (*foundPtr)
pgBufferUsage.shared_blks_hit++;
@@ -1157,6 +1143,7 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
ReadBuffersOperation operation;
Buffer buffer;
int flags;
+ char persistence;
/*
* Backward compatibility path, most code should use ExtendBufferedRel()
@@ -1195,7 +1182,15 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
flags = 0;
operation.smgr = smgr;
operation.rel = rel;
- operation.smgr_persistence = smgr_persistence;
+
+ if (rel)
+ persistence = rel->rd_rel->relpersistence;
+ else if (smgr_persistence == 0)
+ persistence = RELPERSISTENCE_PERMANENT;
+ else
+ persistence = smgr_persistence;
+ operation.smgr_persistence = persistence;
+
operation.forknum = forkNum;
operation.strategy = strategy;
if (StartReadBuffer(&operation,
--
2.43.0
From 2c6947231ba6377f291a1bd7c9cee55e5833fdbf Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <[email protected]>
Date: Mon, 8 Apr 2024 00:14:52 +0300
Subject: [PATCH v1 2/3] Add a way to create read stream object by using
SMgrRelation
Currently read stream object can be created only by using Relation,
there is no way to create it by using SMgrRelation.
To achieve that, read_stream_begin_impl() function is created and
contents of the read_stream_begin_relation() is moved into this
function.
Both read_stream_begin_relation() and read_stream_begin_smgr_relation()
calls read_stream_begin_impl() by Relation and SMgrRelation
respectively.
---
src/include/storage/read_stream.h | 9 +++
src/backend/storage/aio/read_stream.c | 83 ++++++++++++++++++++++-----
2 files changed, 78 insertions(+), 14 deletions(-)
diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h
index fae09d2b4cc..c7d688d8c9c 100644
--- a/src/include/storage/read_stream.h
+++ b/src/include/storage/read_stream.h
@@ -15,6 +15,7 @@
#define READ_STREAM_H
#include "storage/bufmgr.h"
+#include "storage/smgr.h"
/* Default tuning, reasonable for many users. */
#define READ_STREAM_DEFAULT 0x00
@@ -56,6 +57,14 @@ extern ReadStream *read_stream_begin_relation(int flags,
ReadStreamBlockNumberCB callback,
void *callback_private_data,
size_t per_buffer_data_size);
+extern ReadStream *read_stream_begin_smgr_relation(int flags,
+ BufferAccessStrategy strategy,
+ SMgrRelation smgr,
+ char smgr_persistence,
+ ForkNumber forknum,
+ ReadStreamBlockNumberCB callback,
+ void *callback_private_data,
+ size_t per_buffer_data_size);
extern Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_private);
extern void read_stream_reset(ReadStream *stream);
extern void read_stream_end(ReadStream *stream);
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index d155dde5ce3..8dd100f016c 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -406,14 +406,16 @@ read_stream_look_ahead(ReadStream *stream, bool suppress_advice)
* write extra data for each block into the space provided to it. It will
* also receive callback_private_data for its own purposes.
*/
-ReadStream *
-read_stream_begin_relation(int flags,
- BufferAccessStrategy strategy,
- Relation rel,
- ForkNumber forknum,
- ReadStreamBlockNumberCB callback,
- void *callback_private_data,
- size_t per_buffer_data_size)
+static ReadStream *
+read_stream_begin_impl(int flags,
+ BufferAccessStrategy strategy,
+ Relation rel,
+ SMgrRelation smgr,
+ char smgr_persistence,
+ ForkNumber forknum,
+ ReadStreamBlockNumberCB callback,
+ void *callback_private_data,
+ size_t per_buffer_data_size)
{
ReadStream *stream;
size_t size;
@@ -422,9 +424,6 @@ read_stream_begin_relation(int flags,
int strategy_pin_limit;
uint32 max_pinned_buffers;
Oid tablespace_id;
- SMgrRelation smgr;
-
- smgr = RelationGetSmgr(rel);
/*
* Decide how many I/Os we will allow to run at the same time. That
@@ -434,7 +433,7 @@ read_stream_begin_relation(int flags,
*/
tablespace_id = smgr->smgr_rlocator.locator.spcOid;
if (!OidIsValid(MyDatabaseId) ||
- IsCatalogRelation(rel) ||
+ (rel && IsCatalogRelation(rel)) ||
IsCatalogRelationOid(smgr->smgr_rlocator.locator.relNumber))
{
/*
@@ -548,8 +547,8 @@ read_stream_begin_relation(int flags,
for (int i = 0; i < max_ios; ++i)
{
stream->ios[i].op.rel = rel;
- stream->ios[i].op.smgr = RelationGetSmgr(rel);
- stream->ios[i].op.smgr_persistence = rel->rd_rel->relpersistence;
+ stream->ios[i].op.smgr = smgr;
+ stream->ios[i].op.smgr_persistence = smgr_persistence;
stream->ios[i].op.forknum = forknum;
stream->ios[i].op.strategy = strategy;
}
@@ -557,6 +556,62 @@ read_stream_begin_relation(int flags,
return stream;
}
+/*
+ * Create a new read stream for reading a relation.
+ * See read_stream_begin_impl() for the detailed explanation.
+ */
+ReadStream *
+read_stream_begin_relation(int flags,
+ BufferAccessStrategy strategy,
+ Relation rel,
+ ForkNumber forknum,
+ ReadStreamBlockNumberCB callback,
+ void *callback_private_data,
+ size_t per_buffer_data_size)
+{
+ return read_stream_begin_impl(flags,
+ strategy,
+ rel,
+ RelationGetSmgr(rel),
+ rel->rd_rel->relpersistence,
+ forknum,
+ callback,
+ callback_private_data,
+ per_buffer_data_size);
+}
+
+/*
+ * Create a new read stream for reading a SMgr relation.
+ * See read_stream_begin_impl() for the detailed explanation.
+ */
+ReadStream *
+read_stream_begin_smgr_relation(int flags,
+ BufferAccessStrategy strategy,
+ SMgrRelation smgr,
+ char smgr_persistence,
+ ForkNumber forknum,
+ ReadStreamBlockNumberCB callback,
+ void *callback_private_data,
+ size_t per_buffer_data_size)
+{
+ char persistence;
+
+ if (smgr_persistence == 0)
+ persistence = RELPERSISTENCE_PERMANENT;
+ else
+ persistence = smgr_persistence;
+
+ return read_stream_begin_impl(flags,
+ strategy,
+ NULL,
+ smgr,
+ persistence,
+ forknum,
+ callback,
+ callback_private_data,
+ per_buffer_data_size);
+}
+
/*
* Pull one pinned buffer out of a stream. Each call returns successive
* blocks in the order specified by the callback. If per_buffer_data_size was
--
2.43.0
From b97e1f39ef53c331090597aaf2d38d66a9b49e89 Mon Sep 17 00:00:00 2001
From: Nazir Bilal Yavuz <[email protected]>
Date: Tue, 16 Apr 2024 11:36:49 +0300
Subject: [PATCH v1 3/3] Use read streams in CREATE DATABASE when strategy is
wal_log
CREATE DABASE command uses RelationCopyStorageUsingBuffer() function to
copy buffers when the strategy is wal_log. This function reads source
buffers then copies them to the destination buffers. Read streams are
used only only while reading source buffers because the destination
buffer is read by 'RBM_ZERO_AND_LOCK' option, so it is not important.
---
src/backend/storage/buffer/bufmgr.c | 53 ++++++++++++++++++++++++++---
1 file changed, 49 insertions(+), 4 deletions(-)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 17cb7127f99..e02b699f6cf 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -54,6 +54,7 @@
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
+#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
@@ -135,6 +136,33 @@ typedef struct SMgrSortArray
SMgrRelation srel;
} SMgrSortArray;
+/*
+ * Helper struct for read stream object used in
+ * RelationCopyStorageUsingBuffer() function.
+ */
+struct copy_storage_using_buffer_read_stream_private
+{
+ BlockNumber blocknum;
+ int64 last_block;
+};
+
+/*
+ * Callback function to get next block for read stream object used in
+ * RelationCopyStorageUsingBuffer() function.
+ */
+static BlockNumber
+copy_storage_using_buffer_read_stream_next_block(ReadStream *stream,
+ void *callback_private_data,
+ void *per_buffer_data)
+{
+ struct copy_storage_using_buffer_read_stream_private *p = callback_private_data;
+
+ if (p->blocknum < p->last_block)
+ return p->blocknum++;
+
+ return InvalidBlockNumber;
+}
+
/* GUC variables */
bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
@@ -4639,6 +4667,9 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
PGIOAlignedBlock buf;
BufferAccessStrategy bstrategy_src;
BufferAccessStrategy bstrategy_dst;
+ struct copy_storage_using_buffer_read_stream_private p;
+ ReadStream *src_stream;
+ SMgrRelation src_smgr;
/*
* In general, we want to write WAL whenever wal_level > 'minimal', but we
@@ -4667,19 +4698,31 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
+ /* Initalize streaming read */
+ p.blocknum = 0;
+ p.last_block = nblocks;
+ src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
+ src_stream = read_stream_begin_smgr_relation(READ_STREAM_FULL,
+ bstrategy_src,
+ src_smgr,
+ permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
+ forkNum,
+ copy_storage_using_buffer_read_stream_next_block,
+ &p,
+ 0);
+
/* Iterate over each block of the source relation file. */
for (blkno = 0; blkno < nblocks; blkno++)
{
CHECK_FOR_INTERRUPTS();
/* Read block from source relation. */
- srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
- RBM_NORMAL, bstrategy_src,
- permanent);
+ srcBuf = read_stream_next_buffer(src_stream, NULL);
LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
srcPage = BufferGetPage(srcBuf);
- dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
+ dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
+ BufferGetBlockNumber(srcBuf),
RBM_ZERO_AND_LOCK, bstrategy_dst,
permanent);
dstPage = BufferGetPage(dstBuf);
@@ -4699,6 +4742,8 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
UnlockReleaseBuffer(dstBuf);
UnlockReleaseBuffer(srcBuf);
}
+ Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
+ read_stream_end(src_stream);
FreeAccessStrategy(bstrategy_src);
FreeAccessStrategy(bstrategy_dst);
--
2.43.0