Hi! Here is the case.
Assume we have a master to slave replication with shared_buffers set up to 2 GB at the master and 4 GB at the slave. All of the data is written to the master, while reading occurs from slave.
Now we decided to drop many tables, let's say 1000 or 10000 not in a single transaction, but each table in a separate one. So, due to "plain" shared_buffers memory we have to do for loop for every relation which leads to lag between master and slave.
In real case scenario such issue lead to not a minutes lag, but hours lag. At the same time PostgreSQL have a great routine to delete many relations in a single transaction.
So, to get rid of this kind of issue here came up an idea: what if not to delete everyone of relations right away and just store them in an array, prevent shared buffers (correspond to a deleted relations) from been flushed. And then array reaches it max size we need to walk all buffers only once to "free" shared buffers correspond to a deleted relations.
Here some values from the test which I am made. Without patch: 1. (master 2 GB) - drop 1000 tables took 6 sec (slave 4 GB) - drop 1000 tables took 8 sec 2. (master 4 GB) - drop 1000 tables took 10 sec (slave 8 GB) - drop 1000 tables took 16 sec 3. (master 10 GB) - drop 1000 tables took 22 sec (slave 20 GB) - drop 1000 tables took 38 sec With patch: 1. (master 2 GB) - drop 1000 tables took 2 sec (slave 4 GB) - drop 1000 tables took 2 sec 2. (master 4 GB) - drop 1000 tables took 3 sec (slave 8 GB) - drop 1000 tables took 3 sec 3. (master 10 GB) - drop 1000 tables took 4 sec (slave 20 GB) - drop 1000 tables took 4 sec -- Max Orlov E-mail: [email protected]
init.sh
Description: application/shellscript
run-001.sh
Description: application/shellscript
run-001-core.sh
Description: application/shellscript
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 1f10a97dc78..e0d1fb6a824 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -66,8 +66,6 @@
#define BUF_WRITTEN 0x01
#define BUF_REUSABLE 0x02
-#define DROP_RELS_BSEARCH_THRESHOLD 20
-
typedef struct PrivateRefCountEntry
{
Buffer buffer;
@@ -410,6 +408,72 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
}
}
+/*
+ * Lazy relations delete mechanism.
+ *
+ * On relation drop no need to walk shared buffers every time, just put a deleted
+ * RelFileNode into an array. Thus we store RelFileNodes in RelNodeDeleted
+ * struct and delete them later when number of deleted relations will
+ * exceed LAZY_DELETE_ARRAY_SIZE.
+ */
+
+#define LAZY_DELETE_ARRAY_SIZE 128
+
+/* Wrapper for array of lazy deleted RelFileNodes. */
+typedef struct RelNodeDeletedBuffer
+{
+ RelFileNode rnodes[LAZY_DELETE_ARRAY_SIZE]; /* Array of deleted */
+ int idx; /* Current position */
+} RelNodeDeletedBuffer;
+
+static RelNodeDeletedBuffer *RelNodeDeleted = NULL;
+
+/*
+ * Initialize shared array of lazy deleted relations.
+ *
+ * This is called once during shared-memory initialization (either in the
+ * postmaster, or in a standalone backend).
+ */
+void InitRelNodeDeletedBuffer(void)
+{
+ bool found;
+
+ RelNodeDeleted = ShmemInitStruct("Lazy Deleted Relations",
+ sizeof(RelNodeDeletedBuffer),
+ &found);
+
+ if (!found)
+ MemSet(RelNodeDeleted, 0, sizeof(RelNodeDeletedBuffer));
+}
+
+/*
+ * Compute the size of shared memory for the buffer of lazy deleted relations.
+ */
+Size
+RelNodeDeletedBufferSize(void)
+{
+ Size size = 0;
+
+ size = add_size(size, sizeof(RelNodeDeletedBuffer));
+
+ return size;
+}
+
+/*
+ * Check for relation is in lazy deleted buffer (up to n-th position).
+ */
+static inline bool
+IsRelFileNodeDeleted(RelFileNode rnode, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++)
+ if (RelFileNodeEquals(rnode, RelNodeDeleted->rnodes[i]))
+ return true;
+
+ return false;
+}
+
/*
* BufferIsPinned
* True iff the buffer is pinned (also checks for valid buffer number).
@@ -452,6 +516,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr);
+static void InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes);
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln);
static void AtProcExit_Buffers(int code, Datum arg);
static void CheckForBufferLeaks(void);
@@ -2690,6 +2755,22 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln)
char *bufToWrite;
uint32 buf_state;
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+
+ /*
+ * If rnode is in lazy deleted buffer clear dirty and checkpoint flags.
+ */
+ if (IsRelFileNodeDeleted(buf->tag.rnode, RelNodeDeleted->idx))
+ {
+ buf_state = LockBufHdr(buf);
+ buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+ UnlockBufHdr(buf, buf_state);
+ LWLockRelease(LazyRelDeleteLock);
+ return;
+ }
+
+ LWLockRelease(LazyRelDeleteLock);
+
/*
* Acquire the buffer's io_in_progress lock. If StartBufferIO returns
* false, then someone else flushed the buffer before we could, so we need
@@ -2993,6 +3074,43 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
}
}
+/*
+ * Invalidate all of the nodes buffers.
+ */
+void
+InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes)
+{
+ int i;
+
+ pg_qsort(nodes, nnodes, sizeof(RelFileNode), rnode_comparator);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ RelFileNode *rnode = NULL;
+ BufferDesc *bufHdr = GetBufferDescriptor(i);
+ uint32 buf_state;
+
+ /*
+ * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
+ * and saves some cycles.
+ */
+
+ rnode = bsearch((const void *) &(bufHdr->tag.rnode),
+ nodes, nnodes, sizeof(RelFileNode),
+ rnode_comparator);
+
+ /* buffer doesn't belong to any of the given relfilenodes; skip it */
+ if (rnode == NULL)
+ continue;
+
+ buf_state = LockBufHdr(bufHdr);
+ if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
/* ---------------------------------------------------------------------
* DropRelFileNodesAllBuffers
*
@@ -3006,25 +3124,27 @@ void
DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
{
int i,
+ j,
n = 0;
RelFileNode *nodes;
- bool use_bsearch;
if (nnodes == 0)
return;
nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */
- /* If it's a local relation, it's localbuf.c's problem. */
for (i = 0; i < nnodes; i++)
{
+ /* If it's a local relation, it's localbuf.c's problem. */
if (RelFileNodeBackendIsTemp(rnodes[i]))
{
if (rnodes[i].backend == MyBackendId)
DropRelFileNodeAllLocalBuffers(rnodes[i].node);
}
else
+ {
nodes[n++] = rnodes[i].node;
+ }
}
/*
@@ -3037,58 +3157,41 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes)
return;
}
- /*
- * For low number of relations to drop just use a simple walk through, to
- * save the bsearch overhead. The threshold to use is rather a guess than
- * an exactly determined value, as it depends on many factors (CPU and RAM
- * speeds, amount of shared buffers etc.).
- */
- use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD;
+ if (n > LAZY_DELETE_ARRAY_SIZE)
+ {
+ InvalidateRelFileNodesBuffers(nodes, n);
+ }
+ else
+ {
+ int idx;
- /* sort the list of rnodes if necessary */
- if (use_bsearch)
- pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator);
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+ idx = RelNodeDeleted->idx;
+ Assert(idx < LAZY_DELETE_ARRAY_SIZE);
- for (i = 0; i < NBuffers; i++)
- {
- RelFileNode *rnode = NULL;
- BufferDesc *bufHdr = GetBufferDescriptor(i);
- uint32 buf_state;
+ /* Copy deleted relations into lazy deleted array, but watch the size ... */
+ for (i = idx, j = 0; i < LAZY_DELETE_ARRAY_SIZE && j < n; i++, j++)
+ RelNodeDeleted->rnodes[i] = rnodes[j].node;
- /*
- * As in DropRelFileNodeBuffers, an unlocked precheck should be safe
- * and saves some cycles.
- */
+ idx += j;
- if (!use_bsearch)
+ if (idx == LAZY_DELETE_ARRAY_SIZE)
{
- int j;
+ InvalidateRelFileNodesBuffers(RelNodeDeleted->rnodes, idx);
- for (j = 0; j < n; j++)
- {
- if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j]))
- {
- rnode = &nodes[j];
- break;
- }
- }
+ /* ... and сopy rest of deleted relations. */
+ for (i = 0; j < n; i++, j++)
+ RelNodeDeleted->rnodes[i] = rnodes[j].node;
+
+ RelNodeDeleted->idx = i;
}
else
{
- rnode = bsearch((const void *) &(bufHdr->tag.rnode),
- nodes, n, sizeof(RelFileNode),
- rnode_comparator);
+ RelNodeDeleted->idx = idx;
}
- /* buffer doesn't belong to any of the given relfilenodes; skip it */
- if (rnode == NULL)
- continue;
-
- buf_state = LockBufHdr(bufHdr);
- if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode)))
- InvalidateBuffer(bufHdr); /* releases spinlock */
- else
- UnlockBufHdr(bufHdr, buf_state);
+ Assert(RelNodeDeleted->idx < LAZY_DELETE_ARRAY_SIZE);
+ LWLockRelease(LazyRelDeleteLock);
}
pfree(nodes);
@@ -3133,6 +3236,22 @@ DropDatabaseBuffers(Oid dbid)
else
UnlockBufHdr(bufHdr, buf_state);
}
+
+ /*
+ * Remove all relations of dbid from lazy deleted array.
+ */
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+ {
+ int idx = RelNodeDeleted->idx;
+
+ for (i = 0; i < idx; i++)
+ {
+ if (RelNodeDeleted->rnodes[i].dbNode == dbid)
+ RelNodeDeleted->rnodes[i] = RelNodeDeleted->rnodes[--idx];
+ }
+ RelNodeDeleted->idx = idx;
+ }
+ LWLockRelease(LazyRelDeleteLock);
}
/* -----------------------------------------------------------------
@@ -3330,6 +3449,15 @@ FlushDatabaseBuffers(Oid dbid)
if (bufHdr->tag.rnode.dbNode != dbid)
continue;
+ LWLockAcquire(LazyRelDeleteLock, LW_SHARED);
+
+ if (IsRelFileNodeDeleted(bufHdr->tag.rnode, RelNodeDeleted->idx))
+ {
+ LWLockRelease(LazyRelDeleteLock);
+ continue;
+ }
+
+ LWLockRelease(LazyRelDeleteLock);
ReservePrivateRefCountEntry();
buf_state = LockBufHdr(bufHdr);
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 4829953ee64..fdfdb78dff8 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
sizeof(ShmemIndexEnt)));
size = add_size(size, BufferShmemSize());
+ size = add_size(size, RelNodeDeletedBufferSize());
size = add_size(size, LockShmemSize());
size = add_size(size, PredicateLockShmemSize());
size = add_size(size, ProcGlobalShmemSize());
@@ -217,6 +218,7 @@ CreateSharedMemoryAndSemaphores(void)
SUBTRANSShmemInit();
MultiXactShmemInit();
InitBufferPool();
+ InitRelNodeDeletedBuffer();
/*
* Set up lock manager
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index db478432291..a35e99190ff 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -49,3 +49,4 @@ MultiXactTruncationLock 41
OldSnapshotTimeMapLock 42
LogicalRepWorkerLock 43
CLogTruncationLock 44
+LazyRelDeleteLock 45
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 17b97f7e38f..9cb51bc5f0d 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -179,6 +179,7 @@ extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation,
BlockNumber blockNum);
extern void InitBufferPool(void);
+extern void InitRelNodeDeletedBuffer(void);
extern void InitBufferPoolAccess(void);
extern void InitBufferPoolBackend(void);
extern void AtEOXact_Buffers(bool isCommit);
@@ -205,6 +206,7 @@ extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
extern void PrintPinnedBufs(void);
#endif
extern Size BufferShmemSize(void);
+extern Size RelNodeDeletedBufferSize(void);
extern void BufferGetTag(Buffer buffer, RelFileNode *rnode,
ForkNumber *forknum, BlockNumber *blknum);
