On 02/01/2019 17:35, Heikki Linnakangas wrote:
On 28/10/2018 19:32, Andrey Borodin wrote:
Hi everyone!
2 окт. 2018 г., в 6:14, Michael Paquier <mich...@paquier.xyz> написал(а):
Andrey, your latest patch does not apply. I am moving this to the next
CF, waiting for your input.
I'm doing preps for CF.
Here's rebased version.
Thanks, I had another look at these.
Here are new patch versions, with the fixes I mentioned. Forgot to
attach these earlier.
- Heikki
>From 46456dbf1d07aaf3e6963035a02aaa060decace3 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amboro...@acm.org>
Date: Sun, 28 Oct 2018 22:20:58 +0500
Subject: [PATCH 1/2] Physical GiST scan in VACUUM v18-heikki
---
src/backend/access/gist/gist.c | 8 +-
src/backend/access/gist/gistvacuum.c | 430 +++++++++++++++------------
2 files changed, 247 insertions(+), 191 deletions(-)
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index a2cb84800e8..d42e810c6b3 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -38,7 +38,7 @@ static bool gistinserttuples(GISTInsertState *state, GISTInsertStack *stack,
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool releasebuf);
-static void gistvacuumpage(Relation rel, Page page, Buffer buffer,
+static void gistprunepage(Relation rel, Page page, Buffer buffer,
Relation heapRel);
@@ -261,7 +261,7 @@ gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
*/
if (is_split && GistPageIsLeaf(page) && GistPageHasGarbage(page))
{
- gistvacuumpage(rel, page, buffer, heapRel);
+ gistprunepage(rel, page, buffer, heapRel);
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
}
@@ -1544,11 +1544,11 @@ freeGISTstate(GISTSTATE *giststate)
}
/*
- * gistvacuumpage() -- try to remove LP_DEAD items from the given page.
+ * gistprunepage() -- try to remove LP_DEAD items from the given page.
* Function assumes that buffer is exclusively locked.
*/
static void
-gistvacuumpage(Relation rel, Page page, Buffer buffer, Relation heapRel)
+gistprunepage(Relation rel, Page page, Buffer buffer, Relation heapRel)
{
OffsetNumber deletable[MaxIndexTuplesPerPage];
int ndeletable = 0;
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 5948218c779..4fb32bf76bf 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -21,6 +21,34 @@
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
+/* Working state needed by gistbulkdelete */
+typedef struct
+{
+ IndexVacuumInfo *info;
+ IndexBulkDeleteResult *stats;
+ IndexBulkDeleteCallback callback;
+ void *callback_state;
+ GistNSN startNSN;
+ BlockNumber totFreePages; /* true total # of free pages */
+} GistVacState;
+
+static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state);
+static void gistvacuumpage(GistVacState *vstate, BlockNumber blkno,
+ BlockNumber orig_blkno);
+
+IndexBulkDeleteResult *
+gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ /* allocate stats if first time through, else re-use existing struct */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+
+ gistvacuumscan(info, stats, callback, callback_state);
+
+ return stats;
+}
/*
* VACUUM cleanup: update FSM
@@ -28,104 +56,36 @@
IndexBulkDeleteResult *
gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
{
- Relation rel = info->index;
- BlockNumber npages,
- blkno;
- BlockNumber totFreePages;
- double tuplesCount;
- bool needLock;
-
/* No-op in ANALYZE ONLY mode */
if (info->analyze_only)
return stats;
- /* Set up all-zero stats if gistbulkdelete wasn't called */
+ /*
+ * If gistbulkdelete was called, we need not do anything, just return the
+ * stats from the latest gistbulkdelete call. If it wasn't called, we
+ * still need to do a pass over the index, to obtain index statistics.
+ */
if (stats == NULL)
+ {
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ gistvacuumscan(info, stats, NULL, NULL);
+ }
/*
- * Need lock unless it's local to this backend.
+ * It's quite possible for us to be fooled by concurrent page splits into
+ * double-counting some index tuples, so disbelieve any total that exceeds
+ * the underlying heap's count ... if we know that accurately. Otherwise
+ * this might just make matters worse.
*/
- needLock = !RELATION_IS_LOCAL(rel);
-
- /* try to find deleted pages */
- if (needLock)
- LockRelationForExtension(rel, ExclusiveLock);
- npages = RelationGetNumberOfBlocks(rel);
- if (needLock)
- UnlockRelationForExtension(rel, ExclusiveLock);
-
- totFreePages = 0;
- tuplesCount = 0;
- for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++)
+ if (!info->estimated_count)
{
- Buffer buffer;
- Page page;
-
- vacuum_delay_point();
-
- buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
- info->strategy);
- LockBuffer(buffer, GIST_SHARE);
- page = (Page) BufferGetPage(buffer);
-
- if (PageIsNew(page) || GistPageIsDeleted(page))
- {
- totFreePages++;
- RecordFreeIndexPage(rel, blkno);
- }
- else if (GistPageIsLeaf(page))
- {
- /* count tuples in index (considering only leaf tuples) */
- tuplesCount += PageGetMaxOffsetNumber(page);
- }
- UnlockReleaseBuffer(buffer);
+ if (stats->num_index_tuples > info->num_heap_tuples)
+ stats->num_index_tuples = info->num_heap_tuples;
}
- /* Finally, vacuum the FSM */
- IndexFreeSpaceMapVacuum(info->index);
-
- /* return statistics */
- stats->pages_free = totFreePages;
- if (needLock)
- LockRelationForExtension(rel, ExclusiveLock);
- stats->num_pages = RelationGetNumberOfBlocks(rel);
- if (needLock)
- UnlockRelationForExtension(rel, ExclusiveLock);
- stats->num_index_tuples = tuplesCount;
- stats->estimated_count = false;
-
return stats;
}
-typedef struct GistBDItem
-{
- GistNSN parentlsn;
- BlockNumber blkno;
- struct GistBDItem *next;
-} GistBDItem;
-
-static void
-pushStackIfSplited(Page page, GistBDItem *stack)
-{
- GISTPageOpaque opaque = GistPageGetOpaque(page);
-
- if (stack->blkno != GIST_ROOT_BLKNO && !XLogRecPtrIsInvalid(stack->parentlsn) &&
- (GistFollowRight(page) || stack->parentlsn < GistPageGetNSN(page)) &&
- opaque->rightlink != InvalidBlockNumber /* sanity check */ )
- {
- /* split page detected, install right link to the stack */
-
- GistBDItem *ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
-
- ptr->blkno = opaque->rightlink;
- ptr->parentlsn = stack->parentlsn;
- ptr->next = stack->next;
- stack->next = ptr;
- }
-}
-
-
/*
* Bulk deletion of all index entries pointing to a set of heap tuples and
* check invalid tuples left after upgrade.
@@ -134,141 +94,237 @@ pushStackIfSplited(Page page, GistBDItem *stack)
*
* Result: a palloc'd struct containing statistical info for VACUUM displays.
*/
-IndexBulkDeleteResult *
-gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+static void
+gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
IndexBulkDeleteCallback callback, void *callback_state)
{
Relation rel = info->index;
- GistBDItem *stack,
- *ptr;
+ GistVacState vstate;
+ BlockNumber num_pages;
+ bool needLock;
+ BlockNumber blkno;
- /* first time through? */
- if (stats == NULL)
- stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
- /* we'll re-count the tuples each time */
+ /*
+ * Reset counts that will be incremented during the scan; needed in case
+ * of multiple scans during a single VACUUM command.
+ */
stats->estimated_count = false;
stats->num_index_tuples = 0;
+ stats->pages_deleted = 0;
+
+ /* Set up info to pass down to gistvacuumpage */
+ vstate.info = info;
+ vstate.stats = stats;
+ vstate.callback = callback;
+ vstate.callback_state = callback_state;
+ if (RelationNeedsWAL(rel))
+ vstate.startNSN = GetInsertRecPtr();
+ else
+ vstate.startNSN = gistGetFakeLSN(rel);
+ vstate.totFreePages = 0;
- stack = (GistBDItem *) palloc0(sizeof(GistBDItem));
- stack->blkno = GIST_ROOT_BLKNO;
-
- while (stack)
- {
- Buffer buffer;
- Page page;
- OffsetNumber i,
- maxoff;
- IndexTuple idxtuple;
- ItemId iid;
-
- buffer = ReadBufferExtended(rel, MAIN_FORKNUM, stack->blkno,
- RBM_NORMAL, info->strategy);
- LockBuffer(buffer, GIST_SHARE);
- gistcheckpage(rel, buffer);
- page = (Page) BufferGetPage(buffer);
-
- if (GistPageIsLeaf(page))
- {
- OffsetNumber todelete[MaxOffsetNumber];
- int ntodelete = 0;
+ /*
+ * Need lock unless it's local to this backend.
+ */
+ needLock = !RELATION_IS_LOCAL(rel);
- LockBuffer(buffer, GIST_UNLOCK);
- LockBuffer(buffer, GIST_EXCLUSIVE);
+ /*
+ * The outer loop iterates over all index pages, in physical order (we
+ * hope the kernel will cooperate in providing read-ahead for speed). It
+ * is critical that we visit all leaf pages, including ones added after we
+ * start the scan, else we might fail to delete some deletable tuples.
+ * Hence, we must repeatedly check the relation length. We must acquire
+ * the relation-extension lock while doing so to avoid a race condition:
+ * if someone else is extending the relation, there is a window where
+ * bufmgr/smgr have created a new all-zero page but it hasn't yet been
+ * write-locked by gistNewBuffer(). If we manage to scan such a page
+ * here, we'll improperly assume it can be recycled. Taking the lock
+ * synchronizes things enough to prevent a problem: either num_pages won't
+ * include the new page, or gistNewBuffer already has write lock on the
+ * buffer and it will be fully initialized before we can examine it. (See
+ * also vacuumlazy.c, which has the same issue.) Also, we need not worry
+ * if a page is added immediately after we look; the page splitting code
+ * already has write-lock on the left page before it adds a right page, so
+ * we must already have processed any tuples due to be moved into such a
+ * page.
+ *
+ * We can skip locking for new or temp relations, however, since no one
+ * else could be accessing them.
+ */
+ needLock = !RELATION_IS_LOCAL(rel);
- page = (Page) BufferGetPage(buffer);
- if (stack->blkno == GIST_ROOT_BLKNO && !GistPageIsLeaf(page))
- {
- /* only the root can become non-leaf during relock */
- UnlockReleaseBuffer(buffer);
- /* one more check */
- continue;
- }
+ blkno = GIST_ROOT_BLKNO;
+ for (;;)
+ {
+ /* Get the current relation length */
+ if (needLock)
+ LockRelationForExtension(rel, ExclusiveLock);
+ num_pages = RelationGetNumberOfBlocks(rel);
+ if (needLock)
+ UnlockRelationForExtension(rel, ExclusiveLock);
+
+ /* Quit if we've scanned the whole relation */
+ if (blkno >= num_pages)
+ break;
+ /* Iterate over pages, then loop back to recheck length */
+ for (; blkno < num_pages; blkno++)
+ gistvacuumpage(&vstate, blkno, blkno);
+ }
- /*
- * check for split proceeded after look at parent, we should check
- * it after relock
- */
- pushStackIfSplited(page, stack);
+ /*
+ * If we found any recyclable pages (and recorded them in the FSM), then
+ * forcibly update the upper-level FSM pages to ensure that searchers can
+ * find them. It's possible that the pages were also found during
+ * previous scans and so this is a waste of time, but it's cheap enough
+ * relative to scanning the index that it shouldn't matter much, and
+ * making sure that free pages are available sooner not later seems
+ * worthwhile.
+ *
+ * Note that if no recyclable pages exist, we don't bother vacuuming the
+ * FSM at all.
+ */
+ if (vstate.totFreePages > 0)
+ IndexFreeSpaceMapVacuum(rel);
- /*
- * Remove deletable tuples from page
- */
+ /* update statistics */
+ stats->num_pages = num_pages;
+ stats->pages_free = vstate.totFreePages;
+}
- maxoff = PageGetMaxOffsetNumber(page);
+/*
+ * gistvacuumpage --- VACUUM one page
+ *
+ * This processes a single page for gistbulkdelete(). In some cases we
+ * must go back and re-examine previously-scanned pages; this routine
+ * recurses when necessary to handle that case.
+ *
+ * blkno is the page to process. orig_blkno is the highest block number
+ * reached by the outer gistvacuumscan loop (the same as blkno, unless we
+ * are recursing to re-examine a previous page).
+ */
+static void
+gistvacuumpage(GistVacState *vstate, BlockNumber blkno, BlockNumber orig_blkno)
+{
+ IndexVacuumInfo *info = vstate->info;
+ IndexBulkDeleteResult *stats = vstate->stats;
+ IndexBulkDeleteCallback callback = vstate->callback;
+ void *callback_state = vstate->callback_state;
+ Relation rel = info->index;
+ Buffer buffer;
+ Page page;
+ BlockNumber recurse_to;
- for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
- {
- iid = PageGetItemId(page, i);
- idxtuple = (IndexTuple) PageGetItem(page, iid);
+restart:
+ recurse_to = InvalidBlockNumber;
- if (callback(&(idxtuple->t_tid), callback_state))
- todelete[ntodelete++] = i;
- else
- stats->num_index_tuples += 1;
- }
+ /* call vacuum_delay_point while not holding any buffer lock */
+ vacuum_delay_point();
- stats->tuples_removed += ntodelete;
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+ info->strategy);
- if (ntodelete)
- {
- START_CRIT_SECTION();
+ /*
+ * We are not going to stay here for a long time, agressively grab an
+ * exclusive lock.
+ */
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
- MarkBufferDirty(buffer);
+ if (PageIsNew(page) || GistPageIsDeleted(page))
+ {
+ UnlockReleaseBuffer(buffer);
+ vstate->totFreePages++;
+ RecordFreeIndexPage(rel, blkno);
+ return;
+ }
- PageIndexMultiDelete(page, todelete, ntodelete);
- GistMarkTuplesDeleted(page);
+ if (GistPageIsLeaf(page))
+ {
+ OffsetNumber todelete[MaxOffsetNumber];
+ int ntodelete = 0;
+ GISTPageOpaque opaque = GistPageGetOpaque(page);
+ OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
+
+ /*
+ * Check whether we need to recurse back to earlier pages. What we
+ * are concerned about is a page split that happened since we started
+ * the vacuum scan. If the split moved some tuples to a lower page
+ * then we might have missed 'em. If so, set up for tail recursion.
+ */
+ if ((GistFollowRight(page) ||
+ vstate->startNSN < GistPageGetNSN(page)) &&
+ (opaque->rightlink != InvalidBlockNumber) &&
+ (opaque->rightlink < orig_blkno))
+ {
+ recurse_to = opaque->rightlink;
+ }
- if (RelationNeedsWAL(rel))
- {
- XLogRecPtr recptr;
+ /*
+ * Scan over all items to see which ones need deleted according to the
+ * callback function.
+ */
+ if (callback)
+ {
+ OffsetNumber off;
- recptr = gistXLogUpdate(buffer,
- todelete, ntodelete,
- NULL, 0, InvalidBuffer);
- PageSetLSN(page, recptr);
- }
- else
- PageSetLSN(page, gistGetFakeLSN(rel));
+ for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
+ {
+ ItemId iid = PageGetItemId(page, off);
+ IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid);
- END_CRIT_SECTION();
+ if (callback(&(idxtuple->t_tid), callback_state))
+ todelete[ntodelete++] = off;
}
-
}
- else
+
+ /*
+ * Apply any needed deletes. We issue just one WAL record per page,
+ * so as to minimize WAL traffic.
+ */
+ if (ntodelete)
{
- /* check for split proceeded after look at parent */
- pushStackIfSplited(page, stack);
+ START_CRIT_SECTION();
- maxoff = PageGetMaxOffsetNumber(page);
+ MarkBufferDirty(buffer);
- for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+ PageIndexMultiDelete(page, todelete, ntodelete);
+ GistMarkTuplesDeleted(page);
+
+ if (RelationNeedsWAL(rel))
{
- iid = PageGetItemId(page, i);
- idxtuple = (IndexTuple) PageGetItem(page, iid);
-
- ptr = (GistBDItem *) palloc(sizeof(GistBDItem));
- ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
- ptr->parentlsn = BufferGetLSNAtomic(buffer);
- ptr->next = stack->next;
- stack->next = ptr;
-
- if (GistTupleIsInvalid(idxtuple))
- ereport(LOG,
- (errmsg("index \"%s\" contains an inner tuple marked as invalid",
- RelationGetRelationName(rel)),
- errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."),
- errhint("Please REINDEX it.")));
+ XLogRecPtr recptr;
+
+ recptr = gistXLogUpdate(buffer,
+ todelete, ntodelete,
+ NULL, 0, InvalidBuffer);
+ PageSetLSN(page, recptr);
}
- }
+ else
+ PageSetLSN(page, gistGetFakeLSN(rel));
- UnlockReleaseBuffer(buffer);
+ END_CRIT_SECTION();
+
+ stats->tuples_removed += ntodelete;
+ /* must recompute maxoff */
+ maxoff = PageGetMaxOffsetNumber(page);
+ }
- ptr = stack->next;
- pfree(stack);
- stack = ptr;
+ stats->num_index_tuples += maxoff - FirstOffsetNumber + 1;
- vacuum_delay_point();
}
- return stats;
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * This is really tail recursion, but if the compiler is too stupid to
+ * optimize it as such, we'd eat an uncomfortably large amount of stack
+ * space per recursion level (due to the deletable[] array). A failure is
+ * improbable since the number of levels isn't likely to be large ... but
+ * just in case, let's hand-optimize into a loop.
+ */
+ if (recurse_to != InvalidBlockNumber)
+ {
+ blkno = recurse_to;
+ goto restart;
+ }
}
--
2.19.2
>From 62fbe0ce5506e006b92dbfb07aee7414040d982f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Wed, 2 Jan 2019 16:00:16 +0200
Subject: [PATCH 2/2] Delete pages during GiST VACUUM v18-heikki
---
src/backend/access/gist/README | 14 +++
src/backend/access/gist/gist.c | 18 +++
src/backend/access/gist/gistutil.c | 3 +-
src/backend/access/gist/gistvacuum.c | 152 ++++++++++++++++++++++++-
src/backend/access/gist/gistxlog.c | 60 ++++++++++
src/backend/access/rmgrdesc/gistdesc.c | 3 +
src/backend/nodes/bitmapset.c | 16 +++
src/include/access/gist.h | 3 +
src/include/access/gist_private.h | 7 +-
src/include/access/gistxlog.h | 10 +-
src/include/nodes/bitmapset.h | 1 +
src/test/regress/expected/gist.out | 4 +-
src/test/regress/sql/gist.sql | 4 +-
13 files changed, 282 insertions(+), 13 deletions(-)
diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b81..c84359de310 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,20 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
through buffers at a given level until all buffers at that level have been
emptied, and then moves down to the next level.
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function scans index in physical order.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains information necessary to collect block numbers of internal pages
+that need cleansing and block number of empty leafs.
+
+After the scan, for each internal pages under exclusive lock, each potentially
+free leaf page is examined. gistbulkdelete() never delete last one reference
+from internal page to preserve balanced tree properties.
Authors:
Teodor Sigaev <teo...@sigaev.ru>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index d42e810c6b3..bbfd5a92b88 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -704,6 +704,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
GISTInsertStack *item;
OffsetNumber downlinkoffnum;
+ /*
+ * Currently internal pages are not deleted during vacuum,
+ * so we do not need to check if page is deleted
+ */
+
downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
iid = PageGetItemId(stack->page, downlinkoffnum);
idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
@@ -838,6 +843,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
}
}
+ /*
+ * Leaf pages can be left deleted but still referenced
+ * until it's space is reused. Downlink to this page may be already
+ * removed from the internal page, but this scan can posess it.
+ */
+ if(GistPageIsDeleted(stack->page))
+ {
+ UnlockReleaseBuffer(stack->buffer);
+ xlocked = false;
+ state.stack = stack = stack->parent;
+ continue;
+ }
+
/* now state.stack->(page, buffer and blkno) points to leaf page */
gistinserttuple(&state, stack, giststate, itup,
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 70627e5df66..adb316c6afa 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -23,6 +23,7 @@
#include "storage/lmgr.h"
#include "utils/float.h"
#include "utils/syscache.h"
+#include "utils/snapmgr.h"
#include "utils/lsyscache.h"
@@ -807,7 +808,7 @@ gistNewBuffer(Relation r)
gistcheckpage(r, buffer);
- if (GistPageIsDeleted(page))
+ if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalDataXmin))
return buffer; /* OK to use */
LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 4fb32bf76bf..bac6b8c77af 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,8 +16,10 @@
#include "access/genam.h"
#include "access/gist_private.h"
+#include "access/transam.h"
#include "commands/vacuum.h"
#include "miscadmin.h"
+#include "nodes/bitmapset.h"
#include "storage/indexfsm.h"
#include "storage/lmgr.h"
@@ -30,6 +32,10 @@ typedef struct
void *callback_state;
GistNSN startNSN;
BlockNumber totFreePages; /* true total # of free pages */
+ BlockNumber emptyPages;
+
+ Bitmapset *internalPagesMap;
+ Bitmapset *emptyLeafPagesMap;
} GistVacState;
static void gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
@@ -91,8 +97,6 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
* check invalid tuples left after upgrade.
* The set of target tuples is specified via a callback routine that tells
* whether any given heap tuple (identified by ItemPointer) is being deleted.
- *
- * Result: a palloc'd struct containing statistical info for VACUUM displays.
*/
static void
gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
@@ -122,6 +126,9 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
else
vstate.startNSN = gistGetFakeLSN(rel);
vstate.totFreePages = 0;
+ vstate.emptyPages = 0;
+ vstate.internalPagesMap = NULL;
+ vstate.emptyLeafPagesMap = NULL;
/*
* Need lock unless it's local to this backend.
@@ -166,6 +173,12 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* Quit if we've scanned the whole relation */
if (blkno >= num_pages)
break;
+
+ if (!vstate.internalPagesMap)
+ vstate.internalPagesMap = bms_make_empty(num_pages);
+ if (!vstate.emptyLeafPagesMap)
+ vstate.emptyLeafPagesMap = bms_make_empty(num_pages);
+
/* Iterate over pages, then loop back to recheck length */
for (; blkno < num_pages; blkno++)
gistvacuumpage(&vstate, blkno, blkno);
@@ -189,6 +202,126 @@ gistvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
/* update statistics */
stats->num_pages = num_pages;
stats->pages_free = vstate.totFreePages;
+
+ /* rescan inner pages that had empty child pages */
+ if (vstate.emptyPages > 0)
+ {
+ int x;
+
+ x = -1;
+ while (vstate.emptyPages > 0 &&
+ (x = bms_next_member(vstate.internalPagesMap, x)) >= 0)
+ {
+ Buffer buffer;
+ Page page;
+ OffsetNumber off,
+ maxoff;
+ IndexTuple idxtuple;
+ ItemId iid;
+ OffsetNumber todelete[MaxOffsetNumber];
+ Buffer buftodelete[MaxOffsetNumber];
+ int ntodelete = 0;
+
+ /* FIXME: 'x' is signed, so this will not work with indexes larger than 2^31 blocks */
+ blkno = (BlockNumber) x;
+
+ buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+ info->strategy);
+
+ LockBuffer(buffer, GIST_EXCLUSIVE);
+ page = (Page) BufferGetPage(buffer);
+ if (PageIsNew(page) || GistPageIsDeleted(page) || GistPageIsLeaf(page))
+ {
+ UnlockReleaseBuffer(buffer);
+ continue;
+ }
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ /* Check that leafs are still empty and decide what to delete */
+ for (off = FirstOffsetNumber; off <= maxoff; off = OffsetNumberNext(off))
+ {
+ Buffer leafBuffer;
+ Page leafPage;
+ BlockNumber leafBlockNo;
+
+ iid = PageGetItemId(page, off);
+ idxtuple = (IndexTuple) PageGetItem(page, iid);
+ /* if this page was not empty in previous scan - we do not consider it */
+ leafBlockNo = ItemPointerGetBlockNumber(&(idxtuple->t_tid));
+ if (!bms_is_member(leafBlockNo, vstate.emptyLeafPagesMap))
+ continue;
+
+ leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, leafBlockNo,
+ RBM_NORMAL, info->strategy);
+ LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+ gistcheckpage(rel, leafBuffer);
+ leafPage = (Page) BufferGetPage(leafBuffer);
+ if (!GistPageIsLeaf(leafPage))
+ {
+ UnlockReleaseBuffer(leafBuffer);
+ continue;
+ }
+
+ if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+ && !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+ && ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+ {
+ buftodelete[ntodelete] = leafBuffer;
+ todelete[ntodelete++] = off;
+ }
+ else
+ UnlockReleaseBuffer(leafBuffer);
+ }
+
+ if (ntodelete)
+ {
+ /*
+ * Like in _bt_unlink_halfdead_page we need a upper bound on xid
+ * that could hold downlinks to this page. We use
+ * ReadNewTransactionId() to instead of GetCurrentTransactionId
+ * since we are in a VACUUM.
+ */
+ TransactionId txid = ReadNewTransactionId();
+ int i;
+
+ START_CRIT_SECTION();
+
+ /* Mark pages as deleted dropping references from internal pages */
+ for (i = 0; i < ntodelete; i++)
+ {
+ Page leafPage = (Page) BufferGetPage(buftodelete[i]);
+ XLogRecPtr recptr;
+
+ GistPageSetDeleteXid(leafPage,txid);
+
+ GistPageSetDeleted(leafPage);
+ MarkBufferDirty(buftodelete[i]);
+ stats->pages_deleted++;
+ vstate.emptyPages--;
+
+ MarkBufferDirty(buffer);
+ /* Offsets are changed as long as we delete tuples from internal page */
+ PageIndexTupleDelete(page, todelete[i] - i);
+
+ if (RelationNeedsWAL(rel))
+ recptr = gistXLogSetDeleted(rel->rd_node, buftodelete[i],
+ txid, buffer, todelete[i] - i);
+ else
+ recptr = gistGetFakeLSN(rel);
+ PageSetLSN(page, recptr);
+ PageSetLSN(leafPage, recptr);
+
+ UnlockReleaseBuffer(buftodelete[i]);
+ }
+ END_CRIT_SECTION();
+ }
+
+ UnlockReleaseBuffer(buffer);
+ }
+ }
+
+ bms_free(vstate.emptyLeafPagesMap);
+ bms_free(vstate.internalPagesMap);
}
/*
@@ -242,6 +375,7 @@ restart:
{
OffsetNumber todelete[MaxOffsetNumber];
int ntodelete = 0;
+ int nremain;
GISTPageOpaque opaque = GistPageGetOpaque(page);
OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
@@ -309,8 +443,18 @@ restart:
maxoff = PageGetMaxOffsetNumber(page);
}
- stats->num_index_tuples += maxoff - FirstOffsetNumber + 1;
-
+ nremain = maxoff - FirstOffsetNumber + 1;
+ if (nremain == 0)
+ {
+ vstate->emptyLeafPagesMap = bms_add_member(vstate->emptyLeafPagesMap, blkno);
+ vstate->emptyPages++;
+ }
+ else
+ stats->num_index_tuples += nremain;
+ }
+ else
+ {
+ vstate->internalPagesMap = bms_add_member(vstate->internalPagesMap, blkno);
}
UnlockReleaseBuffer(buffer);
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 01e025d5fdb..bb0fa473f5e 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -64,6 +64,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
UnlockReleaseBuffer(buffer);
}
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+ XLogRecPtr lsn = record->EndRecPtr;
+ gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+ Buffer buffer;
+ Page page;
+
+ if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ GistPageSetDeleteXid(page, xldata->deleteXid);
+ GistPageSetDeleted(page);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+
+ if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+ {
+ page = (Page) BufferGetPage(buffer);
+
+ PageIndexTupleDelete(page, xldata->downlinkOffset);
+
+ PageSetLSN(page, lsn);
+ MarkBufferDirty(buffer);
+ }
+ if (BufferIsValid(buffer))
+ UnlockReleaseBuffer(buffer);
+}
/*
* redo any page update (except page split)
*/
@@ -116,6 +149,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
data += sizeof(OffsetNumber) * xldata->ntodelete;
PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+
if (GistPageIsLeaf(page))
GistMarkTuplesDeleted(page);
}
@@ -535,6 +569,9 @@ gist_redo(XLogReaderState *record)
case XLOG_GIST_CREATE_INDEX:
gistRedoCreateIndex(record);
break;
+ case XLOG_GIST_PAGE_DELETE:
+ gistRedoPageSetDeleted(record);
+ break;
default:
elog(PANIC, "gist_redo: unknown op code %u", info);
}
@@ -653,6 +690,29 @@ gistXLogSplit(bool page_is_leaf,
return recptr;
}
+/*
+ * Write XLOG record describing a page delete. This also includes removal of
+ * downlink from internal page.
+ */
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid,
+ Buffer internalPageBuffer, OffsetNumber internalPageOffset) {
+ gistxlogPageDelete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.deleteXid = xid;
+ xlrec.downlinkOffset = internalPageOffset;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+ XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+ XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD);
+ /* new tuples */
+ recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+ return recptr;
+}
+
/*
* Write XLOG record describing a page update. The update can include any
* number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index b79ed1dfdc8..f65335ba23a 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -76,6 +76,9 @@ gist_identify(uint8 info)
case XLOG_GIST_CREATE_INDEX:
id = "CREATE_INDEX";
break;
+ case XLOG_GIST_PAGE_DELETE:
+ id = "PAGE_DELETE";
+ break;
}
return id;
diff --git a/src/backend/nodes/bitmapset.c b/src/backend/nodes/bitmapset.c
index 8ce253c88df..29cfcd78984 100644
--- a/src/backend/nodes/bitmapset.c
+++ b/src/backend/nodes/bitmapset.c
@@ -258,6 +258,22 @@ bms_make_singleton(int x)
return result;
}
+/*
+ * bms_make_singleton - preallocate an empty bitmapset
+ */
+Bitmapset *
+bms_make_empty(int size)
+{
+ Bitmapset *result;
+ int wordnum;
+
+ if (size < 0)
+ elog(ERROR, "negative bitmapset member not allowed");
+ wordnum = WORDNUM(size - 1);
+ result = (Bitmapset *) palloc0(BITMAPSET_SIZE(wordnum + 1));
+ return result;
+}
+
/*
* bms_free - free a bitmapset
*
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 827566dc6e7..0dd2bf47c8c 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -151,6 +151,9 @@ typedef struct GISTENTRY
#define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
#define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
+#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
+#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val)
+
/*
* Vector of GISTENTRY structs; user-defined methods union and picksplit
* take it as one of their arguments
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index a73716d6eaa..5d02800dac6 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -412,12 +412,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
int len, GISTSTATE *giststate);
+/* gistxlog.c */
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+ TransactionId xid, Buffer internalPageBuffer,
+ OffsetNumber internalPageOffset);
+
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
IndexTuple *itup, int ntup,
Buffer leftchild);
-XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
+extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete,
int ntodelete, RelFileNode hnode);
extern XLogRecPtr gistXLogSplit(bool page_is_leaf,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index b67c7100500..3c71d0261a1 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,13 +17,15 @@
#include "access/xlogreader.h"
#include "lib/stringinfo.h"
+/* XLog stuff */
+
#define XLOG_GIST_PAGE_UPDATE 0x00
#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a page */
/* #define XLOG_GIST_NEW_ROOT 0x20 */ /* not used anymore */
#define XLOG_GIST_PAGE_SPLIT 0x30
/* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */
#define XLOG_GIST_CREATE_INDEX 0x50
- /* #define XLOG_GIST_PAGE_DELETE 0x60 */ /* not used anymore */
+#define XLOG_GIST_PAGE_DELETE 0x60
/*
* Backup Blk 0: updated page.
@@ -76,6 +78,12 @@ typedef struct gistxlogPageSplit
*/
} gistxlogPageSplit;
+typedef struct gistxlogPageDelete
+{
+ TransactionId deleteXid; /* last Xid which could see page in scan */
+ OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */
+} gistxlogPageDelete;
+
extern void gist_redo(XLogReaderState *record);
extern void gist_desc(StringInfo buf, XLogReaderState *record);
extern const char *gist_identify(uint8 info);
diff --git a/src/include/nodes/bitmapset.h b/src/include/nodes/bitmapset.h
index 433df8a46d0..55435f9ae64 100644
--- a/src/include/nodes/bitmapset.h
+++ b/src/include/nodes/bitmapset.h
@@ -79,6 +79,7 @@ extern Bitmapset *bms_copy(const Bitmapset *a);
extern bool bms_equal(const Bitmapset *a, const Bitmapset *b);
extern int bms_compare(const Bitmapset *a, const Bitmapset *b);
extern Bitmapset *bms_make_singleton(int x);
+extern Bitmapset *bms_make_empty(int size);
extern void bms_free(Bitmapset *a);
extern Bitmapset *bms_union(const Bitmapset *a, const Bitmapset *b);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf2..5b92f08c747 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
-- To test vacuum, delete some entries from all over the index.
delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
delete from gist_point_tbl where id < 10000;
vacuum analyze gist_point_tbl;
-- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13c..e66396e851b 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
-- To test vacuum, delete some entries from all over the index.
delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
delete from gist_point_tbl where id < 10000;
vacuum analyze gist_point_tbl;
--
2.19.2