On Mon, Aug 3, 2015 at 12:27 PM, Anastasia Lubennikova
<a.lubennik...@postgrespro.ru <mailto:a.lubennik...@postgrespro.ru>>
wrote:
1) Test and results are in attachments. Everything seems to work
as expected.
2) I dropped these notices. It was done only for debug purposes.
Updated patch is attached.
3) fixed
Good! Another couple of notes from me:
1) I think gistvacuumpage() and gistkillitems() need function-level
comments.
2) ItemIdIsDead() can be used in index scan like it's done
in _bt_checkkeys().
------
Alexander Korotkov
Postgres Professional: http://www.postgrespro.com
<http://www.postgrespro.com/>
The Russian Postgres Company
I've added some comments.
ItemIdIsDead() is used now (just skip dead tuples as not matching the
quals).
And there is one else check of LSN in gistkillitems to make sure that
page was not changed between reads.
--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
*** a/src/backend/access/gist/gist.c
--- b/src/backend/access/gist/gist.c
***************
*** 36,42 **** static bool gistinserttuples(GISTInsertState *state,
GISTInsertStack *stack,
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool
releasebuf);
!
#define ROTATEDIST(d) do { \
SplitedPageLayout
*tmp=(SplitedPageLayout*)palloc(sizeof(SplitedPageLayout)); \
--- 36,42 ----
bool unlockbuf, bool unlockleftchild);
static void gistfinishsplit(GISTInsertState *state, GISTInsertStack *stack,
GISTSTATE *giststate, List *splitinfo, bool
releasebuf);
! static void gistvacuumpage(Relation rel, Page page, Buffer buffer);
#define ROTATEDIST(d) do { \
SplitedPageLayout
*tmp=(SplitedPageLayout*)palloc(sizeof(SplitedPageLayout)); \
***************
*** 209,214 **** gistplacetopage(Relation rel, Size freespace, GISTSTATE
*giststate,
--- 209,225 ----
* because the tuple vector passed to gistSplit won't include this
tuple.
*/
is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
+
+ /*
+ * If leaf page is full, try at first to delete dead tuples. And then
+ * check again.
+ */
+ if ((is_split) && GistPageIsLeaf(page) && GistPageHasGarbage(page))
+ {
+ gistvacuumpage(rel, page, buffer);
+ is_split = gistnospace(page, itup, ntup, oldoffnum, freespace);
+ }
+
if (is_split)
{
/* no space for insertion */
***************
*** 1440,1442 **** freeGISTstate(GISTSTATE *giststate)
--- 1451,1522 ----
/* It's sufficient to delete the scanCxt */
MemoryContextDelete(giststate->scanCxt);
}
+
+ /*
+ * gistvacuumpage() -- try to remove LP_DEAD items from the given page.
+ */
+ static void
+ gistvacuumpage(Relation rel, Page page, Buffer buffer)
+ {
+ OffsetNumber deletable[MaxOffsetNumber];
+ int ndeletable = 0;
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+
+ /*
+ * Scan over all items to see which ones need to be deleted according to
+ * LP_DEAD flags.
+ */
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (offnum = FirstOffsetNumber;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemId = PageGetItemId(page, offnum);
+
+ if (ItemIdIsDead(itemId))
+ deletable[ndeletable++] = offnum;
+ }
+
+ if (ndeletable > 0)
+ {
+ START_CRIT_SECTION();
+
+ PageIndexMultiDelete(page, deletable, ndeletable);
+
+ /*
+ * Mark the page as not containing any LP_DEAD items. This is
not
+ * certainly true (there might be some that have recently been
marked,
+ * but weren't included in our target-item list), but it will
almost
+ * always be true and it doesn't seem worth an additional page
scan to
+ * check it. Remember that F_HAS_GARBAGE is only a hint anyway.
+ */
+ GistClearPageHasGarbage(page);
+
+ MarkBufferDirty(buffer);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+
+ recptr = gistXLogUpdate(rel->rd_node, buffer,
+
deletable, ndeletable,
+ NULL,
0, InvalidBuffer);
+
+ PageSetLSN(page, recptr);
+ }
+ else
+ PageSetLSN(page, gistGetFakeLSN(rel));
+
+ END_CRIT_SECTION();
+ }
+
+ /*
+ * Note: if we didn't find any LP_DEAD items, then the page's
+ * F_HAS_GARBAGE hint bit is falsely set. We do not bother expending a
+ * separate write to clear it, however. We will clear it when we split
+ * the page.
+ */
+ }
*** a/src/backend/access/gist/gistget.c
--- b/src/backend/access/gist/gistget.c
***************
*** 24,29 ****
--- 24,116 ----
#include "utils/memutils.h"
#include "utils/rel.h"
+ /*
+ * gistkillitems() -- set LP_DEAD state for items an indexscan caller has
+ * told us were killed.
+ *
+ * We match items by heap TID before mark them. If an item has moved off
+ * the current page due to a split, we'll fail to find it and do nothing
+ * (this is not an error case --- we assume the item will eventually get
+ * marked in a future indexscan).
+ *
+ * We re-read page here, so it's significant to check page LSN. If page
+ * has been modified since the last read (as determined by LSN), we dare not
+ * flag any antries because it is possible that the old entry was vacuumed
+ * away and the TID was re-used by a completely different heap tuple.
+ */
+ static void
+ gistkillitems(IndexScanDesc scan)
+ {
+ GISTScanOpaque so = (GISTScanOpaque) scan->opaque;
+ Buffer buffer;
+ Page page;
+ OffsetNumber minoff;
+ OffsetNumber maxoff;
+ int i;
+ bool killedsomething = false;
+
+ Assert(so->curBlkno != InvalidBlockNumber);
+
+ buffer = ReadBuffer(scan->indexRelation, so->curBlkno);
+ if (!BufferIsValid(buffer))
+ return;
+
+ LockBuffer(buffer, GIST_SHARE);
+ gistcheckpage(scan->indexRelation, buffer);
+ page = BufferGetPage(buffer);
+
+ /*
+ * If page LSN differs it means that the page was modified since the
last read.
+ * killedItemes could be not valid so LP_DEAD hints applying is not
safe.
+ */
+ if(PageGetLSN(page) != so->curPageLSN)
+ {
+ UnlockReleaseBuffer(buffer);
+ so->numKilled = 0; /* reset counter */
+ return;
+ }
+
+ minoff = FirstOffsetNumber;
+ maxoff = PageGetMaxOffsetNumber(page);
+
+ maxoff = PageGetMaxOffsetNumber(page);
+ for (i = 0; i < so->numKilled; i++)
+ {
+ if (so->killedItems != NULL)
+ {
+ OffsetNumber offnum = FirstOffsetNumber;
+
+ while (offnum <= maxoff)
+ {
+ ItemId iid = PageGetItemId(page,
offnum);
+ IndexTuple ituple = (IndexTuple)
PageGetItem(page, iid);
+
+ if (ItemPointerEquals(&ituple->t_tid,
&(so->killedItems[i])))
+ {
+ /* found the item */
+ ItemIdMarkDead(iid);
+ killedsomething = true;
+ break; /* out of inner search
loop */
+ }
+ offnum = OffsetNumberNext(offnum);
+ }
+ }
+ }
+
+ if (killedsomething)
+ {
+ GistMarkPageHasGarbage(page);
+ MarkBufferDirtyHint(buffer, true);
+ }
+
+ UnlockReleaseBuffer(buffer);
+
+ /*
+ * Always reset the scan state, so we don't look for same items on other
+ * pages.
+ */
+ so->numKilled = 0;
+ }
/*
* gistindex_keytest() -- does this index tuple satisfy the scan key(s)?
***************
*** 306,322 **** gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
double *myDistances,
MemoryContextReset(so->pageDataCxt);
/*
* check all tuples on page
*/
maxoff = PageGetMaxOffsetNumber(page);
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
{
! IndexTuple it = (IndexTuple) PageGetItem(page,
PageGetItemId(page, i));
bool match;
bool recheck;
bool recheck_distances;
/*
* Must call gistindex_keytest in tempCxt, and clean up any
leftover
* junk afterward.
*/
--- 393,425 ----
MemoryContextReset(so->pageDataCxt);
/*
+ * We save the LSN of the page as we read it, so that we know whether it
+ * safe to apply LP_DEAD hints to the page later. This allows us to drop
+ * the pin for MVCC scans, which allows vacuum to avoid blocking.
+ */
+ so->curPageLSN = PageGetLSN(page);
+
+ /*
* check all tuples on page
*/
maxoff = PageGetMaxOffsetNumber(page);
for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
{
! ItemId iid = PageGetItemId(page, i);
! IndexTuple it;
bool match;
bool recheck;
bool recheck_distances;
/*
+ * If the scan specifies not to return killed tuples, then we
treat a
+ * killed tuple as not passing the qual.
+ */
+ if(scan->ignore_killed_tuples && ItemIdIsDead(iid))
+ continue;
+
+ it = (IndexTuple) PageGetItem(page, iid);
+ /*
* Must call gistindex_keytest in tempCxt, and clean up any
leftover
* junk afterward.
*/
***************
*** 331,337 **** gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem,
double *myDistances,
/* Ignore tuple if it doesn't match */
if (!match)
continue;
-
if (tbm && GistPageIsLeaf(page))
{
/*
--- 434,439 ----
***************
*** 451,456 **** getNextNearest(IndexScanDesc scan)
--- 553,575 ----
if (scan->xs_itup)
{
+ /*
+ * If previously returned index tuple is not visible save it
into
+ * so->killedItems. And at the end of the page scan mark all
saved
+ * tuples as dead.
+ */
+ if (scan->kill_prior_tuple)
+ {
+ if (so->killedItems == NULL)
+ {
+ MemoryContext oldCxt2 =
MemoryContextSwitchTo(so->giststate->scanCxt);
+
+ so->killedItems = (ItemPointerData *)
palloc(MaxIndexTuplesPerPage * sizeof(ItemPointerData));
+ MemoryContextSwitchTo(oldCxt2);
+ }
+ if ((so->numKilled < MaxIndexTuplesPerPage))
+ so->killedItems[so->numKilled++] =
scan->xs_ctup.t_self;
+ }
/* free previously returned tuple */
pfree(scan->xs_itup);
scan->xs_itup = NULL;
***************
*** 515,520 **** getNextNearest(IndexScanDesc scan)
--- 634,645 ----
}
else
{
+ if ((so->curBlkno != InvalidBlockNumber) &&
(so->numKilled > 0))
+ gistkillitems(scan);
+
+ /* save current item BlockNumber for next
gistkillitems() call */
+ so->curBlkno = item->blkno;
+
/* visit an index page, extract its items into queue */
CHECK_FOR_INTERRUPTS();
***************
*** 572,578 **** gistgettuple(PG_FUNCTION_ARGS)
--- 697,715 ----
{
if (so->curPageData < so->nPageData)
{
+ if ((scan->kill_prior_tuple) &&
(so->curPageData > 0))
+ {
+ if (so->killedItems == NULL)
+ {
+ MemoryContext oldCxt =
MemoryContextSwitchTo(so->giststate->scanCxt);
+
+ so->killedItems =
(ItemPointerData *) palloc(MaxIndexTuplesPerPage * sizeof(ItemPointerData));
+ MemoryContextSwitchTo(oldCxt);
+ }
+ if (so->numKilled <
MaxIndexTuplesPerPage)
+
so->killedItems[so->numKilled++] = so->pageData[so->curPageData - 1].heapPtr;
+ }
/* continuing to return tuples from a leaf page
*/
scan->xs_ctup.t_self =
so->pageData[so->curPageData].heapPtr;
scan->xs_recheck =
so->pageData[so->curPageData].recheck;
***************
*** 586,594 **** gistgettuple(PG_FUNCTION_ARGS)
--- 723,751 ----
PG_RETURN_BOOL(true);
}
+ /*
+ * Check the last returned tuple and add it to
killitems if
+ * necessary
+ */
+ if ((scan->kill_prior_tuple) && (so->curPageData > 0)
&& (so->curPageData == so->nPageData))
+ {
+
+ if (so->killedItems == NULL)
+ {
+ MemoryContext oldCxt =
MemoryContextSwitchTo(so->giststate->scanCxt);
+
+ so->killedItems = (ItemPointerData *)
palloc(MaxIndexTuplesPerPage * sizeof(ItemPointerData));
+ MemoryContextSwitchTo(oldCxt);
+ }
+ if ((so->numKilled < MaxIndexTuplesPerPage))
+ so->killedItems[so->numKilled++] =
so->pageData[so->curPageData - 1].heapPtr;
+ }
/* find and process the next index page */
do
{
+ if ((so->curBlkno != InvalidBlockNumber) &&
(so->numKilled > 0))
+ gistkillitems(scan);
+
GISTSearchItem *item =
getNextGISTSearchItem(so);
if (!item)
***************
*** 596,601 **** gistgettuple(PG_FUNCTION_ARGS)
--- 753,761 ----
CHECK_FOR_INTERRUPTS();
+ /* save current item BlockNumber for next
gistkillitems() call */
+ so->curBlkno = item->blkno;
+
/*
* While scanning a leaf page, ItemPointers of
matching heap
* tuples are stored in so->pageData. If there
are any on
*** a/src/backend/access/gist/gistscan.c
--- b/src/backend/access/gist/gistscan.c
***************
*** 93,98 **** gistbeginscan(PG_FUNCTION_ARGS)
--- 93,103 ----
memset(scan->xs_orderbynulls, true, sizeof(bool) *
scan->numberOfOrderBys);
}
+ so->killedItems = NULL; /* until needed */
+ so->numKilled = 0;
+ so->curBlkno = InvalidBlockNumber;
+ so->curPageLSN = InvalidXLogRecPtr;
+
scan->opaque = so;
/*
*** a/src/include/access/gist.h
--- b/src/include/access/gist.h
***************
*** 41,48 ****
*/
#define F_LEAF (1 << 0) /* leaf page */
#define F_DELETED (1 << 1) /* the page has been
deleted */
! #define F_TUPLES_DELETED (1 << 2) /* some tuples on the page are
dead */
#define F_FOLLOW_RIGHT (1 << 3) /* page to the right
has no downlink */
typedef XLogRecPtr GistNSN;
--- 41,51 ----
*/
#define F_LEAF (1 << 0) /* leaf page */
#define F_DELETED (1 << 1) /* the page has been
deleted */
! #define F_TUPLES_DELETED (1 << 2) /* some tuples on the page were
!
* deleted */
#define F_FOLLOW_RIGHT (1 << 3) /* page to the right
has no downlink */
+ #define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are
dead,
+
* but not deleted yet */
typedef XLogRecPtr GistNSN;
***************
*** 137,142 **** typedef struct GISTENTRY
--- 140,149 ----
#define GistMarkTuplesDeleted(page) ( GistPageGetOpaque(page)->flags |=
F_TUPLES_DELETED)
#define GistClearTuplesDeleted(page) ( GistPageGetOpaque(page)->flags &=
~F_TUPLES_DELETED)
+ #define GistPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &
F_HAS_GARBAGE)
+ #define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |=
F_HAS_GARBAGE)
+ #define GistClearPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &=
~F_HAS_GARBAGE)
+
#define GistFollowRight(page) ( GistPageGetOpaque(page)->flags &
F_FOLLOW_RIGHT)
#define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |=
F_FOLLOW_RIGHT)
#define GistClearFollowRight(page) ( GistPageGetOpaque(page)->flags &=
~F_FOLLOW_RIGHT)
*** a/src/include/access/gist_private.h
--- b/src/include/access/gist_private.h
***************
*** 22,27 ****
--- 22,28 ----
#include "storage/bufmgr.h"
#include "storage/buffile.h"
#include "utils/hsearch.h"
+ #include "access/genam.h"
/*
* Maximum number of "halves" a page can be split into in one operation.
***************
*** 161,166 **** typedef struct GISTScanOpaqueData
--- 162,173 ----
/* pre-allocated workspace arrays */
double *distances; /* output area for gistindex_keytest */
+ /* info about killed items if any (killedItems is NULL if never used) */
+ ItemPointerData *killedItems; /* heap pointers of killed
items */
+ int numKilled; /* number of currently
stored items */
+ BlockNumber curBlkno; /* current number of block */
+ GistNSN curPageLSN; /* pos in the WAL stream when page was
read */
+
/* In a non-ordered search, returnable heap items are stored here: */
GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)];
OffsetNumber nPageData; /* number of valid items in array */
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers