Re: [HACKERS] Page Scan Mode in Hash Index

Ashutosh Sharma Thu, 23 Mar 2017 11:12:07 -0700

On Thu, Mar 23, 2017 at 8:29 PM, Jesper Pedersen
<jesper.peder...@redhat.com> wrote:
> Hi,
>
> On 03/22/2017 09:32 AM, Ashutosh Sharma wrote:
>>
>> Done. Please refer to the attached v2 version of patch.
>>
>
> Thanks.
>
>>>> 1) 0001-Rewrite-hash-index-scans-to-work-a-page-at-a-time.patch: this
>>>> patch rewrites the hash index scan module to work in page-at-a-time
>>>> mode. It basically introduces two new functions-- _hash_readpage() and
>>>> _hash_saveitem(). The former is used to load all the qualifying tuples
>>>> from a target bucket or overflow page into an items array. The latter
>>>> one is used by _hash_readpage to save all the qualifying tuples found
>>>> in a page into an items array. Apart from that, this patch bascially
>>>> cleans _hash_first(), _hash_next and hashgettuple().
>>>>
>
> 0001v2:
>
> In hashgettuple() you can remove the 'currItem' and 'offnum' from the 'else'
> part, and do the assignment inside
>
>   if (so->numKilled < MaxIndexTuplesPerPage)
>
> instead.
>


Done. Please have a look into the attached v3 patch.

>
> No new comments for 0002 and 0003.

okay. Thanks.

--
With Regards,
Ashutosh Sharma
EnterpriseDB:http://www.enterprisedb.com

From 4e953c35da2274165b00d763500b83e0f3f9e2a9 Mon Sep 17 00:00:00 2001
From: ashu <ashu@localhost.localdomain>
Date: Thu, 23 Mar 2017 23:36:05 +0530
Subject: [PATCH] Rewrite hash index scans to work a page at a timev3

Patch by Ashutosh Sharma
---
 src/backend/access/hash/README       |   9 +-
 src/backend/access/hash/hash.c       | 121 +++----------
 src/backend/access/hash/hashpage.c   |  14 +-
 src/backend/access/hash/hashsearch.c | 330 ++++++++++++++++++++++++++++++-----
 src/backend/access/hash/hashutil.c   |  23 ++-
 src/include/access/hash.h            |  44 +++++
 6 files changed, 385 insertions(+), 156 deletions(-)

diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README
index 1541438..f0a7bdf 100644
--- a/src/backend/access/hash/README
+++ b/src/backend/access/hash/README
@@ -243,10 +243,11 @@ The reader algorithm is:
 -- then, per read request:
 	reacquire content lock on current page
 	step to next page if necessary (no chaining of content locks, but keep
-     the pin on the primary bucket throughout the scan; we also maintain
-     a pin on the page currently being scanned)
-	get tuple
-	release content lock
+     the pin on the primary bucket throughout the scan)
+    save all the matching tuples from current index page into an items array
+    release pin and content lock (but if it is primary bucket page retain
+    it's pin till the end of scan)
+    get tuple from an item array
 -- at scan shutdown:
 	release all pins still held
 
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 34cc08f..8c28fbd 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -268,66 +268,23 @@ bool
 hashgettuple(IndexScanDesc scan, ScanDirection dir)
 {
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
-	Relation	rel = scan->indexRelation;
-	Buffer		buf;
-	Page		page;
 	OffsetNumber offnum;
-	ItemPointer current;
 	bool		res;
+	HashScanPosItem	*currItem;
 
 	/* Hash indexes are always lossy since we store only the hash code */
 	scan->xs_recheck = true;
 
 	/*
-	 * We hold pin but not lock on current buffer while outside the hash AM.
-	 * Reacquire the read lock here.
-	 */
-	if (BufferIsValid(so->hashso_curbuf))
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_SHARE);
-
-	/*
 	 * If we've already initialized this scan, we can just advance it in the
 	 * appropriate direction.  If we haven't done so yet, we call a routine to
 	 * get the first item in the scan.
 	 */
-	current = &(so->hashso_curpos);
-	if (ItemPointerIsValid(current))
+	if (!HashScanPosIsValid(so->currPos))
+		res = _hash_first(scan, dir);
+	else
 	{
 		/*
-		 * An insertion into the current index page could have happened while
-		 * we didn't have read lock on it.  Re-find our position by looking
-		 * for the TID we previously returned.  (Because we hold a pin on the
-		 * primary bucket page, no deletions or splits could have occurred;
-		 * therefore we can expect that the TID still exists in the current
-		 * index page, at an offset >= where we were.)
-		 */
-		OffsetNumber maxoffnum;
-
-		buf = so->hashso_curbuf;
-		Assert(BufferIsValid(buf));
-		page = BufferGetPage(buf);
-
-		/*
-		 * We don't need test for old snapshot here as the current buffer is
-		 * pinned, so vacuum can't clean the page.
-		 */
-		maxoffnum = PageGetMaxOffsetNumber(page);
-		for (offnum = ItemPointerGetOffsetNumber(current);
-			 offnum <= maxoffnum;
-			 offnum = OffsetNumberNext(offnum))
-		{
-			IndexTuple	itup;
-
-			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-			if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
-				break;
-		}
-		if (offnum > maxoffnum)
-			elog(ERROR, "failed to re-find scan position within index \"%s\"",
-				 RelationGetRelationName(rel));
-		ItemPointerSetOffsetNumber(current, offnum);
-
-		/*
 		 * Check to see if we should kill the previously-fetched tuple.
 		 */
 		if (scan->kill_prior_tuple)
@@ -346,9 +303,11 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 
 			if (so->numKilled < MaxIndexTuplesPerPage)
 			{
-				so->killedItems[so->numKilled].heapTid = so->hashso_heappos;
-				so->killedItems[so->numKilled].indexOffset =
-							ItemPointerGetOffsetNumber(&(so->hashso_curpos));
+				currItem = &so->currPos.items[so->currPos.itemIndex];
+				offnum = currItem->indexOffset;
+
+				so->killedItems[so->numKilled].heapTid = currItem->heapTid;
+				so->killedItems[so->numKilled].indexOffset = offnum;
 				so->numKilled++;
 			}
 		}
@@ -358,30 +317,10 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
 		 */
 		res = _hash_next(scan, dir);
 	}
-	else
-		res = _hash_first(scan, dir);
-
-	/*
-	 * Skip killed tuples if asked to.
-	 */
-	if (scan->ignore_killed_tuples)
-	{
-		while (res)
-		{
-			offnum = ItemPointerGetOffsetNumber(current);
-			page = BufferGetPage(so->hashso_curbuf);
-			if (!ItemIdIsDead(PageGetItemId(page, offnum)))
-				break;
-			res = _hash_next(scan, dir);
-		}
-	}
-
-	/* Release read lock on current buffer, but keep it pinned */
-	if (BufferIsValid(so->hashso_curbuf))
-		LockBuffer(so->hashso_curbuf, BUFFER_LOCK_UNLOCK);
 
 	/* Return current heap TID on success */
-	scan->xs_ctup.t_self = so->hashso_heappos;
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
 
 	return res;
 }
@@ -396,35 +335,22 @@ hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
 	bool		res;
 	int64		ntids = 0;
+	HashScanPosItem *currItem;
 
 	res = _hash_first(scan, ForwardScanDirection);
 
 	while (res)
 	{
-		bool		add_tuple;
+		currItem = &so->currPos.items[so->currPos.itemIndex];
 
 		/*
-		 * Skip killed tuples if asked to.
+		 * _hash_first() or _hash_next() never returns
+		 * dead tuples. Therefore, we can always add
+		 * the tuples into TIDBitmap without checking
+		 * if a tuple is dead or not.
 		 */
-		if (scan->ignore_killed_tuples)
-		{
-			Page		page;
-			OffsetNumber offnum;
-
-			offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
-			page = BufferGetPage(so->hashso_curbuf);
-			add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
-		}
-		else
-			add_tuple = true;
-
-		/* Save tuple ID, and continue scanning */
-		if (add_tuple)
-		{
-			/* Note we mark the tuple ID as requiring recheck */
-			tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
-			ntids++;
-		}
+		tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
+		ntids++;
 
 		res = _hash_next(scan, ForwardScanDirection);
 	}
@@ -448,12 +374,9 @@ hashbeginscan(Relation rel, int nkeys, int norderbys)
 	scan = RelationGetIndexScan(rel, nkeys, norderbys);
 
 	so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
-	so->hashso_curbuf = InvalidBuffer;
+	HashScanPosInvalidate(so->currPos);
 	so->hashso_bucket_buf = InvalidBuffer;
 	so->hashso_split_bucket_buf = InvalidBuffer;
-	/* set position invalid (this will cause _hash_first call) */
-	ItemPointerSetInvalid(&(so->hashso_curpos));
-	ItemPointerSetInvalid(&(so->hashso_heappos));
 
 	so->hashso_buc_populated = false;
 	so->hashso_buc_split = false;
@@ -482,10 +405,6 @@ hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 
 	_hash_dropscanbuf(rel, so);
 
-	/* set position invalid (this will cause _hash_first call) */
-	ItemPointerSetInvalid(&(so->hashso_curpos));
-	ItemPointerSetInvalid(&(so->hashso_heappos));
-
 	/* Update scan key, if a new one is given */
 	if (scankey && scan->numberOfKeys > 0)
 	{
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 622cc4b..8515c28 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -298,20 +298,22 @@ _hash_dropscanbuf(Relation rel, HashScanOpaque so)
 {
 	/* release pin we hold on primary bucket page */
 	if (BufferIsValid(so->hashso_bucket_buf) &&
-		so->hashso_bucket_buf != so->hashso_curbuf)
+		so->hashso_bucket_buf != so->currPos.buf)
 		_hash_dropbuf(rel, so->hashso_bucket_buf);
-	so->hashso_bucket_buf = InvalidBuffer;
 
 	/* release pin we hold on primary bucket page  of bucket being split */
 	if (BufferIsValid(so->hashso_split_bucket_buf) &&
-		so->hashso_split_bucket_buf != so->hashso_curbuf)
+		so->hashso_split_bucket_buf != so->currPos.buf)
 		_hash_dropbuf(rel, so->hashso_split_bucket_buf);
 	so->hashso_split_bucket_buf = InvalidBuffer;
 
 	/* release any pin we still hold */
-	if (BufferIsValid(so->hashso_curbuf))
-		_hash_dropbuf(rel, so->hashso_curbuf);
-	so->hashso_curbuf = InvalidBuffer;
+	if (BufferIsValid(so->currPos.buf) &&
+		so->hashso_bucket_buf == so->currPos.buf)
+		_hash_dropbuf(rel, so->currPos.buf);
+
+	so->currPos.buf = InvalidBuffer;
+	so->hashso_bucket_buf = InvalidBuffer;
 
 	/* reset split scan */
 	so->hashso_buc_populated = false;
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 2d92049..1f05b1f 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -20,44 +20,87 @@
 #include "pgstat.h"
 #include "utils/rel.h"
 
+static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP,
+			ScanDirection dir);
+static inline void _hash_saveitem(HashScanOpaque so, int itemIndex,
+			OffsetNumber offnum, IndexTuple itup);
 
 /*
  *	_hash_next() -- Get the next item in a scan.
  *
- *		On entry, we have a valid hashso_curpos in the scan, and a
- *		pin and read lock on the page that contains that item.
- *		We find the next item in the scan, if any.
- *		On success exit, we have the page containing the next item
- *		pinned and locked.
+ *		On entry, so->currPos describes the current page, which may
+ *		be pinned but not locked, and so->currPos.itemIndex identifies
+ *		which item was previously returned.
+ *
+ *		On successful exit, scan->xs_ctup.t_self is set to the TID
+ *		of the next heap tuple, and if requested, scan->xs_itup
+ *		points to a copy of the index tuple.  so->currPos is updated
+ *		as needed.
+ *
+ *		On failure exit (no more tuples), we return FALSE with no
+ *		pins or locks held.
  */
 bool
 _hash_next(IndexScanDesc scan, ScanDirection dir)
 {
 	Relation	rel = scan->indexRelation;
 	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	HashScanPosItem *currItem;
+	BlockNumber blkno;
 	Buffer		buf;
-	Page		page;
-	OffsetNumber offnum;
-	ItemPointer current;
-	IndexTuple	itup;
-
-	/* we still have the buffer pinned and read-locked */
-	buf = so->hashso_curbuf;
-	Assert(BufferIsValid(buf));
+	bool        tuples_to_read;
 
 	/*
-	 * step to next valid tuple.
+	 * Advance to next tuple on current page; or if there's no more,
+	 * try to read data from next or prev page based on the scan
+	 * direction. Before moving to the next or prev page make sure
+	 * that we deal with all the killed items.
 	 */
-	if (!_hash_step(scan, &buf, dir))
-		return false;
+	if (ScanDirectionIsForward(dir))
+	{
+		if (++so->currPos.itemIndex > so->currPos.lastItem)
+		{
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			blkno = so->currPos.nextPage;
+			if (BlockNumberIsValid(blkno))
+			{
+				buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+				so->currPos.buf = buf;
+				tuples_to_read = _hash_readpage(scan, &buf, dir);
+				if (!tuples_to_read)
+					return false;
+			}
+			else
+				return false;
+		}
+	}
+	else
+	{
+		if (--so->currPos.itemIndex < so->currPos.firstItem)
+		{
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			blkno = so->currPos.prevPage;
+			if (BlockNumberIsValid(blkno))
+			{
+				buf = _hash_getbuf(rel, blkno, HASH_READ,
+								   LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+				so->currPos.buf = buf;
+				tuples_to_read = _hash_readpage(scan, &buf, dir);
+				if (!tuples_to_read)
+					return false;
+			}
+			else
+				return false;
+		}
+	}
 
-	/* if we're here, _hash_step found a valid tuple */
-	current = &(so->hashso_curpos);
-	offnum = ItemPointerGetOffsetNumber(current);
-	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
-	page = BufferGetPage(buf);
-	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-	so->hashso_heappos = itup->t_tid;
+	/* OK, itemIndex says what to return */
+	currItem = &so->currPos.items[so->currPos.itemIndex];
+	scan->xs_ctup.t_self = currItem->heapTid;
 
 	return true;
 }
@@ -212,11 +255,15 @@ _hash_readprev(IndexScanDesc scan,
 /*
  *	_hash_first() -- Find the first item in a scan.
  *
- *		Find the first item in the index that
- *		satisfies the qualification associated with the scan descriptor. On
- *		success, the page containing the current index tuple is read locked
- *		and pinned, and the scan's opaque data entry is updated to
- *		include the buffer.
+ *		We find the first item(or, if backward scan, the last item) in
+ *		the index that satisfies the qualification associated with the
+ *		scan descriptor. On success, the page containing the current
+ *		index tuple is read locked and pinned, and data about the
+ *		matching tuple(s) on the page has been loaded into so->currPos,
+ *		scan->xs_ctup.t_self is set to the heap TID of the current tuple.
+ *
+ *		If there are no matching items in the index, we return FALSE,
+ *		with no pins or locks held.
  */
 bool
 _hash_first(IndexScanDesc scan, ScanDirection dir)
@@ -229,15 +276,9 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	Buffer		buf;
 	Page		page;
 	HashPageOpaque opaque;
-	IndexTuple	itup;
-	ItemPointer current;
-	OffsetNumber offnum;
 
 	pgstat_count_index_scan(rel);
 
-	current = &(so->hashso_curpos);
-	ItemPointerSetInvalid(current);
-
 	/*
 	 * We do not support hash scans with no index qualification, because we
 	 * would have to read the whole index rather than just one bucket. That
@@ -356,17 +397,15 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 			_hash_readnext(scan, &buf, &page, &opaque);
 	}
 
-	/* Now find the first tuple satisfying the qualification */
-	if (!_hash_step(scan, &buf, dir))
-		return false;
+	/* remember which buffer we have pinned, if any */
+	Assert(BufferIsInvalid(so->currPos.buf));
+	so->currPos.buf = buf;
 
-	/* if we're here, _hash_step found a valid tuple */
-	offnum = ItemPointerGetOffsetNumber(current);
-	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
-	page = BufferGetPage(buf);
-	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
-	so->hashso_heappos = itup->t_tid;
+	/* Now find all the tuples satisfying the qualification from a page */
+	if (!_hash_readpage(scan, &buf, dir))
+		return false;
 
+	/* if we're here, _hash_readpage found a valid tuples */
 	return true;
 }
 
@@ -575,3 +614,208 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
 	ItemPointerSet(current, blkno, offnum);
 	return true;
 }
+
+/*
+ *	_hash_readpage() -- Load data from current index page into so->currPos
+ *
+ *	We scan all the items in the current index page and save them into
+ *	so->currPos if it satifies the qualification. If no matching items
+ *	are found in the current page, we move to the next or previous page
+ *	in a bucket chain as indicated by the direction.
+ *
+ *	Return true if any matching items are found else returns false.
+ */
+static bool
+_hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
+{
+	Relation	rel = scan->indexRelation;
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	Buffer		buf;
+	Page		page;
+	HashPageOpaque	opaque;
+	OffsetNumber	maxoff;
+	OffsetNumber	offnum;
+	IndexTuple		itup;
+	uint16			itemIndex;
+
+	so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf);
+
+	buf = *bufP;
+	Assert(BufferIsValid(buf));
+	_hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+	page = BufferGetPage(buf);
+	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	if (ScanDirectionIsForward(dir))
+	{
+loop_top_fwd:
+		/* load items[] in ascending order */
+		itemIndex = 0;
+
+		/* new page, locate starting position by binary search */
+		offnum = _hash_binsearch(page, so->hashso_sk_hash);
+
+		while (offnum <= maxoff)
+		{
+			Assert(offnum >= FirstOffsetNumber);
+			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+			/*
+			 * skip the tuples that are moved by split operation
+			 * for the scan that has started when split was in
+			 * progress. Also, skip the tuples that are marked
+			 * as dead.
+			 */
+			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
+				(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
+				(scan->ignore_killed_tuples &&
+				(ItemIdIsDead(PageGetItemId(page, offnum)))))
+			{
+				offnum = OffsetNumberNext(offnum);	/* move forward */
+				continue;
+			}
+
+			if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
+				_hash_checkqual(scan, itup))
+			{
+				/* tuple is qualified, so remember it */
+				_hash_saveitem(so, itemIndex, offnum, itup);
+				itemIndex++;
+			}
+
+			offnum = OffsetNumberNext(offnum);
+		}
+
+		Assert(itemIndex <= MaxIndexTuplesPerPage);
+
+		if (itemIndex == 0)
+		{
+			/*
+			 * Could not find any matching tuples in the current page, move
+			 * to the next page. Before leaving the current page, also deal
+			 * with any killed items.
+			 */
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			_hash_readnext(scan, &buf, &page, &opaque);
+			if (BufferIsValid(buf))
+			{
+				so->currPos.buf = buf;
+				so->currPos.currPage = BufferGetBlockNumber(buf);
+				maxoff = PageGetMaxOffsetNumber(page);
+				offnum = _hash_binsearch(page, so->hashso_sk_hash);
+				goto loop_top_fwd;
+			}
+			else
+				return false;
+		}
+		else
+		{
+			if (so->currPos.buf == so->hashso_bucket_buf ||
+				so->currPos.buf == so->hashso_split_bucket_buf)
+				LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+			else
+				_hash_relbuf(rel, so->currPos.buf);
+
+			so->currPos.nextPage = (opaque)->hasho_nextblkno;
+		}
+
+		so->currPos.firstItem = 0;
+		so->currPos.lastItem = itemIndex - 1;
+		so->currPos.itemIndex = 0;
+	}
+	else
+	{
+loop_top_bwd:
+		/* load items[] in descending order */
+		itemIndex = MaxIndexTuplesPerPage;
+
+		/* new page, locate starting position by binary search */
+		offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+
+		while (offnum >= FirstOffsetNumber)
+		{
+			Assert(offnum <= maxoff);
+			itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+
+			/*
+			 * skip the tuples that are moved by split operation
+			 * for the scan that has started when split was in
+			 * progress. Also, skip the tuples that are marked
+			 * as dead.
+			 */
+			if ((so->hashso_buc_populated && !so->hashso_buc_split &&
+				(itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) ||
+				(scan->ignore_killed_tuples &&
+				(ItemIdIsDead(PageGetItemId(page, offnum)))))
+			{
+				offnum = OffsetNumberPrev(offnum);	/* move back */
+				continue;
+			}
+
+			if (so->hashso_sk_hash == _hash_get_indextuple_hashkey(itup) &&
+				_hash_checkqual(scan, itup))
+			{
+				itemIndex--;
+				/* tuple is qualified, so remember it */
+				_hash_saveitem(so, itemIndex, offnum, itup);
+			}
+
+			offnum = OffsetNumberPrev(offnum);
+		}
+
+		Assert(itemIndex >= 0);
+
+		if (itemIndex == MaxIndexTuplesPerPage)
+		{
+			/*
+			 * Could not find any matching tuples in the current page, move
+			 * to the prev page. Before leaving the current page, also deal
+			 * with any killed items.
+			 */
+			if (so->numKilled > 0)
+				_hash_kill_items(scan);
+
+			_hash_readprev(scan, &buf, &page, &opaque);
+			if (BufferIsValid(buf))
+			{
+				so->currPos.buf = buf;
+				so->currPos.currPage = BufferGetBlockNumber(buf);
+				maxoff = PageGetMaxOffsetNumber(page);
+				offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
+				goto loop_top_bwd;
+			}
+			else
+				return false;
+		}
+		else
+		{
+			if (so->currPos.buf == so->hashso_bucket_buf ||
+				so->currPos.buf == so->hashso_split_bucket_buf)
+				LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+			else
+				_hash_relbuf(rel, so->currPos.buf);
+			so->currPos.prevPage = (opaque)->hasho_prevblkno;
+		}
+
+		so->currPos.firstItem = itemIndex;
+		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
+		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+	}
+
+	return (so->currPos.firstItem <= so->currPos.lastItem);
+}
+
+/* Save an index item into so->currPos.items[itemIndex] */
+static inline void
+_hash_saveitem(HashScanOpaque so, int itemIndex,
+			   OffsetNumber offnum, IndexTuple itup)
+{
+	HashScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = itup->t_tid;
+	currItem->indexOffset = offnum;
+}
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index 2e99719..ecda225 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -463,6 +463,9 @@ void
 _hash_kill_items(IndexScanDesc scan)
 {
 	HashScanOpaque	so = (HashScanOpaque) scan->opaque;
+	Relation    rel = scan->indexRelation;
+	BlockNumber	blkno;
+	Buffer	buf;
 	Page	page;
 	HashPageOpaque	opaque;
 	OffsetNumber	offnum, maxoff;
@@ -479,7 +482,19 @@ _hash_kill_items(IndexScanDesc scan)
 	 */
 	so->numKilled = 0;
 
-	page = BufferGetPage(so->hashso_curbuf);
+	blkno = so->currPos.currPage;
+	if (so->hashso_bucket_buf == so->currPos.buf)
+	{
+		buf = so->currPos.buf;
+		LockBuffer(buf, BUFFER_LOCK_SHARE);
+	}
+	else
+	{
+		if (BlockNumberIsValid(blkno))
+			buf = _hash_getbuf(rel, blkno, HASH_READ, LH_OVERFLOW_PAGE);
+	}
+
+	page = BufferGetPage(buf);
 	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
 	maxoff = PageGetMaxOffsetNumber(page);
 
@@ -511,6 +526,10 @@ _hash_kill_items(IndexScanDesc scan)
 	if (killedsomething)
 	{
 		opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES;
-		MarkBufferDirtyHint(so->hashso_curbuf, true);
+		MarkBufferDirtyHint(buf, true);
 	}
+	if (so->hashso_bucket_buf == so->currPos.buf)
+		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	else
+		_hash_relbuf(rel, buf);
 }
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index eb1df57..3b01e3e 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -103,6 +103,44 @@ typedef struct HashScanPosItem    /* what we remember about each match */
 	OffsetNumber indexOffset;	/* index item's location within page */
 } HashScanPosItem;
 
+typedef struct HashScanPosData
+{
+	Buffer		buf;			/* if valid, the buffer is pinned */
+	BlockNumber currPage;		/* current hash index page */
+	BlockNumber nextPage;		/* next overflow page */
+	BlockNumber prevPage;		/* prev overflow or bucket page */
+
+	/*
+	 * The items array is always ordered in index order (ie, increasing
+	 * indexoffset).  When scanning backwards it is convenient to fill the
+	 * array back-to-front, so we start at the last slot and fill downwards.
+	 * Hence we need both a first-valid-entry and a last-valid-entry counter.
+	 * itemIndex is a cursor showing which entry was last returned to caller.
+	 */
+	int			firstItem;		/* first valid index in items[] */
+	int			lastItem;		/* last valid index in items[] */
+	int			itemIndex;		/* current index in items[] */
+
+	HashScanPosItem items[MaxIndexTuplesPerPage];		/* MUST BE LAST */
+}	HashScanPosData;
+
+#define HashScanPosIsValid(scanpos) \
+( \
+	AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
+				!BufferIsValid((scanpos).buf)), \
+	BlockNumberIsValid((scanpos).currPage) \
+)
+
+#define HashScanPosInvalidate(scanpos) \
+	do { \
+		(scanpos).buf = InvalidBuffer; \
+		(scanpos).currPage = InvalidBlockNumber; \
+		(scanpos).nextPage = InvalidBlockNumber; \
+		(scanpos).prevPage = InvalidBlockNumber; \
+		(scanpos).firstItem = 0; \
+		(scanpos).lastItem = 0; \
+		(scanpos).itemIndex = 0; \
+	} while (0);
 
 /*
  *	HashScanOpaqueData is private state for a hash index scan.
@@ -147,6 +185,12 @@ typedef struct HashScanOpaqueData
 	/* info about killed items if any (killedItems is NULL if never used) */
 	HashScanPosItem	*killedItems;	/* tids and offset numbers of killed items */
 	int			numKilled;			/* number of currently stored items */
+
+	/*
+	 * Identify all the matching items on a page and save them
+	 * in HashScanPosData
+	 */
+	HashScanPosData	currPos;		/* current position data */
 } HashScanOpaqueData;
 
 typedef HashScanOpaqueData *HashScanOpaque;
-- 
1.8.3.1

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Page Scan Mode in Hash Index

Reply via email to